In [1]:
# get stock market index data by yfinance library
import yfinance as yf
import datetime
import time
import pandas as pd
In [2]:
# the reddit data is from 2021-01-01 to 2022-08-31
# but the stock market is closed at the first three days in 2021
# get 2020-12-31 data to fill the NaN value of the first three days

# get daily s&p 500 index 
sp500_df = yf.download('^GSPC', start='2020-12-31', end='2022-08-31')
sp500_df
[*********************100%***********************]  1 of 1 completed
Out[2]:
Open High Low Close Adj Close Volume
Date
2020-12-31 3733.270020 3760.199951 3726.879883 3756.070068 3756.070068 3172510000
2021-01-04 3764.610107 3769.989990 3662.709961 3700.649902 3700.649902 5006680000
2021-01-05 3698.020020 3737.830078 3695.070068 3726.860107 3726.860107 4582620000
2021-01-06 3712.199951 3783.040039 3705.340088 3748.139893 3748.139893 6049970000
2021-01-07 3764.709961 3811.550049 3764.709961 3803.790039 3803.790039 5080870000
... ... ... ... ... ... ...
2022-08-24 4126.549805 4156.560059 4119.970215 4140.770020 4140.770020 3056910000
2022-08-25 4153.259766 4200.540039 4147.589844 4199.120117 4199.120117 2976050000
2022-08-26 4198.740234 4203.040039 4057.659912 4057.659912 4057.659912 3175260000
2022-08-29 4034.580078 4062.989990 4017.419922 4030.610107 4030.610107 2963020000
2022-08-30 4041.250000 4044.979980 3965.209961 3986.159912 3986.159912 3190580000

419 rows × 6 columns

In [3]:
# get daily NASDAQ index 
nasdaq_df = yf.download('^IXIC', start='2020-12-31', end='2022-08-31')
nasdaq_df
[*********************100%***********************]  1 of 1 completed
Out[3]:
Open High Low Close Adj Close Volume
Date
2020-12-31 12877.089844 12902.070312 12821.230469 12888.280273 12888.280273 4771390000
2021-01-04 12958.519531 12958.719727 12543.240234 12698.450195 12698.450195 6546740000
2021-01-05 12665.650391 12828.269531 12665.650391 12818.959961 12818.959961 6904420000
2021-01-06 12666.150391 12909.629883 12649.990234 12740.790039 12740.790039 7648340000
2021-01-07 12867.339844 13090.910156 12867.339844 13067.480469 13067.480469 6777010000
... ... ... ... ... ... ...
2022-08-24 12375.150391 12504.330078 12350.160156 12431.530273 12431.530273 3896570000
2022-08-25 12506.370117 12641.259766 12471.980469 12639.269531 12639.269531 4290090000
2022-08-26 12630.580078 12655.839844 12141.519531 12141.709961 12141.709961 4510480000
2022-08-29 12021.049805 12124.870117 11981.419922 12017.669922 12017.669922 4183740000
2022-08-30 12093.059570 12101.849609 11790.019531 11883.139648 11883.139648 4630720000

419 rows × 6 columns

In [4]:
# get daily dow jones index
dj_df = yf.download('^DJI', start='2020-12-31', end='2022-08-31')
dj_df
[*********************100%***********************]  1 of 1 completed
Out[4]:
Open High Low Close Adj Close Volume
Date
2020-12-31 30417.640625 30637.470703 30344.500000 30606.480469 30606.480469 284470000
2021-01-04 30627.470703 30674.279297 29881.820312 30223.890625 30223.890625 475080000
2021-01-05 30204.250000 30504.890625 30141.779297 30391.599609 30391.599609 350910000
2021-01-06 30362.779297 31022.650391 30313.070312 30829.400391 30829.400391 500430000
2021-01-07 30901.179688 31193.400391 30897.859375 31041.130859 31041.130859 427810000
... ... ... ... ... ... ...
2022-08-24 32899.011719 33095.500000 32828.558594 32969.230469 32969.230469 256900000
2022-08-25 33029.269531 33306.089844 32889.851562 33291.781250 33291.781250 308530000
2022-08-26 33293.429688 33364.699219 32278.220703 32283.400391 32283.400391 352850000
2022-08-29 32188.000000 32325.160156 31972.789062 32098.990234 32098.990234 317820000
2022-08-30 32163.839844 32205.660156 31647.220703 31790.869141 31790.869141 317150000

419 rows × 6 columns

In [5]:
# merge three stock market index together
df_index = pd.DataFrame(sp500_df['Close'])
df_index.columns = ['sp500']
df_index['nasdaq'] = nasdaq_df['Close']
df_index['dj'] = dj_df['Close']
df_index['date'] = df_index.index.astype(str)
df_index
Out[5]:
sp500 nasdaq dj date
Date
2020-12-31 3756.070068 12888.280273 30606.480469 2020-12-31
2021-01-04 3700.649902 12698.450195 30223.890625 2021-01-04
2021-01-05 3726.860107 12818.959961 30391.599609 2021-01-05
2021-01-06 3748.139893 12740.790039 30829.400391 2021-01-06
2021-01-07 3803.790039 13067.480469 31041.130859 2021-01-07
... ... ... ... ...
2022-08-24 4140.770020 12431.530273 32969.230469 2022-08-24
2022-08-25 4199.120117 12639.269531 33291.781250 2022-08-25
2022-08-26 4057.659912 12141.709961 32283.400391 2022-08-26
2022-08-29 4030.610107 12017.669922 32098.990234 2022-08-29
2022-08-30 3986.159912 11883.139648 31790.869141 2022-08-30

419 rows × 4 columns

In [6]:
# generate consecutive date data from 2020-12-31 to 2022-08-31
date_list = []
begin_date = datetime.datetime.strptime('2020-12-31', "%Y-%m-%d")
end_date = datetime.datetime.strptime('2022-08-31', "%Y-%m-%d")
while begin_date <= end_date:
    date_str = begin_date.strftime("%Y-%m-%d")
    date_list.append(date_str)
    begin_date += datetime.timedelta(days=1)
In [7]:
df_date = pd.DataFrame(date_list, columns = ['date'])
df_date
Out[7]:
date
0 2020-12-31
1 2021-01-01
2 2021-01-02
3 2021-01-03
4 2021-01-04
... ...
604 2022-08-27
605 2022-08-28
606 2022-08-29
607 2022-08-30
608 2022-08-31

609 rows × 1 columns

In [8]:
# merge stock index df and consecutive date df
# stock market closed dates are filled with NaN
df = pd.merge(df_date,df_index,how='left', on='date')
df
Out[8]:
date sp500 nasdaq dj
0 2020-12-31 3756.070068 12888.280273 30606.480469
1 2021-01-01 NaN NaN NaN
2 2021-01-02 NaN NaN NaN
3 2021-01-03 NaN NaN NaN
4 2021-01-04 3700.649902 12698.450195 30223.890625
... ... ... ... ...
604 2022-08-27 NaN NaN NaN
605 2022-08-28 NaN NaN NaN
606 2022-08-29 4030.610107 12017.669922 32098.990234
607 2022-08-30 3986.159912 11883.139648 31790.869141
608 2022-08-31 NaN NaN NaN

609 rows × 4 columns

In [9]:
# fill the NaN value with valid stock market index from the previous day
stock_data = df.fillna(method = 'ffill')
stock_data
Out[9]:
date sp500 nasdaq dj
0 2020-12-31 3756.070068 12888.280273 30606.480469
1 2021-01-01 3756.070068 12888.280273 30606.480469
2 2021-01-02 3756.070068 12888.280273 30606.480469
3 2021-01-03 3756.070068 12888.280273 30606.480469
4 2021-01-04 3700.649902 12698.450195 30223.890625
... ... ... ... ...
604 2022-08-27 4057.659912 12141.709961 32283.400391
605 2022-08-28 4057.659912 12141.709961 32283.400391
606 2022-08-29 4030.610107 12017.669922 32098.990234
607 2022-08-30 3986.159912 11883.139648 31790.869141
608 2022-08-31 3986.159912 11883.139648 31790.869141

609 rows × 4 columns

In [10]:
# drop the 2020-12-31 row
stock_data =stock_data.drop(0)
In [11]:
stock_data
Out[11]:
date sp500 nasdaq dj
1 2021-01-01 3756.070068 12888.280273 30606.480469
2 2021-01-02 3756.070068 12888.280273 30606.480469
3 2021-01-03 3756.070068 12888.280273 30606.480469
4 2021-01-04 3700.649902 12698.450195 30223.890625
5 2021-01-05 3726.860107 12818.959961 30391.599609
... ... ... ... ...
604 2022-08-27 4057.659912 12141.709961 32283.400391
605 2022-08-28 4057.659912 12141.709961 32283.400391
606 2022-08-29 4030.610107 12017.669922 32098.990234
607 2022-08-30 3986.159912 11883.139648 31790.869141
608 2022-08-31 3986.159912 11883.139648 31790.869141

608 rows × 4 columns

In [12]:
stock_data.to_csv('stock_index.csv')
In [ ]: