# get stock market index data by yfinance library
import yfinance as yf
import datetime
import time
import pandas as pd
# the reddit data is from 2021-01-01 to 2022-08-31
# but the stock market is closed at the first three days in 2021
# get 2020-12-31 data to fill the NaN value of the first three days
# get daily s&p 500 index
sp500_df = yf.download('^GSPC', start='2020-12-31', end='2022-08-31')
sp500_df
[*********************100%***********************] 1 of 1 completed
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
2020-12-31 | 3733.270020 | 3760.199951 | 3726.879883 | 3756.070068 | 3756.070068 | 3172510000 |
2021-01-04 | 3764.610107 | 3769.989990 | 3662.709961 | 3700.649902 | 3700.649902 | 5006680000 |
2021-01-05 | 3698.020020 | 3737.830078 | 3695.070068 | 3726.860107 | 3726.860107 | 4582620000 |
2021-01-06 | 3712.199951 | 3783.040039 | 3705.340088 | 3748.139893 | 3748.139893 | 6049970000 |
2021-01-07 | 3764.709961 | 3811.550049 | 3764.709961 | 3803.790039 | 3803.790039 | 5080870000 |
... | ... | ... | ... | ... | ... | ... |
2022-08-24 | 4126.549805 | 4156.560059 | 4119.970215 | 4140.770020 | 4140.770020 | 3056910000 |
2022-08-25 | 4153.259766 | 4200.540039 | 4147.589844 | 4199.120117 | 4199.120117 | 2976050000 |
2022-08-26 | 4198.740234 | 4203.040039 | 4057.659912 | 4057.659912 | 4057.659912 | 3175260000 |
2022-08-29 | 4034.580078 | 4062.989990 | 4017.419922 | 4030.610107 | 4030.610107 | 2963020000 |
2022-08-30 | 4041.250000 | 4044.979980 | 3965.209961 | 3986.159912 | 3986.159912 | 3190580000 |
419 rows × 6 columns
# get daily NASDAQ index
nasdaq_df = yf.download('^IXIC', start='2020-12-31', end='2022-08-31')
nasdaq_df
[*********************100%***********************] 1 of 1 completed
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
2020-12-31 | 12877.089844 | 12902.070312 | 12821.230469 | 12888.280273 | 12888.280273 | 4771390000 |
2021-01-04 | 12958.519531 | 12958.719727 | 12543.240234 | 12698.450195 | 12698.450195 | 6546740000 |
2021-01-05 | 12665.650391 | 12828.269531 | 12665.650391 | 12818.959961 | 12818.959961 | 6904420000 |
2021-01-06 | 12666.150391 | 12909.629883 | 12649.990234 | 12740.790039 | 12740.790039 | 7648340000 |
2021-01-07 | 12867.339844 | 13090.910156 | 12867.339844 | 13067.480469 | 13067.480469 | 6777010000 |
... | ... | ... | ... | ... | ... | ... |
2022-08-24 | 12375.150391 | 12504.330078 | 12350.160156 | 12431.530273 | 12431.530273 | 3896570000 |
2022-08-25 | 12506.370117 | 12641.259766 | 12471.980469 | 12639.269531 | 12639.269531 | 4290090000 |
2022-08-26 | 12630.580078 | 12655.839844 | 12141.519531 | 12141.709961 | 12141.709961 | 4510480000 |
2022-08-29 | 12021.049805 | 12124.870117 | 11981.419922 | 12017.669922 | 12017.669922 | 4183740000 |
2022-08-30 | 12093.059570 | 12101.849609 | 11790.019531 | 11883.139648 | 11883.139648 | 4630720000 |
419 rows × 6 columns
# get daily dow jones index
dj_df = yf.download('^DJI', start='2020-12-31', end='2022-08-31')
dj_df
[*********************100%***********************] 1 of 1 completed
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
2020-12-31 | 30417.640625 | 30637.470703 | 30344.500000 | 30606.480469 | 30606.480469 | 284470000 |
2021-01-04 | 30627.470703 | 30674.279297 | 29881.820312 | 30223.890625 | 30223.890625 | 475080000 |
2021-01-05 | 30204.250000 | 30504.890625 | 30141.779297 | 30391.599609 | 30391.599609 | 350910000 |
2021-01-06 | 30362.779297 | 31022.650391 | 30313.070312 | 30829.400391 | 30829.400391 | 500430000 |
2021-01-07 | 30901.179688 | 31193.400391 | 30897.859375 | 31041.130859 | 31041.130859 | 427810000 |
... | ... | ... | ... | ... | ... | ... |
2022-08-24 | 32899.011719 | 33095.500000 | 32828.558594 | 32969.230469 | 32969.230469 | 256900000 |
2022-08-25 | 33029.269531 | 33306.089844 | 32889.851562 | 33291.781250 | 33291.781250 | 308530000 |
2022-08-26 | 33293.429688 | 33364.699219 | 32278.220703 | 32283.400391 | 32283.400391 | 352850000 |
2022-08-29 | 32188.000000 | 32325.160156 | 31972.789062 | 32098.990234 | 32098.990234 | 317820000 |
2022-08-30 | 32163.839844 | 32205.660156 | 31647.220703 | 31790.869141 | 31790.869141 | 317150000 |
419 rows × 6 columns
# merge three stock market index together
df_index = pd.DataFrame(sp500_df['Close'])
df_index.columns = ['sp500']
df_index['nasdaq'] = nasdaq_df['Close']
df_index['dj'] = dj_df['Close']
df_index['date'] = df_index.index.astype(str)
df_index
sp500 | nasdaq | dj | date | |
---|---|---|---|---|
Date | ||||
2020-12-31 | 3756.070068 | 12888.280273 | 30606.480469 | 2020-12-31 |
2021-01-04 | 3700.649902 | 12698.450195 | 30223.890625 | 2021-01-04 |
2021-01-05 | 3726.860107 | 12818.959961 | 30391.599609 | 2021-01-05 |
2021-01-06 | 3748.139893 | 12740.790039 | 30829.400391 | 2021-01-06 |
2021-01-07 | 3803.790039 | 13067.480469 | 31041.130859 | 2021-01-07 |
... | ... | ... | ... | ... |
2022-08-24 | 4140.770020 | 12431.530273 | 32969.230469 | 2022-08-24 |
2022-08-25 | 4199.120117 | 12639.269531 | 33291.781250 | 2022-08-25 |
2022-08-26 | 4057.659912 | 12141.709961 | 32283.400391 | 2022-08-26 |
2022-08-29 | 4030.610107 | 12017.669922 | 32098.990234 | 2022-08-29 |
2022-08-30 | 3986.159912 | 11883.139648 | 31790.869141 | 2022-08-30 |
419 rows × 4 columns
# generate consecutive date data from 2020-12-31 to 2022-08-31
date_list = []
begin_date = datetime.datetime.strptime('2020-12-31', "%Y-%m-%d")
end_date = datetime.datetime.strptime('2022-08-31', "%Y-%m-%d")
while begin_date <= end_date:
date_str = begin_date.strftime("%Y-%m-%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
df_date = pd.DataFrame(date_list, columns = ['date'])
df_date
date | |
---|---|
0 | 2020-12-31 |
1 | 2021-01-01 |
2 | 2021-01-02 |
3 | 2021-01-03 |
4 | 2021-01-04 |
... | ... |
604 | 2022-08-27 |
605 | 2022-08-28 |
606 | 2022-08-29 |
607 | 2022-08-30 |
608 | 2022-08-31 |
609 rows × 1 columns
# merge stock index df and consecutive date df
# stock market closed dates are filled with NaN
df = pd.merge(df_date,df_index,how='left', on='date')
df
date | sp500 | nasdaq | dj | |
---|---|---|---|---|
0 | 2020-12-31 | 3756.070068 | 12888.280273 | 30606.480469 |
1 | 2021-01-01 | NaN | NaN | NaN |
2 | 2021-01-02 | NaN | NaN | NaN |
3 | 2021-01-03 | NaN | NaN | NaN |
4 | 2021-01-04 | 3700.649902 | 12698.450195 | 30223.890625 |
... | ... | ... | ... | ... |
604 | 2022-08-27 | NaN | NaN | NaN |
605 | 2022-08-28 | NaN | NaN | NaN |
606 | 2022-08-29 | 4030.610107 | 12017.669922 | 32098.990234 |
607 | 2022-08-30 | 3986.159912 | 11883.139648 | 31790.869141 |
608 | 2022-08-31 | NaN | NaN | NaN |
609 rows × 4 columns
# fill the NaN value with valid stock market index from the previous day
stock_data = df.fillna(method = 'ffill')
stock_data
date | sp500 | nasdaq | dj | |
---|---|---|---|---|
0 | 2020-12-31 | 3756.070068 | 12888.280273 | 30606.480469 |
1 | 2021-01-01 | 3756.070068 | 12888.280273 | 30606.480469 |
2 | 2021-01-02 | 3756.070068 | 12888.280273 | 30606.480469 |
3 | 2021-01-03 | 3756.070068 | 12888.280273 | 30606.480469 |
4 | 2021-01-04 | 3700.649902 | 12698.450195 | 30223.890625 |
... | ... | ... | ... | ... |
604 | 2022-08-27 | 4057.659912 | 12141.709961 | 32283.400391 |
605 | 2022-08-28 | 4057.659912 | 12141.709961 | 32283.400391 |
606 | 2022-08-29 | 4030.610107 | 12017.669922 | 32098.990234 |
607 | 2022-08-30 | 3986.159912 | 11883.139648 | 31790.869141 |
608 | 2022-08-31 | 3986.159912 | 11883.139648 | 31790.869141 |
609 rows × 4 columns
# drop the 2020-12-31 row
stock_data =stock_data.drop(0)
stock_data
date | sp500 | nasdaq | dj | |
---|---|---|---|---|
1 | 2021-01-01 | 3756.070068 | 12888.280273 | 30606.480469 |
2 | 2021-01-02 | 3756.070068 | 12888.280273 | 30606.480469 |
3 | 2021-01-03 | 3756.070068 | 12888.280273 | 30606.480469 |
4 | 2021-01-04 | 3700.649902 | 12698.450195 | 30223.890625 |
5 | 2021-01-05 | 3726.860107 | 12818.959961 | 30391.599609 |
... | ... | ... | ... | ... |
604 | 2022-08-27 | 4057.659912 | 12141.709961 | 32283.400391 |
605 | 2022-08-28 | 4057.659912 | 12141.709961 | 32283.400391 |
606 | 2022-08-29 | 4030.610107 | 12017.669922 | 32098.990234 |
607 | 2022-08-30 | 3986.159912 | 11883.139648 | 31790.869141 |
608 | 2022-08-31 | 3986.159912 | 11883.139648 | 31790.869141 |
608 rows × 4 columns
stock_data.to_csv('stock_index.csv')