The purpose of this notebook is to analyze our aggregated data downloaded and found in data_grab and cleaned in data_clean.
'''
Necessary packages
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import yfinance as yf
import pandas_datareader as pdr
'''
Data Import and Aggregation
---
pulls data created from clean file
Aggregates data to individual dataframe stored quarter by quarte (dfs list)
NOTE: this removes cosine values of 0, which implies the two documents absolutely different
(i.e. one does not exist)
'''
data_2010 = pd.read_csv('2010_dataset.csv',index_col='Unnamed: 0') # reads 2010 data
data_2020 = pd.read_csv('2020_dataset.csv',index_col='Unnamed: 0') # reads 2020 data
# data index : q1 q2 a3 q4
cosine_index = [ '10k', 'q1', 'q2', 'q3']
return_index = ['q1_ret', 'q2_ret', 'q3_ret', 'q4_ret']
dfs = [] # for list of data frames
for df in [data_2010, data_2020]: # appends 2010, then 2020
for i in range(4): # over all 3 quarters
dfs.append(df[[cosine_index[i],return_index[i]]].loc[df[cosine_index[i]]>0]) # only pulls nonzero cosine values
ports = [] # initialize the ports array
avg_ret = [] # initialize the average return array
var_ret = [] # initialize the variance return array
for i in range(len(dfs)): # for loop that runs through the length of the dfs created in the above cell
if i <= 3: # if the incrementing of i is less than or equal to 3
idx = 0 # sets the variable idx equal to 0
else:
idx = 4 # otherwise sets the variable idx equal to 4
df = dfs[i] # sets a new df equal to the dfs at index i
df_bins = pd.qcut(df.iloc[:,0],q=5,retbins=True)[1] # sets a new df_bins variable equal to the quintile at the first index position
df['long_short'] = df[cosine_index[i - idx]].apply(lambda x: 1 if x >= df_bins[-2] else 0) # creates a column of long_short that stores a 1 if the ticker is in the top quintile and otherwise stores 0
port = df[df['long_short'] == 1] # creates a new portfolio spot that includes the tickers with a 1 signaling them being in the top quintile
df['long_short'] = df[cosine_index[i - idx]].apply(lambda x: -1 if x <= df_bins[1] else 0) # edits the column of long_short so that it stores a -1 if the ticker is in the bottom quintile and otherwise stores 0
port = pd.concat([port, df[df['long_short'] == -1]], ignore_index=False) # concatenates the current portfolio array with the tickers that have a -1
ports.append(port) # appends the port variable to the ports array created at the top of the cell
g_obj = port.groupby('long_short').mean() # creates a new variable g_obj that stores the grouped by column of long_short and takes the mean of this
avg_ret.append(g_obj.iloc[1,1] - g_obj.iloc[0,1]) # appends the returns for the top quintile and the bottom quintile to the average return array
var_ret.append(np.var(np.array(port.iloc[:,1]*port.iloc[:,2]))) # appends the variance for the top quintile and the bottom quintile to the variance return array
first_dates = ['01-29-2010','04-30-2010','07-30-2010','10-29-2010','01-28-2011'] # initializes the array of first dates to get the SPY returns during the same dates
second_dates = ['01-29-2020','04-30-2020','07-30-2020','10-29-2020','01-28-2021'] # initializes the array of second dates to get the SPY returns during the same dates
rets = [] # initializes the returns array
vars = [] # initializes the variance array
for i in range(len(first_dates)-1): # for loop that runs for the length of the first_dates minus 1
spy = pdr.DataReader('SPY', data_source = 'yahoo', start = first_dates[i], end = first_dates[i+1])['Adj Close'] # sets the new variable spy equal to the returns of the spy during the first dates
rets.append(spy[-1]/spy[0]-1) # appends the returns array with the spy at index -1 divided by the spy at index 0 minus 1
for i in range(len(second_dates)-1): # for loop that runs for the length of the second dates minus 1
spy = pdr.DataReader('SPY', data_source = 'yahoo', start = second_dates[i], end = second_dates[i+1])['Adj Close'] # sets the variable spy equal to the returns of the spy during the second dates
rets.append(spy[-1]/spy[0]-1) # appends the returns array with the spy at index -1 divided by the spy at index 0 minus 1
bnchmk_df = pd.DataFrame([avg_ret, var_ret, rets], index=['port_ret','port_var','mkt']).transpose() # sets the new benchmark dataframe equal to the average returns, variance of returns, returns and the indexes as the portfolio return, portfolio variance, and market
bnchmk_df['alpha'] = bnchmk_df['port_ret'] - bnchmk_df['mkt'] # sets a new column in benchmark dataframe equal to alpha and calculates this based off of the portfolio return minus the market
# bnchmk_df['sharpe'] = bnchmk_df['port_ret'] / np.sqrt(bnchmk_df['port_var']) (attempt at Sharpe ratio to try and further describe our findings)
bnchmk_df # prints the benchmark dataframe
port_ret | port_var | mkt | alpha | |
---|---|---|---|---|
0 | 0.040033 | 0.016541 | 0.110897 | -0.070864 |
1 | 0.024167 | 0.022347 | -0.067464 | 0.091631 |
2 | -0.017025 | 0.032635 | 0.080297 | -0.097322 |
3 | -0.026291 | 0.039989 | 0.083566 | -0.109857 |
4 | 0.064801 | 0.035176 | -0.105419 | 0.170220 |
5 | 0.006286 | 0.021847 | 0.120165 | -0.113879 |
6 | -0.044570 | 0.080878 | 0.022660 | -0.067229 |
7 | 0.049167 | 0.066373 | 0.149281 | -0.100114 |
bnchmk_df.mean() # prints the benchmark dataframe mean
port_ret 0.012071
port_var 0.039473
mkt 0.049248
alpha -0.037177
dtype: float64