px_lazy

Project maintained by jdean53 Hosted on GitHub Pages — Theme by mattgraham

Data Analyze

The purpose of this notebook is to analyze our aggregated data downloaded and found in data_grab and cleaned in data_clean.

Imports

'''
Necessary packages
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import yfinance as yf
import pandas_datareader as pdr

Data Import and Aggregation

'''
Data Import and Aggregation
---
pulls data created from clean file
Aggregates data to individual dataframe stored quarter by quarte (dfs list)
NOTE: this removes cosine values of 0, which implies the two documents absolutely different
    (i.e. one does not exist)
'''
data_2010 = pd.read_csv('2010_dataset.csv',index_col='Unnamed: 0')                                                      # reads 2010 data
data_2020 = pd.read_csv('2020_dataset.csv',index_col='Unnamed: 0')                                                      # reads 2020 data

# data index :       q1        q2         a3       q4
cosine_index = [   '10k',     'q1',     'q2',     'q3']
return_index = ['q1_ret', 'q2_ret', 'q3_ret', 'q4_ret']

dfs = []                                                                                                                # for list of data frames
for df in [data_2010, data_2020]:                                                                                       # appends 2010, then 2020
    for i in range(4):                                                                                                  # over all 3 quarters
        dfs.append(df[[cosine_index[i],return_index[i]]].loc[df[cosine_index[i]]>0])                                    # only pulls nonzero cosine values

Cosine Similarity Sorting, Average Returns, Variance

ports = []                                                                                                              # initialize the ports array
avg_ret = []                                                                                                            # initialize the average return array
var_ret = []                                                                                                            # initialize the variance return array

for i in range(len(dfs)):                                                                                               # for loop that runs through the length of the dfs created in the above cell
    if i <= 3:                                                                                                          # if the incrementing of i is less than or equal to 3
        idx = 0                                                                                                         # sets the variable idx equal to 0
    else:
        idx = 4                                                                                                         # otherwise sets the variable idx equal to 4
    df = dfs[i]                                                                                                         # sets a new df equal to the dfs at index i
    df_bins = pd.qcut(df.iloc[:,0],q=5,retbins=True)[1]                                                                 # sets a new df_bins variable equal to the quintile at the first index position
    df['long_short'] = df[cosine_index[i - idx]].apply(lambda x: 1 if x >= df_bins[-2] else 0)                          # creates a column of long_short that stores a 1 if the ticker is in the top quintile and otherwise stores 0
    port = df[df['long_short'] == 1]                                                                                    # creates a new portfolio spot that includes the tickers with a 1 signaling them being in the top quintile
    df['long_short'] = df[cosine_index[i - idx]].apply(lambda x: -1 if x <= df_bins[1] else 0)                          # edits the column of long_short so that it stores a -1 if the ticker is in the bottom quintile and otherwise stores 0
    port = pd.concat([port, df[df['long_short'] == -1]], ignore_index=False)                                            # concatenates the current portfolio array with the tickers that have a -1
    ports.append(port)                                                                                                  # appends the port variable to the ports array created at the top of the cell
    g_obj = port.groupby('long_short').mean()                                                                           # creates a new variable g_obj that stores the grouped by column of long_short and takes the mean of this
    avg_ret.append(g_obj.iloc[1,1] - g_obj.iloc[0,1])                                                                   # appends the returns for the top quintile and the bottom quintile to the average return array
    var_ret.append(np.var(np.array(port.iloc[:,1]*port.iloc[:,2])))                                                     # appends the variance for the top quintile and the bottom quintile to the variance return array

SPY Returns

first_dates = ['01-29-2010','04-30-2010','07-30-2010','10-29-2010','01-28-2011']                                        # initializes the array of first dates to get the SPY returns during the same dates
second_dates = ['01-29-2020','04-30-2020','07-30-2020','10-29-2020','01-28-2021']                                       # initializes the array of second dates to get the SPY returns during the same dates

rets = []                                                                                                               # initializes the returns array
vars = []                                                                                                               # initializes the variance array
for i in range(len(first_dates)-1):                                                                                     # for loop that runs for the length of the first_dates minus 1
    spy = pdr.DataReader('SPY', data_source = 'yahoo', start = first_dates[i], end = first_dates[i+1])['Adj Close']     # sets the new variable spy equal to the returns of the spy during the first dates
    rets.append(spy[-1]/spy[0]-1)                                                                                       # appends the returns array with the spy at index -1 divided by the spy at index 0 minus 1

for i in range(len(second_dates)-1):                                                                                    # for loop that runs for the length of the second dates minus 1
    spy = pdr.DataReader('SPY', data_source = 'yahoo', start = second_dates[i], end = second_dates[i+1])['Adj Close']   # sets the variable spy equal to the returns of the spy during the second dates
    rets.append(spy[-1]/spy[0]-1)                                                                                       # appends the returns array with the spy at index -1 divided by the spy at index 0 minus 1

Alpha and Dataframe Creation

bnchmk_df = pd.DataFrame([avg_ret, var_ret, rets], index=['port_ret','port_var','mkt']).transpose()                     # sets the new benchmark dataframe equal to the average returns, variance of returns, returns and the indexes as the portfolio return, portfolio variance, and market
bnchmk_df['alpha'] = bnchmk_df['port_ret'] - bnchmk_df['mkt']                                                           # sets a new column in benchmark dataframe equal to alpha and calculates this based off of the portfolio return minus the market
# bnchmk_df['sharpe'] = bnchmk_df['port_ret'] / np.sqrt(bnchmk_df['port_var'])                                          (attempt at Sharpe ratio to try and further describe our findings)

bnchmk_df                                                                                                               # prints the benchmark dataframe

	port_ret	port_var	mkt	alpha
0	0.040033	0.016541	0.110897	-0.070864
1	0.024167	0.022347	-0.067464	0.091631
2	-0.017025	0.032635	0.080297	-0.097322
3	-0.026291	0.039989	0.083566	-0.109857
4	0.064801	0.035176	-0.105419	0.170220
5	0.006286	0.021847	0.120165	-0.113879
6	-0.044570	0.080878	0.022660	-0.067229
7	0.049167	0.066373	0.149281	-0.100114

bnchmk_df.mean()                                                                                                        # prints the benchmark dataframe mean

port_ret    0.012071
port_var    0.039473
mkt         0.049248
alpha      -0.037177
dtype: float64