-
Notifications
You must be signed in to change notification settings - Fork 0
/
returns_data.py
64 lines (42 loc) · 2.19 KB
/
returns_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
# returns a dataframe with results for Google and S&P 500
def read_goog_sp500_dataframe():
# Point to the where the CSV is located
googFile = 'data/GOOG.csv'
spFile = 'data/SP_500.csv'
goog = pd.read_csv(googFile, sep=",", usecols=[0, 5], names=['Date', 'Goog'], header=0)
sp = pd.read_csv(spFile, sep=",", usecols=[0, 5], names=['Date', 'SP500'], header=0)
goog['SP500'] = sp['SP500']
# The date object is a string, format it as a date
goog['Date'] = pd.to_datetime(goog['Date'], format='%Y-%m-%d')
goog = goog.sort_values(['Date'], ascending=[True])
returns = goog[[key for key in dict(goog.dtypes) if dict(goog.dtypes)[key] in ['float64', 'int64']]] \
.pct_change()
return returns
# returns dataframe with the results for Google and S&P 500 set up for the logistic regression
def read_goog_sp500_logistic_data():
returns = read_goog_sp500_dataframe()
returns['Intercept'] = 1
# leave out first row since it will not have prediction up/down
# leave out the last row as it will not have a value for the returns
# resultant dataframe with the S&P 500 and intercept values of all 1s
xData = np.array(returns[["SP500", "Intercept"]][1:-1])
yData = (returns["Goog"] > 0)[1:-1]
return (xData, yData)
# returns a tuple with 2 fields, the returns for google and S&P 500. Each return is 1D array.
def read_goog_sp500_data():
googFile = 'data/GOOG.csv'
spFile = 'data/SP_500.csv'
goog = pd.read_csv(googFile, sep=",", usecols=[0, 5], names=['Date', 'Goog'], header=0)
sp = pd.read_csv(spFile, sep=",", usecols=[0, 5], names=['Date', 'SP500'], header=0)
goog['SP500'] = sp['SP500']
# The date object is a string, format it as a date
goog['Date'] = pd.to_datetime(goog['Date'], format='%Y-%m-%d')
goog = goog.sort_values(['Date'], ascending=[True])
returns = goog[[key for key in dict(goog.dtypes) if dict(goog.dtypes)[key] in ['float64', 'int64']]] \
.pct_change()
# Filter out the very first row which does not have any value for returns
xData = np.array(returns['SP500'])[1:]
yData = np.array(returns["Goog"])[1:]
return (xData, yData)