© Ran Aroussi
@aroussi | aroussi.com | github.com/ranaroussi
September, 2018
* Note that ML research is a close neighbour of data mining, and hence overfitting is something you should pay very close attention to.
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
symbols = ["^GSPC", "^VIX", "^VXV"]
raw = pd.read_pickle('ml-raw-data.pkl')
df = pd.DataFrame()
for symbol in symbols:
new_symbol = symbol.replace("^", "").replace("GSPC", "SPX")
df[new_symbol] = raw[symbol]['Close']
SPX | VIX | VXV | |
Date | |||
2018-09-14 | 2904.979980 | 12.07 | 14.37 |
2018-09-17 | 2888.800049 | 13.68 | 15.28 |
2018-09-18 | 2904.310059 | 12.79 | 15.04 |
2018-09-19 | 2907.949951 | 11.75 | 14.65 |
2018-09-20 | 2932.149902 | 11.52 | NaN |
df['SPXVOL'] = raw['^GSPC']['Volume'].pct_change() * 100
df['SPX1D'] = raw['^GSPC']['Close'].pct_change() * 100
df['VIX1D'] = raw['^VIX']['Close'].pct_change() * 100
df['VXV1D'] = raw['^VXV']['Close'].pct_change() * 100
df['SPXHL'] = raw['^GSPC']['Open'] - raw['^GSPC']['Low']
df['SPXOC'] = raw['^GSPC']['Open'] - raw['^GSPC']['Close']
df['SPXO2C'] = ((raw['^GSPC']['Close'] / raw['^GSPC']['Open']) - 1) * 100
Date | ||||||||||
2018-09-13 | 2904.179932 | 12.37 | 14.77 | -0.306285 | 0.528225 | -5.859970 | -3.590078 | 0.460205 | -7.329834 | 0.253028 |
2018-09-14 | 2904.979980 | 12.07 | 14.37 | -3.229870 | 0.027548 | -2.425222 | -2.708192 | 10.609863 | 1.399903 | -0.048167 |
2018-09-17 | 2888.800049 | 13.68 | 15.28 | -6.414376 | -0.556972 | 13.338857 | 6.332637 | 17.670166 | 15.030029 | -0.517593 |
2018-09-18 | 2904.310059 | 12.79 | 15.04 | 4.303268 | 0.536901 | -6.505848 | -1.570681 | 0.310058 | -13.570069 | 0.469432 |
2018-09-19 | 2907.949951 | 11.75 | 14.65 | 6.680847 | 0.125327 | -8.131353 | -2.593085 | 2.780030 | -1.349853 | 0.046441 |
df['target'] = np.where(df["SPXO2C"] >= 0, 1, np.where(df["SPXO2C"] < 0, -1, 0))
df['target'] = df['target'].shift(-1) # next day
SPX | VIX | VXV | SPXVOL | SPX1D | VIX1D | VXV1D | SPXHL | SPXOC | SPXO2C | target | |
Date | |||||||||||
2018-09-05 | 2888.600098 | 13.91 | 15.92 | 5.335938 | -0.280313 | 5.699088 | 3.042071 | 14.670166 | 2.989990 | -0.103403 | -1.0 |
2018-09-06 | 2878.050049 | 14.65 | 16.34 | -3.136444 | -0.365231 | 5.319914 | 2.638191 | 21.349854 | 10.589844 | -0.366603 | 1.0 |
2018-09-07 | 2871.679932 | 14.88 | 16.59 | -6.157492 | -0.221334 | 1.569966 | 1.529988 | 4.139893 | -3.419922 | 0.119233 | -1.0 |
2018-09-10 | 2877.129883 | 14.16 | 16.17 | -7.292950 | 0.189783 | -4.838710 | -2.531646 | 5.449952 | 4.260010 | -0.147846 | 1.0 |
2018-09-11 | 2887.889893 | 13.22 | 15.54 | 6.160211 | 0.373984 | -6.638418 | -3.896104 | 4.790039 | -16.319825 | 0.568324 | 1.0 |
2018-09-12 | 2888.919922 | 13.14 | 15.32 | 12.596994 | 0.035667 | -0.605144 | -1.415701 | 9.090088 | -0.629883 | 0.021808 | 1.0 |
2018-09-13 | 2904.179932 | 12.37 | 14.77 | -0.306285 | 0.528225 | -5.859970 | -3.590078 | 0.460205 | -7.329834 | 0.253028 | -1.0 |
2018-09-14 | 2904.979980 | 12.07 | 14.37 | -3.229870 | 0.027548 | -2.425222 | -2.708192 | 10.609863 | 1.399903 | -0.048167 | -1.0 |
2018-09-17 | 2888.800049 | 13.68 | 15.28 | -6.414376 | -0.556972 | 13.338857 | 6.332637 | 17.670166 | 15.030029 | -0.517593 | 1.0 |
2018-09-18 | 2904.310059 | 12.79 | 15.04 | 4.303268 | 0.536901 | -6.505848 | -1.570681 | 0.310058 | -13.570069 | 0.469432 | 1.0 |
feature_cols = [col for col in df.columns if col not in ['target']]
features = df[feature_cols].values
labels = df['target'].values.flatten()
# split df
train_test_split = .9
sample = int( len(df.index) * train_test_split )
train_features = features[:-sample]
train_labels = labels[:-sample]
test_features = features[-sample:]
test_labels = labels[-sample:]
# normalize
normalizer = preprocessing.Normalizer()
train_features = normalizer.fit(train_features).transform(train_features)
# init classifier
clf = DecisionTreeClassifier()
# fit
clf.fit(train_features, train_labels)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
# predict
prediction = clf.predict(test_features)
# score
accuracy_score(test_labels, prediction)
scores = cross_val_score(clf, features, labels, cv=10)
print("Mean %.2f%%, Std: %.2f" % (scores.mean()*100, scores.std()*100) )
Mean 51.38%, Std: 2.72
testdf = df[-sample:].copy()
testdf['predicted'] = prediction
SPX | VIX | VXV | SPXVOL | SPX1D | VIX1D | VXV1D | SPXHL | SPXOC | SPXO2C | target | predicted | |
Date | ||||||||||||
2018-09-12 | 2888.919922 | 13.14 | 15.32 | 12.596994 | 0.035667 | -0.605144 | -1.415701 | 9.090088 | -0.629883 | 0.021808 | 1.0 | 1.0 |
2018-09-13 | 2904.179932 | 12.37 | 14.77 | -0.306285 | 0.528225 | -5.859970 | -3.590078 | 0.460205 | -7.329834 | 0.253028 | -1.0 | 1.0 |
2018-09-14 | 2904.979980 | 12.07 | 14.37 | -3.229870 | 0.027548 | -2.425222 | -2.708192 | 10.609863 | 1.399903 | -0.048167 | -1.0 | -1.0 |
2018-09-17 | 2888.800049 | 13.68 | 15.28 | -6.414376 | -0.556972 | 13.338857 | 6.332637 | 17.670166 | 15.030029 | -0.517593 | 1.0 | 1.0 |
2018-09-18 | 2904.310059 | 12.79 | 15.04 | 4.303268 | 0.536901 | -6.505848 | -1.570681 | 0.310058 | -13.570069 | 0.469432 | 1.0 | 1.0 |
testdf['strategy'] = testdf['predicted'].shift(1) * testdf['SPXO2C']
testdf[['SPX1D', 'strategy']].cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x1a1c9abb00>
This was the part 4 out of the 4-part webinar series
Webinars available @ aroussi.com/webinars
© Ran Aroussi
@aroussi | aroussi.com | github.com/ranaroussi
September, 2018