cortex/machinelearning.py at master · notdon/cortex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from collections import Counter
import numpy as np
import pandas as pd
import pickle
from sklearn import svm, cross_validation, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from statistics import mean


def process_data_for_labels(ticker):
    hm_days = 5
    df = pd.read_csv('sp500joined.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)

    for i in range(1, hm_days+1):
        df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

    df.fillna(0, inplace=True)
    return tickers, df


def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > 0.027:
            return 1
        if col < -0.025:
            return -1

    return 0

def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)
    df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                    df['{}_1d'.format(ticker)],
                                    df['{}_2d'.format(ticker)],
                                    df['{}_3d'.format(ticker)],
                                    df['{}_4d'.format(ticker)],
                                    df['{}_5d'.format(ticker)],
                                    # df['{}_6d'.format(ticker)],
                                    # df['{}_7d'.format(ticker)],
                                    # df['{}_8d'.format(ticker)],
                                    # df['{}_9d'.format(ticker)],
                                    # df['{}_10d'.format(ticker)],
                                    # df['{}_11d'.format(ticker)],
                                    # df['{}_12d'.format(ticker)],
                                    # df['{}_13d'.format(ticker)],
                                    # df['{}_14d'.format(ticker)],
                                    # df['{}_15d'.format(ticker)],
                                    # df['{}_16d'.format(ticker)],
                                    # df['{}_17d'.format(ticker)],
                                    # df['{}_18d'.format(ticker)],
                                    # df['{}_19d'.format(ticker)],
                                    # df['{}_20d'.format(ticker)],
                                    ))
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data Spread:', Counter(str_vals))

    df.fillna(0, inplace=True)

    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0,inplace=True)

    X = df_vals.values
    y= df['{}_target'.format(ticker)].values

    return X, y, df
#extract_featuresets('SRIL')
def do_ml(ticker):
    X, y, df, = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25)

    #clf = neighbors.KNeighborsClassifier()
    clf = VotingClassifier([('lsvc',svm.LinearSVC()),('knn',neighbors.KNeighborsClassifier()),
    ('rfor', RandomForestClassifier())])


    clf.fit(X_train, y_train)

    confidence = clf.score(X_test, y_test)
    print('Accuracy:', confidence)
    predictions = clf.predict(X_test)

    print('Predicted spread:', Counter(predictions))

    return confidence

# df = pd.read_csv('sp500joined.csv', index_col=0)
# tickers = df.columns.values.tolist()
# for ticker in tickers:
#     print("\n"+ticker + " \n")
#     do_ml(ticker)
with open("sp500tickers.pickle","rb") as f:
    tickers = pickle.load(f)

accuracies = []
for count,ticker in enumerate(tickers):

    # if count%10==0:
    #     print(count)

    accuracy = do_ml(ticker)
    accuracies.append(accuracy)
    print("\n{} accuracy: {}. Average accuracy:{}\n".format(ticker,accuracy,mean(accuracies)))
    print("------------------------------------------------------------")