forked from hengchao0248/ccf2016_sougou
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdm_nn_stack.py
More file actions
86 lines (70 loc) · 2.88 KB
/
dm_nn_stack.py
File metadata and controls
86 lines (70 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# dm-nn stack for education/age/gender
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import KFold
from gensim.models import Doc2Vec
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
import cfg
# -----------------------myfunc-----------------------
def myAcc(y_true, y_pred):
y_pred = np.argmax(y_pred, axis=1)
return np.mean(y_true == y_pred)
# -----------------------load dataset----------------------
df_all = pd.read_csv(cfg.data_path + 'all_v2.csv', encoding='utf8',
usecols=['Id', 'Education', 'age', 'gender'], nrows=200000)
ys = {}
for label in ['Education', 'age', 'gender']:
ys[label] = np.array(df_all[label])
model = Doc2Vec.load(cfg.data_path + 'dm_d2v.model')
X_sp = np.array([model.docvecs[i] for i in range(200000)])
# ----------------------dmd2v stack for Education/age/gender---------------------------
df_stack = pd.DataFrame(index=range(len(df_all)))
TR = 100000
n = 5
X = X_sp[:TR]
X_te = X_sp[TR:]
feat = 'dmd2v'
for i, lb in enumerate(['Education', 'age', 'gender']):
num_class = len(pd.value_counts(ys[lb]))
y = ys[lb][:TR]
y_te = ys[lb][TR:]
stack = np.zeros((X.shape[0], num_class))
stack_te = np.zeros((X_te.shape[0], num_class))
for k, (tr, va) in enumerate(KFold(len(y), n_folds=n)):
print('{} stack:{}/{}'.format(datetime.now(), k+1, n))
nb_classes = num_class
X_train = X[tr]
y_train = y[tr]
X_test = X_te
y_test = y_te
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
model = Sequential()
model.add(Dense(300, input_shape=(X_train.shape[1],)))
model.add(Dropout(0.1))
model.add(Activation('tanh'))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adadelta',
metrics=['accuracy'])
history = model.fit(X_train, Y_train, shuffle=True,
batch_size=128, nb_epoch=35,
verbose=2, validation_data=(X_test, Y_test))
y_pred_va = model.predict_proba(X[va])
y_pred_te = model.predict_proba(X_te)
print('va acc:', myAcc(y[va], y_pred_va))
print('te acc:', myAcc(y_te, y_pred_te))
stack[va] += y_pred_va
stack_te += y_pred_te
stack_te /= n
stack_all = np.vstack([stack, stack_te])
for l in range(stack_all.shape[1]):
df_stack['{}_{}_{}'.format(feat, lb, l)] = stack_all[:, l]
df_stack.to_csv(cfg.data_path + 'dmd2v_stack_20W.csv', encoding='utf8', index=None)
print(datetime.now(), 'save dmd2v stack done!')