forked from hengchao0248/ccf2016_sougou
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_d2v_model.py
More file actions
97 lines (81 loc) · 3.37 KB
/
train_d2v_model.py
File metadata and controls
97 lines (81 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# train dbow/dm for education/age/gender
import pandas as pd
import jieba
from datetime import datetime
from collections import namedtuple
from gensim.models.doc2vec import Doc2Vec
import subprocess
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import codecs
import cfg
import numpy as np
df_all = pd.read_csv(cfg.data_path + 'all_v2.csv', encoding='utf8')
# -------------------add row number to query----------------------
doc_f = codecs.open('alldata-id.txt', 'w', encoding='utf8')
for i, queries in enumerate(df_all.iloc[:200000]['query']):
words = []
for query in queries.split('\t'):
words.extend(list(jieba.cut(query)))
tags = [i]
if i % 10000 == 0:
print(datetime.now(), i)
doc_f.write('_*{} {}'.format(i, ' '.join(words)))
doc_f.close()
# -------------------------prepare to train--------------------------------------------
def run_cmd(cmd):
print(cmd)
process = subprocess.Popen(cmd, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
for t, line in enumerate(iter(process.stdout.readline, b'')):
line = line.decode('utf8').rstrip()
print(line)
process.communicate()
return process.returncode
SentimentDocument = namedtuple('SentimentDocument', 'words tags')
class Doc_list(object):
def __init__(self, f):
self.f = f
def __iter__(self):
for i, line in enumerate(codecs.open(self.f, encoding='utf8')):
words = line.split()
tags = [int(words[0][2:])]
words = words[1:]
yield SentimentDocument(words, tags)
d2v = Doc2Vec(dm=0, size=300, negative=5, hs=0, min_count=3, window=30,
sample=1e-5, workers=8, alpha=0.025, min_alpha=0.025)
doc_list = Doc_list('alldata-id.txt')
d2v.build_vocab(doc_list)
# -------------------train dbow doc2vec---------------------------------------------
df_lb = pd.read_csv(cfg.data_path + 'all_v2.csv', usecols=['Education', 'age', 'gender'], nrows=200000)
ys = {}
for lb in ['Education', 'age', 'gender']:
ys[lb] = np.array(df_lb[lb])
for i in range(2):
print(datetime.now(), 'pass:', i + 1)
run_cmd('shuf alldata-id.txt > alldata-id-shuf.txt')
doc_list = Doc_list('alldata-id.txt')
d2v.train(doc_list)
X_d2v = np.array([d2v.docvecs[i] for i in range(200000)])
for lb in ["Education", 'age', 'gender']:
scores = cross_val_score(LogisticRegression(C=3), X_d2v, ys[lb], cv=5)
print('dbow', lb, scores, np.mean(scores))
d2v.save(cfg.data_path + 'dbow_d2v.model')
print(datetime.now(), 'save done')
d2v = Doc2Vec(dm=1, size=300, negative=5, hs=0, min_count=3, window=10,
sample=1e-5, workers=8, alpha=0.05, min_alpha=0.025)
doc_list = Doc_list('alldata-id.txt')
d2v.build_vocab(doc_list)
# ---------------train dm doc2vec-----------------------------------------------------
for i in range(10):
print(datetime.now(), 'pass:', i)
run_cmd('shuf alldata-id.txt > alldata-id-shuf.txt')
doc_list = Doc_list('alldata-id.txt')
d2v.train(doc_list)
X_d2v = np.array([d2v.docvecs[i] for i in range(200000)])
for lb in ["Education", 'age', 'gender']:
scores = cross_val_score(LogisticRegression(C=3), X_d2v, ys[lb], cv=5)
print('dm', lb, scores, np.mean(scores))
d2v.save(cfg.data_path + 'dm_d2v.model')
print(datetime.now(), 'save done')