-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtrain_questions.py
More file actions
82 lines (68 loc) · 2.74 KB
/
train_questions.py
File metadata and controls
82 lines (68 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.externals import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import spacy
import numpy as np
import get_affirmations
nlp = spacy.load('en')
with open('affirmations.txt') as f:
affirmations = f.read().split('\n')
class TrainModel(object):
def __init__(self, filepath):
df = pd.read_table(filepath, delimiter=" ,,, ", names=['sentence', 'category'])
self.sents = df.sentence
self.cat = df.category
self.special_features = []
def get_sent_features(self, sent):
text = nlp(unicode(sent, encoding='utf-8'))
s_feature = {
'tag':"",
'is_wh':False,
'is_affirmation':False
}
for token in text:
if token.text.lower().startswith('wh'):
s_feature['is_wh'] = True
if token.text.lower() in affirmations:
s_feature['is_affirmation'] = True
s_feature['tag'] = token.tag_
break
self.special_features.append(s_feature)
def label_encoder(self):
lencoder = LabelEncoder().fit(self.cat)
print 'Dumping Label Encoder to models directory'
joblib.dump(lencoder, 'models/label_encoder.pkl')
return lencoder
def feature_extraction(self):
#Get Special Features
self.sents.apply(self.get_sent_features)
special_vect = DictVectorizer(sparse=False)
X_spec = special_vect.fit_transform(self.special_features)
print 'Dumping Dict Vectorizer to models directory'
joblib.dump(special_vect, 'models/dict_vect.pkl')
#Get TfIdf Vector
tfidf_vect = TfidfVectorizer(ngram_range=[1, 2], encoding='utf-8')
X_tfidf = tfidf_vect.fit_transform(self.sents).toarray()
print 'Dumping TfIdf vectorizer to models directory'
joblib.dump(tfidf_vect, 'models/tfidf_vect.pkl')
#Merge 2 feature vectors
self.X = np.concatenate((X_spec, X_tfidf), axis=1)
print self.X.shape
def train(self):
#Extract Features
print 'Extracting Features'
self.feature_extraction()
#Pre Processing
print 'Encoding Labels'
lencoder = self.label_encoder()
print 'Training Model'
model = GradientBoostingClassifier(random_state=42)#Life Universe and Everything
y = lencoder.transform(self.cat)
model.fit(self.X, y)
print 'Training Complete, dumping model to models directory'
joblib.dump(model, 'models/GradientBoostingClassifier.pkl')
#model = TrainModel('data/LabelledData (1).txt')
#model.train()