-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifier.py
More file actions
121 lines (91 loc) · 3.64 KB
/
classifier.py
File metadata and controls
121 lines (91 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import pickle
import time
import csv
from sklearn import svm
def calculate_avg_emb(tweet, vocab, embed):
words = tweet.split()
embeds = []
for word in words:
v = vocab.get(word, None)
if v != None:
embeds.append(embed[v])
avg_emb = sum(np.array(embeds))/len(embeds) if len(embeds) != 0 else []
return avg_emb
def main():
# Params
training_samples_per_class = 5000
testing_samples_per_class = 10000
dataset_dir = "twitter-datasets/" # directory for the dataset
out_dir = "out/" # directory for submissions
# Loading data
print("\nLoading embeddings...")
embed = np.load('embeddings.npy')
print(len(embed), "embeddings loaded\n")
print("Loading vocab...")
with open('vocab.pkl', 'rb') as f:
vocab = pickle.load(f)
print(len(vocab), "vocab lines loaded\n")
print("Loading dataset...")
fp = open(dataset_dir + "train_pos_full.txt", "r")
tweets_pos = fp.readlines()
print(len(tweets_pos), "positive tweets loaded")
fp.close()
fn = open(dataset_dir + "train_neg_full.txt", "r")
tweets_neg = fn.readlines()
print(len(tweets_neg), "negative tweets loaded\n")
fn.close()
# Generating embeddings for tweets
embs = []
for tweet in tweets_pos[:training_samples_per_class]:
avg_emb = calculate_avg_emb(tweet, vocab, embed)
if avg_emb != []:
embs.append(avg_emb)
num_pos = len(embs)
for tweet in tweets_neg[:training_samples_per_class]:
avg_emb = calculate_avg_emb(tweet, vocab, embed)
if avg_emb != []:
embs.append(avg_emb)
num_neg = len(embs) - num_pos
# Training the model
print("Training on", len(embs), "samples...\n")
clf = svm.SVC(gamma=0.001, C=100., verbose=1)
X = embs
y = np.append(np.ones(num_pos, dtype=int),-np.ones(num_neg, dtype=int))
clf.fit(X, y)
# Testing (on unused samples)
print('\n\nTesting on unused samples...')
test_data = tweets_pos[:testing_samples_per_class] + tweets_neg[:testing_samples_per_class]
correct_predictions = 0
for i, tweet in enumerate(test_data):
avg_emb = calculate_avg_emb(tweet, vocab, embed)
prediction = 1 # Default value for tweets we can't analyse
if avg_emb != []:
prediction = clf.predict([avg_emb])[0]
correct_answer = int(i < testing_samples_per_class) * 2 - 1
if prediction == correct_answer:
correct_predictions += 1
print("\nPredicted accuracy: " + str(correct_predictions/(2 * testing_samples_per_class)))
# Generating submission for test_data
print("\nLoading test_data.txt...\n")
fp = open(dataset_dir + "test_data.txt", "r")
test_data = fp.readlines()
print(len(test_data), "test tweets loaded\n")
fp.close()
localtime = time.asctime(time.localtime(time.time())).replace(':','.')
fp = open(out_dir + "submission " + localtime[4:-5] + ".csv", "w")
fieldnames = ['Id', 'Prediction']
writer = csv.DictWriter(fp, fieldnames=fieldnames)
writer.writeheader()
print("Generating predictions...\n")
for tweet in test_data:
i, t = tweet.split(",", maxsplit=1) # Splitting the index from the tweet text
avg_emb = calculate_avg_emb(t, vocab, embed)
prediction = 1 # Default value for tweets we can't analyse
if avg_emb != []:
prediction = clf.predict([avg_emb])[0]
writer.writerow({'Id': str(i), 'Prediction': str(prediction)})
fp.close()
print("Done.")
if __name__ == '__main__':
main()