ML_TwitterSentimentClassification/classifier.py at master · Umbriferous/ML_TwitterSentimentClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import pickle
import time
import csv
from sklearn import svm


def calculate_avg_emb(tweet, vocab, embed):
    words = tweet.split()
    embeds = []
    for word in words:
        v = vocab.get(word, None)
        if v != None:
            embeds.append(embed[v])
    avg_emb = sum(np.array(embeds))/len(embeds) if len(embeds) != 0 else []
    return avg_emb


def main():

    # Params

    training_samples_per_class = 5000
    testing_samples_per_class = 10000
    dataset_dir = "twitter-datasets/"  # directory for the dataset
    out_dir = "out/"  # directory for submissions

    # Loading data

    print("\nLoading embeddings...")
    embed = np.load('embeddings.npy')
    print(len(embed), "embeddings loaded\n")

    print("Loading vocab...")
    with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    print(len(vocab), "vocab lines loaded\n")

    print("Loading dataset...")
    fp = open(dataset_dir + "train_pos_full.txt", "r")
    tweets_pos = fp.readlines()
    print(len(tweets_pos), "positive tweets loaded")
    fp.close()
    fn = open(dataset_dir + "train_neg_full.txt", "r")
    tweets_neg = fn.readlines()
    print(len(tweets_neg), "negative tweets loaded\n")
    fn.close()

    # Generating embeddings for tweets

    embs = []
    for tweet in tweets_pos[:training_samples_per_class]:
        avg_emb = calculate_avg_emb(tweet, vocab, embed)
        if avg_emb != []:
            embs.append(avg_emb)

    num_pos = len(embs)

    for tweet in tweets_neg[:training_samples_per_class]:
        avg_emb = calculate_avg_emb(tweet, vocab, embed)
        if avg_emb != []:
            embs.append(avg_emb)

    num_neg = len(embs) - num_pos

    # Training the model

    print("Training on", len(embs), "samples...\n")

    clf = svm.SVC(gamma=0.001, C=100., verbose=1)

    X = embs
    y = np.append(np.ones(num_pos, dtype=int),-np.ones(num_neg, dtype=int))
    clf.fit(X, y)

    # Testing (on unused samples)

    print('\n\nTesting on unused samples...')
    test_data = tweets_pos[:testing_samples_per_class] + tweets_neg[:testing_samples_per_class]
    correct_predictions = 0
    for i, tweet in enumerate(test_data):
        avg_emb = calculate_avg_emb(tweet, vocab, embed)
        prediction = 1  # Default value for tweets we can't analyse
        if avg_emb != []:
            prediction = clf.predict([avg_emb])[0]
        correct_answer = int(i < testing_samples_per_class) * 2 - 1
        if prediction == correct_answer:
            correct_predictions += 1
    print("\nPredicted accuracy: " + str(correct_predictions/(2 * testing_samples_per_class)))


    # Generating submission for test_data

    print("\nLoading test_data.txt...\n")

    fp = open(dataset_dir + "test_data.txt", "r")
    test_data = fp.readlines()
    print(len(test_data), "test tweets loaded\n")
    fp.close()

    localtime = time.asctime(time.localtime(time.time())).replace(':','.')
    fp = open(out_dir + "submission " + localtime[4:-5] + ".csv", "w")
    fieldnames = ['Id', 'Prediction']
    writer = csv.DictWriter(fp, fieldnames=fieldnames)
    writer.writeheader()

    print("Generating predictions...\n")

    for tweet in test_data:
        i, t = tweet.split(",", maxsplit=1)  # Splitting the index from the tweet text
        avg_emb = calculate_avg_emb(t, vocab, embed)
        prediction = 1  # Default value for tweets we can't analyse
        if avg_emb != []:
            prediction = clf.predict([avg_emb])[0]
        writer.writerow({'Id': str(i), 'Prediction': str(prediction)})
    fp.close()

    print("Done.")

if __name__ == '__main__':
    main()