SentimentAnalysisSVMClassifier/probabiltyDistribution.py at master · HisashiQ/SentimentAnalysisSVMClassifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

# coding: utf-8

from nltk.probability import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, LaplaceProbDist, MLEProbDist
from nltk.util import bigrams
import unicodecsv
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.tokenize import TweetTokenizer
from random import shuffle
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import re
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
### PART B ###
##############
####
## A bigram model using the NLTK built-in functions
####

# given a list of lists of preprocessed tweets,
# getBigrams should return a list of pairs containing all the bigrams that
# are observed in the list.

def getBigrams(tweets):
    a = []
    for i in tweets:
        a.append(bigrams(i))
    bigramsArr = [item for sublist in a for item in sublist]
    return bigramsArr

# conditionalProbDist will return a probability distribution over a list of
# bigrams, together with a specified probability distribution constructor
def conditionalProbDist(probDist, bigrams):
    cfDist = ConditionalFreqDist(bigrams)
    cpDist = ConditionalProbDist(cfDist, probDist, bins=len(bigrams))
    return cpDist

londonTweetData = []
def loadData(path):
    with open(path, 'rb') as f:
        reader = unicodecsv.reader(f, encoding='utf-8')
        next(reader)
        for line in reader:
            (dt, tweet) = parseTweet(line)
            londonTweetData.append((dt, (preProcess(tweet))) )

def preProcess(text):
    token = TweetTokenizer()
    # lowercase = text.lower()
    # changeHashTag = re.sub("#tubestrike", "tube strike", lowercase)
    tokenList = token.tokenize(text)
    return tokenList

def parseTweet(tweetLine):
    timestamp = datetime.strptime(tweetLine[1], "%Y-%m-%d %H:%M:%S")
    content = tweetLine[4]
    return (timestamp, content)

# this is the function where you can put your main script, which you can then
# toggle if for test purposes
def mainScript():
    loadData("london_2017_tweets.csv")
    #Uncomment below line and comment out 'factory = MLEProbDist' to use LaplaceProbDist as the factory
    # factory = LaplaceProbDist
    factory = MLEProbDist
    wholeDatasetProb = calculateProp(getContentWholeSet(), factory)
    fifthDatasetProb = calculateProp(getContentFifth(), factory)
    ninthDataSetProb = calculateProp(getContentNinth(), factory)
    print("Whole dataset: {}".format((wholeDatasetProb["tube"].prob("strike"))))
    print("5th of Jan: {}".format((fifthDatasetProb["tube"].prob("strike"))))
    print("9th of Jan: {}".format((ninthDataSetProb["tube"].prob("strike"))))

    fifth = getRatio(fifthDatasetProb, wholeDatasetProb)
    ninth = getRatio(ninthDataSetProb, wholeDatasetProb)

    sortedFifth = sorted(fifth, key=lambda t: t[1], reverse=True)
    print("Fifth vs Whole month:   \n{}".format(sortedFifth[:10]))
    sortedNinth = sorted(ninth, key=lambda t: t[1], reverse=True)
    print("Ninth vs Whole month:   \n{}".format(sortedNinth[:10]))

def getRatio(specificDayProb, wholeDatasetProb):
    dayArr = []
    for i in specificDayProb:
        for w in specificDayProb[i].samples():
            wholeProb = wholeDatasetProb[i].prob(w)
            dayArr.append([(i,w), (specificDayProb[i].prob(w) - wholeProb), specificDayProb[i].prob(w)])
    return dayArr

def calculateProp(bigrams, factory):
    bi = getBigrams(bigrams)
    probD = conditionalProbDist(factory, bi)
    return probD

def getContentWholeSet():
    content = []
    for singleTweet in londonTweetData:
        (timestamp, msg) = singleTweet
        content.append(msg)
    return content

def getContentFifth():
    cont = []
    for singleTweet in londonTweetData:
        (timestamp, msg) = singleTweet
        if timestamp.day == 5:
            cont.append(msg)
    return cont

def getContentNinth():
    content = []
    for singleTweet in londonTweetData:
        (timestamp, msg) = singleTweet
        if timestamp.day == 9:
            content.append(msg)
    return content

# The line below can be toggled as a comment to toggle execution of the main script
results = mainScript()