-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyzetwitter.py
More file actions
executable file
·111 lines (103 loc) · 3.79 KB
/
analyzetwitter.py
File metadata and controls
executable file
·111 lines (103 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import csv
import re
import matplotlib.pyplot as plt
def gettweets():
tweetinfo = []
with open('tweets.csv', 'r') as tweetfile:
tweetfile.readline()
tweetreader = csv.reader(tweetfile, delimiter=",")
for row in tweetreader:
tweetinfo.append([row[1], row[3], row[5], row[6]])
return tweetinfo
def analyzeHourOfDay():
hourDict = {}
tweetinfo = gettweets()
for tweet in tweetinfo:
#Subtract four to account for timezone (CST)
hour = (int(tweet[1].split()[1].split(":")[0]) - 4) % 24
if hour not in hourDict.keys():
hourDict[hour] = 1
else:
hourDict[hour] = hourDict[hour] + 1
return hourDict
def analyzeWordFreq():
wordDict = {}
tweetinfo = gettweets()
for tweet in tweetinfo:
#disregard the tweet if it is a retweet (tweet[3] is not empty)
if not tweet[3]:
text = tweet[2]
words = text.split()
for word in words:
word = re.sub(r'[^A-Za-z]+', '', word)
if word not in wordDict.keys():
wordDict[word] = 1
else:
wordDict[word] = wordDict[word] + 1
sortedWordsByFreq = sorted(wordDict, key=wordDict.get, reverse=True)
print(sortedWordsByFreq[0:250])
def analyzeRetweets():
retweetHandles = {}
tweetinfo = gettweets()
for tweet in tweetinfo:
if tweet[3]:
text = tweet[2]
m = re.search(r'RT @(?P<handle>.+?):', text)
handle = m.group('handle')
if handle not in retweetHandles.keys():
retweetHandles[handle] = 1
else:
retweetHandles[handle] = retweetHandles[handle] + 1
sortedRT = sorted(retweetHandles, key=retweetHandles.get, reverse=True)
print("You have retweeted someone for {0} out of {1} tweets".format(\
len(sortedRT), len(tweetinfo)))
print("which is {0}% of your tweets".format((len(sortedRT) /\
len(tweetinfo)) * 100))
topten = sortedRT[0:10]
for key in topten:
print(key, retweetHandles[key])
def analyzeTweetAt():
handleDict = {}
tweetinfo = gettweets()
for tweet in tweetinfo:
if not tweet[3]:
text = tweet[2]
words = text.split()
for word in words:
word = re.sub(r'[^A-Za-z0-9_@]', '', word)
if "@" in word:
#for better analysis, should check if actually valid handle
handle = word.replace("@", "")
if handle not in handleDict:
handleDict[handle] = 1
else:
handleDict[handle] = handleDict[handle] + 1
sortedHandlesByFreq = sorted(handleDict, key=handleDict.get, reverse=True)
print("You have mentioned someone in {0} out of {1} tweets".format(\
len(sortedHandlesByFreq), len(tweetinfo)))
print("which is {0}% of your tweets".format((len(sortedHandlesByFreq) /\
len(tweetinfo)) * 100))
topfifteen = sortedHandlesByFreq[0:15]
for w in topfifteen:
print(w,handleDict[w])
def graphHours(hourDict):
times = []
for i in range(2):
if i == 0:
times.append("12 AM")
else:
times.append("12 PM")
for j in range(1,12):
if i == 0:
times.append("{0} AM".format(j))
else:
times.append("{0} PM".format(j))
fig = plt.figure()
ax = plt.subplot(111)
ax.bar(range(len(times)),hourDict.values())
ax.set_xticks([i for i in range(24)])
ax.set_xticklabels(times, rotation = 45)
plt.title("Tweets per Hour")
plt.show()
if __name__ == "__main__":
analyzeWordFreq()