forked from sd17spring/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_mining.py
More file actions
56 lines (42 loc) · 1.43 KB
/
text_mining.py
File metadata and controls
56 lines (42 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
NAME: Prava
This program is the main program for the text_mining project. It takes all
the inaugural addresses and generates a list of the top words in total and
the words that are used across the most speeches. Then, a word cloud is
generated for both.
"""
import os
from text_analysis import count_words, top50_cloud
# stop words
f = open('/home/prava/TextMining/stopwords.txt')
# read the file and get stop words
stop_words = f.read().split()
f.close()
# text files path
path = '/home/prava/TextMining/textfiles/'
# file names
files = next(os.walk(path))[2]
count = []
for file in files:
# open the file
f = open(path + file)
count.append(count_words(f.read(), stop_words))
# create a dict for counting words in all documents
count_all = dict()
for word_count in count:
for word in word_count:
if word not in count_all:
count_all[word] = word_count[word]
else:
count_all[word] += word_count[word]
# shows the top 50 words from all the documents together
top50_cloud(count_all, 'Words used most in all texts together', 'WORD\tCOUNT')
# create a dict for counting words and number of documents
count_doc = dict()
for word in count_all:
count_doc[word] = 0
for word_count in count:
if word in word_count:
count_doc[word] += 1
# shows the top 50 words that appear in the most number of documents
top50_cloud(count_doc, 'Words used in most texts', 'WORD\tDOCUMENTS')