TextMining/TextMining.py at master · guokevin/TextMining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
Finds the relevant words associated with the main motifs in the novel

@author: Kevin Guo
"""
from pattern.en import *
import matplotlib.pyplot as plt
import pickle
import re
from pattern.web import *
import nltk
from nltk.corpus import wordnet as wn
import numpy as np


def pickle_files(text):
    """
    Pickle all files
    """
    for i in range(5):
        name = 'pickle' + str(i) + '.txt'
        pickle.dump(text,open(name,'w'))

def create_dictionary(words):
    """
    Create a dictionary given a list of words

    >>> create_dictionary(['bug','bug','chess'])
    {'bug': 2, 'chess': 1}
    """
    #initialize dictionary
    dictionary = dict()

    #creates a histogram of words given
    for word in words:
        if(word not in dictionary):
            dictionary[word] = 1
        else:
            dictionary[word] += 1

    return dictionary

def read_main_dictionary():
    """
    Create and modify dictionary for the main text to remove words that are implausible matches
    """
    #read file
    a = open("Text1.txt")
    #make all characters in file lowercase
    book = a.read().lower()
    a.close()
    #remove all characters that are not alphabetical
    book = re.sub('[^a-z]', ' ', book)
    words = book.split()
    #histogram for main dictionary
    dictionary = create_dictionary(words)
    #modify dictionary by frequency of words
    threshold(dictionary,.02,.002)
    return dictionary

def create_dictionary_list():
    """
    Create dictionaries for other texts by the same author
    """
    #open text files
    b = open("Text2.txt")
    c = open("Text3.txt")
    d = open("Text4.txt")
    e = open("Text5.txt")

    #move texts into histogram
    book2 = b.read().lower()
    b.close()
    book2 = re.sub('[^a-z]', ' ', book2)
    words2 = book2.split()
    dictionary2 = create_dictionary(words2)

    book3 = c.read().lower()
    c.close()
    book3 = re.sub('[^a-z]', ' ', book3)
    words3 = book3.split()
    dictionary3= create_dictionary(words3)

    book4 = d.read().lower()
    d.close()
    book4 = re.sub('[^a-z]', ' ', book4)
    words4 = book4.split()
    dictionary4 = create_dictionary(words4)

    book5 = e.read().lower()
    e.close()
    book5 = re.sub('[^a-z]', ' ', book5)
    words5 = book5.split()
    dictionary5= create_dictionary(words5)

    return [dictionary2, dictionary3, dictionary4, dictionary5]

def threshold(dictionary,upper_threshold_percent,lower_threshold_percent):
    """
    Remove entries from the dictionary that have a frequency which is not within the desired range
    """
    length = len(dictionary)
    #convert percentages to numerical threshold
    upper_threshold = upper_threshold_percent*length
    lower_threshold = lower_threshold_percent*length
    #remove entries that are not in the range
    for x,y in dictionary.items():
        if y > upper_threshold or y <= lower_threshold:
            del(dictionary[x])

def alphabetical(dictionary):
    """
    Returns list of most common words in alphabetical order
    """
    #create empty list
    lst = []
    #inputs keys of dictionary into a list
    for x,y in dictionary.items():
        lst.append(x)
    #sort list by alphabetical order
    lst.sort()
    return lst

def most_common(dictionary):
    """
    Returns list of most common words sorted by frequency
    """
    #create empty tuple
    tupl = []
    #reverses dictionary inside a tuple
    for x,y in dictionary.items():
        tupl.append((y,x))
    #sort tuple by frequency of words
    tupl.sort()

    lst = []
    #create a list sorted by frequency of words
    for x,y in tupl:
        lst.append(y)
    return lst

def subtract_dictionaries(threshold):
    """
    Removes an entry from the main dictionary if the number of appearances of that word is relatively the same as
    the number of appearances in other books (this would identify which words are commonly used by the author
    compared to those which are actually significant)

    Receives a threshold that indicates significance of word required to remain in dictionary.
    """
    #creates list of dictionaries of words in other books
    dictionary_list = create_dictionary_list()
    #creates dictionary for somewhat filtered words in main book
    main_dictionary = read_main_dictionary()

    #iterate through each book in the list of books
    for i in range(len(dictionary_list)):
        for word,freq in dictionary_list[i].items():
            for x,y in main_dictionary.items():
                #test if the frequency of each word in the main text is significant relative to the frequency of that word in other texts
                if(x == word and float(y)/freq < threshold):
                    #remove insignificant words
                    del main_dictionary[x]

    return main_dictionary

def motif_sentiment(lst):
    """
    Unused code which returns sentiments of individual words
    """
    sentiment_dictionary = dict()
    for word in lst:
        sentiment_dictionary[word] = sentiment(word)
    return sentiment_dictionary

def plot_texts(dictionary):
    """
    Plots the words in a bar graph in order of frequency for easier visualization
    """
    fig = plt.figure()
    ax = fig.add_subplot(111)

    ## the data
    N = len(dictionary)
    tup = []
    lstx = []
    lsty = []

    #sort words by frequency
    for x,y in dictionary.items():
        tup.append((y,x))
    tup.sort(reverse=True)

    for y,x in tup:
        lstx.append(x)
        lsty.append(y)

    ## necessary variables
    ind = np.arange(N)                # the x locations for the groups
    width = 0.35                     # the width of the bars

    ## the bars
    rects1 = ax.bar(ind, lsty, width, color='blue')
    # # axes and labels
    ax.set_xlim(-width,len(ind)+width)
    ax.set_ylim(0,200)
    ax.set_ylabel('Frequency')
    ax.set_title('Frequency of most common themes')
    ax.set_xticks(ind+width)
    xtickNames = ax.set_xticklabels(lstx)
    plt.setp(xtickNames, rotation=90, fontsize=8)

    plt.show()

if __name__ == '__main__':
    import doctest
    # doctest.testmod()
    motifs =  subtract_dictionaries(1.0)
    plot_texts(motifs)
    # print motif_sentiment(motifs)
    # word = wn.synset('beer.n.01')
    # print word.hypernyms()

    # print sentiment(motifs)
    # plot_texts(motifs)
    # print alphabetical(subtract_dictionaries(1.0))


    #Test code for natural language processing
    # a = alphabetical(subtract_dictionaries(1.0))
    # print a
    # text = nltk.Text(a)
    # text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
    # print text


    # temp = dict()
    # for i in range(len(a)):
    #     for j in range(len(a)-i):
    #         if(text.similar(dict[i])
    # print text.similar('gold')