-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_reviews.py
More file actions
91 lines (77 loc) · 2.66 KB
/
process_reviews.py
File metadata and controls
91 lines (77 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python3.5
import re
import csv
import string
import hashlib
from py2neo import Graph
from nltk import tokenize
from nltk.tokenize import WordPunctTokenizer
graphdb = Graph('http://neo4j:neo4j@localhost:7474/db/data')
word_punct_tokenizer = WordPunctTokenizer()
regex = re.compile('[%s]' % re.escape(string.punctuation))
tx = graphdb.begin()
INSERT_QUERY = """
WITH split(tolower({sentence}), " ") AS words
WITH [w in words WHERE NOT w IN ["the","and","i", "it", "to"]] AS text
UNWIND range(0,size(text)-2) AS i
MERGE (w1:Word {word: text[i]})
ON CREATE SET w1.count = 1 ON MATCH SET w1.count = w1.count + 1
MERGE (w2:Word {word: text[i+1]})
ON CREATE SET w2.count = 1 ON MATCH SET w2.count = w2.count + 1
MERGE (w1)-[r:NEXT]->(w2)
ON CREATE SET r.count = 1
ON MATCH SET r.count = r.count + 1;
"""
FREQ_QUERY = """
MATCH (w:Word)
RETURN w.word AS Word, w.count AS Word_count
ORDER BY w.count DESC LIMIT 10
"""
FREQ_PAIRS_QUERY = """
MATCH (w1:Word)-[r:NEXT]->(w2:Word)
RETURN [w1.word, w2.word] AS Word_pair, r.count AS Word_count
ORDER BY r.count DESC LIMIT 10
"""
SUMMARIZE_QUERY = """
MATCH p=(:Word { word: 'breakfast' } )-[r:NEXT*1..4]->(:Word) WITH p
WITH reduce(s = 0, x IN relationships(p) | s + x.count) AS total, p
WITH nodes(p) AS text, 1.0*total/size(nodes(p)) AS weight
RETURN extract(x IN text | x.word) AS phrase, weight ORDER BY weight DESC LIMIT 10
"""
def paragraph_to_sentence(paragraph: str) -> list:
return paragraph.split(".")
# return tokenize.sent_tokenize(paragraph)
def clean_sentence(sentence: str):
sentence = sentence.lower()
sentence = sentence.strip()
sentence = regex.sub("", sentence)
return sentence
# words = re.findall('[A-Za-z0-9\']+', sentence)
# return " ".join(words)
def read_reviews():
with open('7282_1.csv') as csvfile:
reader = csv.DictReader(csvfile)
return [(row['reviews.text'], row['name']) for row in reader]
def insert_reviews():
tx = graphdb.begin()
ratings = read_reviews()
count = 0
for review, venue_name in ratings:
if venue_name != 'Best Western Plus Waterville Grand Hotel':
continue
for sentence in paragraph_to_sentence(review):
params = {'sentence': clean_sentence(sentence)}
tx.append(INSERT_QUERY, params)
tx.process()
count += 1
if count > 100:
tx.commit()
tx = graphdb.begin()
count = 0
def main():
# insert_reviews()
print(tx.run(FREQ_QUERY).dump())
print(tx.run(FREQ_PAIRS_QUERY).dump())
print(tx.run(SUMMARIZE_QUERY).dump())
if __name__ == '__main__':
main()