-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlanguage_processing.py
More file actions
153 lines (133 loc) · 4.41 KB
/
language_processing.py
File metadata and controls
153 lines (133 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Responsible for doing Natural Language Processing.
It should ideally operate on the extracted text.
It should have ability to perform things like:
- Parts of Speech tagging.
- Named Entity Recognition
- Finding digits, words etc.
- Stopwords removal
- Compute Lexical Diversity
Later, we want it to perform:
- Summarization
- Answer basic question
"""
import logging
import spacy
logger = logging.getLogger(__name__)
nlp = spacy.load("en_core_web_sm")
def parts_of_speech(text: str):
"""
Extracts parts of speech from the text
"""
nouns = []
verbs = []
doc = nlp(text)
for token in doc:
if token.pos_ == "PROPN":
nouns.append(token)
elif token.pos_ == "VERB":
verbs.append(token)
data = {
"nouns": nouns,
"verbs": verbs
}
return data
def entities(text: str):
doc = nlp(text)
ents = [ent.text for ent in doc.ents]
return ents
def remove_punctuations(text: str):
doc = nlp(text)
return [token.text for token in doc if not token.is_punct]
def remove_stopwords(text: str):
doc = nlp(text)
return [token.text for token in doc if not token.is_stop]
def remove_punctuations_and_stopwords(text: str):
doc = nlp(text)
tokens = []
for token in doc:
if not token.is_stop and not token.is_punct:
tokens.append(token)
return tokens
def summarize(text: str):
pass
def converse(text: str, question: str):
"""
Answer simple questions like:
- Who did something?
- When something happened?
- How much does something take.
- Where did something happen.
The following constructs will play a role here:
- Parts of Speech (POS tagging)
- Named Entities (NER)
- Syntactic Dependencies (dep_)
- Rule based matching. In addition to regex, use token attributes like is_punct, is_stop etc.
"""
proper_nouns = []
verbs = []
subjects = []
objects = []
prepositions = []
numerics = []
dates = []
doc = nlp(text)
lowered_question = question.lower()
for token in doc:
logger.info(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}")
if token.pos_ == "PROPN":
proper_nouns.append(token)
if token.pos_ == "VERB":
verbs.append(token)
if token.dep_ == "nsubj":
subjects.append(token)
if token.dep_ in ["pobj", "dobj"]:
objects.append(token)
if token.pos_ == 'ADP':
prepositions.append(token)
if token.like_num:
numerics.append(token)
for ent in doc.ents:
logger.info(f"Entity: {ent.text}, Type: {ent.label_}")
if ent.label_ == "DATE":
dates.append(ent)
logger.info(f"Nouns: {proper_nouns}")
logger.info(f"Verbs: {verbs}")
logger.info(f"Subjects: {subjects}")
logger.info(f"Objects: {objects}")
logger.info(f"Prepositions: {prepositions}")
if "who" in lowered_question:
# The answer should probably be a proper noun.
if len(proper_nouns) == 1:
return proper_nouns[0].text
# If there are multiple nouns, then most probably the subject instead of the object is the answer.
# Hence dependency parsing can help us get that.
# We are currently dealing with single sentences.
# TODO: Modify it to get more context from the question, and then infer the correct subject
return subjects[0].text
if "where" in lowered_question:
# It means we want a place as answer
# The answer should probably be a noun
# Very likely it is followed by a prepositional phrase.
# Examples: They went "to" Colombo, kept on "the" table. etc.
if len(objects) > 0:
return objects[0].text
# Statements like "apaar went to play"
# Here play is not an object. So use the token appearing right after preposition
if len(prepositions) > 0:
prep = prepositions[0]
token = doc[prep.i + 1]
return token.text
if "how much" in lowered_question:
# A quantity has to be returned
# A quantity would mean a numeric
if len(numerics) > 0:
return numerics[0].text
if "when" in lowered_question:
# A date has to be returned
if len(dates) > 0:
return dates[0].text
if "what" in lowered_question:
if len(objects) > 0:
return objects[0].text
return None