document-processing/language_processing.py at master · akshar-raaj/document-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
Responsible for doing Natural Language Processing.
It should ideally operate on the extracted text.

It should have ability to perform things like:
- Parts of Speech tagging.
- Named Entity Recognition
- Finding digits, words etc.
- Stopwords removal
- Compute Lexical Diversity

Later, we want it to perform:
- Summarization
- Answer basic question
"""

import logging
import spacy

logger = logging.getLogger(__name__)

nlp = spacy.load("en_core_web_sm")


def parts_of_speech(text: str):
    """
    Extracts parts of speech from the text
    """
    nouns = []
    verbs = []
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "PROPN":
            nouns.append(token)
        elif token.pos_ == "VERB":
            verbs.append(token)
    data = {
        "nouns": nouns,
        "verbs": verbs
    }
    return data


def entities(text: str):
    doc = nlp(text)
    ents = [ent.text for ent in doc.ents]
    return ents


def remove_punctuations(text: str):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_punct]


def remove_stopwords(text: str):
    doc = nlp(text)
    return [token.text for token in doc if not token.is_stop]


def remove_punctuations_and_stopwords(text: str):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            tokens.append(token)
    return tokens


def summarize(text: str):
    pass


def converse(text: str, question: str):
    """
    Answer simple questions like:
    - Who did something?
    - When something happened?
    - How much does something take.
    - Where did something happen.

    The following constructs will play a role here:
    - Parts of Speech (POS tagging)
    - Named Entities (NER)
    - Syntactic Dependencies (dep_)
    - Rule based matching. In addition to regex, use token attributes like is_punct, is_stop etc.
    """
    proper_nouns = []
    verbs = []
    subjects = []
    objects = []
    prepositions = []
    numerics = []
    dates = []
    doc = nlp(text)
    lowered_question = question.lower()
    for token in doc:
        logger.info(f"Token: {token.text}, POS: {token.pos_}, Dep: {token.dep_}")
        if token.pos_ == "PROPN":
            proper_nouns.append(token)
        if token.pos_ == "VERB":
            verbs.append(token)
        if token.dep_ == "nsubj":
            subjects.append(token)
        if token.dep_ in ["pobj", "dobj"]:
            objects.append(token)
        if token.pos_ == 'ADP':
            prepositions.append(token)
        if token.like_num:
            numerics.append(token)
    for ent in doc.ents:
        logger.info(f"Entity: {ent.text}, Type: {ent.label_}")
        if ent.label_ == "DATE":
            dates.append(ent)
    logger.info(f"Nouns: {proper_nouns}")
    logger.info(f"Verbs: {verbs}")
    logger.info(f"Subjects: {subjects}")
    logger.info(f"Objects: {objects}")
    logger.info(f"Prepositions: {prepositions}")
    if "who" in lowered_question:
        # The answer should probably be a proper noun.
        if len(proper_nouns) == 1:
            return proper_nouns[0].text
        # If there are multiple nouns, then most probably the subject instead of the object is the answer.
        # Hence dependency parsing can help us get that.
        # We are currently dealing with single sentences.
        # TODO: Modify it to get more context from the question, and then infer the correct subject
        return subjects[0].text
    if "where" in lowered_question:
        # It means we want a place as answer
        # The answer should probably be a noun
        # Very likely it is followed by a prepositional phrase.
        # Examples: They went "to" Colombo, kept on "the" table. etc.
        if len(objects) > 0:
            return objects[0].text
        # Statements like "apaar went to play"
        # Here play is not an object. So use the token appearing right after preposition
        if len(prepositions) > 0:
            prep = prepositions[0]
            token = doc[prep.i + 1]
            return token.text
    if "how much" in lowered_question:
        # A quantity has to be returned
        # A quantity would mean a numeric
        if len(numerics) > 0:
            return numerics[0].text
    if "when" in lowered_question:
        # A date has to be returned
        if len(dates) > 0:
            return dates[0].text
    if "what" in lowered_question:
        if len(objects) > 0:
            return objects[0].text
    return None