Supervised-Approach-for-Keyphrase-Extraction/pre_processing_api_output.py at main · Femme-js/Supervised-Approach-for-Keyphrase-Extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import ast
import argparse
import numpy as np
import spacy
from rich.console import Console
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

console = Console()

nlp = spacy.load('en_core_web_sm')

def print_pos_df(doc):
  recs = []
  for token in doc:
    recs.append([token.text, token.text_with_ws, token.lemma_, token.pos_, token.tag_, token.dep_,token.shape_, token.is_alpha, token.is_stop, token.ent_type_, token.ent_iob_])
    cols = ['text', 'text_with_ws', 'lemma_', 'pos_', 'tag_', 'dep_', 'shape_', 'is_alpha', 'is_stop', 'ent_type_', 'ent_iob_']
    pos_df = pd.DataFrame(recs, columns=cols)

  features_dict = pos_df.to_dict(orient = 'index')
  return features_dict

def generate_features(feature, features_dict, list_x):
    a = []
    b = []

    test = list(features_dict.values())

    for val in list_x:
      for i in val:
        for j in range(len(test)):
          if ps.stem(test[j]['text']) == ps.stem(i):
            b.append(test[j][feature])
            break

      a.append(b)
      b = []


    return a


def generating_df_values(text_intro, list_of_tags):

  doc = nlp(text_intro)
  features_dict = print_pos_df(doc)

  text_list = []
  item_list = []

  for item in list_of_tags:
    item_list.append(item)
    text_list.append(text_intro)

  d = {'source': text_list, 'Tags' : item_list}
  dfx = pd.DataFrame(d)

  list_x = []
  for item in dfx['Tags'].to_list():
    list_x.append(item.split(' '))

  dfx['temp'] = list_x

  features = ['text', 'text_with_ws', 'lemma_', 'pos_', 'tag_', 'dep_', 'shape_', 'is_alpha', 'is_stop', 'ent_type_', 'ent_iob_']

  for feature in features:
    dfx[feature] = generate_features(feature, features_dict, list_x)
    # for i in dfx.index:
    #   dfx[feature][i] = ', '.join(str(x) for x in dfx[feature][i])

  return dfx


def pre_process(df):

	df['list_of_tags'] = df['list_of_tags'].apply(ast.literal_eval)

	cols = ['source', 'text', 'text_with_ws', 'lemma_', 'pos_', 'tag_', 'dep_', 'shape_', 'is_alpha', 'is_stop', 'ent_type_', 'ent_iob_', 'label_item']
	df2 = pd.DataFrame(columns= cols)

	ps = PorterStemmer()

	list_of_df = []

	for i in df.index:
		list_of_df.append(generating_df_values(df.text_intro[i], df.list_of_tags[i]))

	df3 = pd.concat(list_of_df)

	return df3


if __name__ == "__main__":

	parser = argparse.ArgumentParser()

	parser.add_argument("--file_path", type=str)

	# read the arguments from the command line
	args = parser.parse_args()
	path = args.file_path

	df = pd.read_csv(path)

	dfinal = pre_process(df)

	dfinal.to_csv('pre_processed_data.csv')