-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatamodule.py
More file actions
156 lines (124 loc) · 6.2 KB
/
datamodule.py
File metadata and controls
156 lines (124 loc) · 6.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
import pandas as pd
import re
import math
import nltk
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
class SNSDataset:
def __init__(self, config=None):
self.config = config
self.vocab = None
# load data
self.train_data = self._load_data('train')
self.test_data = self._load_data('test')
# data preprocessing
self.train_data = self._sns_data_processing(self.train_data)
# text vectorize and encode
self.train_data = self._vectorize_and_encode(self.train_data, is_train=True)
self.test_data = self._vectorize_and_encode(self.test_data, is_train=False)
# split input, target data
self.X_train, self.y_train = self._split_data(self.train_data)
self.X_test, self.y_test = self._split_data(self.test_data)
def _load_data(self, option):
if option == 'train':
data = pd.read_csv('data/train.csv')
else:
data = pd.read_csv('data/test.csv')
return data
def _sns_data_processing(self, data):
"""
### Preprocessing ###
Perform preprocessing as needed *ex) Remove stop words, remove special characters, unify case, remove missing values, remove sentences that are too short
Preprocess the text in data['content'] and store it in the data['content'] field *ex) data['content'] = data['content'].apply(lambda x: x.lower())
*** All other fields can be used as training data except 'sentiment' field ***
"""
############################################## EDIT ################################################
nltk.download('stopwords')
nltk.download('punkt')
#missing value
data = data.dropna(subset=['content'])
#capital delete
data['content'] = data['content'].apply(lambda x: x.lower())
#special characters
data['content'] = data['content'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
#stop words
stopWordSet = set(stopwords.words('english'))
data['content'] = data['content'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stopWordSet]))
#tokenize and length limit
data = data[data['content'].apply(lambda x: len(word_tokenize(x)) >= 3)]
####################################################################################################
return data
def _vectorize_and_encode(self, data, is_train):
data = self._vectorize_text(data,is_train)
data = self._label_encoding(data)
return data
def _vectorize_text(self, data, is_train):
"""
### Vectorization ###
Store the result of vectorizing text (content) in the data['vector'] field to vector using a technique such as one-hot vector or tf-idf and return it
Below is an example code that shows the basics of vectorization: converting to a one-hot vector & generating vocab based on frequency of occurrence.
"""
############################################## EDIT ################################################
if is_train:
words = [word for text in data['content'] for word in text.lower().split()]
wordCnt = Counter(words)
min_frequency = 25
vocab = {word for word, count in wordCnt.items() if count >= min_frequency}
self.vocab = vocab
print("vocab length: ", len(vocab))
#document frequency for each word
docFreq = {word: 0 for word in vocab}
for text in data['content']:
inText = set(text.lower().split())
for word in inText:
if word in docFreq:
docFreq[word] += 1
totalDocs = len(data['content'])
self.idf = {word: math.log(totalDocs / (1 + docFreq[word])) for word in vocab}
vocabDict = {word: idx for idx, word in enumerate(self.vocab)}
# tfidfVectors = np.zeros((len(data['content']), len(vocab_index)))
tfidfVectorList = []
for text in data['content']:
wordList = text.lower().split()
wordCnt = Counter(wordList)
lenDocs = len(wordList)
tfidfVectorCache = [0.0 for _ in range(len(vocabDict))]
for word, count in wordCnt.items():
if word in vocabDict:
#TF Value
tf = count / lenDocs
idf = self.idf[word]
tfidfVectorCache[vocabDict[word]] = tf * idf
tfidfVectorList.append(tfidfVectorCache)
data['vector'] = tfidfVectorList
####################################################################################################
return data
def _label_encoding(self, data):
"""
### Label Encoding ###
Target is Positive, Neutral, Negative, and Irrelevant, Computers can't understand this natural language, so it needs to be converted to numbers.
Convert Positive, Neutral, Negative, and Irrelevant to 0, 1, 2, and 3, respectively
Convert the labels in data['sentiment'] to numbers and store them in the data['sentiment'] field
"""
############################################## EDIT ################################################
numList = list()
for sentiment in data['sentiment']:
if sentiment == "Positive":
numList.append(0)
elif sentiment == "Neutral":
numList.append(1)
elif sentiment == "Negative":
numList.append(2)
else:
numList.append(3)
data['sentiment'] = numList
####################################################################################################
return data
def _split_data(self, data):
X = np.array(data['vector'].tolist())
y = data['sentiment'].to_numpy()
y = y.reshape(len(y),)
return X,y