This repository was archived by the owner on Apr 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_data.py
More file actions
92 lines (70 loc) · 2.36 KB
/
create_data.py
File metadata and controls
92 lines (70 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# %% import relevant libraries
import os
import numpy as np
import pandas as pd
import spacy
# Spacy imports
from spacy import displacy
from spacy.lang.en import English
from spacy.tokens import DocBin
# ML
from sklearn.model_selection import train_test_split
# tqdm shows a progress bar while executing cells
from tqdm.auto import tqdm
# %% Load in data
data_path = r'IMDB Dataset.csv'
data = pd.read_csv(data_path)
data_sample = data.sample(frac=.1, random_state=1)
data_sample.value_counts(subset='sentiment')
# %% split data into train, valid, and test datasets
X_train, X_rem, y_train, y_rem = train_test_split(data_sample['review'], data_sample['sentiment'], train_size=.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=.5)
d = {'review': X_train, 'sentiment': y_train}
train_df = pd.DataFrame(data=d)
del d
d = {'review': X_valid, 'sentiment': y_valid}
valid_df = pd.DataFrame(data=d)
del d
d = {'review': X_test, 'sentiment': y_test}
test_df = pd.DataFrame(data=d)
del d
# %% confirm balance of categories for each dataframe
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)
print(train_df.value_counts(subset='sentiment'))
print(valid_df.value_counts(subset='sentiment'))
print(test_df.value_counts(subset='sentiment'))
# format data for spaCy
train_df = [tuple(x) for x in train_df.to_numpy()]
valid_df = [tuple(x) for x in valid_df.to_numpy()]
test_df = [tuple(x) for x in test_df.to_numpy()]
# %% make_docs takes in data and converts it to a spaCy document
def make_docs(data):
docs = []
for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
if label =="negative":
doc.cats["positive"] = 0
doc.cats["negative"] = 1
else:
doc.cats["positive"] = 1
doc.cats["negative"] = 0
docs.append(doc)
return docs
# %% Load the pre-trained spaCy model for the english language
nlp = spacy.load("en_core_web_sm")
# %% create spaCy docbins and save to disk
import os
if not os.path.exists('data'):
os.makedirs('data')
train_docs = make_docs(train_df)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")
valid_docs = make_docs(valid_df)
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")
# %%
test_docs = make_docs(test_df)
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("./data/test.spacy")
# %%