-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdata.py
More file actions
34 lines (28 loc) · 1.03 KB
/
data.py
File metadata and controls
34 lines (28 loc) · 1.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
import os.path
from normalizer import normalizer
def data_load(Config):
path = Config['data_set']
text = open(path).read()
text = normalizer(text)
print('courpus length:', len(text))
chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c,i) for i,c in enumerate(chars))
indices_char = dict((i,c) for i,c in enumerate(chars))
maxlen = 20
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i+maxlen])
next_chars.append(text[i+maxlen])
print('nb sequences:', len(sentences))
print('vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
X[i, t, char_indices[char]] = 1
y[i, char_indices[next_chars[i]]] = 1
return (chars, char_indices, indices_char, maxlen, X, y, text)