-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathnmt(e2g)_preprocessing.py
More file actions
63 lines (53 loc) · 1.84 KB
/
nmt(e2g)_preprocessing.py
File metadata and controls
63 lines (53 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
import string
from pickle import dump
from unicodedata import normalize
from numpy import array
#Loading Data into memory as blob of text to perserve Unicode German Characters
def load_doc(filename):
file = open(filename,mode='rt',encoding='utf-8')
text = file.read()
file.close()
return text
#Spliting loaded Document into Sentences(English and german Seperately)
def to_pairs(doc):
lines = doc.strip().split('\n')
pairs = [line.split('\t') for line in lines]
return pairs
#Data Cleaning Operations
def clean_pairs(lines):
cleaned = list()
#preparing regex for char filtering
re_print = re.compile('[^%s]' % re.escape(string.printable))
#Prepare translation table for removing punctuation
table = str.maketrans('','',string.punctuation)
for pair in lines:
clean_pair = list()
for line in pair:
#Normalize unicode characters
line = normalize('NFD',line).encode('ascii','ignore')
line = line.decode('UTF-8')
#Tokenizing on White Space
line = line.split()
line = [word.lower() for word in line]
#remove punctuation from each token
line = [word.translate(table) for word in line]
#removing non-printable characters from each token
line = [re_print.sub('',w) for w in line]
#remove tokens with number in them
line = [word for word in line if word.isalpha()]
#Store as String
clean_pair.append(' '.join(line))
cleaned.append(clean_pair)
return array(cleaned)
#Saving the clean sentences to a file
def save_clean_data(sentences,filename):
dump(sentences,open(filename,'wb'))
print('Saved: %s' % filename)
filename = 'deu.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
clean_pairs = clean_pairs(pairs)
save_clean_data(clean_pairs,'english-german.pkl')
for i in range(100):
print('[%s] ==> [%s]' % (clean_pairs[i,0],clean_pairs[i,1]))