-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathembed-vis.py
More file actions
114 lines (84 loc) · 3.31 KB
/
embed-vis.py
File metadata and controls
114 lines (84 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
##### Author: Xingchi Li #####
# Forked from work by Jingfeng Yang
## Read word embeddings, sentences and sentence embeddings, build training and test dataset ##
import numpy as np
from sklearn.manifold import TSNE
class Sent(object):
def __init__(self,sent,label,ebd):
self.label=label
self.emb=ebd
self.sent=sent
def read_word_embeds(file='mr_workspace/word.emb'):
voc=[]
with open(file,'r') as reader:
i=0
for line in reader:
tokens=line.strip().split()
if i==0:
voc_size=int(tokens[0])
ebd_dim=int(tokens[1])
ebd = np.zeros((voc_size,ebd_dim), dtype=np.float64)
else:
voc.append(tokens[0])
ebd[i-1]=np.array([float(v) for v in tokens[(-ebd_dim):]],dtype=np.float64)
i+=1
assert(i-1==voc_size)
return voc, ebd
def readData(train_label_file='data/mr/label_train.txt',train_text_file='data/mr/text_train.txt',
test_label_file='data/mr/label_test.txt',test_text_file='data/mr/text_test.txt',
word_ebd_file='mr_workspace/word.emb',all_text_file='data/mr/text_all.txt'):
voc,word_ebd=read_word_embeds(file=word_ebd_file)
dic={}
for i,voc in enumerate(voc):
dic[voc]=i
allText=[]
with open(all_text_file) as reader1:
for line1 in reader1:
sent=line1.strip().split()
sent_ebd=[]
for word in sent:
if word in dic:
sent_ebd.append(word_ebd[dic[word]])
if len(sent_ebd)==0:
sent_ebd=[np.zeros_like(word_ebd[0],dtype=np.float64)]
allText.append((sent,np.average(np.array(sent_ebd,dtype=np.float64),axis=0)))
trainCorpus=[]
with open(train_text_file) as reader1, open(train_label_file) as reader2:
for line1,line2,text in zip(reader1,reader2,allText):
assert(line1.strip().split()==text[0])
sent=Sent(text[0],line2.strip(),text[1])
trainCorpus.append(sent)
totolTrainCount=0
with open(train_text_file) as reader:
for line in reader:
totolTrainCount+=1
testCorpus = []
with open(test_text_file) as reader1, open(test_label_file) as reader2:
for line1, line2, text in zip(reader1, reader2, allText[totolTrainCount:]):
assert (line1.strip().split() == text[0])
sent = Sent(text[0], line2.strip(), text[1])
testCorpus.append(sent)
n = len(trainCorpus)
points = []
data = np.zeros((n, 3))
print(n)
for i in range(n):
data[i, 2] = trainCorpus[i].label
points.append(trainCorpus[i].emb)
print("doing TSNE")
data[:, :2] = TSNE(n_components = 2).fit_transform(points)
np.savetxt('output/embed-vis-mr-train.csv', data, delimiter=',', fmt='%10.5f')
n = len(testCorpus)
points_test = []
data_test = np.zeros((n, 3))
print(n)
for i in range(n):
data_test[i, 2] = testCorpus[i].label
points_test.append(testCorpus[i].emb)
print("doing TSNE")
data_test[:, :2] = TSNE(n_components = 2).fit_transform(points_test)
np.savetxt('output/embed-vis-mr-test.csv', data, delimiter=',', fmt='%10.5f')
return trainCorpus,testCorpus
if __name__ == "__main__":
read_word_embeds()
readData()