44"""
55import argparse
66import os
7+ from collections import OrderedDict , defaultdict
78
89from tqdm .auto import tqdm
910import xmltodict
1011from sqlalchemy import create_engine
1112from sqlalchemy .orm import sessionmaker
1213
1314from ruwordnet .models import Sense , Synset , Base , hypernymy_table , domains_table , meronymy_table , pos_synonymy_table , \
14- antonymy_table , composition_table , entailment_table , cause_table , derivation_table , instances_table
15+ antonymy_table , composition_table , entailment_table , cause_table , derivation_table , instances_table , related_table
16+ from ruwordnet .models import WNSynset , WNSense , ili_table
1517
1618
1719def load_from_xml (root = '.' , parts = 'NVA' , file_name = 'ruwordnet/static/ruwordnet.db' ):
1820 dirname = os .path .dirname (file_name )
1921 if not os .path .exists (dirname ):
2022 os .makedirs (dirname )
2123
24+ if os .path .exists (file_name ):
25+ os .remove (file_name )
2226 engine = create_engine (f'sqlite:///{ file_name } ' , echo = False )
2327 Base .metadata .create_all (engine )
2428
@@ -60,6 +64,44 @@ def load_from_xml(root='.', parts='NVA', file_name='ruwordnet/static/ruwordnet.d
6064
6165 session .commit ()
6266
67+ # load interlingual index
68+ fn = os .path .join (root , f'ili.xml' )
69+ if os .path .exists (fn ):
70+ print ('creating foreign Wordnet...' )
71+ with open (fn , 'r' , encoding = 'utf-8' ) as f :
72+ ili_raw = xmltodict .parse (f .read (), process_namespaces = True )
73+ pairs_to_insert = set ()
74+ already = set ()
75+ for match in tqdm (ili_raw ['ili' ]['match' ]):
76+ wn_synsets = match ['wn-synset' ]
77+ if isinstance (wn_synsets , OrderedDict ):
78+ wn_synsets = [wn_synsets ]
79+ for wnss in wn_synsets :
80+ pairs_to_insert .add ((match ['rwn-synset' ]['@id' ], wnss ['@id' ]))
81+ if wnss ['@id' ] in already :
82+ continue
83+ already .add (wnss ['@id' ])
84+ lemmas = wnss ['lemma' ]
85+ if isinstance (lemmas , OrderedDict ):
86+ lemmas = [lemmas ]
87+ for s in lemmas :
88+ if s ['@key' ] in already :
89+ continue
90+ already .add (s ['@key' ])
91+ wn_sense = WNSense (name = s ['@name' ], key = s ['@key' ], synset_id = wnss ['@id' ])
92+ session .add (wn_sense )
93+ wn_synset = WNSynset (
94+ id = wnss ['@id' ],
95+ definition = wnss ['@definition' ],
96+ )
97+ session .add (wn_synset )
98+ session .commit ()
99+ print ('connecting synsets with foreign Wordnet...' )
100+ conn = engine .connect ()
101+ conn .execute (ili_table .insert (), [dict (ruwn_id = id1 , wn_id = id2 ) for id1 , id2 in pairs_to_insert ])
102+ else :
103+ print ('interlingual index does not exist; skipping it!' )
104+
63105 conn = engine .connect ()
64106
65107 # load synset relations
@@ -68,75 +110,119 @@ def load_from_xml(root='.', parts='NVA', file_name='ruwordnet/static/ruwordnet.d
68110 fn = os .path .join (root , f'synset_relations.{ part } .xml' )
69111 with open (fn , 'r' , encoding = 'utf-8' ) as f :
70112 relations = xmltodict .parse (f .read (), process_namespaces = True )
113+ rel2values = defaultdict (set )
71114 for relation in tqdm (relations ['relations' ]['relation' ]):
72115 parent_id = relation ['@parent_id' ]
73116 child_id = relation ['@child_id' ]
74117 # parent = session.query(Synset).filter_by(id=parent_id).first()
75118 # child = session.query(Synset).filter_by(id=child_id).first()
76- if relation ['@name' ] == 'hypernym' :
77- insert = hypernymy_table .insert ().values (hyponym_id = parent_id , hypernym_id = child_id )
78- conn .execute (insert )
79- elif relation ['@name' ] == 'instance hypernym' :
80- insert = instances_table .insert ().values (instance_id = parent_id , class_id = child_id )
81- conn .execute (insert )
82- elif relation ['@name' ] == 'domain' :
83- insert = domains_table .insert ().values (domain_item_id = parent_id , domain_id = child_id )
84- conn .execute (insert )
85- elif relation ['@name' ] == 'part holonym' :
86- insert = meronymy_table .insert ().values (meronym_id = parent_id , holonym_id = child_id )
87- conn .execute (insert )
88- elif relation ['@name' ] == 'POS-synonymy' :
89- insert = pos_synonymy_table .insert ().values (left_id = parent_id , right_id = child_id )
90- conn .execute (insert )
119+ rel2values [relation ['@name' ]].add ((parent_id , child_id ))
120+
121+ # ['hypernym', 'related', 'POS-synonymy', 'hyponym', 'domain', 'part holonym', 'instance hypernym',
122+ # 'instance hyponym', 'part meronym', 'antonym'])
123+ # ['hypernym', 'entailment', 'domain', 'POS-synonymy', 'hyponym', 'cause', 'antonym']
124+ # ['POS-synonymy', 'domain', 'hypernym', 'hyponym', 'antonym']
125+ # uncovered: related, hyponym, instance hyponym, part meronym
126+ for relation_name , pairs in rel2values .items ():
127+ if relation_name == 'hypernym' :
128+ conn .execute (
129+ hypernymy_table .insert (),
130+ [dict (hyponym_id = parent_id , hypernym_id = child_id ) for parent_id , child_id in pairs ]
131+ )
132+ elif relation_name == 'instance hypernym' :
133+ conn .execute (
134+ instances_table .insert (),
135+ [dict (instance_id = parent_id , class_id = child_id ) for parent_id , child_id in pairs ]
136+ )
137+ elif relation_name == 'domain' :
138+ conn .execute (
139+ domains_table .insert (),
140+ [dict (domain_item_id = parent_id , domain_id = child_id ) for parent_id , child_id in pairs ]
141+ )
142+ elif relation_name == 'part holonym' :
143+ conn .execute (
144+ meronymy_table .insert (),
145+ [dict (meronym_id = parent_id , holonym_id = child_id ) for parent_id , child_id in pairs ]
146+ )
147+ elif relation_name == 'POS-synonymy' :
148+ conn .execute (
149+ pos_synonymy_table .insert (),
150+ [dict (left_id = parent_id , right_id = child_id ) for parent_id , child_id in pairs ]
151+ )
91152 # synonyms are already duplicated in the data
92153 # insert = pos_synonymy_table.insert().values(right_id=parent_id, left_id=child_id)
93154 # conn.execute(insert)
94- elif relation ['@name' ] == 'antonym' :
95- insert = antonymy_table .insert ().values (left_id = parent_id , right_id = child_id )
96- conn .execute (insert )
155+ elif relation_name == 'antonym' :
156+ conn .execute (
157+ antonymy_table .insert (),
158+ [dict (left_id = parent_id , right_id = child_id ) for parent_id , child_id in pairs ]
159+ )
97160 # antonyms are already duplicated in the data
98161 # insert = antonymy_table.insert().values(right_id=parent_id, left_id=child_id)
99162 # conn.execute(insert)
100- elif relation ['@name' ] == 'entailment' :
101- insert = entailment_table .insert ().values (premise_id = parent_id , conclusion_id = child_id )
102- conn .execute (insert )
103- elif relation ['@name' ] == 'cause' :
104- insert = cause_table .insert ().values (cause_id = parent_id , effect_id = child_id )
105- conn .execute (insert )
163+ elif relation_name == 'entailment' :
164+ conn .execute (
165+ entailment_table .insert (),
166+ [dict (premise_id = parent_id , conclusion_id = child_id ) for parent_id , child_id in pairs ]
167+ )
168+ elif relation_name == 'cause' :
169+ conn .execute (
170+ cause_table .insert (),
171+ [dict (cause_id = parent_id , effect_id = child_id ) for parent_id , child_id in pairs ]
172+ )
173+ elif relation_name == 'related' :
174+ conn .execute (
175+ related_table .insert (),
176+ [dict (left_id = parent_id , right_id = child_id ) for parent_id , child_id in pairs ]
177+ )
178+ else :
179+ print ('unknown relation name' , relation_name )
180+ print ('relation types' , rel2values .keys ())
106181
107182 print ('loading phrases' )
108183 fn = os .path .join (root , 'composed_of.xml' )
109184 with open (fn , 'r' , encoding = 'utf-8' ) as f :
110185 relations = xmltodict .parse (f .read (), process_namespaces = True )
186+ pairs_to_insert = set ()
111187 for relation in tqdm (relations ['senses' ]['sense' ]):
112188 phrase_id = relation ['@id' ]
113189 words = relation ['composed_of' ]['sense' ]
114190 if not isinstance (words , list ):
115191 words = [words ]
116192 for word in words :
117193 word_id = word ['@id' ]
118- insert = composition_table .insert ().values (word_id = word_id , phrase_id = phrase_id )
119- conn .execute (insert )
194+ pairs_to_insert .add ((word_id , phrase_id ))
195+ conn .execute (
196+ composition_table .insert (),
197+ [dict (word_id = word_id , phrase_id = phrase_id ) for word_id , phrase_id in pairs_to_insert ]
198+ )
120199
121200 print ('loading derivations' )
122201 fn = os .path .join (root , 'derived_from.xml' )
123202 with open (fn , 'r' , encoding = 'utf-8' ) as f :
124203 relations = xmltodict .parse (f .read (), process_namespaces = True )
204+ pairs_to_insert = set ()
125205 for relation in tqdm (relations ['senses' ]['sense' ]):
126206 source_id = relation ['@id' ]
127207 derivatives = relation ['derived_from' ]['sense' ]
128208 if not isinstance (derivatives , list ):
129209 derivatives = [derivatives ]
130210 for derivative in derivatives :
131211 derivative_id = derivative ['@id' ]
132- insert = derivation_table .insert ().values (source_id = source_id , derivative_id = derivative_id )
133- conn .execute (insert )
212+ pairs_to_insert .add ((source_id , derivative_id ))
213+ conn .execute (
214+ derivation_table .insert (),
215+ [dict (source_id = source_id , derivative_id = derivative_id ) for source_id , derivative_id in pairs_to_insert ]
216+ )
217+ print ('All loaded successfully!' )
134218
135219
136220if __name__ == '__main__' :
137221 parser = argparse .ArgumentParser (description = 'Convert RuWordNet from xml to sqlite' )
138- parser .add_argument ('-s' , '--source' , default = 'data' , help = 'name of the directory with the source xml files' )
139- parser .add_argument ('-d' , '--destination' , default = 'ruwordnet/static/ruwordnet.db' ,
222+ parser .add_argument (
223+ '-s' , '--source' , default = 'data/rwn-2021' , help = 'name of the directory with the source xml files'
224+ )
225+ parser .add_argument ('-d' , '--destination' , default = 'ruwordnet/static/ruwordnet-2021.db' ,
140226 help = 'destination database filename' )
141227 args = parser .parse_args ()
142228 load_from_xml (root = args .source , file_name = args .destination )
0 commit comments