@@ -36,13 +36,13 @@ const parseAnn = (function() {
3636
3737 const re = / : + (? = [ T E R ] \d + $ ) / ;
3838
39- function parseTextBoundMention ( tokens , text ) {
39+ function parseTextBoundMention ( tokens , textLength ) {
4040 const id = + tokens [ 0 ] . slice ( 1 ) ,
4141 label = tokens [ 1 ] ,
4242 charStart = + tokens [ 2 ] ,
4343 charEnd = + tokens [ 3 ] ;
4444
45- if ( id > 0 && charStart >= 0 && charStart < charEnd && charEnd < text . length ) {
45+ if ( id > 0 && charStart >= 0 && charStart < charEnd && charEnd <= textLength ) {
4646 return new TextBoundMention ( 'T' + id , label , charStart , charEnd ) ;
4747 }
4848 }
@@ -113,6 +113,8 @@ const parseAnn = (function() {
113113 relations : [ ] ,
114114 attributes : [ ] ,
115115 unparsedLines : [ ] ,
116+ text : null ,
117+ tokens : [ ] ,
116118 mentions : { }
117119 }
118120
@@ -124,6 +126,7 @@ const parseAnn = (function() {
124126 return output ;
125127 }
126128
129+ let textLength = text . length ;
127130 let unparsedLines = [ ] ;
128131 let mentions = { } ;
129132
@@ -147,31 +150,35 @@ const parseAnn = (function() {
147150
148151 switch ( tokens [ 0 ] . charAt ( 0 ) ) {
149152 case 'T' :
150- let tbm = parseTextBoundMention ( tokens , text ) ;
153+ let tbm = parseTextBoundMention ( tokens , textLength ) ;
151154 if ( tbm ) {
152155 output . texts . push ( tbm ) ;
153156 mentions [ tbm . id ] = tbm ;
157+ parseIsSuccessful = true ;
154158 }
155159 break ;
156160 case 'E' :
157161 let em = parseEventMention ( tokens , mentions ) ;
158162 if ( em ) {
159163 output . events . push ( em ) ;
160164 mentions [ em . id ] = em ;
165+ parseIsSuccessful = true ;
161166 }
162167 break ;
163168 case 'R' :
164169 let rm = parseRelationMention ( tokens , mentions ) ;
165170 if ( rm ) {
166171 output . relations . push ( rm ) ;
167172 mentions [ rm . id ] = rm ;
173+ parseIsSuccessful = true ;
168174 }
169175 break ;
170176 case 'A' :
171177 let a = parseAttribute ( tokens , mentions ) ;
172178 if ( a ) {
173179 output . attributes . push ( a ) ;
174180 mentions [ a . id ] = a ;
181+ parseIsSuccessful = true ;
175182 }
176183 break ;
177184 }
@@ -181,6 +188,58 @@ const parseAnn = (function() {
181188 }
182189 }
183190
191+ // split text into tokens
192+ output . texts . sort ( ( a , b ) => {
193+ if ( a . charEnd - b . charEnd != 0 ) {
194+ return a . charEnd - b . charEnd ;
195+ }
196+ else {
197+ return a . charStart - b . charStart ;
198+ }
199+ } ) ;
200+
201+ let tokens = [ ] ;
202+ let tbm_i = 0 ;
203+ let token_start = 0 ;
204+ for ( let ch = 0 ; ch < textLength ; ++ ch ) {
205+ let tbm = output . texts [ tbm_i ] ;
206+ while ( text [ token_start ] === ' ' ) {
207+ ++ token_start ;
208+ }
209+ if ( tbm && tbm . charStart <= ch ) {
210+ tokens . push ( {
211+ word : text . slice ( tbm . charStart , tbm . charEnd ) ,
212+ start : tbm . charStart ,
213+ end : tbm . charEnd
214+ } ) ;
215+ token_start = tbm . charEnd ;
216+
217+ while ( output . texts [ tbm_i ] && output . texts [ tbm_i ] . charStart <= ch ) {
218+ output . texts [ tbm_i ] . tokenId = tokens . length - 1 ;
219+ ++ tbm_i ;
220+ }
221+ }
222+ else if ( text [ ch ] === ' ' ) {
223+ if ( token_start < ch ) {
224+ tokens . push ( {
225+ word : text . slice ( token_start , ch ) ,
226+ start : token_start ,
227+ end : ch
228+ } ) ;
229+ token_start = ch + 1 ;
230+ }
231+ }
232+ }
233+ if ( token_start < textLength ) {
234+ tokens . push ( {
235+ word : text . slice ( token_start , textLength ) ,
236+ start : token_start ,
237+ end : textLength
238+ } ) ;
239+ }
240+
241+ output . tokens = tokens ;
242+ output . text = text ;
184243 output . mentions = mentions ;
185244 output . unparsedLines = unparsedLines ;
186245
0 commit comments