convert data[6-8].json to example[1-3].ann (standoff)

hmsklee · hmsklee · commit a99db248e2cd · 2018-01-16T14:21:31.000-06:00
-overlapping/nested TBMs not handled yet
diff --git a/data/example1.ann b/data/example1.ann
@@ -1,6 +1,6 @@
-un lock able
+unlockable
 T1 BOUND 0 2 un
-T2 FREE 4 7 lock
-T3 BOUND 8 12 able
+T2 FREE 2 6 lock
+T3 BOUND 6 10 able
 R1 unlock controlled:T1 controller:T2
 R2 unlockable controller:R1 controlled:T3
diff --git a/data/example2.ann b/data/example2.ann
@@ -1,11 +1,6 @@
-Induction of p21 by p53 following DNA damage inhibits both Cdk4 and Cdk2 activities
-T1  Gene_or_gene_product 13 16  p21
-T2  Gene_or_gene_product 20 23  p53
-T3  BioProcess 34 44    DNA damage
-T4  Gene_or_gene_product 59 63  Cdk4
-T5  Gene_or_gene_product 68 72  Cdk2
-T6  Positive_activation 0 9  Induction
-T7  Negative_regulation 45 53    inhibits
-E1  Positive_activation:T6 Controller:T2 Controlled:T1
-E2  Negative_regulation::T7 Controller:E1 Controlled:T4
-E3  Negative_regulation::T7 Controller:E1 Controlled:T5
+unlockable
+T1 BOUND 0 2 un
+T2 FREE 2 6 lock
+T3 BOUND 6 10 able
+R1 lockable controlled:T2 controller:T3
+R2 unlockable controller:T1 controlled:R1
diff --git a/data/example3.ann b/data/example3.ann
@@ -0,0 +1,11 @@
+Induction of p21 by p53 following DNA damage inhibits both Cdk4 and Cdk2 activities.
+T1  Gene_or_gene_product 13 16  p21
+T2  Gene_or_gene_product 20 23  p53
+T3  BioProcess 34 44    DNA damage
+T4  Gene_or_gene_product 59 63  Cdk4
+T5  Gene_or_gene_product 68 72  Cdk2
+T6  Positive_activation 0 9  Induction
+T7  Negative_regulation 45 53    inhibits
+E1  Positive_activation:T6 Controller:T2 Controlled:T1
+E2  Negative_regulation::T7 Controller:E1 Controlled:T4
+E3  Negative_regulation::T7 Controller:E1 Controlled:T5
diff --git a/index.html b/index.html
@@ -30,11 +30,11 @@
 <div id="brat-input" class="modal">
   <div>
     <header>
-      <span class="tab active">Custom input in .ann format</span>
+      <span class="tab active">Load file</span>
     </header>
     <div class="page active">
       <div>
-        <button><label for="file-input">Load .ann file</label></button>
+        <button><label for="file-input">Upload</label></button>
         <input type="file" id="file-input" style="display:none;">
       </div>
       <textarea></textarea>
diff --git a/js/ann-parser.js b/js/ann-parser.js
@@ -36,13 +36,13 @@ const parseAnn = (function() {
 
     const re = /:+(?=[TER]\d+$)/;
 
-    function parseTextBoundMention(tokens, text) {
+    function parseTextBoundMention(tokens, textLength) {
         const id = +tokens[0].slice(1),
             label = tokens[1],
             charStart = +tokens[2],
             charEnd = +tokens[3];
 
-        if (id > 0 && charStart >= 0 && charStart < charEnd && charEnd < text.length) {
+        if (id > 0 && charStart >= 0 && charStart < charEnd && charEnd <= textLength) {
             return new TextBoundMention('T' + id, label, charStart, charEnd);
         }    
     }
@@ -113,6 +113,8 @@ const parseAnn = (function() {
             relations: [],
             attributes: [],
             unparsedLines: [],
+            text: null,
+            tokens: [],
             mentions: {}
         }
 
@@ -124,6 +126,7 @@ const parseAnn = (function() {
             return output;
         }
 
+        let textLength = text.length;
         let unparsedLines = [];
         let mentions = {};
 
@@ -147,31 +150,35 @@ const parseAnn = (function() {
 
             switch (tokens[0].charAt(0)) {
                 case 'T':
-                    let tbm = parseTextBoundMention(tokens, text);
+                    let tbm = parseTextBoundMention(tokens, textLength);
                     if (tbm) {
                         output.texts.push(tbm);
                         mentions[tbm.id] = tbm;
+                        parseIsSuccessful = true;
                     }
                     break;
                 case 'E':
                     let em = parseEventMention(tokens, mentions);
                     if (em) {
                         output.events.push(em);
                         mentions[em.id] = em;
+                        parseIsSuccessful = true;
                     }
                     break;
                 case 'R':
                     let rm = parseRelationMention(tokens, mentions);
                     if (rm) {
                         output.relations.push(rm);
                         mentions[rm.id] = rm;
+                        parseIsSuccessful = true;
                     }
                     break;
                 case 'A':
                     let a = parseAttribute(tokens, mentions);
                     if (a) {
                         output.attributes.push(a);
                         mentions[a.id] = a;
+                        parseIsSuccessful = true;
                     }
                     break;
             }
@@ -181,6 +188,58 @@ const parseAnn = (function() {
             }
         }
 
+        // split text into tokens
+        output.texts.sort((a,b) => {
+            if (a.charEnd - b.charEnd != 0) {
+                return a.charEnd - b.charEnd;
+            }
+            else {
+                return a.charStart - b.charStart;
+            }
+        });
+
+        let tokens = [];
+        let tbm_i = 0;
+        let token_start = 0;
+        for (let ch = 0; ch < textLength; ++ch) {
+            let tbm = output.texts[tbm_i];
+            while (text[token_start] === ' ') {
+                ++token_start;
+            }
+            if (tbm && tbm.charStart <= ch) {
+                tokens.push({
+                    word: text.slice(tbm.charStart, tbm.charEnd),
+                    start: tbm.charStart,
+                    end: tbm.charEnd
+                });
+                token_start = tbm.charEnd;
+
+                while(output.texts[tbm_i] && output.texts[tbm_i].charStart <= ch){
+                    output.texts[tbm_i].tokenId = tokens.length - 1;
+                    ++tbm_i;
+                }
+            }
+            else if (text[ch] === ' ') {
+                if (token_start < ch) {
+                    tokens.push({
+                        word: text.slice(token_start, ch),
+                        start: token_start,
+                        end: ch
+                    });
+                    token_start = ch + 1;
+                }
+            }
+        }
+        if (token_start < textLength) {
+            tokens.push({
+                word: text.slice(token_start, textLength),
+                start: token_start,
+                end: textLength
+            });
+        }
+
+        output.tokens = tokens;
+        output.text = text;
         output.mentions = mentions;
         output.unparsedLines = unparsedLines;
 
diff --git a/js/main.js b/js/main.js
diff --git a/js/parser.js b/js/parser.js