|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- encoding: utf-8 -*- |
| 3 | + |
| 4 | +# isis2json.py: convert ISIS and ISO-2709 files to JSON |
| 5 | +# |
| 6 | +# Copyright (C) 2010 BIREME/PAHO/WHO |
| 7 | +# |
| 8 | +# This program is free software: you can redistribute it and/or modify |
| 9 | +# it under the terms of the GNU Lesser General Public License as published |
| 10 | +# by the Free Software Foundation, either version 2.1 of the License, or |
| 11 | +# (at your option) any later version. |
| 12 | + |
| 13 | +# This program is distributed in the hope that it will be useful, |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 16 | +# GNU Lesser General Public License for more details. |
| 17 | + |
| 18 | +# You should have received a copy of the GNU Lesser General Public License |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 20 | + |
| 21 | +############################ |
| 22 | +# BEGIN ISIS2JSON |
| 23 | +# this script works with Python or Jython (versions >=2.5 and <3) |
| 24 | + |
| 25 | +import sys |
| 26 | +import argparse |
| 27 | +from uuid import uuid4 |
| 28 | +import os |
| 29 | + |
| 30 | +try: |
| 31 | + import json |
| 32 | +except ImportError: |
| 33 | + if os.name == 'java': # running Jython |
| 34 | + from com.xhaus.jyson import JysonCodec as json |
| 35 | + else: |
| 36 | + import simplejson as json |
| 37 | + |
| 38 | +SKIP_INACTIVE = True |
| 39 | +DEFAULT_QTY = 2**31 |
| 40 | +ISIS_MFN_KEY = 'mfn' |
| 41 | +ISIS_ACTIVE_KEY = 'active' |
| 42 | +SUBFIELD_DELIMITER = '^' |
| 43 | +INPUT_ENCODING = 'cp1252' |
| 44 | + |
| 45 | + |
| 46 | +def iter_iso_records(iso_file_name, isis_json_type): |
| 47 | + from iso2709 import IsoFile |
| 48 | + from subfield import expand |
| 49 | + |
| 50 | + iso = IsoFile(iso_file_name) |
| 51 | + for record in iso: |
| 52 | + fields = {} |
| 53 | + for field in record.directory: |
| 54 | + field_key = str(int(field.tag)) # remove leading zeroes |
| 55 | + field_occurrences = fields.setdefault(field_key, []) |
| 56 | + content = field.value.decode(INPUT_ENCODING, 'replace') |
| 57 | + if isis_json_type == 1: |
| 58 | + field_occurrences.append(content) |
| 59 | + elif isis_json_type == 2: |
| 60 | + field_occurrences.append(expand(content)) |
| 61 | + elif isis_json_type == 3: |
| 62 | + field_occurrences.append(dict(expand(content))) |
| 63 | + else: |
| 64 | + raise NotImplementedError('ISIS-JSON type %s conversion ' |
| 65 | + 'not yet implemented for .iso input' % isis_json_type) |
| 66 | + |
| 67 | + yield fields |
| 68 | + iso.close() |
| 69 | + |
| 70 | + |
| 71 | +def iter_mst_records(master_file_name, isis_json_type): |
| 72 | + try: |
| 73 | + from bruma.master import MasterFactory, Record |
| 74 | + except ImportError: |
| 75 | + print('IMPORT ERROR: Jython 2.5 and Bruma.jar ' |
| 76 | + 'are required to read .mst files') |
| 77 | + raise SystemExit |
| 78 | + mst = MasterFactory.getInstance(master_file_name).open() |
| 79 | + for record in mst: |
| 80 | + fields = {} |
| 81 | + if SKIP_INACTIVE: |
| 82 | + if record.getStatus() != Record.Status.ACTIVE: |
| 83 | + continue |
| 84 | + else: # save status only there are non-active records |
| 85 | + fields[ISIS_ACTIVE_KEY] = (record.getStatus() == |
| 86 | + Record.Status.ACTIVE) |
| 87 | + fields[ISIS_MFN_KEY] = record.getMfn() |
| 88 | + for field in record.getFields(): |
| 89 | + field_key = str(field.getId()) |
| 90 | + field_occurrences = fields.setdefault(field_key, []) |
| 91 | + if isis_json_type == 3: |
| 92 | + content = {} |
| 93 | + for subfield in field.getSubfields(): |
| 94 | + subfield_key = subfield.getId() |
| 95 | + if subfield_key == '*': |
| 96 | + content['_'] = subfield.getContent() |
| 97 | + else: |
| 98 | + subfield_occurrences = content.setdefault(subfield_key, []) |
| 99 | + subfield_occurrences.append(subfield.getContent()) |
| 100 | + field_occurrences.append(content) |
| 101 | + elif isis_json_type == 1: |
| 102 | + content = [] |
| 103 | + for subfield in field.getSubfields(): |
| 104 | + subfield_key = subfield.getId() |
| 105 | + if subfield_key == '*': |
| 106 | + content.insert(0, subfield.getContent()) |
| 107 | + else: |
| 108 | + content.append(SUBFIELD_DELIMITER + subfield_key + |
| 109 | + subfield.getContent()) |
| 110 | + field_occurrences.append(''.join(content)) |
| 111 | + else: |
| 112 | + raise NotImplementedError('ISIS-JSON type %s conversion ' |
| 113 | + 'not yet implemented for .mst input' % isis_json_type) |
| 114 | + yield fields |
| 115 | + mst.close() |
| 116 | + |
| 117 | + |
| 118 | +def write_json(input_gen, file_name, output, qty, skip, id_tag, |
| 119 | + gen_uuid, mongo, mfn, isis_json_type, prefix, constant): |
| 120 | + start = skip |
| 121 | + end = start + qty |
| 122 | + if id_tag: |
| 123 | + id_tag = str(id_tag) |
| 124 | + ids = set() |
| 125 | + else: |
| 126 | + id_tag = '' |
| 127 | + for i, record in enumerate(input_gen): |
| 128 | + if i >= end: |
| 129 | + break |
| 130 | + if not mongo: |
| 131 | + if i == 0: |
| 132 | + output.write('[') |
| 133 | + elif i > start: |
| 134 | + output.write(',') |
| 135 | + if start <= i < end: |
| 136 | + if id_tag: |
| 137 | + occurrences = record.get(id_tag, None) |
| 138 | + if occurrences is None: |
| 139 | + msg = 'id tag #%s not found in record %s' |
| 140 | + if ISIS_MFN_KEY in record: |
| 141 | + msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY]) |
| 142 | + raise KeyError(msg % (id_tag, i)) |
| 143 | + if len(occurrences) > 1: |
| 144 | + msg = 'multiple id tags #%s found in record %s' |
| 145 | + if ISIS_MFN_KEY in record: |
| 146 | + msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY]) |
| 147 | + raise TypeError(msg % (id_tag, i)) |
| 148 | + else: # ok, we have one and only one id field |
| 149 | + if isis_json_type == 1: |
| 150 | + id = occurrences[0] |
| 151 | + elif isis_json_type == 2: |
| 152 | + id = occurrences[0][0][1] |
| 153 | + elif isis_json_type == 3: |
| 154 | + id = occurrences[0]['_'] |
| 155 | + if id in ids: |
| 156 | + msg = 'duplicate id %s in tag #%s, record %s' |
| 157 | + if ISIS_MFN_KEY in record: |
| 158 | + msg = msg + (' (mfn=%s)' % record[ISIS_MFN_KEY]) |
| 159 | + raise TypeError(msg % (id, id_tag, i)) |
| 160 | + record['_id'] = id |
| 161 | + ids.add(id) |
| 162 | + elif gen_uuid: |
| 163 | + record['_id'] = unicode(uuid4()) |
| 164 | + elif mfn: |
| 165 | + record['_id'] = record[ISIS_MFN_KEY] |
| 166 | + if prefix: |
| 167 | + # iterate over a fixed sequence of tags |
| 168 | + for tag in tuple(record): |
| 169 | + if str(tag).isdigit(): |
| 170 | + record[prefix+tag] = record[tag] |
| 171 | + del record[tag] # this is why we iterate over a tuple |
| 172 | + # with the tags, and not directly on the record dict |
| 173 | + if constant: |
| 174 | + constant_key, constant_value = constant.split(':') |
| 175 | + record[constant_key] = constant_value |
| 176 | + output.write(json.dumps(record).encode('utf-8')) |
| 177 | + output.write('\n') |
| 178 | + if not mongo: |
| 179 | + output.write(']\n') |
| 180 | + |
| 181 | + |
| 182 | +def main(): |
| 183 | + # create the parser |
| 184 | + parser = argparse.ArgumentParser( |
| 185 | + description='Convert an ISIS .mst or .iso file to a JSON array') |
| 186 | + |
| 187 | + # add the arguments |
| 188 | + parser.add_argument( |
| 189 | + 'file_name', metavar='INPUT.(mst|iso)', |
| 190 | + help='.mst or .iso file to read') |
| 191 | + parser.add_argument( |
| 192 | + '-o', '--out', type=argparse.FileType('w'), default=sys.stdout, |
| 193 | + metavar='OUTPUT.json', |
| 194 | + help='the file where the JSON output should be written' |
| 195 | + ' (default: write to stdout)') |
| 196 | + parser.add_argument( |
| 197 | + '-c', '--couch', action='store_true', |
| 198 | + help='output array within a "docs" item in a JSON document' |
| 199 | + ' for bulk insert to CouchDB via POST to db/_bulk_docs') |
| 200 | + parser.add_argument( |
| 201 | + '-m', '--mongo', action='store_true', |
| 202 | + help='output individual records as separate JSON dictionaries,' |
| 203 | + ' one per line for bulk insert to MongoDB via mongoimport utility') |
| 204 | + parser.add_argument( |
| 205 | + '-t', '--type', type=int, metavar='ISIS_JSON_TYPE', default=1, |
| 206 | + help='ISIS-JSON type, sets field structure: 1=string, 2=alist, 3=dict (default=1)') |
| 207 | + parser.add_argument( |
| 208 | + '-q', '--qty', type=int, default=DEFAULT_QTY, |
| 209 | + help='maximum quantity of records to read (default=ALL)') |
| 210 | + parser.add_argument( |
| 211 | + '-s', '--skip', type=int, default=0, |
| 212 | + help='records to skip from start of .mst (default=0)') |
| 213 | + parser.add_argument( |
| 214 | + '-i', '--id', type=int, metavar='TAG_NUMBER', default=0, |
| 215 | + help='generate an "_id" from the given unique TAG field number' |
| 216 | + ' for each record') |
| 217 | + parser.add_argument( |
| 218 | + '-u', '--uuid', action='store_true', |
| 219 | + help='generate an "_id" with a random UUID for each record') |
| 220 | + parser.add_argument( |
| 221 | + '-p', '--prefix', type=str, metavar='PREFIX', default='', |
| 222 | + help='concatenate prefix to every numeric field tag (ex. 99 becomes "v99")') |
| 223 | + parser.add_argument( |
| 224 | + '-n', '--mfn', action='store_true', |
| 225 | + help='generate an "_id" from the MFN of each record' |
| 226 | + ' (available only for .mst input)') |
| 227 | + parser.add_argument( |
| 228 | + '-k', '--constant', type=str, metavar='TAG:VALUE', default='', |
| 229 | + help='Include a constant tag:value in every record (ex. -k type:AS)') |
| 230 | + |
| 231 | + ''' |
| 232 | + # TODO: implement this to export large quantities of records to CouchDB |
| 233 | + parser.add_argument( |
| 234 | + '-r', '--repeat', type=int, default=1, |
| 235 | + help='repeat operation, saving multiple JSON files' |
| 236 | + ' (default=1, use -r 0 to repeat until end of input)') |
| 237 | + ''' |
| 238 | + # parse the command line |
| 239 | + args = parser.parse_args() |
| 240 | + if args.file_name.lower().endswith('.mst'): |
| 241 | + input_gen_func = iter_mst_records |
| 242 | + else: |
| 243 | + if args.mfn: |
| 244 | + print('UNSUPORTED: -n/--mfn option only available for .mst input.') |
| 245 | + raise SystemExit |
| 246 | + input_gen_func = iter_iso_records |
| 247 | + input_gen = input_gen_func(args.file_name, args.type) |
| 248 | + if args.couch: |
| 249 | + args.out.write('{ "docs" : ') |
| 250 | + write_json(input_gen, args.file_name, args.out, args.qty, args.skip, |
| 251 | + args.id, args.uuid, args.mongo, args.mfn, args.type, |
| 252 | + args.prefix, args.constant) |
| 253 | + if args.couch: |
| 254 | + args.out.write('}\n') |
| 255 | + args.out.close() |
| 256 | + |
| 257 | + |
| 258 | +if __name__ == '__main__': |
| 259 | + main() |
| 260 | +# END ISIS2JSON |
0 commit comments