|
| 1 | +import json |
| 2 | +import requests |
| 3 | +import time |
| 4 | +import csv |
| 5 | +import urllib3 |
| 6 | +import argparse |
| 7 | +import os |
| 8 | +import re |
| 9 | + |
| 10 | + |
| 11 | +def main(): |
| 12 | + # NOTE: this is the secrets file, not a module |
| 13 | + import secrets |
| 14 | + |
| 15 | + # define defaults |
| 16 | + default_response_timeout = 1 |
| 17 | + default_limit = 100 |
| 18 | + |
| 19 | + # define globals for requests, so we needn't pass too many arguments to our functions |
| 20 | + global header |
| 21 | + global cookies |
| 22 | + |
| 23 | + # begin: argument parsing |
| 24 | + parser = argparse.ArgumentParser() |
| 25 | + |
| 26 | + parser.add_argument('-v', '--verbose', action='store_true', |
| 27 | + help='increase output verbosity') |
| 28 | + |
| 29 | + parser.add_argument('-i', '--handle', |
| 30 | + help='handle of the object to retreive. optional - if not provided, the script will ask for input') |
| 31 | + |
| 32 | + # bitstream formats: |
| 33 | + # REM: set number of args |
| 34 | + # '+' == 1 or more. |
| 35 | + # '*' == 0 or more. |
| 36 | + # '?' == 0 or 1. |
| 37 | + # An int is an explicit number of arguments to accept. |
| 38 | + parser.add_argument('-f', '--formats', nargs='*', |
| 39 | + help='optional list of bitstream formats. will return all formats if not provided') |
| 40 | + |
| 41 | + parser.add_argument('-b', '--bundles', nargs='*', |
| 42 | + help='optional list of bundles (e.g. ORIGINAL or LICENSE). will return all bundles if not provided') |
| 43 | + |
| 44 | + parser.add_argument('-dl', '--download', action='store_true', |
| 45 | + help='download bitstreams (rather than just retreive metadata about them). default: false') |
| 46 | + |
| 47 | + parser.add_argument('-rt', '--rtimeout', type=int, |
| 48 | + help='response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str(default_response_timeout)) |
| 49 | + |
| 50 | + parser.add_argument('-l', '--limit', type=int, |
| 51 | + help='limit to the number of objects to return in a given request. default: ' + str(default_limit)) |
| 52 | + |
| 53 | + parser.add_argument('-u', '--baseURL', |
| 54 | + help='url of the dspace instance. can be read from the secrets file') |
| 55 | + |
| 56 | + parser.add_argument('-e', '--email', |
| 57 | + help='email of an authorized dspace user. can be read from the secrets file') |
| 58 | + |
| 59 | + parser.add_argument('-p', '--password', |
| 60 | + help='password of an authorized dspace user. can be read from the secrets file') |
| 61 | + |
| 62 | + parser.add_argument('-d', '--filePath', |
| 63 | + help='directory into which output files will be written. can be read from the secrets file') |
| 64 | + |
| 65 | + parser.add_argument('-s', '--verify', |
| 66 | + help='ssl verification enabled (boolean) OR the path to a CA_BUNDLE file or directory with certificates of trusted CAs. use false if using an ssh tunnel to connect to the dspace api. can be read from the secrets file') |
| 67 | + |
| 68 | + args = parser.parse_args() |
| 69 | + |
| 70 | + secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ') |
| 71 | + if secretsVersion != '': |
| 72 | + try: |
| 73 | + secrets = __import__(secretsVersion) |
| 74 | + print('Accessing Production') |
| 75 | + except ImportError: |
| 76 | + print('Accessing Stage') |
| 77 | + else: |
| 78 | + print('Accessing Stage') |
| 79 | + |
| 80 | + if not args.rtimeout: |
| 81 | + args.rtimeout = default_response_timeout |
| 82 | + |
| 83 | + if not args.limit: |
| 84 | + args.limit = default_limit |
| 85 | + |
| 86 | + if not args.baseURL: |
| 87 | + args.baseURL = secrets.baseURL |
| 88 | + |
| 89 | + if not args.email: |
| 90 | + args.email = secrets.email |
| 91 | + |
| 92 | + if not args.password: |
| 93 | + args.password = secrets.password |
| 94 | + |
| 95 | + if not args.filePath: |
| 96 | + args.filePath = secrets.filePath |
| 97 | + |
| 98 | + if not args.verify: |
| 99 | + args.verify = secrets.verify |
| 100 | + |
| 101 | + if args.handle: |
| 102 | + handle = args.handle |
| 103 | + else: |
| 104 | + handle = raw_input('Enter handle: ') |
| 105 | + |
| 106 | + if args.verbose: |
| 107 | + print('verbosity turned on') |
| 108 | + |
| 109 | + if args.handle: |
| 110 | + print('retreiving object with handle {}').format(args.handle) |
| 111 | + |
| 112 | + if args.formats: |
| 113 | + print('filtering results to the following bitstream formats: {}').format(args.formats) |
| 114 | + else: |
| 115 | + print('returning bitstreams of any format') |
| 116 | + |
| 117 | + if args.bundles: |
| 118 | + print('filtering results to the following bundles: {}').format(args.bundles) |
| 119 | + else: |
| 120 | + print('returning bitstreams from any bundle') |
| 121 | + |
| 122 | + if args.download: |
| 123 | + print('downloading bitstreams') |
| 124 | + |
| 125 | + if args.rtimeout: |
| 126 | + print('response_timeout set to {}').format(args.rtimeout) |
| 127 | + |
| 128 | + # end: argument parsing |
| 129 | + |
| 130 | + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| 131 | + |
| 132 | + startTime = time.time() |
| 133 | + data = {'email': args.email, 'password': args.password} |
| 134 | + header = {'content-type': 'application/json', 'accept': 'application/json'} |
| 135 | + session = requests.post(args.baseURL+'/rest/login', headers=header, verify=args.verify, params=data, timeout=args.rtimeout).cookies['JSESSIONID'] |
| 136 | + cookies = {'JSESSIONID': session} |
| 137 | + print 'authenticated' |
| 138 | + |
| 139 | + # NOTE: expanding items (of collections) and bitstreams (of items) to get the count |
| 140 | + endpoint = args.baseURL+'/rest/handle/'+handle+'?expand=items,bitstreams' |
| 141 | + dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout) |
| 142 | + dsObject.raise_for_status() # ensure we notice bad responses |
| 143 | + dsObject = dsObject.json() |
| 144 | + if args.verbose: print dsObject |
| 145 | + dsObjectID = dsObject['uuid'] |
| 146 | + # TODO: extend |
| 147 | + if dsObject['type'] == 'collection': |
| 148 | + if args.verbose: print dsObject['type'] |
| 149 | + |
| 150 | + itemCount = len(dsObject['items']) |
| 151 | + print('{} items').format(itemCount) |
| 152 | + for collItem in dsObject['items']: |
| 153 | + endpoint = args.baseURL + collItem['link'] + '?expand=bitstreams' |
| 154 | + item = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout) |
| 155 | + item.raise_for_status() # ensure we notice bad responses |
| 156 | + item = item.json() |
| 157 | + processItem(item, args) |
| 158 | + |
| 159 | + elif dsObject['type'] == 'item': |
| 160 | + processItem(dsObject, args) |
| 161 | + |
| 162 | + else: |
| 163 | + print('object is of an invalid type for this script ({}). please enter the handle of an item or a collection.').format(dsObject['type']) |
| 164 | + |
| 165 | + logout = requests.post(args.baseURL+'/rest/logout', headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout) |
| 166 | + |
| 167 | + elapsedTime = time.time() - startTime |
| 168 | + m, s = divmod(elapsedTime, 60) |
| 169 | + h, m = divmod(m, 60) |
| 170 | + print('Total script run time: {:01.0f}:{:02.0f}:{:02.0f}').format(h, m, s) |
| 171 | + |
| 172 | + |
| 173 | +def processItem(dsObject, args): |
| 174 | + if args.verbose: print(dsObject['type']) |
| 175 | + |
| 176 | + itemHandle = dsObject['handle'] |
| 177 | + handleID = re.sub(r'.*\/', '', itemHandle) |
| 178 | + itemPath = args.filePath + '/' + handleID + '/' |
| 179 | + if not os.path.exists(itemPath): |
| 180 | + os.makedirs(itemPath) |
| 181 | + |
| 182 | + f = csv.writer(open(itemPath + handleID + '_bitstreams.csv', 'wb')) |
| 183 | + f.writerow(['sequenceId']+['name']+['format']+['bundleName']) |
| 184 | + |
| 185 | + itemID = dsObject['uuid'] |
| 186 | + bitstreamCount = len(dsObject['bitstreams']) |
| 187 | + dlBitstreams = [] |
| 188 | + offset = 0 |
| 189 | + limit = args.limit |
| 190 | + bitstreams = '' |
| 191 | + # while bitstreams != []: |
| 192 | + while bitstreamCount > 0: |
| 193 | + # don't retreive more bitstreams than we have left |
| 194 | + if limit > bitstreamCount: |
| 195 | + limit = bitstreamCount |
| 196 | + print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit) |
| 197 | + bitstreams = requests.get(args.baseURL+'/rest/items/' + str(itemID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout) |
| 198 | + bitstreams.raise_for_status() # ensure we notice bad responses |
| 199 | + bitstreams = bitstreams.json() |
| 200 | + for bitstream in bitstreams: |
| 201 | + if (args.formats and bitstream['format'] in args.formats or not args.formats |
| 202 | + and args.bundles and bitstream['bundleName'] in args.bundles or not args.bundles): |
| 203 | + if args.verbose: print(bitstream) |
| 204 | + sequenceId = str(bitstream['sequenceId']) |
| 205 | + fileName = bitstream['name'] |
| 206 | + fileFormat = bitstream['format'] |
| 207 | + bundleName = bitstream['bundleName'] |
| 208 | + f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName]) |
| 209 | + |
| 210 | + if args.download: |
| 211 | + dlBitstreams.append(bitstream) |
| 212 | + offset += limit |
| 213 | + bitstreamCount -= limit |
| 214 | + |
| 215 | + for dlBitstream in dlBitstreams: |
| 216 | + if not os.path.isfile(itemPath + dlBitstream['name']): |
| 217 | + response = requests.get(args.baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout) |
| 218 | + response.raise_for_status() # ensure we notice bad responses |
| 219 | + file = open(itemPath + dlBitstream['name'], 'wb') |
| 220 | + file.write(response.content) |
| 221 | + file.close() |
| 222 | + |
| 223 | + |
| 224 | +if __name__ == "__main__": main() |
0 commit comments