Merge pull request #2 from dheles/bitstreams

ehanson8 · web-flow · commit 2be9b58880ff · 2018-06-13T13:18:11.000-04:00
Add get bitstreams script
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,20 @@ $RECYCLE.BIN/
 Network Trash Folder
 Temporary Items
 .apdisk
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Rope project settings
+.ropeproject
+
+# Local
 secrets.py
 secretsProd.py
 *.pyc
diff --git a/data/.keep b/data/.keep
diff --git a/getBitstreams.py b/getBitstreams.py
@@ -0,0 +1,224 @@
+import json
+import requests
+import time
+import csv
+import urllib3
+import argparse
+import os
+import re
+
+
+def main():
+    # NOTE: this is the secrets file, not a module
+    import secrets
+
+    # define defaults
+    default_response_timeout = 1
+    default_limit = 100
+
+    # define globals for requests, so we needn't pass too many arguments to our functions
+    global header
+    global cookies
+
+    # begin: argument parsing
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('-v', '--verbose', action='store_true',
+                        help='increase output verbosity')
+
+    parser.add_argument('-i', '--handle',
+                        help='handle of the object to retreive. optional - if not provided, the script will ask for input')
+
+    # bitstream formats:
+    # REM: set number of args
+    # '+' == 1 or more.
+    # '*' == 0 or more.
+    # '?' == 0 or 1.
+    # An int is an explicit number of arguments to accept.
+    parser.add_argument('-f', '--formats', nargs='*',
+                        help='optional list of bitstream formats. will return all formats if not provided')
+
+    parser.add_argument('-b', '--bundles', nargs='*',
+                        help='optional list of bundles (e.g. ORIGINAL or LICENSE). will return all bundles if not provided')
+
+    parser.add_argument('-dl', '--download', action='store_true',
+                        help='download bitstreams (rather than just retreive metadata about them). default: false')
+
+    parser.add_argument('-rt', '--rtimeout', type=int,
+                        help='response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str(default_response_timeout))
+
+    parser.add_argument('-l', '--limit', type=int,
+                        help='limit to the number of objects to return in a given request. default: ' + str(default_limit))
+
+    parser.add_argument('-u', '--baseURL',
+                        help='url of the dspace instance. can be read from the secrets file')
+
+    parser.add_argument('-e', '--email',
+                        help='email of an authorized dspace user. can be read from the secrets file')
+
+    parser.add_argument('-p', '--password',
+                        help='password of an authorized dspace user. can be read from the secrets file')
+
+    parser.add_argument('-d', '--filePath',
+                        help='directory into which output files will be written. can be read from the secrets file')
+
+    parser.add_argument('-s', '--verify',
+                        help='ssl verification enabled (boolean) OR the path to a CA_BUNDLE file or directory with certificates of trusted CAs. use false if using an ssh tunnel to connect to the dspace api. can be read from the secrets file')
+
+    args = parser.parse_args()
+
+    secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
+    if secretsVersion != '':
+        try:
+            secrets = __import__(secretsVersion)
+            print('Accessing Production')
+        except ImportError:
+            print('Accessing Stage')
+    else:
+        print('Accessing Stage')
+
+    if not args.rtimeout:
+        args.rtimeout = default_response_timeout
+
+    if not args.limit:
+        args.limit = default_limit
+
+    if not args.baseURL:
+        args.baseURL = secrets.baseURL
+
+    if not args.email:
+        args.email = secrets.email
+
+    if not args.password:
+        args.password = secrets.password
+
+    if not args.filePath:
+        args.filePath = secrets.filePath
+
+    if not args.verify:
+        args.verify = secrets.verify
+
+    if args.handle:
+        handle = args.handle
+    else:
+        handle = raw_input('Enter handle: ')
+
+    if args.verbose:
+        print('verbosity turned on')
+
+        if args.handle:
+            print('retreiving object with handle {}').format(args.handle)
+
+        if args.formats:
+            print('filtering results to the following bitstream formats: {}').format(args.formats)
+        else:
+            print('returning bitstreams of any format')
+
+        if args.bundles:
+            print('filtering results to the following bundles: {}').format(args.bundles)
+        else:
+            print('returning bitstreams from any bundle')
+
+        if args.download:
+            print('downloading bitstreams')
+
+        if args.rtimeout:
+            print('response_timeout set to {}').format(args.rtimeout)
+
+    # end: argument parsing
+
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    startTime = time.time()
+    data = {'email': args.email, 'password': args.password}
+    header = {'content-type': 'application/json', 'accept': 'application/json'}
+    session = requests.post(args.baseURL+'/rest/login', headers=header, verify=args.verify, params=data, timeout=args.rtimeout).cookies['JSESSIONID']
+    cookies = {'JSESSIONID': session}
+    print 'authenticated'
+
+    # NOTE: expanding items (of collections) and bitstreams (of items) to get the count
+    endpoint = args.baseURL+'/rest/handle/'+handle+'?expand=items,bitstreams'
+    dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
+    dsObject.raise_for_status()  # ensure we notice bad responses
+    dsObject = dsObject.json()
+    if args.verbose: print dsObject
+    dsObjectID = dsObject['uuid']
+    # TODO: extend
+    if dsObject['type'] == 'collection':
+        if args.verbose: print dsObject['type']
+
+        itemCount = len(dsObject['items'])
+        print('{} items').format(itemCount)
+        for collItem in dsObject['items']:
+            endpoint = args.baseURL + collItem['link'] + '?expand=bitstreams'
+            item = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
+            item.raise_for_status()  # ensure we notice bad responses
+            item = item.json()
+            processItem(item, args)
+
+    elif dsObject['type'] == 'item':
+        processItem(dsObject, args)
+
+    else:
+        print('object is of an invalid type for this script ({}). please enter the handle of an item or a collection.').format(dsObject['type'])
+
+    logout = requests.post(args.baseURL+'/rest/logout', headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
+
+    elapsedTime = time.time() - startTime
+    m, s = divmod(elapsedTime, 60)
+    h, m = divmod(m, 60)
+    print('Total script run time: {:01.0f}:{:02.0f}:{:02.0f}').format(h, m, s)
+
+
+def processItem(dsObject, args):
+    if args.verbose: print(dsObject['type'])
+
+    itemHandle = dsObject['handle']
+    handleID = re.sub(r'.*\/', '', itemHandle)
+    itemPath = args.filePath + '/' + handleID + '/'
+    if not os.path.exists(itemPath):
+        os.makedirs(itemPath)
+
+    f = csv.writer(open(itemPath + handleID + '_bitstreams.csv', 'wb'))
+    f.writerow(['sequenceId']+['name']+['format']+['bundleName'])
+
+    itemID = dsObject['uuid']
+    bitstreamCount = len(dsObject['bitstreams'])
+    dlBitstreams = []
+    offset = 0
+    limit = args.limit
+    bitstreams = ''
+    # while bitstreams != []:
+    while bitstreamCount > 0:
+        # don't retreive more bitstreams than we have left
+        if limit > bitstreamCount:
+            limit = bitstreamCount
+        print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit)
+        bitstreams = requests.get(args.baseURL+'/rest/items/' + str(itemID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
+        bitstreams.raise_for_status()  # ensure we notice bad responses
+        bitstreams = bitstreams.json()
+        for bitstream in bitstreams:
+            if (args.formats and bitstream['format'] in args.formats or not args.formats
+                    and args.bundles and bitstream['bundleName'] in args.bundles or not args.bundles):
+                if args.verbose: print(bitstream)
+                sequenceId = str(bitstream['sequenceId'])
+                fileName = bitstream['name']
+                fileFormat = bitstream['format']
+                bundleName = bitstream['bundleName']
+                f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName])
+
+                if args.download:
+                    dlBitstreams.append(bitstream)
+        offset += limit
+        bitstreamCount -= limit
+
+    for dlBitstream in dlBitstreams:
+        if not os.path.isfile(itemPath + dlBitstream['name']):
+            response = requests.get(args.baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
+            response.raise_for_status()  # ensure we notice bad responses
+            file = open(itemPath + dlBitstream['name'], 'wb')
+            file.write(response.content)
+            file.close()
+
+
+if __name__ == "__main__": main()