Skip to content

Commit 2be9b58

Browse files
authored
Merge pull request #2 from dheles/bitstreams
Add get bitstreams script
2 parents 55a52b7 + 3ee5867 commit 2be9b58

File tree

3 files changed

+238
-0
lines changed

3 files changed

+238
-0
lines changed

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,20 @@ $RECYCLE.BIN/
4545
Network Trash Folder
4646
Temporary Items
4747
.apdisk
48+
49+
# Environments
50+
.env
51+
.venv
52+
env/
53+
venv/
54+
ENV/
55+
env.bak/
56+
venv.bak/
57+
58+
# Rope project settings
59+
.ropeproject
60+
61+
# Local
4862
secrets.py
4963
secretsProd.py
5064
*.pyc

data/.keep

Whitespace-only changes.

getBitstreams.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import json
2+
import requests
3+
import time
4+
import csv
5+
import urllib3
6+
import argparse
7+
import os
8+
import re
9+
10+
11+
def main():
12+
# NOTE: this is the secrets file, not a module
13+
import secrets
14+
15+
# define defaults
16+
default_response_timeout = 1
17+
default_limit = 100
18+
19+
# define globals for requests, so we needn't pass too many arguments to our functions
20+
global header
21+
global cookies
22+
23+
# begin: argument parsing
24+
parser = argparse.ArgumentParser()
25+
26+
parser.add_argument('-v', '--verbose', action='store_true',
27+
help='increase output verbosity')
28+
29+
parser.add_argument('-i', '--handle',
30+
help='handle of the object to retreive. optional - if not provided, the script will ask for input')
31+
32+
# bitstream formats:
33+
# REM: set number of args
34+
# '+' == 1 or more.
35+
# '*' == 0 or more.
36+
# '?' == 0 or 1.
37+
# An int is an explicit number of arguments to accept.
38+
parser.add_argument('-f', '--formats', nargs='*',
39+
help='optional list of bitstream formats. will return all formats if not provided')
40+
41+
parser.add_argument('-b', '--bundles', nargs='*',
42+
help='optional list of bundles (e.g. ORIGINAL or LICENSE). will return all bundles if not provided')
43+
44+
parser.add_argument('-dl', '--download', action='store_true',
45+
help='download bitstreams (rather than just retreive metadata about them). default: false')
46+
47+
parser.add_argument('-rt', '--rtimeout', type=int,
48+
help='response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str(default_response_timeout))
49+
50+
parser.add_argument('-l', '--limit', type=int,
51+
help='limit to the number of objects to return in a given request. default: ' + str(default_limit))
52+
53+
parser.add_argument('-u', '--baseURL',
54+
help='url of the dspace instance. can be read from the secrets file')
55+
56+
parser.add_argument('-e', '--email',
57+
help='email of an authorized dspace user. can be read from the secrets file')
58+
59+
parser.add_argument('-p', '--password',
60+
help='password of an authorized dspace user. can be read from the secrets file')
61+
62+
parser.add_argument('-d', '--filePath',
63+
help='directory into which output files will be written. can be read from the secrets file')
64+
65+
parser.add_argument('-s', '--verify',
66+
help='ssl verification enabled (boolean) OR the path to a CA_BUNDLE file or directory with certificates of trusted CAs. use false if using an ssh tunnel to connect to the dspace api. can be read from the secrets file')
67+
68+
args = parser.parse_args()
69+
70+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
71+
if secretsVersion != '':
72+
try:
73+
secrets = __import__(secretsVersion)
74+
print('Accessing Production')
75+
except ImportError:
76+
print('Accessing Stage')
77+
else:
78+
print('Accessing Stage')
79+
80+
if not args.rtimeout:
81+
args.rtimeout = default_response_timeout
82+
83+
if not args.limit:
84+
args.limit = default_limit
85+
86+
if not args.baseURL:
87+
args.baseURL = secrets.baseURL
88+
89+
if not args.email:
90+
args.email = secrets.email
91+
92+
if not args.password:
93+
args.password = secrets.password
94+
95+
if not args.filePath:
96+
args.filePath = secrets.filePath
97+
98+
if not args.verify:
99+
args.verify = secrets.verify
100+
101+
if args.handle:
102+
handle = args.handle
103+
else:
104+
handle = raw_input('Enter handle: ')
105+
106+
if args.verbose:
107+
print('verbosity turned on')
108+
109+
if args.handle:
110+
print('retreiving object with handle {}').format(args.handle)
111+
112+
if args.formats:
113+
print('filtering results to the following bitstream formats: {}').format(args.formats)
114+
else:
115+
print('returning bitstreams of any format')
116+
117+
if args.bundles:
118+
print('filtering results to the following bundles: {}').format(args.bundles)
119+
else:
120+
print('returning bitstreams from any bundle')
121+
122+
if args.download:
123+
print('downloading bitstreams')
124+
125+
if args.rtimeout:
126+
print('response_timeout set to {}').format(args.rtimeout)
127+
128+
# end: argument parsing
129+
130+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
131+
132+
startTime = time.time()
133+
data = {'email': args.email, 'password': args.password}
134+
header = {'content-type': 'application/json', 'accept': 'application/json'}
135+
session = requests.post(args.baseURL+'/rest/login', headers=header, verify=args.verify, params=data, timeout=args.rtimeout).cookies['JSESSIONID']
136+
cookies = {'JSESSIONID': session}
137+
print 'authenticated'
138+
139+
# NOTE: expanding items (of collections) and bitstreams (of items) to get the count
140+
endpoint = args.baseURL+'/rest/handle/'+handle+'?expand=items,bitstreams'
141+
dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
142+
dsObject.raise_for_status() # ensure we notice bad responses
143+
dsObject = dsObject.json()
144+
if args.verbose: print dsObject
145+
dsObjectID = dsObject['uuid']
146+
# TODO: extend
147+
if dsObject['type'] == 'collection':
148+
if args.verbose: print dsObject['type']
149+
150+
itemCount = len(dsObject['items'])
151+
print('{} items').format(itemCount)
152+
for collItem in dsObject['items']:
153+
endpoint = args.baseURL + collItem['link'] + '?expand=bitstreams'
154+
item = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
155+
item.raise_for_status() # ensure we notice bad responses
156+
item = item.json()
157+
processItem(item, args)
158+
159+
elif dsObject['type'] == 'item':
160+
processItem(dsObject, args)
161+
162+
else:
163+
print('object is of an invalid type for this script ({}). please enter the handle of an item or a collection.').format(dsObject['type'])
164+
165+
logout = requests.post(args.baseURL+'/rest/logout', headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
166+
167+
elapsedTime = time.time() - startTime
168+
m, s = divmod(elapsedTime, 60)
169+
h, m = divmod(m, 60)
170+
print('Total script run time: {:01.0f}:{:02.0f}:{:02.0f}').format(h, m, s)
171+
172+
173+
def processItem(dsObject, args):
174+
if args.verbose: print(dsObject['type'])
175+
176+
itemHandle = dsObject['handle']
177+
handleID = re.sub(r'.*\/', '', itemHandle)
178+
itemPath = args.filePath + '/' + handleID + '/'
179+
if not os.path.exists(itemPath):
180+
os.makedirs(itemPath)
181+
182+
f = csv.writer(open(itemPath + handleID + '_bitstreams.csv', 'wb'))
183+
f.writerow(['sequenceId']+['name']+['format']+['bundleName'])
184+
185+
itemID = dsObject['uuid']
186+
bitstreamCount = len(dsObject['bitstreams'])
187+
dlBitstreams = []
188+
offset = 0
189+
limit = args.limit
190+
bitstreams = ''
191+
# while bitstreams != []:
192+
while bitstreamCount > 0:
193+
# don't retreive more bitstreams than we have left
194+
if limit > bitstreamCount:
195+
limit = bitstreamCount
196+
print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit)
197+
bitstreams = requests.get(args.baseURL+'/rest/items/' + str(itemID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
198+
bitstreams.raise_for_status() # ensure we notice bad responses
199+
bitstreams = bitstreams.json()
200+
for bitstream in bitstreams:
201+
if (args.formats and bitstream['format'] in args.formats or not args.formats
202+
and args.bundles and bitstream['bundleName'] in args.bundles or not args.bundles):
203+
if args.verbose: print(bitstream)
204+
sequenceId = str(bitstream['sequenceId'])
205+
fileName = bitstream['name']
206+
fileFormat = bitstream['format']
207+
bundleName = bitstream['bundleName']
208+
f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName])
209+
210+
if args.download:
211+
dlBitstreams.append(bitstream)
212+
offset += limit
213+
bitstreamCount -= limit
214+
215+
for dlBitstream in dlBitstreams:
216+
if not os.path.isfile(itemPath + dlBitstream['name']):
217+
response = requests.get(args.baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
218+
response.raise_for_status() # ensure we notice bad responses
219+
file = open(itemPath + dlBitstream['name'], 'wb')
220+
file.write(response.content)
221+
file.close()
222+
223+
224+
if __name__ == "__main__": main()

0 commit comments

Comments
 (0)