Skip to content

Commit 352b8ca

Browse files
committed
Add bitstreams script
1 parent 2f5f810 commit 352b8ca

File tree

3 files changed

+201
-0
lines changed

3 files changed

+201
-0
lines changed

.gitignore

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,20 @@ $RECYCLE.BIN/
4545
Network Trash Folder
4646
Temporary Items
4747
.apdisk
48+
49+
# Environments
50+
.env
51+
.venv
52+
env/
53+
venv/
54+
ENV/
55+
env.bak/
56+
venv.bak/
57+
58+
# Rope project settings
59+
.ropeproject
60+
61+
# Local
4862
secrets.py
4963
secretsProd.py
5064
*.pyc

data/.keep

Whitespace-only changes.

getBitstreams.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# NOTE: this is the secrets file, not a module
2+
import secrets
3+
4+
import json
5+
import requests
6+
import time
7+
import csv
8+
import urllib3
9+
import argparse
10+
11+
# TODO: use main() to remove need to define defaults up here
12+
response_timeout = 1
13+
limit = 100
14+
15+
# begin: argument parsing
16+
parser = argparse.ArgumentParser()
17+
18+
parser.add_argument("-v", "--verbose", action="store_true",
19+
help="increase output verbosity")
20+
21+
parser.add_argument("-i", "--handle",
22+
help="handle of the object to retreive. optional - if not provided, the script will ask for input")
23+
24+
# bitstream formats:
25+
# REM: set number of args
26+
# '+' == 1 or more.
27+
# '*' == 0 or more.
28+
# '?' == 0 or 1.
29+
# An int is an explicit number of arguments to accept.
30+
parser.add_argument('-f', '--formats', nargs='*',
31+
help="optional list of bitstream formats. will return all formats if not provided")
32+
33+
parser.add_argument("-dl", "--download", action="store_true",
34+
help="download bitstreams (rather than just retreive metadata about them). default: false")
35+
36+
parser.add_argument("-rt", "--rtimeout", type=int,
37+
help="response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: " + str(response_timeout))
38+
39+
parser.add_argument("-l", "--limit", type=int,
40+
help="limit to the number of objects to return in a given request. default = " + str(limit))
41+
42+
args = parser.parse_args()
43+
44+
if args.rtimeout:
45+
response_timeout = args.rtimeout
46+
47+
if args.limit:
48+
limit = args.limit
49+
50+
if args.verbose:
51+
print "verbosity turned on"
52+
53+
if args.handle:
54+
print("retreiving object with handle {}").format(args.handle)
55+
56+
if args.formats:
57+
print('filtering results to the following bitstream formats: %s' % str(args.formats))
58+
else:
59+
print 'returning bitstreams of any format'
60+
61+
if args.download:
62+
print "downloading bitstreams"
63+
64+
if args.rtimeout:
65+
print 'response_timeout set to ' + str(response_timeout)
66+
67+
# end: argument parsing
68+
69+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
70+
71+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
72+
if secretsVersion != '':
73+
try:
74+
secrets = __import__(secretsVersion)
75+
print 'Editing Production'
76+
except ImportError:
77+
print 'Editing Stage'
78+
else:
79+
print 'Editing Stage'
80+
81+
baseURL = secrets.baseURL
82+
email = secrets.email
83+
password = secrets.password
84+
filePath = secrets.filePath
85+
verify = secrets.verify
86+
87+
if args.handle:
88+
handle = args.handle
89+
else:
90+
handle = raw_input('Enter handle: ')
91+
92+
startTime = time.time()
93+
data = {'email': email, 'password': password}
94+
header = {'content-type': 'application/json', 'accept': 'application/json'}
95+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data, timeout=response_timeout).cookies['JSESSIONID']
96+
cookies = {'JSESSIONID': session}
97+
headerFileUpload = {'accept': 'application/json'}
98+
cookiesFileUpload = cookies
99+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify, timeout=response_timeout).json()
100+
userFullName = status['fullname']
101+
print 'authenticated'
102+
103+
# NOTE: expanding bitstreams to get the count, in case this is an item
104+
endpoint = baseURL+'/rest/handle/'+handle+'?expand=bitstreams'
105+
dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=verify, timeout=response_timeout).json()
106+
if args.verbose: print dsObject
107+
dsObjectID = dsObject['uuid']
108+
# TODO: extend
109+
if dsObject['type'] == 'collection':
110+
if args.verbose: print dsObject['type']
111+
112+
itemList = []
113+
offset = 0
114+
items = ''
115+
while items != []:
116+
items = requests.get(baseURL+'/rest/collections/'+str(dsObjectID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
117+
while items.status_code != 200:
118+
time.sleep(5)
119+
items = requests.get(baseURL+'/rest/collections/'+str(dsObjectID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
120+
items = items.json()
121+
for k in range(0, len(items)):
122+
itemID = items[k]['uuid']
123+
itemID = '/rest/items/'+itemID
124+
itemHandle = items[k]['handle']
125+
itemList.append(itemID)
126+
offset = offset + 200
127+
128+
f = csv.writer(open(filePath+'handlesAndBitstreams.csv', 'wb'))
129+
f.writerow(['bitstream']+['handle'])
130+
131+
for item in itemList:
132+
bitstreams = requests.get(baseURL+itemID+'/bitstreams', headers=header, cookies=cookies, verify=verify, timeout=response_timeout).json()
133+
for bitstream in bitstreams:
134+
fileName = bitstream['name']
135+
fileName.replace('.pdf', '')
136+
f.writerow([fileName]+[itemHandle])
137+
138+
elif dsObject['type'] == 'item':
139+
if args.verbose: print dsObject['type']
140+
141+
itemHandle = dsObject['handle']
142+
143+
f = csv.writer(open(filePath+itemHandle.replace('/', '-')+'_bitstreams.csv', 'wb'))
144+
f.writerow(['sequenceId']+['name']+['format']+['bundleName'])
145+
146+
bitstreamCount = len(dsObject['bitstreams'])
147+
dlBitstreams = []
148+
offset = 0
149+
bitstreams = ''
150+
while bitstreams != []:
151+
# don't retreive more bitstreams than we have left
152+
if limit > bitstreamCount:
153+
limit = bitstreamCount
154+
print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit)
155+
bitstreams = requests.get(baseURL+'/rest/items/' + str(dsObjectID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
156+
bitstreams.raise_for_status() # ensure we notice bad responses
157+
bitstreams = bitstreams.json()
158+
for bitstream in bitstreams:
159+
if args.formats and bitstream['format'] in args.formats or not args.formats:
160+
if args.verbose: print bitstream
161+
sequenceId = str(bitstream['sequenceId'])
162+
fileName = bitstream['name']
163+
fileFormat = bitstream['format']
164+
bundleName = bitstream['bundleName']
165+
f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName])
166+
167+
if args.download:
168+
dlBitstreams.append(bitstream)
169+
offset += limit
170+
bitstreamCount -= limit
171+
172+
for dlBitstream in dlBitstreams:
173+
response = requests.get(baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
174+
response.raise_for_status() # ensure we notice bad responses
175+
file = open(filePath + dlBitstream['name'], "wb")
176+
file.write(response.content)
177+
file.close()
178+
else:
179+
print 'object is of an invalid type for this script (' + dsObject['type'] + '). please enter the handle of an item or a collection.'
180+
181+
182+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
183+
184+
elapsedTime = time.time() - startTime
185+
m, s = divmod(elapsedTime, 60)
186+
h, m = divmod(m, 60)
187+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

0 commit comments

Comments
 (0)