Skip to content

Commit a0257b8

Browse files
committed
Process collections
1 parent 8a5b779 commit a0257b8

File tree

1 file changed

+122
-108
lines changed

1 file changed

+122
-108
lines changed

getBitstreams.py

Lines changed: 122 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,12 @@ def main():
1313
import secrets
1414

1515
# define defaults
16-
response_timeout = 1
17-
limit = 100
16+
default_response_timeout = 1
17+
default_limit = 100
18+
19+
# define globals for requests, so we needn't pass too many arguments to our functions
20+
global header
21+
global cookies
1822

1923
# begin: argument parsing
2024
parser = argparse.ArgumentParser()
@@ -38,18 +42,63 @@ def main():
3842
help='download bitstreams (rather than just retreive metadata about them). default: false')
3943

4044
parser.add_argument('-rt', '--rtimeout', type=int,
41-
help='response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str(response_timeout))
45+
help='response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str(default_response_timeout))
4246

4347
parser.add_argument('-l', '--limit', type=int,
44-
help='limit to the number of objects to return in a given request. default: ' + str(limit))
48+
help='limit to the number of objects to return in a given request. default: ' + str(default_limit))
49+
50+
parser.add_argument('-u', '--baseURL',
51+
help='url of the dspace instance. can be read from the secrets file')
52+
53+
parser.add_argument('-e', '--email',
54+
help='email of an authorized dspace user. can be read from the secrets file')
55+
56+
parser.add_argument('-p', '--password',
57+
help='password of an authorized dspace user. can be read from the secrets file')
58+
59+
parser.add_argument('-d', '--filePath',
60+
help='directory into which output files will be written. can be read from the secrets file')
61+
62+
parser.add_argument('-s', '--verify',
63+
help='ssl verification enabled (boolean) OR the path to a CA_BUNDLE file or directory with certificates of trusted CAs. use false if using an ssh tunnel to connect to the dspace api. can be read from the secrets file')
4564

4665
args = parser.parse_args()
4766

67+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
68+
if secretsVersion != '':
69+
try:
70+
secrets = __import__(secretsVersion)
71+
print('Accessing Production')
72+
except ImportError:
73+
print('Accessing Stage')
74+
else:
75+
print('Accessing Stage')
76+
4877
if args.rtimeout:
49-
response_timeout = args.rtimeout
78+
args.rtimeout = default_response_timeout
5079

5180
if args.limit:
52-
limit = args.limit
81+
args.limit = default_limit
82+
83+
if not args.baseURL:
84+
args.baseURL = secrets.baseURL
85+
86+
if not args.email:
87+
args.email = secrets.email
88+
89+
if not args.password:
90+
args.password = secrets.password
91+
92+
if not args.filePath:
93+
args.filePath = secrets.filePath
94+
95+
if not args.verify:
96+
args.verify = secrets.verify
97+
98+
if args.handle:
99+
handle = args.handle
100+
else:
101+
handle = raw_input('Enter handle: ')
53102

54103
if args.verbose:
55104
print('verbosity turned on')
@@ -66,44 +115,22 @@ def main():
66115
print('downloading bitstreams')
67116

68117
if args.rtimeout:
69-
print('response_timeout set to {}').format(response_timeout)
118+
print('response_timeout set to {}').format(args.rtimeout)
70119

71120
# end: argument parsing
72121

73122
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
74123

75-
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
76-
if secretsVersion != '':
77-
try:
78-
secrets = __import__(secretsVersion)
79-
print('Accessing Production')
80-
except ImportError:
81-
print('Accessing Stage')
82-
else:
83-
print('Accessing Stage')
84-
85-
baseURL = secrets.baseURL
86-
email = secrets.email
87-
password = secrets.password
88-
filePath = secrets.filePath
89-
verify = secrets.verify
90-
91-
if args.handle:
92-
handle = args.handle
93-
else:
94-
handle = raw_input('Enter handle: ')
95-
96124
startTime = time.time()
97-
data = {'email': email, 'password': password}
125+
data = {'email': args.email, 'password': args.password}
98126
header = {'content-type': 'application/json', 'accept': 'application/json'}
99-
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data, timeout=response_timeout).cookies['JSESSIONID']
127+
session = requests.post(args.baseURL+'/rest/login', headers=header, verify=args.verify, params=data, timeout=args.rtimeout).cookies['JSESSIONID']
100128
cookies = {'JSESSIONID': session}
101-
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify, timeout=response_timeout).json()
102129
print 'authenticated'
103130

104-
# NOTE: expanding bitstreams to get the count, in case this is an item
105-
endpoint = baseURL+'/rest/handle/'+handle+'?expand=bitstreams'
106-
dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
131+
# NOTE: expanding items (of collections) and bitstreams (of items) to get the count
132+
endpoint = args.baseURL+'/rest/handle/'+handle+'?expand=items,bitstreams'
133+
dsObject = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
107134
dsObject.raise_for_status() # ensure we notice bad responses
108135
dsObject = dsObject.json()
109136
if args.verbose: print dsObject
@@ -112,90 +139,77 @@ def main():
112139
if dsObject['type'] == 'collection':
113140
if args.verbose: print dsObject['type']
114141

115-
itemList = []
116-
offset = 0
117-
items = ''
118-
while items != []:
119-
items = requests.get(baseURL+'/rest/collections/'+str(dsObjectID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
120-
while items.status_code != 200:
121-
time.sleep(5)
122-
items = requests.get(baseURL+'/rest/collections/'+str(dsObjectID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
123-
items = items.json()
124-
for k in range(0, len(items)):
125-
itemID = items[k]['uuid']
126-
itemID = '/rest/items/'+itemID
127-
itemHandle = items[k]['handle']
128-
itemList.append(itemID)
129-
offset = offset + 200
130-
131-
f = csv.writer(open(filePath+'handlesAndBitstreams.csv', 'wb'))
132-
f.writerow(['bitstream']+['handle'])
133-
134-
for item in itemList:
135-
bitstreams = requests.get(baseURL+itemID+'/bitstreams', headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
136-
bitstreams.raise_for_status() # ensure we notice bad responses
137-
bitstreams = bitstreams.json()
138-
for bitstream in bitstreams:
139-
fileName = bitstream['name']
140-
fileName.replace('.pdf', '')
141-
f.writerow([fileName]+[itemHandle])
142+
itemCount = len(dsObject['items'])
143+
print('{} items').format(itemCount)
144+
for collItem in dsObject['items']:
145+
endpoint = args.baseURL + collItem['link'] + '?expand=bitstreams'
146+
item = requests.get(endpoint, headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
147+
item.raise_for_status() # ensure we notice bad responses
148+
item = item.json()
149+
processItem(item, args)
142150

143151
elif dsObject['type'] == 'item':
144-
if args.verbose: print(dsObject['type'])
145-
146-
itemHandle = dsObject['handle']
147-
handleID = re.sub(r'.*\/', '', itemHandle)
148-
itemPath = filePath + '/' + handleID + '/'
149-
if not os.path.exists(itemPath):
150-
os.makedirs(itemPath)
151-
152-
f = csv.writer(open(itemPath + handleID + '_bitstreams.csv', 'wb'))
153-
f.writerow(['sequenceId']+['name']+['format']+['bundleName'])
154-
155-
bitstreamCount = len(dsObject['bitstreams'])
156-
dlBitstreams = []
157-
offset = 0
158-
bitstreams = ''
159-
# while bitstreams != []:
160-
while bitstreamCount > 0:
161-
# don't retreive more bitstreams than we have left
162-
if limit > bitstreamCount:
163-
limit = bitstreamCount
164-
print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit)
165-
bitstreams = requests.get(baseURL+'/rest/items/' + str(dsObjectID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
166-
bitstreams.raise_for_status() # ensure we notice bad responses
167-
bitstreams = bitstreams.json()
168-
for bitstream in bitstreams:
169-
if args.formats and bitstream['format'] in args.formats or not args.formats:
170-
if args.verbose: print(bitstream)
171-
sequenceId = str(bitstream['sequenceId'])
172-
fileName = bitstream['name']
173-
fileFormat = bitstream['format']
174-
bundleName = bitstream['bundleName']
175-
f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName])
176-
177-
if args.download:
178-
dlBitstreams.append(bitstream)
179-
offset += limit
180-
bitstreamCount -= limit
181-
182-
for dlBitstream in dlBitstreams:
183-
if not os.path.isfile(itemPath + dlBitstream['name']):
184-
response = requests.get(baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
185-
response.raise_for_status() # ensure we notice bad responses
186-
file = open(itemPath + dlBitstream['name'], 'wb')
187-
file.write(response.content)
188-
file.close()
152+
processItem(dsObject, args)
153+
189154
else:
190155
print('object is of an invalid type for this script ({}). please enter the handle of an item or a collection.').format(dsObject['type'])
191156

192-
193-
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify, timeout=response_timeout)
157+
logout = requests.post(args.baseURL+'/rest/logout', headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
194158

195159
elapsedTime = time.time() - startTime
196160
m, s = divmod(elapsedTime, 60)
197161
h, m = divmod(m, 60)
198162
print('Total script run time: {:01.0f}:{:02.0f}:{:02.0f}').format(h, m, s)
199163

200164

165+
def processItem(dsObject, args):
166+
if args.verbose: print(dsObject['type'])
167+
168+
itemHandle = dsObject['handle']
169+
handleID = re.sub(r'.*\/', '', itemHandle)
170+
itemPath = args.filePath + '/' + handleID + '/'
171+
if not os.path.exists(itemPath):
172+
os.makedirs(itemPath)
173+
174+
f = csv.writer(open(itemPath + handleID + '_bitstreams.csv', 'wb'))
175+
f.writerow(['sequenceId']+['name']+['format']+['bundleName'])
176+
177+
itemID = dsObject['uuid']
178+
bitstreamCount = len(dsObject['bitstreams'])
179+
dlBitstreams = []
180+
offset = 0
181+
limit = args.limit
182+
bitstreams = ''
183+
# while bitstreams != []:
184+
while bitstreamCount > 0:
185+
# don't retreive more bitstreams than we have left
186+
if limit > bitstreamCount:
187+
limit = bitstreamCount
188+
print('bitstreamCount: {0} offset: {1} limit: {2}').format(bitstreamCount, offset, limit)
189+
bitstreams = requests.get(args.baseURL+'/rest/items/' + str(itemID) + '/bitstreams?limit=' + str(limit) + '&offset='+str(offset), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
190+
bitstreams.raise_for_status() # ensure we notice bad responses
191+
bitstreams = bitstreams.json()
192+
for bitstream in bitstreams:
193+
if args.formats and bitstream['format'] in args.formats or not args.formats:
194+
if args.verbose: print(bitstream)
195+
sequenceId = str(bitstream['sequenceId'])
196+
fileName = bitstream['name']
197+
fileFormat = bitstream['format']
198+
bundleName = bitstream['bundleName']
199+
f.writerow([sequenceId]+[fileName]+[fileFormat]+[bundleName])
200+
201+
if args.download:
202+
dlBitstreams.append(bitstream)
203+
offset += limit
204+
bitstreamCount -= limit
205+
206+
for dlBitstream in dlBitstreams:
207+
if not os.path.isfile(itemPath + dlBitstream['name']):
208+
response = requests.get(args.baseURL + str(dlBitstream['retrieveLink']), headers=header, cookies=cookies, verify=args.verify, timeout=args.rtimeout)
209+
response.raise_for_status() # ensure we notice bad responses
210+
file = open(itemPath + dlBitstream['name'], 'wb')
211+
file.write(response.content)
212+
file.close()
213+
214+
201215
if __name__ == "__main__": main()

0 commit comments

Comments
 (0)