@@ -13,8 +13,12 @@ def main():
1313 import secrets
1414
1515 # define defaults
16- response_timeout = 1
17- limit = 100
16+ default_response_timeout = 1
17+ default_limit = 100
18+
19+ # define globals for requests, so we needn't pass too many arguments to our functions
20+ global header
21+ global cookies
1822
1923 # begin: argument parsing
2024 parser = argparse .ArgumentParser ()
@@ -38,18 +42,63 @@ def main():
3842 help = 'download bitstreams (rather than just retreive metadata about them). default: false' )
3943
4044 parser .add_argument ('-rt' , '--rtimeout' , type = int ,
41- help = 'response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str (response_timeout ))
45+ help = 'response timeout - number of seconds to wait for a response. not a timeout for a download or run of the entire script. default: ' + str (default_response_timeout ))
4246
4347 parser .add_argument ('-l' , '--limit' , type = int ,
44- help = 'limit to the number of objects to return in a given request. default: ' + str (limit ))
48+ help = 'limit to the number of objects to return in a given request. default: ' + str (default_limit ))
49+
50+ parser .add_argument ('-u' , '--baseURL' ,
51+ help = 'url of the dspace instance. can be read from the secrets file' )
52+
53+ parser .add_argument ('-e' , '--email' ,
54+ help = 'email of an authorized dspace user. can be read from the secrets file' )
55+
56+ parser .add_argument ('-p' , '--password' ,
57+ help = 'password of an authorized dspace user. can be read from the secrets file' )
58+
59+ parser .add_argument ('-d' , '--filePath' ,
60+ help = 'directory into which output files will be written. can be read from the secrets file' )
61+
62+ parser .add_argument ('-s' , '--verify' ,
63+ help = 'ssl verification enabled (boolean) OR the path to a CA_BUNDLE file or directory with certificates of trusted CAs. use false if using an ssh tunnel to connect to the dspace api. can be read from the secrets file' )
4564
4665 args = parser .parse_args ()
4766
67+ secretsVersion = raw_input ('To edit production server, enter the name of the secrets file: ' )
68+ if secretsVersion != '' :
69+ try :
70+ secrets = __import__ (secretsVersion )
71+ print ('Accessing Production' )
72+ except ImportError :
73+ print ('Accessing Stage' )
74+ else :
75+ print ('Accessing Stage' )
76+
4877 if args .rtimeout :
49- response_timeout = args .rtimeout
78+ args .rtimeout = default_response_timeout
5079
5180 if args .limit :
52- limit = args .limit
81+ args .limit = default_limit
82+
83+ if not args .baseURL :
84+ args .baseURL = secrets .baseURL
85+
86+ if not args .email :
87+ args .email = secrets .email
88+
89+ if not args .password :
90+ args .password = secrets .password
91+
92+ if not args .filePath :
93+ args .filePath = secrets .filePath
94+
95+ if not args .verify :
96+ args .verify = secrets .verify
97+
98+ if args .handle :
99+ handle = args .handle
100+ else :
101+ handle = raw_input ('Enter handle: ' )
53102
54103 if args .verbose :
55104 print ('verbosity turned on' )
@@ -66,44 +115,22 @@ def main():
66115 print ('downloading bitstreams' )
67116
68117 if args .rtimeout :
69- print ('response_timeout set to {}' ).format (response_timeout )
118+ print ('response_timeout set to {}' ).format (args . rtimeout )
70119
71120 # end: argument parsing
72121
73122 urllib3 .disable_warnings (urllib3 .exceptions .InsecureRequestWarning )
74123
75- secretsVersion = raw_input ('To edit production server, enter the name of the secrets file: ' )
76- if secretsVersion != '' :
77- try :
78- secrets = __import__ (secretsVersion )
79- print ('Accessing Production' )
80- except ImportError :
81- print ('Accessing Stage' )
82- else :
83- print ('Accessing Stage' )
84-
85- baseURL = secrets .baseURL
86- email = secrets .email
87- password = secrets .password
88- filePath = secrets .filePath
89- verify = secrets .verify
90-
91- if args .handle :
92- handle = args .handle
93- else :
94- handle = raw_input ('Enter handle: ' )
95-
96124 startTime = time .time ()
97- data = {'email' : email , 'password' : password }
125+ data = {'email' : args . email , 'password' : args . password }
98126 header = {'content-type' : 'application/json' , 'accept' : 'application/json' }
99- session = requests .post (baseURL + '/rest/login' , headers = header , verify = verify , params = data , timeout = response_timeout ).cookies ['JSESSIONID' ]
127+ session = requests .post (args . baseURL + '/rest/login' , headers = header , verify = args . verify , params = data , timeout = args . rtimeout ).cookies ['JSESSIONID' ]
100128 cookies = {'JSESSIONID' : session }
101- status = requests .get (baseURL + '/rest/status' , headers = header , cookies = cookies , verify = verify , timeout = response_timeout ).json ()
102129 print 'authenticated'
103130
104- # NOTE: expanding bitstreams to get the count, in case this is an item
105- endpoint = baseURL + '/rest/handle/' + handle + '?expand=bitstreams'
106- dsObject = requests .get (endpoint , headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
131+ # NOTE: expanding items (of collections) and bitstreams (of items) to get the count
132+ endpoint = args . baseURL + '/rest/handle/' + handle + '?expand=items, bitstreams'
133+ dsObject = requests .get (endpoint , headers = header , cookies = cookies , verify = args . verify , timeout = args . rtimeout )
107134 dsObject .raise_for_status () # ensure we notice bad responses
108135 dsObject = dsObject .json ()
109136 if args .verbose : print dsObject
@@ -112,90 +139,77 @@ def main():
112139 if dsObject ['type' ] == 'collection' :
113140 if args .verbose : print dsObject ['type' ]
114141
115- itemList = []
116- offset = 0
117- items = ''
118- while items != []:
119- items = requests .get (baseURL + '/rest/collections/' + str (dsObjectID )+ '/items?limit=200&offset=' + str (offset ), headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
120- while items .status_code != 200 :
121- time .sleep (5 )
122- items = requests .get (baseURL + '/rest/collections/' + str (dsObjectID )+ '/items?limit=200&offset=' + str (offset ), headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
123- items = items .json ()
124- for k in range (0 , len (items )):
125- itemID = items [k ]['uuid' ]
126- itemID = '/rest/items/' + itemID
127- itemHandle = items [k ]['handle' ]
128- itemList .append (itemID )
129- offset = offset + 200
130-
131- f = csv .writer (open (filePath + 'handlesAndBitstreams.csv' , 'wb' ))
132- f .writerow (['bitstream' ]+ ['handle' ])
133-
134- for item in itemList :
135- bitstreams = requests .get (baseURL + itemID + '/bitstreams' , headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
136- bitstreams .raise_for_status () # ensure we notice bad responses
137- bitstreams = bitstreams .json ()
138- for bitstream in bitstreams :
139- fileName = bitstream ['name' ]
140- fileName .replace ('.pdf' , '' )
141- f .writerow ([fileName ]+ [itemHandle ])
142+ itemCount = len (dsObject ['items' ])
143+ print ('{} items' ).format (itemCount )
144+ for collItem in dsObject ['items' ]:
145+ endpoint = args .baseURL + collItem ['link' ] + '?expand=bitstreams'
146+ item = requests .get (endpoint , headers = header , cookies = cookies , verify = args .verify , timeout = args .rtimeout )
147+ item .raise_for_status () # ensure we notice bad responses
148+ item = item .json ()
149+ processItem (item , args )
142150
143151 elif dsObject ['type' ] == 'item' :
144- if args .verbose : print (dsObject ['type' ])
145-
146- itemHandle = dsObject ['handle' ]
147- handleID = re .sub (r'.*\/' , '' , itemHandle )
148- itemPath = filePath + '/' + handleID + '/'
149- if not os .path .exists (itemPath ):
150- os .makedirs (itemPath )
151-
152- f = csv .writer (open (itemPath + handleID + '_bitstreams.csv' , 'wb' ))
153- f .writerow (['sequenceId' ]+ ['name' ]+ ['format' ]+ ['bundleName' ])
154-
155- bitstreamCount = len (dsObject ['bitstreams' ])
156- dlBitstreams = []
157- offset = 0
158- bitstreams = ''
159- # while bitstreams != []:
160- while bitstreamCount > 0 :
161- # don't retreive more bitstreams than we have left
162- if limit > bitstreamCount :
163- limit = bitstreamCount
164- print ('bitstreamCount: {0} offset: {1} limit: {2}' ).format (bitstreamCount , offset , limit )
165- bitstreams = requests .get (baseURL + '/rest/items/' + str (dsObjectID ) + '/bitstreams?limit=' + str (limit ) + '&offset=' + str (offset ), headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
166- bitstreams .raise_for_status () # ensure we notice bad responses
167- bitstreams = bitstreams .json ()
168- for bitstream in bitstreams :
169- if args .formats and bitstream ['format' ] in args .formats or not args .formats :
170- if args .verbose : print (bitstream )
171- sequenceId = str (bitstream ['sequenceId' ])
172- fileName = bitstream ['name' ]
173- fileFormat = bitstream ['format' ]
174- bundleName = bitstream ['bundleName' ]
175- f .writerow ([sequenceId ]+ [fileName ]+ [fileFormat ]+ [bundleName ])
176-
177- if args .download :
178- dlBitstreams .append (bitstream )
179- offset += limit
180- bitstreamCount -= limit
181-
182- for dlBitstream in dlBitstreams :
183- if not os .path .isfile (itemPath + dlBitstream ['name' ]):
184- response = requests .get (baseURL + str (dlBitstream ['retrieveLink' ]), headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
185- response .raise_for_status () # ensure we notice bad responses
186- file = open (itemPath + dlBitstream ['name' ], 'wb' )
187- file .write (response .content )
188- file .close ()
152+ processItem (dsObject , args )
153+
189154 else :
190155 print ('object is of an invalid type for this script ({}). please enter the handle of an item or a collection.' ).format (dsObject ['type' ])
191156
192-
193- logout = requests .post (baseURL + '/rest/logout' , headers = header , cookies = cookies , verify = verify , timeout = response_timeout )
157+ logout = requests .post (args .baseURL + '/rest/logout' , headers = header , cookies = cookies , verify = args .verify , timeout = args .rtimeout )
194158
195159 elapsedTime = time .time () - startTime
196160 m , s = divmod (elapsedTime , 60 )
197161 h , m = divmod (m , 60 )
198162 print ('Total script run time: {:01.0f}:{:02.0f}:{:02.0f}' ).format (h , m , s )
199163
200164
165+ def processItem (dsObject , args ):
166+ if args .verbose : print (dsObject ['type' ])
167+
168+ itemHandle = dsObject ['handle' ]
169+ handleID = re .sub (r'.*\/' , '' , itemHandle )
170+ itemPath = args .filePath + '/' + handleID + '/'
171+ if not os .path .exists (itemPath ):
172+ os .makedirs (itemPath )
173+
174+ f = csv .writer (open (itemPath + handleID + '_bitstreams.csv' , 'wb' ))
175+ f .writerow (['sequenceId' ]+ ['name' ]+ ['format' ]+ ['bundleName' ])
176+
177+ itemID = dsObject ['uuid' ]
178+ bitstreamCount = len (dsObject ['bitstreams' ])
179+ dlBitstreams = []
180+ offset = 0
181+ limit = args .limit
182+ bitstreams = ''
183+ # while bitstreams != []:
184+ while bitstreamCount > 0 :
185+ # don't retreive more bitstreams than we have left
186+ if limit > bitstreamCount :
187+ limit = bitstreamCount
188+ print ('bitstreamCount: {0} offset: {1} limit: {2}' ).format (bitstreamCount , offset , limit )
189+ bitstreams = requests .get (args .baseURL + '/rest/items/' + str (itemID ) + '/bitstreams?limit=' + str (limit ) + '&offset=' + str (offset ), headers = header , cookies = cookies , verify = args .verify , timeout = args .rtimeout )
190+ bitstreams .raise_for_status () # ensure we notice bad responses
191+ bitstreams = bitstreams .json ()
192+ for bitstream in bitstreams :
193+ if args .formats and bitstream ['format' ] in args .formats or not args .formats :
194+ if args .verbose : print (bitstream )
195+ sequenceId = str (bitstream ['sequenceId' ])
196+ fileName = bitstream ['name' ]
197+ fileFormat = bitstream ['format' ]
198+ bundleName = bitstream ['bundleName' ]
199+ f .writerow ([sequenceId ]+ [fileName ]+ [fileFormat ]+ [bundleName ])
200+
201+ if args .download :
202+ dlBitstreams .append (bitstream )
203+ offset += limit
204+ bitstreamCount -= limit
205+
206+ for dlBitstream in dlBitstreams :
207+ if not os .path .isfile (itemPath + dlBitstream ['name' ]):
208+ response = requests .get (args .baseURL + str (dlBitstream ['retrieveLink' ]), headers = header , cookies = cookies , verify = args .verify , timeout = args .rtimeout )
209+ response .raise_for_status () # ensure we notice bad responses
210+ file = open (itemPath + dlBitstream ['name' ], 'wb' )
211+ file .write (response .content )
212+ file .close ()
213+
214+
201215if __name__ == "__main__" : main ()
0 commit comments