Skip to content

Commit 30e2636

Browse files
author
ehanson8
committed
updates
1 parent 7431f99 commit 30e2636

18 files changed

+476
-320
lines changed

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# dspace-data-collection
22

3+
**Note**: These scripts were updated in 05/2018 for the new authentication method used by DSpace 6.x
4+
35
All of these scripts require a secrets.py file in the same directory that must contain the following text:
46
```
57
baseURL='https://dspace.myuni.edu'
@@ -13,23 +15,21 @@ The 'filePath' is directory into which output files will be written and 'handleP
1315

1416
If you are using both a development server and a production server, you can create a separate secrets.py file with a different name (e.g. secretsProd.py) and containing the production server information. When running each of these scripts, you will be prompted to enter the file name (e.g 'secretsProd' without '.py') of an alternate secrets file. If you skip the prompt or incorrectly type the file name, the scripts will default to the information in the secrets.py file. This ensures that you will only access the production server if you really intend to.
1517

16-
The command 'requests.packages.urllib3.disable_warnings()' is used to disable the excessive warnings that will be produced if the 'verify' variable is set to False, which necessary if you are using an SSH tunnel to connect to the DSpace API.
17-
18-
**Note**: All of these scripts skip collection '24' for local reasons. To change this, edit the following portion of the script (typically between line 27-39)
18+
**Note**: All of these scripts skip collection '4dccec82-4cfb-4583-a728-2cb823b15ef0' for local reasons. To change this, edit the following portion of the script (typically between line 27-39)
1919

2020

21-
Skips collection 24:
21+
Skips collection 4dccec82-4cfb-4583-a728-2cb823b15ef0:
2222

2323
for j in range (0, len (collections)):
24-
collectionID = collections[j]['id']
25-
if collectionID != 24:
24+
collectionID = collections[j]['uuid']
25+
if collectionID != '4dccec82-4cfb-4583-a728-2cb823b15ef0':
2626
offset = 0
2727

2828

2929
No collections skipped:
3030

3131
for j in range (0, len (collections)):
32-
collectionID = collections[j]['id']
32+
collectionID = collections[j]['uuid']
3333
if collectionID != 0:
3434
offset = 0
3535

compareTwoKeysInCommunity.py

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import secrets
44
import csv
55
import time
6+
import urllib3
7+
8+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
69

710
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
811
if secretsVersion != '':
@@ -13,44 +16,50 @@
1316
print 'Editing Stage'
1417
else:
1518
print 'Editing Stage'
16-
19+
1720
baseURL = secrets.baseURL
1821
email = secrets.email
1922
password = secrets.password
2023
filePath = secrets.filePath
2124
verify = secrets.verify
2225

23-
requests.packages.urllib3.disable_warnings()
24-
25-
communityID = raw_input('Enter community ID: ')
26+
communityHandle = raw_input('Enter community handle: ')
2627
key = raw_input('Enter first key: ')
2728
key2 = raw_input('Enter second key: ')
2829

2930
startTime = time.time()
30-
data = json.dumps({'email':email,'password':password})
31+
data = {'email':email,'password':password}
3132
header = {'content-type':'application/json','accept':'application/json'}
32-
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
33-
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
33+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data).cookies['JSESSIONID']
34+
cookies = {'JSESSIONID': session}
35+
headerFileUpload = {'accept':'application/json'}
36+
cookiesFileUpload = cookies
37+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify).json()
3438
print 'authenticated'
3539

40+
endpoint = baseURL+'/rest/handle/'+communityHandle
41+
community = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
42+
communityID = community['uuid']
43+
3644
itemList = []
3745
endpoint = baseURL+'/rest/communities'
38-
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
46+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=header, cookies=cookies, verify=verify).json()
3947
for j in range (0, len (collections)):
40-
collectionID = collections[j]['id']
41-
if collectionID != 24:
48+
collectionID = collections[j]['uuid']
49+
print collectionID
50+
if collectionID != '4dccec82-4cfb-4583-a728-2cb823b15ef0':
4251
offset = 0
4352
items = ''
4453
while items != []:
45-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
54+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
4655
while items.status_code != 200:
4756
time.sleep(5)
48-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
57+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
4958
items = items.json()
5059
for k in range (0, len (items)):
51-
itemID = items[k]['id']
60+
itemID = items[k]['uuid']
5261
itemList.append(itemID)
53-
offset = offset + 1000
62+
offset = offset + 200
5463
elapsedTime = time.time() - startTime
5564
m, s = divmod(elapsedTime, 60)
5665
h, m = divmod(m, 60)
@@ -60,16 +69,16 @@
6069
for number, itemID in enumerate(itemList):
6170
itemsRemaining = len(itemList) - number
6271
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
63-
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
72+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=header, cookies=cookies, verify=verify).json()
6473
itemTuple = (itemID,)
6574
tupleValue1 = ''
6675
tupleValue2 = ''
6776
for l in range (0, len (metadata)):
6877
if metadata[l]['key'] == key:
69-
metadataValue = metadata[l]['value']
78+
metadataValue = metadata[l]['value'].encode('utf-8')
7079
tupleValue1 = metadataValue
7180
if metadata[l]['key'] == key2:
72-
metadataValue = metadata[l]['value']
81+
metadataValue = metadata[l]['value'].encode('utf-8')
7382
tupleValue2 = metadataValue
7483
itemTuple = itemTuple + (tupleValue1 , tupleValue2)
7584
valueList.append(itemTuple)
@@ -86,7 +95,7 @@
8695
for i in range (0, len (valueList)):
8796
f.writerow([valueList[i][0]]+[valueList[i][1]]+[valueList[i][2]])
8897

89-
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
98+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify)
9099

91100
elapsedTime = time.time() - startTime
92101
m, s = divmod(elapsedTime, 60)

findBogusUris.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import secrets
44
import csv
55
import time
6+
import urllib3
7+
8+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
69

710
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
811
if secretsVersion != '':
@@ -13,44 +16,48 @@
1316
print 'Editing Stage'
1417
else:
1518
print 'Editing Stage'
16-
19+
1720
baseURL = secrets.baseURL
1821
email = secrets.email
1922
password = secrets.password
2023
filePath = secrets.filePath
2124
handlePrefix = secrets.handlePrefix
2225
verify = secrets.verify
2326

24-
requests.packages.urllib3.disable_warnings()
25-
2627
startTime = time.time()
27-
data = json.dumps({'email':email,'password':password})
28+
data = {'email':email,'password':password}
2829
header = {'content-type':'application/json','accept':'application/json'}
29-
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
30-
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
30+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data).cookies['JSESSIONID']
31+
cookies = {'JSESSIONID': session}
32+
headerFileUpload = {'accept':'application/json'}
33+
cookiesFileUpload = cookies
34+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify).json()
35+
userFullName = status['fullname']
3136
print 'authenticated'
3237

3338
itemList = []
3439
endpoint = baseURL+'/rest/communities'
35-
communities = requests.get(endpoint, headers=headerAuth, verify=verify).json()
40+
communities = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
3641
for i in range (0, len (communities)):
37-
communityID = communities[i]['id']
38-
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
42+
communityID = communities[i]['uuid']
43+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=header, cookies=cookies, verify=verify).json()
3944
for j in range (0, len (collections)):
40-
collectionID = collections[j]['id']
41-
if collectionID != 24:
45+
collectionID = collections[j]['uuid']
46+
print collectionID
47+
if collectionID != '4dccec82-4cfb-4583-a728-2cb823b15ef0':
4248
offset = 0
4349
items = ''
4450
while items != []:
45-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
51+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
4652
while items.status_code != 200:
4753
time.sleep(5)
48-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
54+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
4955
items = items.json()
5056
for k in range (0, len (items)):
51-
itemID = items[k]['id']
57+
itemID = items[k]['uuid']
5258
itemList.append(itemID)
53-
offset = offset + 1000
59+
offset = offset + 200
60+
print offset
5461
elapsedTime = time.time() - startTime
5562
m, s = divmod(elapsedTime, 60)
5663
h, m = divmod(m, 60)
@@ -62,14 +69,14 @@
6269
for number, itemID in enumerate(itemList):
6370
itemsRemaining = len(itemList) - number
6471
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
65-
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
72+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=header, cookies=cookies, verify=verify).json()
6673
for l in range (0, len (metadata)):
6774
if metadata[l]['key'] == 'dc.identifier.uri':
6875
uri = str(metadata[l]['value'])
6976
if uri.startswith(handlePrefix) == False:
7077
f.writerow([itemID]+[uri])
7178

72-
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
79+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify)
7380

7481
elapsedTime = time.time() - startTime
7582
m, s = divmod(elapsedTime, 60)

findDuplicateKeys.py

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import secrets
44
import time
55
import csv
6+
import urllib3
7+
8+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
69

710
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
811
if secretsVersion != '':
@@ -20,36 +23,43 @@
2023
filePath = secrets.filePath
2124
verify = secrets.verify
2225

23-
requests.packages.urllib3.disable_warnings()
24-
2526
key = raw_input('Enter key: ')
2627
searchString = "\""+key+"\""
2728

2829
startTime = time.time()
29-
data = json.dumps({'email':email,'password':password})
30+
data = {'email':email,'password':password}
3031
header = {'content-type':'application/json','accept':'application/json'}
31-
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
32-
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
32+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data).cookies['JSESSIONID']
33+
cookies = {'JSESSIONID': session}
34+
headerFileUpload = {'accept':'application/json'}
35+
cookiesFileUpload = cookies
36+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify).json()
37+
userFullName = status['fullname']
3338
print 'authenticated'
3439

35-
3640
itemList = []
3741
endpoint = baseURL+'/rest/communities'
38-
communities = requests.get(endpoint, headers=headerAuth, verify=verify).json()
42+
communities = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
3943
for i in range (0, len (communities)):
40-
communityID = communities[i]['id']
41-
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
44+
communityID = communities[i]['uuid']
45+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=header, cookies=cookies, verify=verify).json()
4246
for j in range (0, len (collections)):
43-
collectionID = collections[j]['id']
44-
if collectionID != 24:
45-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=100000', headers=headerAuth, verify=verify)
46-
while items.status_code != 200:
47-
time.sleep(5)
48-
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=100000', headers=headerAuth, verify=verify)
49-
items = items.json()
50-
for k in range (0, len (items)):
51-
itemID = items[k]['id']
52-
itemList.append(itemID)
47+
collectionID = collections[j]['uuid']
48+
print collectionID
49+
if collectionID != '4dccec82-4cfb-4583-a728-2cb823b15ef0':
50+
offset = 0
51+
items = ''
52+
while items != []:
53+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
54+
while items.status_code != 200:
55+
time.sleep(5)
56+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=200&offset='+str(offset), headers=header, cookies=cookies, verify=verify)
57+
items = items.json()
58+
for k in range (0, len (items)):
59+
itemID = items[k]['uuid']
60+
itemList.append(itemID)
61+
offset = offset + 200
62+
print offset
5363
elapsedTime = time.time() - startTime
5464
m, s = divmod(elapsedTime, 60)
5565
h, m = divmod(m, 60)
@@ -60,12 +70,12 @@
6070
for number, itemID in enumerate(itemList):
6171
itemsRemaining = len(itemList) - number
6272
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
63-
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
73+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=header, cookies=cookies, verify=verify).json()
6474
metadata = json.dumps(metadata)
6575
if metadata.find(searchString) != metadata.rfind(searchString):
6676
f.writerow([itemID])
6777

68-
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
78+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify)
6979

7080
elapsedTime = time.time() - startTime
7181
m, s = divmod(elapsedTime, 60)

getCollectionMetadataJson.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import requests
33
import secrets
44
import time
5+
import urllib3
6+
7+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
58

69
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
710
if secretsVersion != '':
@@ -12,45 +15,47 @@
1215
print 'Editing Stage'
1316
else:
1417
print 'Editing Stage'
15-
18+
1619
baseURL = secrets.baseURL
1720
email = secrets.email
1821
password = secrets.password
1922
filePath = secrets.filePath
2023
verify = secrets.verify
2124

22-
requests.packages.urllib3.disable_warnings()
23-
2425
handle = raw_input('Enter handle: ')
2526

26-
data = json.dumps({'email':email,'password':password})
27+
startTime = time.time()
28+
data = {'email':email,'password':password}
2729
header = {'content-type':'application/json','accept':'application/json'}
28-
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
29-
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
30+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data).cookies['JSESSIONID']
31+
cookies = {'JSESSIONID': session}
32+
headerFileUpload = {'accept':'application/json'}
33+
cookiesFileUpload = cookies
34+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify).json()
35+
userFullName = status['fullname']
3036
print 'authenticated'
31-
startTime = time.time()
3237

3338
endpoint = baseURL+'/rest/handle/'+handle
34-
collection = requests.get(endpoint, headers=headerAuth, verify=verify).json()
35-
collectionID = collection['id']
36-
collectionTitle = requests.get(endpoint, headers=headerAuth, verify=verify).json()
39+
collection = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
40+
collectionID = collection['uuid']
41+
collectionTitle = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
3742
endpoint = baseURL+'/rest/collections/'+str(collectionID)+'/items'
38-
output = requests.get(endpoint, headers=headerAuth, verify=verify).json()
43+
output = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
3944

4045
itemList = []
4146
for i in range (0, len (output)):
4247
name = output[i]['name']
43-
itemID = output[i]['id']
48+
itemID = output[i]['uuid']
4449
itemList.append(itemID)
4550

4651
f=open(filePath+handle.replace('/','-')+'.json', 'w')
4752
metadataGroup = []
4853
for itemID in itemList:
49-
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
54+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=header, cookies=cookies, verify=verify).json()
5055
metadataGroup.append(metadata)
5156
json.dump(metadataGroup, f)
5257

53-
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
58+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify)
5459

5560
elapsedTime = time.time() - startTime
5661
m, s = divmod(elapsedTime, 60)

0 commit comments

Comments
 (0)