Skip to content

Commit 5f2dc68

Browse files
author
ehanson8
committed
updates
1 parent 1e217f2 commit 5f2dc68

7 files changed

+355
-4
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ secrets.py
4949
secretsProd.py
5050
*.pyc
5151
data/*
52+
local/*
5253
!data/.keep
5354
.profile
5455
*.csv

getCompleteAndUniqueValuesForAllKeys.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@
7171
if metadata[l]['key'] != 'dc.description.provenance':
7272
key = metadata[l]['key']
7373
value = metadata[l]['value'].encode('utf-8')
74-
if os.path.isfile(filePathComplete+key+'Values.csv') == False:
75-
f=csv.writer(open(filePathComplete+key+'Values.csv', 'wb'))
74+
if os.path.isfile(filePathComplete+key+'ValuesComplete.csv') == False:
75+
f=csv.writer(open(filePathComplete+key+'ValuesComplete.csv', 'wb'))
7676
f.writerow(['itemID']+['value'])
7777
f.writerow([itemID]+[value])
7878
else:
79-
f=csv.writer(open(filePathComplete+key+'Values.csv', 'a'))
79+
f=csv.writer(open(filePathComplete+key+'ValuesComplete.csv', 'a'))
8080
f.writerow([itemID]+[value])
8181

8282
elapsedTime = time.time() - startTime
@@ -86,6 +86,7 @@
8686

8787
for fileName in os.listdir(filePathComplete):
8888
reader = csv.DictReader(open(filePathComplete+fileName))
89+
fileName = fileName.replace('Complete', 'Unique')
8990
valueList = []
9091
for row in reader:
9192
valueList.append(row['value'])
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import json
2+
import requests
3+
import secrets
4+
import csv
5+
import time
6+
import os.path
7+
from collections import Counter
8+
from datetime import datetime
9+
10+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
11+
if secretsVersion != '':
12+
try:
13+
secrets = __import__(secretsVersion)
14+
print 'Editing Production'
15+
except ImportError:
16+
print 'Editing Stage'
17+
else:
18+
print 'Editing Stage'
19+
20+
baseURL = secrets.baseURL
21+
email = secrets.email
22+
password = secrets.password
23+
filePath = secrets.filePath
24+
verify = secrets.verify
25+
26+
handle = raw_input('Enter community handle: ')
27+
28+
requests.packages.urllib3.disable_warnings()
29+
30+
31+
32+
startTime = time.time()
33+
data = json.dumps({'email':email,'password':password})
34+
header = {'content-type':'application/json','accept':'application/json'}
35+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
36+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
37+
print 'authenticated'
38+
39+
itemList = []
40+
endpoint = baseURL+'/rest/handle/'+handle
41+
community = requests.get(endpoint, headers=headerAuth, verify=verify).json()
42+
communityName = community['name'].replace(' ','')
43+
communityID = community['id']
44+
45+
filePathComplete = filePath+'completeValueLists'+communityName+datetime.now().strftime('%Y-%m-%d %H.%M.%S')+'/'
46+
filePathUnique = filePath+'uniqueValueLists'+communityName+datetime.now().strftime('%Y-%m-%d %H.%M.%S')+'/'
47+
48+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
49+
for j in range (0, len (collections)):
50+
collectionID = collections[j]['id']
51+
if collectionID != 24:
52+
offset = 0
53+
items = ''
54+
while items != []:
55+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
56+
while items.status_code != 200:
57+
time.sleep(5)
58+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
59+
items = items.json()
60+
for k in range (0, len (items)):
61+
itemID = items[k]['id']
62+
itemList.append(itemID)
63+
offset = offset + 1000
64+
elapsedTime = time.time() - startTime
65+
m, s = divmod(elapsedTime, 60)
66+
h, m = divmod(m, 60)
67+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
68+
69+
os.mkdir(filePathComplete)
70+
os.mkdir(filePathUnique)
71+
for number, itemID in enumerate(itemList):
72+
itemsRemaining = len(itemList) - number
73+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
74+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
75+
for l in range (0, len (metadata)):
76+
if metadata[l]['key'] != 'dc.description.provenance':
77+
key = metadata[l]['key']
78+
value = metadata[l]['value'].encode('utf-8')
79+
if os.path.isfile(filePathComplete+key+'ValuesComplete.csv') == False:
80+
f=csv.writer(open(filePathComplete+key+'ValuesComplete.csv', 'wb'))
81+
f.writerow(['itemID']+['value'])
82+
f.writerow([itemID]+[value])
83+
else:
84+
f=csv.writer(open(filePathComplete+key+'ValuesComplete.csv', 'a'))
85+
f.writerow([itemID]+[value])
86+
87+
elapsedTime = time.time() - startTime
88+
m, s = divmod(elapsedTime, 60)
89+
h, m = divmod(m, 60)
90+
print 'Complete value list creation time: ','%d:%02d:%02d' % (h, m, s)
91+
92+
for fileName in os.listdir(filePathComplete):
93+
reader = csv.DictReader(open(filePathComplete+fileName))
94+
fileName = fileName.replace('Complete', 'Unique')
95+
valueList = []
96+
for row in reader:
97+
valueList.append(row['value'])
98+
valueListCount = Counter(valueList)
99+
f=csv.writer(open(filePathUnique+fileName, 'wb'))
100+
f.writerow(['value']+['count'])
101+
for key, value in valueListCount.items():
102+
f.writerow([key]+[str(value).zfill(6)])
103+
104+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
105+
106+
elapsedTime = time.time() - startTime
107+
m, s = divmod(elapsedTime, 60)
108+
h, m = divmod(m, 60)
109+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

getRecordsAndValuesForKey.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
for l in range (0, len (metadata)):
7070
if metadata[l]['key'] == 'dc.identifier.uri':
7171
uri = metadata[l]['value']
72-
f.writerow([itemID]+[uri]+[metadataValue])
72+
f.writerow([itemID]+[uri]+[metadataValue.encode('utf8')])
7373

7474
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
7575

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import json
2+
import requests
3+
import secrets
4+
import time
5+
import csv
6+
from datetime import datetime
7+
8+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
9+
if secretsVersion != '':
10+
try:
11+
secrets = __import__(secretsVersion)
12+
print 'Editing Production'
13+
except ImportError:
14+
print 'Editing Stage'
15+
else:
16+
print 'Editing Stage'
17+
18+
baseURL = secrets.baseURL
19+
email = secrets.email
20+
password = secrets.password
21+
filePath = secrets.filePath
22+
verify = secrets.verify
23+
24+
requests.packages.urllib3.disable_warnings()
25+
26+
startTime = time.time()
27+
data = json.dumps({'email':email,'password':password})
28+
header = {'content-type':'application/json','accept':'application/json'}
29+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
30+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
31+
print 'authenticated'
32+
33+
itemList = []
34+
endpoint = baseURL+'/rest/communities'
35+
communities = requests.get(endpoint, headers=headerAuth, verify=verify).json()
36+
for i in range (0, len (communities)):
37+
communityID = communities[i]['id']
38+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
39+
for j in range (0, len (collections)):
40+
collectionID = collections[j]['id']
41+
if collectionID != 24:
42+
offset = 0
43+
items = ''
44+
while items != []:
45+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
46+
while items.status_code != 200:
47+
time.sleep(5)
48+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
49+
items = items.json()
50+
for k in range (0, len (items)):
51+
itemID = items[k]['id']
52+
itemList.append(itemID)
53+
offset = offset + 1000
54+
elapsedTime = time.time() - startTime
55+
m, s = divmod(elapsedTime, 60)
56+
h, m = divmod(m, 60)
57+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
58+
59+
f=csv.writer(open(filePath+'DuplicatesRecordsDiffLangTags'+datetime.now().strftime('%Y-%m-%d %H.%M.%S')+'.csv', 'wb'))
60+
f.writerow(['itemID']+['key:value'])
61+
for number, itemID in enumerate(itemList):
62+
itemMetadataProcessed = []
63+
itemsRemaining = len(itemList) - number
64+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
65+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
66+
for metadataElement in metadata:
67+
key = metadataElement['key']
68+
value = metadataElement['value']
69+
keyValue = key+':'+value
70+
if key+':'+value not in itemMetadataProcessed:
71+
itemMetadataProcessed.append(keyValue)
72+
else:
73+
f.writerow([itemID]+[keyValue])
74+
75+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
76+
77+
elapsedTime = time.time() - startTime
78+
m, s = divmod(elapsedTime, 60)
79+
h, m = divmod(m, 60)
80+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import json
2+
import requests
3+
import secrets
4+
import time
5+
import csv
6+
from datetime import datetime
7+
8+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
9+
if secretsVersion != '':
10+
try:
11+
secrets = __import__(secretsVersion)
12+
print 'Editing Production'
13+
except ImportError:
14+
print 'Editing Stage'
15+
else:
16+
print 'Editing Stage'
17+
18+
baseURL = secrets.baseURL
19+
email = secrets.email
20+
password = secrets.password
21+
filePath = secrets.filePath
22+
verify = secrets.verify
23+
24+
requests.packages.urllib3.disable_warnings()
25+
26+
handle = raw_input('Enter community handle: ')
27+
key = raw_input('Enter key: ')
28+
29+
startTime = time.time()
30+
data = json.dumps({'email':email,'password':password})
31+
header = {'content-type':'application/json','accept':'application/json'}
32+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
33+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
34+
print 'authenticated'
35+
36+
itemList = []
37+
endpoint = baseURL+'/rest/handle/'+handle
38+
community = requests.get(endpoint, headers=headerAuth, verify=verify).json()
39+
communityID = community['id']
40+
41+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
42+
for j in range (0, len (collections)):
43+
collectionID = collections[j]['id']
44+
if collectionID != 24:
45+
offset = 0
46+
items = ''
47+
while items != []:
48+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
49+
while items.status_code != 200:
50+
time.sleep(5)
51+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
52+
items = items.json()
53+
for k in range (0, len (items)):
54+
itemID = items[k]['id']
55+
itemList.append(itemID)
56+
offset = offset + 1000
57+
elapsedTime = time.time() - startTime
58+
m, s = divmod(elapsedTime, 60)
59+
h, m = divmod(m, 60)
60+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
61+
62+
f=csv.writer(open(filePath+'recordsWithDuplicate'+key+datetime.now().strftime('%Y-%m-%d %H.%M.%S')+'.csv', 'wb'))
63+
f.writerow(['itemID']+['key'])
64+
for number, itemID in enumerate(itemList):
65+
keyValues = []
66+
itemsRemaining = len(itemList) - number
67+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
68+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
69+
for metadataElement in metadata:
70+
if metadataElement['key'] == key:
71+
value = metadataElement['value']
72+
keyValues.append(value)
73+
if len(keyValues) > 1:
74+
f.writerow([itemID]+[keyValues])
75+
76+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
77+
78+
elapsedTime = time.time() - startTime
79+
m, s = divmod(elapsedTime, 60)
80+
h, m = divmod(m, 60)
81+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import json
2+
import requests
3+
import secrets
4+
import time
5+
import csv
6+
from datetime import datetime
7+
8+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
9+
if secretsVersion != '':
10+
try:
11+
secrets = __import__(secretsVersion)
12+
print 'Editing Production'
13+
except ImportError:
14+
print 'Editing Stage'
15+
else:
16+
print 'Editing Stage'
17+
18+
baseURL = secrets.baseURL
19+
email = secrets.email
20+
password = secrets.password
21+
filePath = secrets.filePath
22+
verify = secrets.verify
23+
24+
requests.packages.urllib3.disable_warnings()
25+
26+
handle = raw_input('Enter community handle: ')
27+
key = raw_input('Enter key: ')
28+
29+
startTime = time.time()
30+
data = json.dumps({'email':email,'password':password})
31+
header = {'content-type':'application/json','accept':'application/json'}
32+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, data=data).content
33+
headerAuth = {'content-type':'application/json','accept':'application/json', 'rest-dspace-token':session}
34+
print 'authenticated'
35+
36+
itemList = []
37+
endpoint = baseURL+'/rest/handle/'+handle
38+
community = requests.get(endpoint, headers=headerAuth, verify=verify).json()
39+
communityID = community['id']
40+
41+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=headerAuth, verify=verify).json()
42+
for j in range (0, len (collections)):
43+
collectionID = collections[j]['id']
44+
if collectionID != 24:
45+
offset = 0
46+
items = ''
47+
while items != []:
48+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
49+
while items.status_code != 200:
50+
time.sleep(5)
51+
items = requests.get(baseURL+'/rest/collections/'+str(collectionID)+'/items?limit=1000&offset='+str(offset), headers=headerAuth, verify=verify)
52+
items = items.json()
53+
for k in range (0, len (items)):
54+
itemID = items[k]['id']
55+
itemList.append(itemID)
56+
offset = offset + 1000
57+
elapsedTime = time.time() - startTime
58+
m, s = divmod(elapsedTime, 60)
59+
h, m = divmod(m, 60)
60+
print 'Item list creation time: ','%d:%02d:%02d' % (h, m, s)
61+
62+
f=csv.writer(open(filePath+'recordsMissing'+key+datetime.now().strftime('%Y-%m-%d %H.%M.%S')+'.csv', 'wb'))
63+
f.writerow(['itemID']+['key'])
64+
for number, itemID in enumerate(itemList):
65+
itemMetadataProcessed = []
66+
itemsRemaining = len(itemList) - number
67+
print 'Items remaining: ', itemsRemaining, 'ItemID: ', itemID
68+
metadata = requests.get(baseURL+'/rest/items/'+str(itemID)+'/metadata', headers=headerAuth, verify=verify).json()
69+
for metadataElement in metadata:
70+
itemMetadataProcessed.append(metadataElement['key'])
71+
if key not in itemMetadataProcessed:
72+
f.writerow([itemID])
73+
74+
logout = requests.post(baseURL+'/rest/logout', headers=headerAuth, verify=verify)
75+
76+
elapsedTime = time.time() - startTime
77+
m, s = divmod(elapsedTime, 60)
78+
h, m = divmod(m, 60)
79+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

0 commit comments

Comments
 (0)