Skip to content

Commit ce76806

Browse files
committed
new script added
1 parent 28d835a commit ce76806

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ No collections skipped:
3636
#### [compareTwoKeysInCommunity.py](compareTwoKeysInCommunity.py)
3737
Based on user input, extracts the values of two specified keys from a specified community to a CSV file for comparison.
3838

39+
#### [countInitialedNamesByCollection.py](countInitialedNamesByCollection.py)
40+
Based on [mjanowiecki's](https://github.com/mjanowiecki) [findInitialedNamesByCollection.py](https://github.com/mjanowiecki/dspace-data-collection/blob/master/findInitialedNamesByCollection.py), find values in name fields that appear to have first initials that could be expanded to full names and provides a count for each collection when the count is more than zero.
41+
3942
#### [exportSelectedRecordMetadataToCSV.py](exportSelectedRecordMetadataToCSV.py)
4043
Based a CSV of item handles, extracts all metadata (except 'dc.description.provenance' values) from the selected items to a CSV file.
4144

countInitialedNamesByCollection.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import json
2+
import requests
3+
import secrets
4+
import csv
5+
import re
6+
import time
7+
import urllib3
8+
9+
secretsVersion = raw_input('To edit production server, enter the name of the secrets file: ')
10+
if secretsVersion != '':
11+
try:
12+
secrets = __import__(secretsVersion)
13+
print 'Editing Production'
14+
except ImportError:
15+
print 'Editing Stage'
16+
else:
17+
print 'Editing Stage'
18+
19+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
20+
21+
baseURL = secrets.baseURL
22+
email = secrets.email
23+
password = secrets.password
24+
filePath = secrets.filePath
25+
verify = secrets.verify
26+
27+
startTime = time.time()
28+
data = {'email':email,'password':password}
29+
header = {'content-type':'application/json','accept':'application/json'}
30+
session = requests.post(baseURL+'/rest/login', headers=header, verify=verify, params=data).cookies['JSESSIONID']
31+
cookies = {'JSESSIONID': session}
32+
headerFileUpload = {'accept':'application/json'}
33+
cookiesFileUpload = cookies
34+
status = requests.get(baseURL+'/rest/status', headers=header, cookies=cookies, verify=verify).json()
35+
userFullName = status['fullname']
36+
print 'authenticated'
37+
38+
collectionIds = []
39+
endpoint = baseURL+'/rest/communities'
40+
communities = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
41+
for community in communities:
42+
communityID = community['uuid']
43+
collections = requests.get(baseURL+'/rest/communities/'+str(communityID)+'/collections', headers=header, cookies=cookies, verify=verify).json()
44+
for collection in collections:
45+
collectionID = collection['uuid']
46+
if collectionID != '45794375-6640-4efe-848e-082e60bae375':
47+
collectionIds.append(collectionID)
48+
49+
names = []
50+
keys = ['dc.contributor.advisor', 'dc.contributor.author', 'dc.contributor.committeeMember', 'dc.contributor.editor', 'dc.contributor.illustrator', 'dc.contributor.other', 'dc.creator']
51+
52+
f = csv.writer(open('initialCountInCollection.csv', 'wb'))
53+
f.writerow(['collectionName']+['handle']+['initialCount'])
54+
55+
for number, collectionID in enumerate(collectionIds):
56+
initialCount = 0
57+
collectionsRemaining = len(collectionIds) - number
58+
print collectionID, 'Collections remaining: ', collectionsRemaining
59+
collection = requests.get(baseURL+'/rest/collections/'+str(collectionID), headers=header, cookies=cookies, verify=verify).json()
60+
collectionName = collection['name'].encode('utf-8')
61+
collectionHandle = collection['handle']
62+
collSels = '&collSel[]=' + collectionID
63+
offset = 0
64+
recordsEdited = 0
65+
items = ''
66+
while items != []:
67+
for key in keys:
68+
endpoint = baseURL+'/rest/filtered-items?query_field[]='+key+'&query_op[]=exists&query_val[]='+collSels+'&limit=100&offset='+str(offset)
69+
print endpoint
70+
response = requests.get(endpoint, headers=header, cookies=cookies, verify=verify).json()
71+
items = response['items']
72+
for item in items:
73+
itemLink = item['link']
74+
metadata = requests.get(baseURL + itemLink + '/metadata', headers=header, cookies=cookies, verify=verify).json()
75+
for metadata_element in metadata:
76+
if metadata_element['key'] == key:
77+
individual_name = metadata_element['value'].encode('utf-8')
78+
for metadata_element in metadata:
79+
if metadata_element['key'] == 'dc.identifier.uri':
80+
uri = metadata_element['value']
81+
contains_initials = re.search(r'(\s|,|[A-Z]|([A-Z]\.))[A-Z](\s|$|\.|,)', individual_name)
82+
contains_middleinitial = re.search(r'((\w{2,},\s)|(\w{2,},))\w[a-z]+', individual_name)
83+
contains_parentheses = re.search(r'\(|\)', individual_name)
84+
if contains_middleinitial:
85+
continue
86+
elif contains_parentheses:
87+
continue
88+
elif contains_initials:
89+
initialCount += 1
90+
else:
91+
continue
92+
offset = offset + 200
93+
print offset
94+
if initialCount > 0:
95+
f.writerow([collectionName]+[baseURL+'/'+collectionHandle]+[str(initialCount).zfill(6)])
96+
97+
logout = requests.post(baseURL+'/rest/logout', headers=header, cookies=cookies, verify=verify)
98+
99+
elapsedTime = time.time() - startTime
100+
m, s = divmod(elapsedTime, 60)
101+
h, m = divmod(m, 60)
102+
print 'Total script run time: ', '%d:%02d:%02d' % (h, m, s)

0 commit comments

Comments
 (0)