Skip to content

Commit c92cde0

Browse files
authored
Merge pull request #16 from MITLibraries/reconcile-refactor
reconcile refactor
2 parents 097ce53 + c8d52b6 commit c92cde0

File tree

7 files changed

+240
-70
lines changed

7 files changed

+240
-70
lines changed

dsaps/cli.py

Lines changed: 9 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import click
1010
import structlog
1111

12-
from dsaps import models
12+
from dsaps import models, workflows
1313

1414
logger = structlog.get_logger()
1515

@@ -91,40 +91,17 @@ def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type,
9191
@main.command()
9292
@click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
9393
help='The path of the CSV file of metadata.')
94+
@click.option('-o', '--output_path', prompt='Enter the output path',
95+
default='', help='The path of the output files, include '
96+
'/ at the end of the path')
9497
@click.option('-f', '--file_path', prompt='Enter the path',
95-
help='The path of the content, a URL or local drive path.')
98+
help='The path of the content, a URL or local drive path.'
99+
'Include / at the end of a local drive path.')
96100
@click.option('-t', '--file_type', prompt='Enter the file type',
97101
help='The file type to be uploaded.')
98-
def reconcile(metadata_csv, file_path, file_type):
99-
if file_path.startswith('http'):
100-
file_dict = models.build_file_dict_remote(file_path, file_type, {})
101-
else:
102-
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
103-
for file in files:
104-
file_name = os.path.splitext(os.path.basename(file))[0]
105-
file_dict[file_name] = file
106-
metadata_ids = []
107-
with open(metadata_csv) as csvfile:
108-
reader = csv.DictReader(csvfile)
109-
for row in reader:
110-
value = row['file_identifier']
111-
metadata_ids.append(value)
112-
file_matches = []
113-
file_ids = []
114-
for file_id, v in file_dict.items():
115-
file_ids.append(file_id)
116-
for metadata_id in [m for m in metadata_ids if file_id == m]:
117-
file_matches.append(file_id)
118-
metadata_matches = []
119-
for metadata_id in metadata_ids:
120-
for file_id in file_dict:
121-
if file_id == metadata_id:
122-
metadata_matches.append(metadata_id)
123-
no_files = set(metadata_ids) - set(metadata_matches)
124-
no_metadata = set(file_ids) - set(file_matches)
125-
models.create_csv_from_list(no_metadata, 'no_metadata.csv')
126-
models.create_csv_from_list(no_files, 'no_files.csv')
127-
models.create_csv_from_list(metadata_matches, 'metadata_matches.csv')
102+
def reconcile(metadata_csv, file_path, file_type, output_path):
103+
workflows.reconcile_files_and_metadata(metadata_csv, output_path,
104+
file_path, file_type)
128105

129106

130107
@main.command()

dsaps/models.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,11 @@ def post_bitstreams_to_item(self, item_id, file_identifier, file_dict,
128128
"""Post a sorted set of bitstreams to a specified item."""
129129
file_dict = collections.OrderedDict(sorted(file_dict.items()))
130130
for bitstream, v in file_dict.items():
131-
bit_id = self.post_bitstream(item_id, file_identifier, file_dict,
132-
ingest_type, bitstream)
131+
bit_id = self.post_bitstream(item_id, file_dict, ingest_type,
132+
bitstream)
133133
yield bit_id
134134

135-
def post_bitstream(self, item_id, file_identifier, file_dict, ingest_type,
135+
def post_bitstream(self, item_id, file_dict, ingest_type,
136136
bitstream):
137137
"""Post a bitstream to a specified item."""
138138
bitstream_path = file_dict[bitstream]

dsaps/workflows.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import csv
2+
import glob
3+
import os
4+
5+
from dsaps import models
6+
7+
8+
def create_file_dict(file_path, file_type):
9+
"""Creates a dict of file IDs and file paths."""
10+
if file_path.startswith('http'):
11+
file_dict = models.build_file_dict_remote(file_path, file_type, {})
12+
else:
13+
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
14+
file_dict = {}
15+
for file in files:
16+
file_name = os.path.splitext(os.path.basename(file))[0]
17+
file_dict[file_name] = file
18+
return file_dict
19+
20+
21+
def create_metadata_id_list(metadata_csv):
22+
"""Creates a list of IDs from a metadata CSV"""
23+
metadata_ids = []
24+
with open(metadata_csv) as csvfile:
25+
reader = csv.DictReader(csvfile)
26+
for row in reader:
27+
value = row['file_identifier']
28+
metadata_ids.append(value)
29+
return metadata_ids
30+
31+
32+
def match_files_to_metadata(file_dict, metadata_ids):
33+
"""Creates a list of files matched to metadata records."""
34+
file_matches = []
35+
for file_id, v in file_dict.items():
36+
for metadata_id in [m for m in metadata_ids
37+
if file_id.startswith(m)]:
38+
file_matches.append(file_id)
39+
return file_matches
40+
41+
42+
def match_metadata_to_files(file_dict, metadata_ids):
43+
"""Creates a list of metadata records matched to files."""
44+
metadata_matches = []
45+
for metadata_id in metadata_ids:
46+
for file_id in [f for f in file_dict
47+
if f.startswith(metadata_id)]:
48+
metadata_matches.append(metadata_id)
49+
return metadata_matches
50+
51+
52+
def reconcile_files_and_metadata(metadata_csv, output_path, file_path,
53+
file_type):
54+
"""Runs a reconciliation of files and metadata."""
55+
file_dict = create_file_dict(file_path, file_type)
56+
file_ids = file_dict.keys()
57+
metadata_ids = create_metadata_id_list(metadata_csv)
58+
metadata_matches = match_metadata_to_files(file_dict, metadata_ids)
59+
file_matches = match_files_to_metadata(file_dict, metadata_ids)
60+
no_files = set(metadata_ids) - set(metadata_matches)
61+
no_metadata = set(file_ids) - set(file_matches)
62+
models.create_csv_from_list(no_metadata, f'{output_path}no_metadata')
63+
models.create_csv_from_list(no_files, f'{output_path}no_files')
64+
models.create_csv_from_list(metadata_matches,
65+
f'{output_path}metadata_matches')
66+
update_metadata_csv(metadata_csv, output_path, metadata_matches)
67+
68+
69+
def update_metadata_csv(metadata_csv, output_path, metadata_matches):
70+
"""Creates an updated CSV of metadata records with matching files."""
71+
with open(metadata_csv) as csvfile:
72+
reader = csv.DictReader(csvfile)
73+
upd_md_file_name = f'updated-{os.path.basename(metadata_csv)}'
74+
with open(f'{output_path}{upd_md_file_name}', 'w') as updated_csv:
75+
writer = csv.DictWriter(updated_csv, fieldnames=reader.fieldnames)
76+
writer.writeheader()
77+
for row in reader:
78+
if row['file_identifier'] in metadata_matches:
79+
writer.writerow(row)

tests/conftest.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1+
import csv
2+
13
from click.testing import CliRunner
24
import pytest
35
import requests_mock
46

57
from dsaps import models
68

79

8-
@pytest.fixture(autouse=True)
10+
@pytest.fixture()
911
def client():
1012
client = models.Client('mock://example.com/')
1113
client.header = {}
@@ -15,7 +17,7 @@ def client():
1517

1618

1719
@pytest.fixture(autouse=True)
18-
def ds_mock():
20+
def web_mock():
1921
with requests_mock.Mocker() as m:
2022
cookies = {'JSESSIONID': '11111111'}
2123
m.post('mock://example.com/login', cookies=cookies)
@@ -37,33 +39,44 @@ def ds_mock():
3739
item_json = {'uuid': 'a1b2', 'handle': '1111.1/1111'}
3840
m.post('mock://example.com/collections/789/items', json=item_json)
3941
b_json_1 = {'uuid': 'c3d4'}
40-
url_1 = 'mock://example.com/items/a1b2/bitstreams?name=123_1.pdf'
42+
url_1 = 'mock://example.com/items/a1b2/bitstreams?name=test_01.pdf'
4143
m.post(url_1, json=b_json_1)
4244
b_json_2 = {'uuid': 'e5f6'}
43-
url_2 = 'mock://example.com/items/a1b2/bitstreams?name=123_2.pdf'
45+
url_2 = 'mock://example.com/items/a1b2/bitstreams?name=test_02.pdf'
4446
m.post(url_2, json=b_json_2)
47+
m.get('mock://remoteserver.com/files/test_01.pdf', content=b'')
4548
yield m
4649

4750

48-
@pytest.fixture(autouse=True)
51+
@pytest.fixture()
4952
def runner():
5053
return CliRunner()
5154

5255

53-
@pytest.fixture(autouse=True)
54-
def sample_content_1(tmp_path):
55-
content = 'test'
56-
dir = tmp_path / 'sub'
57-
dir.mkdir()
58-
sample_content = dir / '123_1.pdf'
59-
sample_content.write_text(content)
60-
return sample_content
56+
@pytest.fixture()
57+
def input_dir(tmp_path):
58+
input_dir = tmp_path / 'files'
59+
input_dir.mkdir()
60+
input_2nd_lvl = input_dir / 'more_files'
61+
input_2nd_lvl.mkdir()
62+
with open(f'{input_dir}/test_01.pdf', 'w'):
63+
pass
64+
with open(f'{input_2nd_lvl}/test_02.pdf', 'w'):
65+
pass
66+
with open(f'{input_dir}/best_01.pdf', 'w'):
67+
pass
68+
with open(f'{input_dir}/test_01.jpg', 'w'):
69+
pass
70+
with open(f'{input_dir}/metadata.csv', 'w') as csvfile:
71+
writer = csv.writer(csvfile)
72+
writer.writerow(['uri'] + ['title'] + ['file_identifier'])
73+
writer.writerow(['/repo/0/ao/123'] + ['Test Item'] + ['test'])
74+
writer.writerow(['/repo/0/ao/456'] + ['Tast Item'] + ['tast'])
75+
return str(f'{input_dir}/')
6176

6277

63-
@pytest.fixture(autouse=True)
64-
def sample_content_2(tmp_path):
65-
content = 'test'
66-
dir = tmp_path / 'sub'
67-
sample_content = dir / '123_2.pdf'
68-
sample_content.write_text(content)
69-
return sample_content
78+
@pytest.fixture()
79+
def output_dir(tmp_path):
80+
output_dir = tmp_path / 'output'
81+
output_dir.mkdir()
82+
return str(f'{output_dir}/')

tests/test_cli.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from dsaps.cli import main
2+
3+
4+
def test_reconcile(runner, input_dir, output_dir):
5+
"""Test reconcile command."""
6+
result = runner.invoke(main,
7+
['--url', 'mock://example.com/',
8+
'--email', 'test@test.mock',
9+
'--password', '1234',
10+
'reconcile',
11+
'--metadata_csv',
12+
f'{input_dir}/metadata.csv',
13+
'--file_path', 'files',
14+
'--file_type', 'pdf',
15+
'--output_path', f'{output_dir}'
16+
])
17+
assert result.exit_code == 0

tests/test_models.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,35 +45,48 @@ def test_post_coll_to_comm(client):
4545
assert coll_id == '5678'
4646

4747

48-
def test_post_items_to_coll(client, sample_content_1):
48+
def test_post_items_to_coll(client, input_dir):
4949
"""Test post_items_to_coll method."""
5050
coll_metadata = [{"metadata": [
5151
{"key": "file_identifier",
52-
"value": "123"},
52+
"value": "test"},
5353
{"key": "dc.title", "value":
5454
"Monitoring Works: Getting Teachers",
5555
"language": "en_US"},
5656
{"key": "dc.relation.isversionof",
5757
"value": "repo/0/ao/123"}]}]
5858
coll_id = '789'
5959
ingest_type = 'local'
60-
file_dict = {'123': sample_content_1}
60+
file_dict = {'test_01': f'{input_dir}test_01.pdf'}
6161
item_ids = client.post_items_to_coll(coll_id, coll_metadata, file_dict,
6262
ingest_type)
6363
for item_id in item_ids:
6464
assert 'a1b2' == item_id
6565

6666

67-
def test_post_bitstreams_to_item(client, sample_content_1, sample_content_2):
67+
def test_post_bitstreams_to_item(client, input_dir):
6868
"""Test post_bitstreams_to_item method."""
6969
item_id = 'a1b2'
7070
ingest_type = 'local'
7171
file_identifier = '123'
72-
file_dict = {'123': sample_content_1}
72+
file_dict = {'test_02': f'{input_dir}more_files/test_02.pdf',
73+
'test_01': f'{input_dir}test_01.pdf'}
7374
bit_ids = client.post_bitstreams_to_item(item_id, file_identifier,
7475
file_dict, ingest_type)
75-
for bit_id in bit_ids:
76-
assert 'c3d4' == bit_id
76+
assert next(bit_ids) == 'c3d4'
77+
assert next(bit_ids) == 'e5f6'
78+
79+
80+
def test_post_bitstream(client, input_dir):
81+
"""Test post_bitstream method."""
82+
item_id = 'a1b2'
83+
file_dict = {'test_01': f'{input_dir}test_01.pdf'}
84+
bitstream = 'test_01'
85+
bit_id = client.post_bitstream(item_id, file_dict, 'local', bitstream)
86+
assert 'c3d4' == bit_id
87+
file_dict = {'test_01': 'mock://remoteserver.com/files/test_01.pdf'}
88+
bit_id = client.post_bitstream(item_id, file_dict, 'remote', bitstream)
89+
assert 'c3d4' == bit_id
7790

7891

7992
def test__pop_inst(client):
@@ -110,15 +123,14 @@ def test_build_file_dict_remote():
110123
assert '999' in file_list
111124

112125

113-
def test_create_csv_from_list(runner):
126+
def test_create_csv_from_list(output_dir):
114127
"""Test create_csv_from_list function."""
115-
with runner.isolated_filesystem():
116-
list_name = ['123']
117-
models.create_csv_from_list(list_name, 'output')
118-
with open('output.csv') as csvfile:
119-
reader = csv.DictReader(csvfile)
120-
for row in reader:
121-
assert row['id'] == '123'
128+
list_name = ['123']
129+
models.create_csv_from_list(list_name, f'{output_dir}output')
130+
with open(f'{output_dir}output.csv') as csvfile:
131+
reader = csv.DictReader(csvfile)
132+
for row in reader:
133+
assert row['id'] == '123'
122134

123135

124136
def test_metadata_elems_from_row():

0 commit comments

Comments
 (0)