Skip to content

Commit 4c1d830

Browse files
committed
reconcile refactor
1 parent 05c977e commit 4c1d830

File tree

6 files changed

+231
-54
lines changed

6 files changed

+231
-54
lines changed

dsaps/cli.py

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import click
1010
import structlog
1111

12-
from dsaps import models
12+
from dsaps import models, workflows
1313

1414
logger = structlog.get_logger()
1515

@@ -96,35 +96,7 @@ def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type,
9696
@click.option('-t', '--file_type', prompt='Enter the file type',
9797
help='The file type to be uploaded.')
9898
def reconcile(metadata_csv, file_path, file_type):
99-
if file_path.startswith('http'):
100-
file_dict = models.build_file_dict_remote(file_path, file_type, {})
101-
else:
102-
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
103-
for file in files:
104-
file_name = os.path.splitext(os.path.basename(file))[0]
105-
file_dict[file_name] = file
106-
metadata_ids = []
107-
with open(metadata_csv) as csvfile:
108-
reader = csv.DictReader(csvfile)
109-
for row in reader:
110-
value = row['file_identifier']
111-
metadata_ids.append(value)
112-
file_matches = []
113-
file_ids = []
114-
for file_id, v in file_dict.items():
115-
file_ids.append(file_id)
116-
for metadata_id in [m for m in metadata_ids if file_id == m]:
117-
file_matches.append(file_id)
118-
metadata_matches = []
119-
for metadata_id in metadata_ids:
120-
for file_id in file_dict:
121-
if file_id == metadata_id:
122-
metadata_matches.append(metadata_id)
123-
no_files = set(metadata_ids) - set(metadata_matches)
124-
no_metadata = set(file_ids) - set(file_matches)
125-
models.create_csv_from_list(no_metadata, 'no_metadata.csv')
126-
models.create_csv_from_list(no_files, 'no_files.csv')
127-
models.create_csv_from_list(metadata_matches, 'metadata_matches.csv')
99+
workflows.reconcile_files_and_metadata(metadata_csv, file_path, file_type)
128100

129101

130102
@main.command()

dsaps/workflows.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import csv
2+
import glob
3+
import os
4+
5+
from dsaps import models
6+
7+
8+
def create_file_dict_and_list(file_path, file_type):
9+
"""Creates a dict of file IDs and file paths and a list of file IDs."""
10+
if file_path.startswith('http'):
11+
file_dict = models.build_file_dict_remote(file_path, file_type, {})
12+
else:
13+
files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
14+
file_dict = {}
15+
file_ids = []
16+
for file in files:
17+
file_name = os.path.splitext(os.path.basename(file))[0]
18+
file_dict[file_name] = file
19+
file_ids.append(file_name)
20+
return file_dict, file_ids
21+
22+
23+
def create_metadata_id_list(metadata_csv):
24+
"""Creates a list of IDs from a metadata CSV"""
25+
metadata_ids = []
26+
with open(metadata_csv) as csvfile:
27+
reader = csv.DictReader(csvfile)
28+
for row in reader:
29+
value = row['file_identifier']
30+
metadata_ids.append(value)
31+
return metadata_ids
32+
33+
34+
def match_files_to_metadata(file_dict, file_ids, metadata_ids):
35+
"""Creates a list of files matched to metadata records."""
36+
file_matches = []
37+
for file_id, v in file_dict.items():
38+
for metadata_id in [m for m in metadata_ids
39+
if file_id.startswith(m)]:
40+
file_matches.append(file_id)
41+
return file_matches
42+
43+
44+
def match_metadata_to_files(file_dict, metadata_ids):
45+
"""Creates a list of metadata records matched to files."""
46+
metadata_matches = []
47+
for metadata_id in metadata_ids:
48+
for file_id in file_dict:
49+
if file_id.startswith(metadata_id):
50+
metadata_matches.append(metadata_id)
51+
return metadata_matches
52+
53+
54+
def reconcile_files_and_metadata(metadata_csv, file_path, file_type):
55+
"""Runs a reconciliation of files and metadata."""
56+
file_dict, file_ids = create_file_dict_and_list(file_path, file_type)
57+
metadata_ids = create_metadata_id_list(metadata_csv)
58+
metadata_matches = match_metadata_to_files(file_dict, metadata_ids)
59+
file_matches = match_files_to_metadata(file_dict, file_ids, metadata_ids)
60+
no_files = set(metadata_ids) - set(metadata_matches)
61+
no_metadata = set(file_ids) - set(file_matches)
62+
models.create_csv_from_list(no_metadata, 'no_metadata')
63+
models.create_csv_from_list(no_files, 'no_files')
64+
models.create_csv_from_list(metadata_matches, 'metadata_matches')
65+
update_metadata_csv(metadata_csv, metadata_matches)
66+
67+
68+
def update_metadata_csv(metadata_csv, metadata_matches):
69+
"""Creates an updated CSV of metadata records with matching files."""
70+
with open(metadata_csv) as csvfile:
71+
reader = csv.DictReader(csvfile)
72+
upd_md_file_name = f'updated-{os.path.basename(metadata_csv)}'
73+
with open(f'{upd_md_file_name}', 'w') as updated_csv:
74+
writer = csv.DictWriter(updated_csv, fieldnames=reader.fieldnames)
75+
writer.writeheader()
76+
csvfile.seek(0)
77+
for row in reader:
78+
if row['file_identifier'] in metadata_matches:
79+
writer.writerow(row)

tests/conftest.py

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import csv
2+
13
from click.testing import CliRunner
24
import pytest
35
import requests_mock
@@ -37,10 +39,10 @@ def ds_mock():
3739
item_json = {'uuid': 'a1b2', 'handle': '1111.1/1111'}
3840
m.post('mock://example.com/collections/789/items', json=item_json)
3941
b_json_1 = {'uuid': 'c3d4'}
40-
url_1 = 'mock://example.com/items/a1b2/bitstreams?name=123_1.pdf'
42+
url_1 = 'mock://example.com/items/a1b2/bitstreams?name=test_01.pdf'
4143
m.post(url_1, json=b_json_1)
4244
b_json_2 = {'uuid': 'e5f6'}
43-
url_2 = 'mock://example.com/items/a1b2/bitstreams?name=123_2.pdf'
45+
url_2 = 'mock://example.com/items/a1b2/bitstreams?name=test_02.pdf'
4446
m.post(url_2, json=b_json_2)
4547
yield m
4648

@@ -51,19 +53,18 @@ def runner():
5153

5254

5355
@pytest.fixture(autouse=True)
54-
def sample_content_1(tmp_path):
55-
content = 'test'
56-
dir = tmp_path / 'sub'
57-
dir.mkdir()
58-
sample_content = dir / '123_1.pdf'
59-
sample_content.write_text(content)
60-
return sample_content
61-
62-
63-
@pytest.fixture(autouse=True)
64-
def sample_content_2(tmp_path):
65-
content = 'test'
66-
dir = tmp_path / 'sub'
67-
sample_content = dir / '123_2.pdf'
68-
sample_content.write_text(content)
69-
return sample_content
56+
def sample_files_dir(tmp_path):
57+
sample_files_dir = tmp_path / 'files'
58+
sample_files_dir.mkdir()
59+
with open(f'{sample_files_dir}/test_01.pdf', 'w'):
60+
pass
61+
with open(f'{sample_files_dir}/test_02.pdf', 'w'):
62+
pass
63+
with open(f'{sample_files_dir}/best_01.pdf', 'w'):
64+
pass
65+
with open(f'{sample_files_dir}/metadata.csv', 'w') as csvfile:
66+
writer = csv.writer(csvfile)
67+
writer.writerow(['uri'] + ['title'] + ['file_identifier'])
68+
writer.writerow(['/repo/0/ao/123'] + ['Test Item'] + ['test'])
69+
writer.writerow(['/repo/0/ao/456'] + ['Tast Item'] + ['tast'])
70+
return str(sample_files_dir)

tests/test_cli.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import csv
2+
import os
3+
import requests_mock
4+
5+
from dsaps.cli import main
6+
7+
8+
def test_reconcile(runner):
9+
"""Test reconcile command."""
10+
with requests_mock.Mocker() as m:
11+
with runner.isolated_filesystem():
12+
os.mkdir('files')
13+
with open('metadata.csv', 'w') as csvfile:
14+
writer = csv.writer(csvfile)
15+
writer.writerow(['uri'] + ['title'] + ['file_identifier'])
16+
writer.writerow(['/repo/0/ao/123'] + ['Test Item'] + ['test'])
17+
cookies = {'JSESSIONID': '11111111'}
18+
user_json = {'fullname': 'User Name'}
19+
m.post('mock://example.com/login', cookies=cookies)
20+
m.get('mock://example.com/status', json=user_json)
21+
result = runner.invoke(main,
22+
['--url', 'mock://example.com/',
23+
'--email', 'test@test.mock',
24+
'--password', '1234',
25+
'reconcile',
26+
'--metadata_csv', 'metadata.csv',
27+
'--file_path', 'files',
28+
'--file_type', 'pdf'
29+
])
30+
assert result.exit_code == 0

tests/test_models.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,35 +45,51 @@ def test_post_coll_to_comm(client):
4545
assert coll_id == '5678'
4646

4747

48-
def test_post_items_to_coll(client, sample_content_1):
48+
def test_post_items_to_coll(client, sample_files_dir):
4949
"""Test post_items_to_coll method."""
5050
coll_metadata = [{"metadata": [
5151
{"key": "file_identifier",
52-
"value": "123"},
52+
"value": "test"},
5353
{"key": "dc.title", "value":
5454
"Monitoring Works: Getting Teachers",
5555
"language": "en_US"},
5656
{"key": "dc.relation.isversionof",
5757
"value": "repo/0/ao/123"}]}]
5858
coll_id = '789'
5959
ingest_type = 'local'
60-
file_dict = {'123': sample_content_1}
60+
file_dict = {'test_01': f'{sample_files_dir}/test_01.pdf'}
6161
item_ids = client.post_items_to_coll(coll_id, coll_metadata, file_dict,
6262
ingest_type)
6363
for item_id in item_ids:
6464
assert 'a1b2' == item_id
6565

6666

67-
def test_post_bitstreams_to_item(client, sample_content_1, sample_content_2):
67+
def test_post_bitstreams_to_item(client, sample_files_dir):
6868
"""Test post_bitstreams_to_item method."""
6969
item_id = 'a1b2'
7070
ingest_type = 'local'
7171
file_identifier = '123'
72-
file_dict = {'123': sample_content_1}
72+
file_dict = {'test_02': f'{sample_files_dir}/test_02.pdf',
73+
'test_01': f'{sample_files_dir}/test_01.pdf'}
7374
bit_ids = client.post_bitstreams_to_item(item_id, file_identifier,
7475
file_dict, ingest_type)
76+
bit_ids_output = []
7577
for bit_id in bit_ids:
76-
assert 'c3d4' == bit_id
78+
bit_ids_output.append(bit_id)
79+
assert bit_ids_output[0] == 'c3d4'
80+
assert bit_ids_output[1] == 'e5f6'
81+
82+
83+
def test_post_bitstream(client, sample_files_dir):
84+
"""Test post_bitstream method."""
85+
item_id = 'a1b2'
86+
ingest_type = 'local'
87+
file_identifier = '123'
88+
file_dict = {'test_01': f'{sample_files_dir}/test_01.pdf'}
89+
bitstream = 'test_01'
90+
bit_id = client.post_bitstream(item_id, file_identifier, file_dict,
91+
ingest_type, bitstream)
92+
assert 'c3d4' == bit_id
7793

7894

7995
def test__pop_inst(client):

tests/test_workflows.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import csv
2+
3+
from dsaps import workflows
4+
5+
6+
def test_create_file_dict_and_id_list(runner, sample_files_dir):
7+
"""Test create_file_dict_and_id_list function."""
8+
file_path = sample_files_dir
9+
file_dict, file_ids = workflows.create_file_dict_and_list(sample_files_dir,
10+
'pdf')
11+
assert file_dict['test_02'] == f'{file_path}/test_02.pdf'
12+
assert file_dict['test_01'] == f'{file_path}/test_01.pdf'
13+
assert file_dict['best_01'] == f'{file_path}/best_01.pdf'
14+
for id in ['test_02', 'test_01', 'best_01']:
15+
assert id in file_ids
16+
17+
18+
def test_create_metadata_id_list(runner, sample_files_dir):
19+
"""Test create_metadata_id_list function."""
20+
metadata_path = f'{sample_files_dir}/metadata.csv'
21+
metadata_ids = workflows.create_metadata_id_list(metadata_path)
22+
assert 'test' in metadata_ids
23+
24+
25+
def test_match_files_to_metadata():
26+
"""Test match_files_to_metadata function."""
27+
file_dict = {'test_01': 'files/test_01.pdf'}
28+
file_ids = ['test_01']
29+
metadata_ids = ['test', 'tast']
30+
file_matches = workflows.match_files_to_metadata(file_dict, file_ids,
31+
metadata_ids)
32+
assert len(file_matches) == 1
33+
assert 'test_01' in file_matches
34+
35+
36+
def test_match_metadata_to_files():
37+
"""Test match_metadata_to_files function."""
38+
file_dict = {'test_01': 'files/test_01.pdf',
39+
'tast_01': 'files/tast_01.pdf'}
40+
metadata_ids = ['test']
41+
file_matches = workflows.match_metadata_to_files(file_dict, metadata_ids)
42+
assert len(file_matches) == 1
43+
assert 'test' in file_matches
44+
45+
46+
def test_reconcile_files_and_metadata(runner, sample_files_dir):
47+
"""Test reconcile function."""
48+
with runner.isolated_filesystem():
49+
metadata_path = f'{sample_files_dir}/metadata.csv'
50+
workflows.reconcile_files_and_metadata(metadata_path, sample_files_dir,
51+
'pdf')
52+
with open('updated-metadata.csv') as csvfile2:
53+
reader = csv.DictReader(csvfile2)
54+
for row in reader:
55+
assert row['uri'] == '/repo/0/ao/123'
56+
assert row['title'] == 'Test Item'
57+
assert row['file_identifier'] == 'test'
58+
with open('no_metadata.csv') as csvfile3:
59+
reader = csv.DictReader(csvfile3)
60+
for row in reader:
61+
assert row['id'] == 'best_01'
62+
with open('no_files.csv') as csvfile4:
63+
reader = csv.DictReader(csvfile4)
64+
for row in reader:
65+
assert row['id'] == 'tast'
66+
67+
68+
def test_update_metadata_csv(runner, sample_files_dir):
69+
"""Test update_metadata_csv function."""
70+
with runner.isolated_filesystem():
71+
metadata_matches = ['test']
72+
workflows.update_metadata_csv(f'{sample_files_dir}/metadata.csv',
73+
metadata_matches)
74+
with open('updated-metadata.csv') as csvfile2:
75+
reader = csv.DictReader(csvfile2)
76+
for row in reader:
77+
assert row['uri'] == '/repo/0/ao/123'
78+
assert row['title'] == 'Test Item'
79+
assert row['file_identifier'] == 'test'

0 commit comments

Comments
 (0)