Merge pull request #16 from MITLibraries/reconcile-refactor

ehanson8 · web-flow · commit c92cde0b7985 · 2021-03-19T10:56:08.000-04:00
reconcile refactor
diff --git a/dsaps/cli.py b/dsaps/cli.py
@@ -9,7 +9,7 @@
 import click
 import structlog
 
-from dsaps import models
+from dsaps import models, workflows
 
 logger = structlog.get_logger()
 
@@ -91,40 +91,17 @@ def newcoll(ctx, comm_handle, coll_name, metadata, file_path, file_type,
 @main.command()
 @click.option('-m', '--metadata_csv', prompt='Enter the metadata CSV file',
               help='The path of the CSV file of metadata.')
+@click.option('-o', '--output_path', prompt='Enter the output path',
+              default='', help='The path of the output files, include '
+              '/ at the end of the path')
 @click.option('-f', '--file_path', prompt='Enter the path',
-              help='The path of the content, a URL or local drive path.')
+              help='The path of the content, a URL or local drive path.'
+              'Include / at the end of a local drive path.')
 @click.option('-t', '--file_type', prompt='Enter the file type',
               help='The file type to be uploaded.')
-def reconcile(metadata_csv, file_path, file_type):
-    if file_path.startswith('http'):
-        file_dict = models.build_file_dict_remote(file_path, file_type, {})
-    else:
-        files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
-        for file in files:
-            file_name = os.path.splitext(os.path.basename(file))[0]
-            file_dict[file_name] = file
-    metadata_ids = []
-    with open(metadata_csv) as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row in reader:
-            value = row['file_identifier']
-            metadata_ids.append(value)
-    file_matches = []
-    file_ids = []
-    for file_id, v in file_dict.items():
-        file_ids.append(file_id)
-        for metadata_id in [m for m in metadata_ids if file_id == m]:
-            file_matches.append(file_id)
-    metadata_matches = []
-    for metadata_id in metadata_ids:
-        for file_id in file_dict:
-            if file_id == metadata_id:
-                metadata_matches.append(metadata_id)
-    no_files = set(metadata_ids) - set(metadata_matches)
-    no_metadata = set(file_ids) - set(file_matches)
-    models.create_csv_from_list(no_metadata, 'no_metadata.csv')
-    models.create_csv_from_list(no_files, 'no_files.csv')
-    models.create_csv_from_list(metadata_matches, 'metadata_matches.csv')
+def reconcile(metadata_csv, file_path, file_type, output_path):
+    workflows.reconcile_files_and_metadata(metadata_csv, output_path,
+                                           file_path, file_type)
 
 
 @main.command()
diff --git a/dsaps/models.py b/dsaps/models.py
@@ -128,11 +128,11 @@ def post_bitstreams_to_item(self, item_id, file_identifier, file_dict,
         """Post a sorted set of bitstreams to a specified item."""
         file_dict = collections.OrderedDict(sorted(file_dict.items()))
         for bitstream, v in file_dict.items():
-            bit_id = self.post_bitstream(item_id, file_identifier, file_dict,
-                                         ingest_type, bitstream)
+            bit_id = self.post_bitstream(item_id, file_dict, ingest_type,
+                                         bitstream)
             yield bit_id
 
-    def post_bitstream(self, item_id, file_identifier, file_dict, ingest_type,
+    def post_bitstream(self, item_id, file_dict, ingest_type,
                        bitstream):
         """Post a bitstream to a specified item."""
         bitstream_path = file_dict[bitstream]
diff --git a/dsaps/workflows.py b/dsaps/workflows.py
@@ -0,0 +1,79 @@
+import csv
+import glob
+import os
+
+from dsaps import models
+
+
+def create_file_dict(file_path, file_type):
+    """Creates a dict of file IDs and file paths."""
+    if file_path.startswith('http'):
+        file_dict = models.build_file_dict_remote(file_path, file_type, {})
+    else:
+        files = glob.glob(f'{file_path}/**/*.{file_type}', recursive=True)
+        file_dict = {}
+        for file in files:
+            file_name = os.path.splitext(os.path.basename(file))[0]
+            file_dict[file_name] = file
+    return file_dict
+
+
+def create_metadata_id_list(metadata_csv):
+    """Creates a list of IDs from a metadata CSV"""
+    metadata_ids = []
+    with open(metadata_csv) as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            value = row['file_identifier']
+            metadata_ids.append(value)
+    return metadata_ids
+
+
+def match_files_to_metadata(file_dict, metadata_ids):
+    """Creates a list of files matched to metadata records."""
+    file_matches = []
+    for file_id, v in file_dict.items():
+        for metadata_id in [m for m in metadata_ids
+                            if file_id.startswith(m)]:
+            file_matches.append(file_id)
+    return file_matches
+
+
+def match_metadata_to_files(file_dict, metadata_ids):
+    """Creates a list of metadata records matched to files."""
+    metadata_matches = []
+    for metadata_id in metadata_ids:
+        for file_id in [f for f in file_dict
+                        if f.startswith(metadata_id)]:
+            metadata_matches.append(metadata_id)
+    return metadata_matches
+
+
+def reconcile_files_and_metadata(metadata_csv, output_path, file_path,
+                                 file_type):
+    """Runs a reconciliation of files and metadata."""
+    file_dict = create_file_dict(file_path, file_type)
+    file_ids = file_dict.keys()
+    metadata_ids = create_metadata_id_list(metadata_csv)
+    metadata_matches = match_metadata_to_files(file_dict, metadata_ids)
+    file_matches = match_files_to_metadata(file_dict, metadata_ids)
+    no_files = set(metadata_ids) - set(metadata_matches)
+    no_metadata = set(file_ids) - set(file_matches)
+    models.create_csv_from_list(no_metadata, f'{output_path}no_metadata')
+    models.create_csv_from_list(no_files, f'{output_path}no_files')
+    models.create_csv_from_list(metadata_matches,
+                                f'{output_path}metadata_matches')
+    update_metadata_csv(metadata_csv, output_path, metadata_matches)
+
+
+def update_metadata_csv(metadata_csv, output_path, metadata_matches):
+    """Creates an updated CSV of metadata records with matching files."""
+    with open(metadata_csv) as csvfile:
+        reader = csv.DictReader(csvfile)
+        upd_md_file_name = f'updated-{os.path.basename(metadata_csv)}'
+        with open(f'{output_path}{upd_md_file_name}', 'w') as updated_csv:
+            writer = csv.DictWriter(updated_csv, fieldnames=reader.fieldnames)
+            writer.writeheader()
+            for row in reader:
+                if row['file_identifier'] in metadata_matches:
+                    writer.writerow(row)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,11 +1,13 @@
+import csv
+
 from click.testing import CliRunner
 import pytest
 import requests_mock
 
 from dsaps import models
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture()
 def client():
     client = models.Client('mock://example.com/')
     client.header = {}
@@ -15,7 +17,7 @@ def client():
 
 
 @pytest.fixture(autouse=True)
-def ds_mock():
+def web_mock():
     with requests_mock.Mocker() as m:
         cookies = {'JSESSIONID': '11111111'}
         m.post('mock://example.com/login', cookies=cookies)
@@ -37,33 +39,44 @@ def ds_mock():
         item_json = {'uuid': 'a1b2', 'handle': '1111.1/1111'}
         m.post('mock://example.com/collections/789/items', json=item_json)
         b_json_1 = {'uuid': 'c3d4'}
-        url_1 = 'mock://example.com/items/a1b2/bitstreams?name=123_1.pdf'
+        url_1 = 'mock://example.com/items/a1b2/bitstreams?name=test_01.pdf'
         m.post(url_1, json=b_json_1)
         b_json_2 = {'uuid': 'e5f6'}
-        url_2 = 'mock://example.com/items/a1b2/bitstreams?name=123_2.pdf'
+        url_2 = 'mock://example.com/items/a1b2/bitstreams?name=test_02.pdf'
         m.post(url_2, json=b_json_2)
+        m.get('mock://remoteserver.com/files/test_01.pdf', content=b'')
         yield m
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture()
 def runner():
     return CliRunner()
 
 
-@pytest.fixture(autouse=True)
-def sample_content_1(tmp_path):
-    content = 'test'
-    dir = tmp_path / 'sub'
-    dir.mkdir()
-    sample_content = dir / '123_1.pdf'
-    sample_content.write_text(content)
-    return sample_content
+@pytest.fixture()
+def input_dir(tmp_path):
+    input_dir = tmp_path / 'files'
+    input_dir.mkdir()
+    input_2nd_lvl = input_dir / 'more_files'
+    input_2nd_lvl.mkdir()
+    with open(f'{input_dir}/test_01.pdf', 'w'):
+        pass
+    with open(f'{input_2nd_lvl}/test_02.pdf', 'w'):
+        pass
+    with open(f'{input_dir}/best_01.pdf', 'w'):
+        pass
+    with open(f'{input_dir}/test_01.jpg', 'w'):
+        pass
+    with open(f'{input_dir}/metadata.csv', 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['uri'] + ['title'] + ['file_identifier'])
+        writer.writerow(['/repo/0/ao/123'] + ['Test Item'] + ['test'])
+        writer.writerow(['/repo/0/ao/456'] + ['Tast Item'] + ['tast'])
+    return str(f'{input_dir}/')
 
 
-@pytest.fixture(autouse=True)
-def sample_content_2(tmp_path):
-    content = 'test'
-    dir = tmp_path / 'sub'
-    sample_content = dir / '123_2.pdf'
-    sample_content.write_text(content)
-    return sample_content
+@pytest.fixture()
+def output_dir(tmp_path):
+    output_dir = tmp_path / 'output'
+    output_dir.mkdir()
+    return str(f'{output_dir}/')
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -0,0 +1,17 @@
+from dsaps.cli import main
+
+
+def test_reconcile(runner, input_dir, output_dir):
+    """Test reconcile command."""
+    result = runner.invoke(main,
+                           ['--url', 'mock://example.com/',
+                            '--email', 'test@test.mock',
+                            '--password', '1234',
+                            'reconcile',
+                            '--metadata_csv',
+                            f'{input_dir}/metadata.csv',
+                            '--file_path', 'files',
+                            '--file_type', 'pdf',
+                            '--output_path', f'{output_dir}'
+                            ])
+    assert result.exit_code == 0
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -45,35 +45,48 @@ def test_post_coll_to_comm(client):
     assert coll_id == '5678'
 
 
-def test_post_items_to_coll(client, sample_content_1):
+def test_post_items_to_coll(client, input_dir):
     """Test post_items_to_coll method."""
     coll_metadata = [{"metadata": [
                      {"key": "file_identifier",
-                      "value": "123"},
+                      "value": "test"},
                      {"key": "dc.title", "value":
                       "Monitoring Works: Getting Teachers",
                       "language": "en_US"},
                      {"key": "dc.relation.isversionof",
                       "value": "repo/0/ao/123"}]}]
     coll_id = '789'
     ingest_type = 'local'
-    file_dict = {'123': sample_content_1}
+    file_dict = {'test_01': f'{input_dir}test_01.pdf'}
     item_ids = client.post_items_to_coll(coll_id, coll_metadata, file_dict,
                                          ingest_type)
     for item_id in item_ids:
         assert 'a1b2' == item_id
 
 
-def test_post_bitstreams_to_item(client, sample_content_1, sample_content_2):
+def test_post_bitstreams_to_item(client, input_dir):
     """Test post_bitstreams_to_item method."""
     item_id = 'a1b2'
     ingest_type = 'local'
     file_identifier = '123'
-    file_dict = {'123': sample_content_1}
+    file_dict = {'test_02': f'{input_dir}more_files/test_02.pdf',
+                 'test_01': f'{input_dir}test_01.pdf'}
     bit_ids = client.post_bitstreams_to_item(item_id, file_identifier,
                                              file_dict, ingest_type)
-    for bit_id in bit_ids:
-        assert 'c3d4' == bit_id
+    assert next(bit_ids) == 'c3d4'
+    assert next(bit_ids) == 'e5f6'
+
+
+def test_post_bitstream(client, input_dir):
+    """Test post_bitstream method."""
+    item_id = 'a1b2'
+    file_dict = {'test_01': f'{input_dir}test_01.pdf'}
+    bitstream = 'test_01'
+    bit_id = client.post_bitstream(item_id, file_dict, 'local', bitstream)
+    assert 'c3d4' == bit_id
+    file_dict = {'test_01': 'mock://remoteserver.com/files/test_01.pdf'}
+    bit_id = client.post_bitstream(item_id, file_dict, 'remote', bitstream)
+    assert 'c3d4' == bit_id
 
 
 def test__pop_inst(client):
@@ -110,15 +123,14 @@ def test_build_file_dict_remote():
         assert '999' in file_list
 
 
-def test_create_csv_from_list(runner):
+def test_create_csv_from_list(output_dir):
     """Test create_csv_from_list function."""
-    with runner.isolated_filesystem():
-        list_name = ['123']
-        models.create_csv_from_list(list_name, 'output')
-        with open('output.csv') as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row in reader:
-                assert row['id'] == '123'
+    list_name = ['123']
+    models.create_csv_from_list(list_name, f'{output_dir}output')
+    with open(f'{output_dir}output.csv') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            assert row['id'] == '123'
 
 
 def test_metadata_elems_from_row():
diff --git a/tests/test_workflows.py b/tests/test_workflows.py