IndicoDataSolutions · madisonmay · Sep 24, 2024 · Sep 24, 2024 · Sep 26, 2024
diff --git a/apply_labels.py b/apply_labels.py
@@ -122,7 +122,7 @@ def apply_labels(
     print("Reading csv...")
     new_df = pd.read_csv(new_export_path)
     name_to_file_id = {row["file_name"]: row["file_id"] for _, row in new_df.iterrows()}
-    client = IndicoClient(config=IndicoConfig(host=host, api_token_path=api_token_path))
+    client = IndicoClient(config=IndicoConfig(host=host, api_token_path=api_token_path,verify_ssl=False))
     print("Getting dataset details...")
     dataset = client.call(GetDataset(id=new_dataset_id))
     print("Loading revised labels...")

diff --git a/get_datasets.py b/get_datasets.py
@@ -191,6 +191,7 @@ def get_dataset(
     my_config = IndicoConfig(
         host=host,
         api_token_path=api_token_path,
+        verify_ssl=False
     )
     client = IndicoClient(config=my_config)
     if labelset_id:
@@ -250,5 +251,142 @@ def get_dataset(
     pd.DataFrame.from_records(output_records).to_csv(csv_path, index=False)
 
 
+def get_dataset_from_remaining(
+    name,
+    dataset_id,
+    # index,
+    labelset_id=None,
+    label_col="labels",
+    text_col="document",
+    filename_col="file_name",
+    host="app.indico.io",
+    api_token_path="prod_api_token.txt",
+):
+
+    my_config = IndicoConfig(
+        host=host,
+        api_token_path=api_token_path,
+        verify_ssl=False
+    )
+    client = IndicoClient(config=my_config)
+    if labelset_id:
+        labelset = next(
+            labelset
+            for labelset in client.call(GetLabelsetName(datasetId=dataset_id))[
+                "dataset"
+            ]["labelsets"]
+            if labelset["id"] == labelset_id
+        )
+        label_col = labelset["name"]
+
+    export_path = os.path.join(name, "raw_export.csv")
+
+    if not os.path.exists(export_path):
+        raw_export = get_export(client, dataset_id, labelset_id)
+        raw_export.to_csv(export_path)
+    else:
+        raw_export = pd.read_csv(export_path)
+
+    records = raw_export.to_dict("records")
+    output_records = []
+    label_col = label_col.rsplit("_", 1)[0]
+
+    all_rows = [row for i , row in enumerate(tqdm.tqdm(records))]
+    all_filenames = [os.path.splitext(os.path.basename(row[filename_col]))[0] for row in all_rows]
+    existing_filenames = [f .split(".pdf")[0] for f in os.listdir(os.path.join(name, "files"))]
+    remaining_filenames = [f for f in all_filenames if f not in existing_filenames]
+    records_to_run = [row for row in all_rows if os.path.splitext(os.path.basename(row[filename_col]))[0] not in existing_filenames]
+    # records_to_run = [(i, row)  for i , row in enumerate(tqdm.tqdm(records)) if i>index]
+    for row in records_to_run :
+
+        filename = os.path.splitext(os.path.basename(row[filename_col]))[0]
+        document_path = os.path.join(
+            name, "files", filename + "." + row["file_name"].split(".")[-1]
+        )
+        try:
+            page_ocrs, page_image_paths = get_ocr_by_datafile_id(
+                client, row["file_id"], dataset_dir=name, filename=filename
+            )
+
+            # Try to get text from export, but fallback to reconstructing from page OCR
+            if text_col in row:
+                text = row[text_col]
+            else:
+                text = text_from_ocr(page_ocrs)
+
+            # DF doesn't have labels or labels are null for a file
+            if label_col not in row or pd.isna(row[label_col]):
+                labels = None
+            else:
+                labels = reformat_labels(row[label_col], text)
+
+            output_record = {"ocr": json.dumps(page_ocrs), "text": text, "labels": labels}
+            output_record["image_files"] = json.dumps(page_image_paths)
+            output_record["document_path"] = document_path
+
+            with open(document_path, "wb") as fp:
+                fp.write(client.call(RetrieveStorageObject(row["file_url"])))
+
+            output_records.append(output_record)
+        except Exception as e:
+            print(document_path,"is problematic")
+
+    csv_path = os.path.join(name, "all_labels.csv")
+    logger.info("Creating CSV...")
+    pd.DataFrame.from_records(output_records).to_csv(csv_path, index=False)
+
+def get_docs_failed(name,dataset_id,host,api_token_path):
+
+    my_config = IndicoConfig(
+        host=host,
+        api_token_path=api_token_path,
+        verify_ssl=False
+    )
+    client = IndicoClient(config=my_config)
+    export_path = os.path.join(name, "raw_export.csv")
+
+    if not os.path.exists(export_path):
+        raw_export = get_export(client, dataset_id)
+        raw_export.to_csv(export_path)
+    else:
+        raw_export = pd.read_csv(export_path)
+
+    records = raw_export.to_dict("records")
+
+    all_rows = [row for i , row in enumerate(tqdm.tqdm(records))]
+    all_rows_ids = [d['file_id'] for d in all_rows]
+    files = []
+    for fid in all_rows_ids:
+        d = client.call(GetDatafileByID(datafileId=fid))
+        if d['datafile']['pages'] ==[]:
+            files.append(fid)
+    return [row['file_name'] for row in all_rows if row['file_id'] in files]
+
+def copy_file(orig_dir_path, out_dir_path, file_path):
+    import shutil
+    # Define the source file path and destination directory
+    source_file = os.path.join(os.path.abspath(orig_dir_path),file_path)
+    destination_folder = os.path.abspath(out_dir_path)
+
+    # Copy the file
+    shutil.copy(source_file, destination_folder)
+
+    print(f"{file_path} copied successfully!")
+
+def copy_failed_docs(dataset_id,host,api_token_path):
+    l_failed = get_docs_failed("new", dataset_id, host, api_token_path)
+    for f in l_failed:
+        copy_file('old/files','failed',f)
+
+    return
+
 if __name__ == "__main__":
-    fire.Fire(get_dataset)
+    # fire.Fire(get_dataset)
+    # fire.Fire(get_dataset_from_remaining)
+    get_dataset_from_remaining("ghg_old",646,1242,host="indico.dv-lz.aws.ics.intcx.net",
+    api_token_path = "../indico_token/indico_api_token_dv_v5.txt")
+    # get_dataset_from_remaining("new",10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
+    # failed_docs_names = get_docs_failed("new",10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
+    # print(failed_docs_names[:2])
+    # folder_failed_docs(orig_dir_path='old/files',out_dir_path='failed',file_path='4461_Industrial Bank of Korea_Sustainability report.pdf')
+    # copy_failed_docs(10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
diff --git a/old_to_new.yaml b/old_to_new.yaml
@@ -11,5 +11,5 @@ granularity: tokens
 PARTIAL_AFFINE: True
 min_keypoint_match_ratio: 0.2
 debug: True
-source_folder: "old-metlife"
-target_folder: "new-metlife"
+source_folder: "old" #"old-metlife"
+target_folder: "new" #"new-metlife"
diff --git a/requirements.txt b/requirements.txt
@@ -56,15 +56,15 @@ nest-asyncio==1.5.6
 nltk==3.8.1
 notebook==6.5.2
 notebook_shim==0.2.2
-numpy==1.24.2
+numpy
 opencv-jupyter-ui==1.4.2
 opencv-python==4.7.0.68
 packaging==23.0
 pandas==1.5.3
 pandocfilters==1.5.0
 parso==0.8.3
 pathspec==0.11.0
-pathtools==0.1.2
+pathtools
 pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==9.4.0