Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apply_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def apply_labels(
print("Reading csv...")
new_df = pd.read_csv(new_export_path)
name_to_file_id = {row["file_name"]: row["file_id"] for _, row in new_df.iterrows()}
client = IndicoClient(config=IndicoConfig(host=host, api_token_path=api_token_path))
client = IndicoClient(config=IndicoConfig(host=host, api_token_path=api_token_path,verify_ssl=False))
print("Getting dataset details...")
dataset = client.call(GetDataset(id=new_dataset_id))
print("Loading revised labels...")
Expand Down
140 changes: 139 additions & 1 deletion get_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ def get_dataset(
my_config = IndicoConfig(
host=host,
api_token_path=api_token_path,
verify_ssl=False
)
client = IndicoClient(config=my_config)
if labelset_id:
Expand Down Expand Up @@ -250,5 +251,142 @@ def get_dataset(
pd.DataFrame.from_records(output_records).to_csv(csv_path, index=False)


def get_dataset_from_remaining(
name,
dataset_id,
# index,
labelset_id=None,
label_col="labels",
text_col="document",
filename_col="file_name",
host="app.indico.io",
api_token_path="prod_api_token.txt",
):

my_config = IndicoConfig(
host=host,
api_token_path=api_token_path,
verify_ssl=False
)
client = IndicoClient(config=my_config)
if labelset_id:
labelset = next(
labelset
for labelset in client.call(GetLabelsetName(datasetId=dataset_id))[
"dataset"
]["labelsets"]
if labelset["id"] == labelset_id
)
label_col = labelset["name"]

export_path = os.path.join(name, "raw_export.csv")

if not os.path.exists(export_path):
raw_export = get_export(client, dataset_id, labelset_id)
raw_export.to_csv(export_path)
else:
raw_export = pd.read_csv(export_path)

records = raw_export.to_dict("records")
output_records = []
label_col = label_col.rsplit("_", 1)[0]

all_rows = [row for i , row in enumerate(tqdm.tqdm(records))]
all_filenames = [os.path.splitext(os.path.basename(row[filename_col]))[0] for row in all_rows]
existing_filenames = [f .split(".pdf")[0] for f in os.listdir(os.path.join(name, "files"))]
remaining_filenames = [f for f in all_filenames if f not in existing_filenames]
records_to_run = [row for row in all_rows if os.path.splitext(os.path.basename(row[filename_col]))[0] not in existing_filenames]
# records_to_run = [(i, row) for i , row in enumerate(tqdm.tqdm(records)) if i>index]
for row in records_to_run :

filename = os.path.splitext(os.path.basename(row[filename_col]))[0]
document_path = os.path.join(
name, "files", filename + "." + row["file_name"].split(".")[-1]
)
try:
page_ocrs, page_image_paths = get_ocr_by_datafile_id(
client, row["file_id"], dataset_dir=name, filename=filename
)

# Try to get text from export, but fallback to reconstructing from page OCR
if text_col in row:
text = row[text_col]
else:
text = text_from_ocr(page_ocrs)

# DF doesn't have labels or labels are null for a file
if label_col not in row or pd.isna(row[label_col]):
labels = None
else:
labels = reformat_labels(row[label_col], text)

output_record = {"ocr": json.dumps(page_ocrs), "text": text, "labels": labels}
output_record["image_files"] = json.dumps(page_image_paths)
output_record["document_path"] = document_path

with open(document_path, "wb") as fp:
fp.write(client.call(RetrieveStorageObject(row["file_url"])))

output_records.append(output_record)
except Exception as e:
print(document_path,"is problematic")

csv_path = os.path.join(name, "all_labels.csv")
logger.info("Creating CSV...")
pd.DataFrame.from_records(output_records).to_csv(csv_path, index=False)

def get_docs_failed(name,dataset_id,host,api_token_path):

my_config = IndicoConfig(
host=host,
api_token_path=api_token_path,
verify_ssl=False
)
client = IndicoClient(config=my_config)
export_path = os.path.join(name, "raw_export.csv")

if not os.path.exists(export_path):
raw_export = get_export(client, dataset_id)
raw_export.to_csv(export_path)
else:
raw_export = pd.read_csv(export_path)

records = raw_export.to_dict("records")

all_rows = [row for i , row in enumerate(tqdm.tqdm(records))]
all_rows_ids = [d['file_id'] for d in all_rows]
files = []
for fid in all_rows_ids:
d = client.call(GetDatafileByID(datafileId=fid))
if d['datafile']['pages'] ==[]:
files.append(fid)
return [row['file_name'] for row in all_rows if row['file_id'] in files]

def copy_file(orig_dir_path, out_dir_path, file_path):
import shutil
# Define the source file path and destination directory
source_file = os.path.join(os.path.abspath(orig_dir_path),file_path)
destination_folder = os.path.abspath(out_dir_path)

# Copy the file
shutil.copy(source_file, destination_folder)

print(f"{file_path} copied successfully!")

def copy_failed_docs(dataset_id,host,api_token_path):
l_failed = get_docs_failed("new", dataset_id, host, api_token_path)
for f in l_failed:
copy_file('old/files','failed',f)

return

if __name__ == "__main__":
fire.Fire(get_dataset)
# fire.Fire(get_dataset)
# fire.Fire(get_dataset_from_remaining)
get_dataset_from_remaining("ghg_old",646,1242,host="indico.dv-lz.aws.ics.intcx.net",
api_token_path = "../indico_token/indico_api_token_dv_v5.txt")
# get_dataset_from_remaining("new",10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
# failed_docs_names = get_docs_failed("new",10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
# print(failed_docs_names[:2])
# folder_failed_docs(orig_dir_path='old/files',out_dir_path='failed',file_path='4461_Industrial Bank of Korea_Sustainability report.pdf')
# copy_failed_docs(10,host="dev.indico-v6.dv-lz.aws.ics.intcx.net",api_token_path="../indico_token/indico_api_token_dv_v6.txt")
4 changes: 2 additions & 2 deletions old_to_new.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ granularity: tokens
PARTIAL_AFFINE: True
min_keypoint_match_ratio: 0.2
debug: True
source_folder: "old-metlife"
target_folder: "new-metlife"
source_folder: "old" #"old-metlife"
target_folder: "new" #"new-metlife"
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ nest-asyncio==1.5.6
nltk==3.8.1
notebook==6.5.2
notebook_shim==0.2.2
numpy==1.24.2
numpy
opencv-jupyter-ui==1.4.2
opencv-python==4.7.0.68
packaging==23.0
pandas==1.5.3
pandocfilters==1.5.0
parso==0.8.3
pathspec==0.11.0
pathtools==0.1.2
pathtools
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.4.0
Expand Down