Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/damast/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,5 @@

DAMAST_CSV_DEFAULT_ARGS: dict[str, any] = {
'null_values': ["None", "none", "Null", "null"],
'quote_char': None,
'infer_schema_length': None
}
12 changes: 7 additions & 5 deletions src/damast/core/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
import traceback
import warnings
from difflib import SequenceMatcher
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -1207,7 +1208,7 @@ def get_fulfillment(self, expected_specs: List[DataSpecification]) -> Fulfillmen
return md_fulfillment

@classmethod
def search(cls, files: list[str | Path]) -> tuple[MetaData | None, str | None]:
def search(cls, files: list[str | Path]) -> tuple[MetaData | None, str | None, dict[str, float]]:
"""
Search for the metadata specfile for a given list of files
"""
Expand All @@ -1216,18 +1217,19 @@ def search(cls, files: list[str | Path]) -> tuple[MetaData | None, str | None]:
commonpath = os.path.commonpath(files)
except Exception as e:
logger.debug(e)
return None, None, None
return None, None, {}

if len(files) == 1:
commonpath = Path(commonpath).parent

commonprefix = os.path.commonprefix([Path(x).stem for x in files])
metadata_file_candidates = [x for x in Path(commonpath).glob(f"{commonprefix}*{DAMAST_SPEC_SUFFIX}")]
for f in metadata_file_candidates:
metadata_file_candidates = { x: SequenceMatcher(None, commonprefix, str(x.stem)).ratio() for x in Path(commonpath).glob(f"{commonprefix}*{DAMAST_SPEC_SUFFIX}") }
for f in dict(sorted(metadata_file_candidates.items(), key=lambda x: x[1], reverse=True)):
try:
Path(f).stem == commonprefix
return MetaData.load_yaml(filename=f), f, metadata_file_candidates
except Exception as e:
logger.debug(f"Loading {f} as metadata file failed -- {e}")
logger.warning(f"Loading {f} as metadata file failed -- {e}")

return None, None, metadata_file_candidates

Expand Down
17 changes: 17 additions & 0 deletions tests/damast/core/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,23 @@ def test_annotated_dataframe_import_csv(data_path):
name="height", abbreviation="height", category=DataCategory.STATIC,
unit=units.m, value_range=MinMax(min=0, max=40), representation_type=int)

def test_annotated_dataframe_import_csv_with_quotes(data_path):
"""
Simple test of the annotated dataframe import for csv
"""
csv_path = data_path / "test_dataframe_with_quotes.csv"

adf = AnnotatedDataFrame.from_file(csv_path)
assert adf.column_names == ["id", "name"]
assert adf.dtype('id') == polars.Int64
assert adf.dtype('name') == polars.String

assert XDataFrame(adf._dataframe).equals(XDataFrame(polars.scan_csv(csv_path, null_values=["None", "none"])))

df = adf.dataframe.collect()
assert df[0,1] == "a,b;c"
assert df[1,1] == "d;e,f"

def test_set_dtype(data_path):
"""
Test if conversion from int -> str in representation_type is consistent
Expand Down
3 changes: 3 additions & 0 deletions tests/damast/data/test_dataframe_with_quotes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
id,name
0,"a,b;c"
1,"d;e,f"
19 changes: 19 additions & 0 deletions tests/damast/data/test_dataframe_with_quotes.spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
columns:
- name: id
is_optional: false
abbreviation: id
category: static
value_range:
MinMax:
min: 0
max: 1
allow_missing: true
representation_type: int
- name: name
is_optional: false
abbreviation: name
category: static
representation_type: str
annotations:
license: MIT License
comment: test dataframe
Loading