Added some load functions

dhondta · dhondta · commit d7c2eb4e3abf · 2026-02-23T00:06:26.000+01:00
diff --git a/src/dsff/VERSION.txt b/src/dsff/VERSION.txt
@@ -1 +1 @@
-1.2.2
+1.2.3
diff --git a/src/dsff/formats/__init__.py b/src/dsff/formats/__init__.py
@@ -23,7 +23,7 @@
 
 __all__ = ["DSFF"]
 for k in list(globals().keys()):
-    if k.startswith("is_"):
+    if k.startswith("is_") or k.startswith("load_"):
         __all__.append(k)
 
 _FORMAT_TEXT_ALIAS = {'arff': "ARFF", 'csv': "CSV", 'db': "SQL", 'orc': "ORC"}
diff --git a/src/dsff/formats/arff.py b/src/dsff/formats/arff.py
@@ -2,20 +2,20 @@
 from .__common__ import *
 
 
-__all__ = ["from_arff", "is_arff", "to_arff"]
+__all__ = ["from_arff", "is_arff", "load_arff", "to_arff"]
 
 
-def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
+def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN, **kw):
     d, features, metadata, title = [], {}, {}, ""
     relation, attributes, data = False, [False, False], False
     for n, l in enumerate(t.splitlines() if isinstance(t := text_or_fh, str) else t):
         l, pf = l.strip(), f"Line {n}: "
         # the file shall start with "@RELATION"
         if not relation:
-            if l.startswith("@RELATION "):
+            if l.upper().startswith("@RELATION "):
                 relation = True
                 try:
-                    title = re.match(r"@RELATION\s+('[^']*'|\"[^\"]*\")$", l).group(1).strip("'\"")
+                    title = re.match(r"@RELATION\s+('[^']*'|\"[^\"]*\")$", l, re.I).group(1).strip("'\"")
                     continue
                 except Exception as e:
                     raise BadInputData(f"{pf}failed on @RELATION ({e})")
@@ -34,9 +34,8 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
             if attributes[0] and not attributes[1]:
                 # close the atributes block
                 attributes[1] = True
-                n_cols = len(d[0])
             continue
-        if l.startswith("@ATTRIBUTE "):
+        if l.upper().startswith("@ATTRIBUTE "):
             if not attributes[0]:
                 attributes[0] = True
             if len(d) == 0:
@@ -45,21 +44,23 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
             if attributes[1]:
                 raise BadInputData(f"{pf}found @ATTRIBUTE out of the attributes block)")
             try:
-                header = re.match(r"@ATTRIBUTE\s+([^\s]+)\s+[A-Z]+$", l).group(1)
+                header = re.match(r"@ATTRIBUTE\s+([^\s]+)\s+(?:[a-zA-Z]+|\{.*?\})$", l, re.I).group(1).strip("'\"")
                 if header == "class":
                     header = target
+                else:
+                    features.setdefault(header, "")
                 d[0].append(header)
                 continue
             except AttributeError:
                 raise BadInputData(f"{pf}failed on @ATTRIBUTE (bad type)")
         if not data:
-            if l == "@DATA":
+            if l.upper() == "@DATA":
                 data = True
+                n_cols = len(d[0])
                 continue
             else:
                 raise BadInputData(f"{pf}did not find @DATA where expected")
-        row = list(map(lambda x: x.strip("'\""), re.split(r",\s+", l)))
-        if len(row) != n_cols:
+        if len(row := list(map(lambda x: x.strip("'\""), re.split(r",\s*", l)))) != n_cols:
             raise BadInputData(f"{pf}this row does not match the number of columns")
         d.append(row)
     for i in range(n_cols):
@@ -78,7 +79,7 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
     return d, features, metadata, title
 
 
-def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
+def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN, **kw):
     """ Populate the DSFF file from an ARFF file. """
     with open(path) as f:
         d, ft, md, t = _parse(f, target, missing)
@@ -87,7 +88,7 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
 
 
 @text_or_path
-def is_arff(text):
+def is_arff(text, target=TARGET_NAME, missing=MISSING_TOKEN, **kw):
     """ Check if the input text or path is a valid ARFF. """
     try:
         _parse(ensure_str(text))
@@ -96,7 +97,14 @@ def is_arff(text):
         return False
 
 
-def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=MISSING_TOKEN, text=False):
+def load_arff(path, target=TARGET_NAME, missing=MISSING_TOKEN, **kw):
+    """ Load an ARFF file as a dictionary with data, features and metadata. """
+    with open(path) as f:
+        d, ft, md, _ = _parse(f, target, missing)
+    return {'data': d, 'features': ft, 'metadata': md}
+
+
+def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=MISSING_TOKEN, text=False, **kw):
     """ Output the dataset in ARFF format, suitable for use with the Weka framework, saved as a file or output as a
          string. """
     name = splitext(basename(path))[0]
diff --git a/src/dsff/formats/csv.py b/src/dsff/formats/csv.py
@@ -2,24 +2,16 @@
 from .__common__ import *
 
 
-__all__ = ["from_csv", "is_csv", "to_csv"]
+__all__ = ["from_csv", "is_csv", "load_csv", "to_csv"]
 
 
-def from_csv(dsff, path=None, exclude=DEFAULT_EXCL):
+def from_csv(dsff, path=None, exclude=DEFAULT_EXCL, **kw):
     """ Populate the DSFF file from a CSV file. """
-    dsff.write(path)
-    features = {}
-    for headers in dsff['data'].rows:
-        for header in headers:
-            if header.value in exclude:
-                continue
-            features[header.value] = ""
-        break
-    dsff.write(features=features)
+    dsff.write(**load_csv(path))
 
 
 @text_or_path
-def is_csv(text):
+def is_csv(text, **kw):
     """ Check if the input text or path is a valid CSV. """
     try:
         dialect = csvmod.Sniffer().sniff(text := ensure_str(text))
@@ -29,7 +21,16 @@ def is_csv(text):
         return False
 
 
-def to_csv(dsff, path=None, text=False):
+def load_csv(path, exclude=DEFAULT_EXCL, **kw):
+    """ Load a CSV file as a dictionary with data, features and metadata. """
+    data = {'metadata': {}}
+    with open(expanduser(path)) as f:
+        data['data'] = [r for r in csvmod.reader(f, delimiter=CSV_DELIMITER)]
+    data['features'] = {h: "" for h in data['data'][0] if h not in exclude}
+    return data
+
+
+def to_csv(dsff, path=None, text=False, **kw):
     """ Create a CSV from the data worksheet, saved as a file or output as a string. """
     with (StringIO() if text else open(path, 'w+')) as f:
         writer = csvmod.writer(f, delimiter=";")
diff --git a/src/dsff/formats/dataset.py b/src/dsff/formats/dataset.py
@@ -1,25 +1,25 @@
 # -*- coding: UTF-8 -*-
 from .__common__ import *
+from .csv import load_csv
 
 
-__all__ = ["from_dataset", "is_dataset", "to_dataset"]
+__all__ = ["from_dataset", "is_dataset", "load_dataset", "to_dataset"]
 
 
 def _parse(path):
-    if not isdir(path):
+    if not isdir(expanduser(path)):
         raise BadInputData("Not a folder")
-    else:
-        if len(missing := [f for f in ["data.csv", "features.json", "metadata.json"] if not isfile(join(path, f))]) > 0:
-            raise BadInputData(f"Not a valid dataset folder (missing: {', '.join(missing)})")
+    if len(missing := [f for f in ["data.csv", "features.json", "metadata.json"] if not isfile(join(path, f))]) > 0:
+        raise BadInputData(f"Not a valid dataset folder (missing: {', '.join(missing)})")
 
 
-def from_dataset(dsff, path=None):
+def from_dataset(dsff, path=None, **kw):
     """ Populate the DSFF file from a Dataset structure. """
     _parse(path)
     dsff.write(path)
 
 
-def is_dataset(path):
+def is_dataset(path, **kw):
     """ Check if the input path is a valid Dataset. """
     try:
         _parse(path)
@@ -28,7 +28,21 @@ def is_dataset(path):
         return False
 
 
-def to_dataset(dsff, path=None):
+def load_dataset(path, **kw):
+    """ Load a dataset folder as a dictionary with data, features and metadata. """
+    if not isdir(d := expanduser(str(path))):
+        raise BadInputData("Not a folder")
+    dp, fp, mp = join(d, "data.csv"), join(d, "features.json"), join(d, "metadata.json")
+    data = {}
+    data['data'] = load_csv(dp)['data']
+    with open(fp) as f:
+        data['features'] = json.load(f)
+    with open(mp) as f:
+        data['metadata'] = json.load(f)
+    return data
+
+
+def to_dataset(dsff, path=None, **kw):
     """ Create a dataset folder according to the following structure ;
     name
      +-- data.csv
diff --git a/src/dsff/formats/db.py b/src/dsff/formats/db.py
@@ -2,36 +2,16 @@
 from .__common__ import *
 
 
-__all__ = ["from_db", "is_db", "to_db"]
+__all__ = ["from_db", "is_db", "load_db", "to_db"]
 
 
-def from_db(dsff, path=None, exclude=DEFAULT_EXCL):
+def from_db(dsff, path=None, **kw):
     """ Populate the DSFF file from a SQLDB file. """
-    from json import loads
-    from sqlite3 import connect
-    conn = connect(path)
-    cursor = conn.cursor()
-    # list tables
-    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
-    tables = [table[0] for table in cursor.fetchall()]
-    if not all(t in tables for t in ["data", "features", "metadata"]):  # pragma: no cover
-        raise BadInputData("The target SQLDB does not have the right format")
-    # import data
-    cursor.execute("PRAGMA table_info('data')")
-    headers = [[col[1] for col in cursor.fetchall()]]
-    cursor.execute("SELECT * FROM data;")
-    dsff.write(headers + [r for r in cursor.fetchall()])
-    # import feature definitions
-    cursor.execute("SELECT name,description FROM features;")
-    dsff.write(features={r[0]: r[1] for r in cursor.fetchall()})
-    # import metadata
-    cursor.execute("SELECT key,value FROM metadata;")
-    dsff.write(metadata={r[0]: loads(r[1]) if isinstance(r[1], str) else r[1] for r in cursor.fetchall()})
-    conn.close()
+    dsff.write(**load_db(path))
 
 
 @text_or_path
-def is_db(data):
+def is_db(data, **kw):
     """ Check if the input data or path is a valid SQL database. """
     from sqlite3 import connect, Error
     from sys import version_info
@@ -60,7 +40,34 @@ def is_db(data):
             return False
 
 
-def to_db(dsff, path=None, text=False, primary_index=0):
+def load_db(path, **kw):
+    """ Load a SQLDB file as a dictionary with data, features and metadata. """
+    from json import loads
+    from os.path import basename, splitext
+    from sqlite3 import connect
+    conn = connect(path)
+    cursor, data = conn.cursor(), {}
+    # list tables
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+    tables = [table[0] for table in cursor.fetchall()]
+    if not all(t in tables for t in ["data", "features", "metadata"]):  # pragma: no cover
+        raise BadInputData("The target SQLDB does not have the right format")
+    # import data
+    cursor.execute("PRAGMA table_info('data')")
+    headers = [[col[1] for col in cursor.fetchall()]]
+    cursor.execute("SELECT * FROM data;")
+    data['data'] = headers + [r for r in cursor.fetchall()]
+    # import feature definitions
+    cursor.execute("SELECT name,description FROM features;")
+    data['features'] = {r[0]: r[1] for r in cursor.fetchall()}
+    # import metadata
+    cursor.execute("SELECT key,value FROM metadata;")
+    data['metadata'] = {r[0]: loads(r[1]) if isinstance(r[1], str) else r[1] for r in cursor.fetchall()}
+    conn.close()
+    return data
+
+
+def to_db(dsff, path=None, text=False, primary_index=0, **kw):
     """ Create a SQLDB from the data worksheet, saved as a file or output as a string. """
     from json import dumps
     from sqlite3 import connect
diff --git a/src/dsff/formats/pa.py b/src/dsff/formats/pa.py
@@ -9,28 +9,38 @@ def _nowrite(m):
     raise NotImplementedError(f"none of {m}.write_table and {m}.write_{m} is implemented")
 
 
+def _parse(ds):
+    return {
+        'data': [ds.schema.names] + [list(r.values()) for r in ds.to_pylist()],
+        'features': {k.decode(): v.decode() for k, v in ds.schema.metadata.items()},
+        'metadata': literal_eval(ds.schema.metadata.pop(b'__metadata__', b"{}").decode()),
+    }
+
+
 for module in ["feather", "orc", "parquet"]:
-    __all__ += [f"from_{module}", f"is_{module}", f"to_{module}"]
+    __all__ += [f"from_{module}", f"is_{module}", f"load_{module}", f"to_{module}"]
     def gen_func(m):
-        def from_(dsff, path=None, exclude=DEFAULT_EXCL):
-            dataset = globals()[m].read_table(path)
-            dsff.write(data=[dataset.schema.names] + [list(r.values()) for r in dataset.to_pylist()],
-                       metadata=literal_eval(dataset.schema.metadata.pop(b'__metadata__', b"{}").decode()),
-                       features={k.decode(): v.decode() for k, v in dataset.schema.metadata.items()})
+        def from_(dsff, path=None, **kw):
+            from os.path import basename, splitext
+            dsff.write(**_parse(globals()[m].read_table(path)))
         from_.__name__ = f"from_{m}"
-        def is_(data):
+        def is_(data, **kw):
             try:
                 globals()[m].read_table(pyarrow.BufferReader(data))
                 return True
             except Exception:
                 return False
         is_.__name__ = f"is_{m}"
-        def to_(dsff, path=None, text=False):
+        def load_(path, **kw):
+            return _parse(globals()[m].read_table(path))
+        load_.__name__ = f"load_{m}"
+        def to_(dsff, path=None, text=False, **kw):
             with (BytesIO() if text else open(path, 'wb+')) as f:
                 getattr(globals()[m], "write_table", getattr(globals()[m], f"write_{m}", _nowrite))(dsff._to_table(), f)
                 if text:
                     return f.getvalue()
         to_.__name__ = f"to_{m}"
-        return from_, text_or_path(is_), to_
-    globals()[f'from_{module}'], globals()[f'is_{module}'], globals()[f'to_{module}'] = gen_func(module)
+        return from_, text_or_path(is_), load_, to_
+    globals()[f'from_{module}'], globals()[f'is_{module}'], globals()[f'load_{module}'], globals()[f'to_{module}'] = \
+        gen_func(module)
 
diff --git a/tests/test_dsff.py b/tests/test_dsff.py
@@ -89,6 +89,7 @@ def test_conversion_arff(self):
         with open(arff := f"{TEST_BASENAME}.arff", 'w') as f:
             f.write(TEST_ARFF)
         self.assertTrue(is_arff(arff))
+        self.assertIsInstance(load_arff(arff), dict)
         with DSFF() as f:
             f.from_arff(TEST_BASENAME)
         # test for multiple error scenarios
@@ -130,7 +131,8 @@ def test_conversion_csv(self):
         with DSFF(TEST) as f:
             self.assertIsNotNone(f.to_csv(text=True))
             self.assertIsNone(f.to_csv())
-        self.assertTrue(is_csv(f"{TEST}.csv"))
+        self.assertTrue(is_csv(csv := f"{TEST_BASENAME}.csv"))
+        self.assertIsInstance(load_csv(csv), dict)
         # CSV to DSFF
         with DSFF() as f:
             f.from_csv(TEST_BASENAME)
@@ -143,7 +145,9 @@ def test_conversion_dataset(self):
         # FilelessDataset to DSFF
         with DSFF() as f:
             f.from_dataset(TEST_BASENAME)
-        self.assertTrue(is_dataset(f"{TEST_BASENAME}"))
+        self.assertTrue(is_dataset(TEST_BASENAME))
+        self.assertRaises(BadInputData, load_dataset, 0)
+        self.assertIsInstance(load_dataset(TEST_BASENAME), dict)
         # FilelessDataset to DSFF (bad input dataset)
         os.remove(os.path.join(TEST_BASENAME, "metadata.json"))
         with DSFF() as f:
@@ -167,7 +171,8 @@ def test_conversion_db(self):
         with DSFF(TEST) as f:
             self.assertIsNotNone(f.to_db(text=True))
             self.assertIsNone(f.to_db())
-        self.assertTrue(is_db(f"{TEST_BASENAME}.db"))
+        self.assertTrue(is_db(db := f"{TEST_BASENAME}.db"))
+        self.assertIsInstance(load_db(db), dict)
         # SQL database to DSFF
         with DSFF() as f:
             f.from_db(TEST_BASENAME)
@@ -183,8 +188,9 @@ def test_conversion_pyarrow_formats(self):
                     self.assertIsNone(getattr(f, f"to_{fmt}")(TEST_BASENAME))
                     self.assertIsNotNone(getattr(f, f"to_{fmt}")(text=True))
                 is_ = globals()[f'is_{fmt}']
-                self.assertTrue(is_(f"{TEST_BASENAME}.{fmt}"))
+                self.assertTrue(is_(fn := f"{TEST_BASENAME}.{fmt}"))
                 self.assertFalse(is_(b"PK\x03\x04\x14\x00\x00\x00\x08\x00P\xb3T\\F\xc7MH"))
+                self.assertIsInstance(globals()[f'load_{fmt}'](fn), dict)
                 with DSFF(INMEMORY) as f:
                     self.assertIsNone(getattr(f, f"from_{fmt}")(TEST_BASENAME))
                     f.to_dataset(path=TEST_BASENAME)