22from .__common__ import *
33
44
5- __all__ = ["from_arff" , "is_arff" , "to_arff" ]
5+ __all__ = ["from_arff" , "is_arff" , "load_arff" , " to_arff" ]
66
77
8- def _parse (text_or_fh , target = TARGET_NAME , missing = MISSING_TOKEN ):
8+ def _parse (text_or_fh , target = TARGET_NAME , missing = MISSING_TOKEN , ** kw ):
99 d , features , metadata , title = [], {}, {}, ""
1010 relation , attributes , data = False , [False , False ], False
1111 for n , l in enumerate (t .splitlines () if isinstance (t := text_or_fh , str ) else t ):
1212 l , pf = l .strip (), f"Line { n } : "
1313 # the file shall start with "@RELATION"
1414 if not relation :
15- if l .startswith ("@RELATION " ):
15+ if l .upper (). startswith ("@RELATION " ):
1616 relation = True
1717 try :
18- title = re .match (r"@RELATION\s+('[^']*'|\"[^\"]*\")$" , l ).group (1 ).strip ("'\" " )
18+ title = re .match (r"@RELATION\s+('[^']*'|\"[^\"]*\")$" , l , re . I ).group (1 ).strip ("'\" " )
1919 continue
2020 except Exception as e :
2121 raise BadInputData (f"{ pf } failed on @RELATION ({ e } )" )
@@ -34,9 +34,8 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
3434 if attributes [0 ] and not attributes [1 ]:
3535 # close the atributes block
3636 attributes [1 ] = True
37- n_cols = len (d [0 ])
3837 continue
39- if l .startswith ("@ATTRIBUTE " ):
38+ if l .upper (). startswith ("@ATTRIBUTE " ):
4039 if not attributes [0 ]:
4140 attributes [0 ] = True
4241 if len (d ) == 0 :
@@ -45,21 +44,23 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
4544 if attributes [1 ]:
4645 raise BadInputData (f"{ pf } found @ATTRIBUTE out of the attributes block)" )
4746 try :
48- header = re .match (r"@ATTRIBUTE\s+([^\s]+)\s+[A- Z]+$" , l ).group (1 )
47+ header = re .match (r"@ATTRIBUTE\s+([^\s]+)\s+(?:[a-zA- Z]+|\{.*?\}) $" , l , re . I ).group (1 ). strip ( "' \" " )
4948 if header == "class" :
5049 header = target
50+ else :
51+ features .setdefault (header , "" )
5152 d [0 ].append (header )
5253 continue
5354 except AttributeError :
5455 raise BadInputData (f"{ pf } failed on @ATTRIBUTE (bad type)" )
5556 if not data :
56- if l == "@DATA" :
57+ if l . upper () == "@DATA" :
5758 data = True
59+ n_cols = len (d [0 ])
5860 continue
5961 else :
6062 raise BadInputData (f"{ pf } did not find @DATA where expected" )
61- row = list (map (lambda x : x .strip ("'\" " ), re .split (r",\s+" , l )))
62- if len (row ) != n_cols :
63+ if len (row := list (map (lambda x : x .strip ("'\" " ), re .split (r",\s*" , l )))) != n_cols :
6364 raise BadInputData (f"{ pf } this row does not match the number of columns" )
6465 d .append (row )
6566 for i in range (n_cols ):
@@ -78,7 +79,7 @@ def _parse(text_or_fh, target=TARGET_NAME, missing=MISSING_TOKEN):
7879 return d , features , metadata , title
7980
8081
81- def from_arff (dsff , path = None , target = TARGET_NAME , missing = MISSING_TOKEN ):
82+ def from_arff (dsff , path = None , target = TARGET_NAME , missing = MISSING_TOKEN , ** kw ):
8283 """ Populate the DSFF file from an ARFF file. """
8384 with open (path ) as f :
8485 d , ft , md , t = _parse (f , target , missing )
@@ -87,7 +88,7 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
8788
8889
8990@text_or_path
90- def is_arff (text ):
91+ def is_arff (text , target = TARGET_NAME , missing = MISSING_TOKEN , ** kw ):
9192 """ Check if the input text or path is a valid ARFF. """
9293 try :
9394 _parse (ensure_str (text ))
@@ -96,7 +97,14 @@ def is_arff(text):
9697 return False
9798
9899
99- def to_arff (dsff , path = None , target = TARGET_NAME , exclude = DEFAULT_EXCL , missing = MISSING_TOKEN , text = False ):
100+ def load_arff (path , target = TARGET_NAME , missing = MISSING_TOKEN , ** kw ):
101+ """ Load an ARFF file as a dictionary with data, features and metadata. """
102+ with open (path ) as f :
103+ d , ft , md , _ = _parse (f , target , missing )
104+ return {'data' : d , 'features' : ft , 'metadata' : md }
105+
106+
107+ def to_arff (dsff , path = None , target = TARGET_NAME , exclude = DEFAULT_EXCL , missing = MISSING_TOKEN , text = False , ** kw ):
100108 """ Output the dataset in ARFF format, suitable for use with the Weka framework, saved as a file or output as a
101109 string. """
102110 name = splitext (basename (path ))[0 ]
0 commit comments