openml · LennartPurucker · Jun 17, 2025 · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 <div id="user-content-toc">
   <ul align="center" style="list-style: none;">
     <summary>
-      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/> 
+      <img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
       <h1>OpenML-Python</h1>
       <img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
     </summary>

diff --git a/examples/20_basic/simple_datasets_tutorial.py b/examples/20_basic/simple_datasets_tutorial.py
@@ -19,7 +19,7 @@
 # List datasets
 # =============
 
-datasets_df = openml.datasets.list_datasets(output_format="dataframe")
+datasets_df = openml.datasets.list_datasets()
 print(datasets_df.head(n=10))
 
 ############################################################################
@@ -48,7 +48,7 @@
 # attribute_names - the names of the features for the examples (X) and
 # target feature (y)
 X, y, categorical_indicator, attribute_names = dataset.get_data(
-    dataset_format="dataframe", target=dataset.default_target_attribute
+    target=dataset.default_target_attribute
 )
 
 ############################################################################
@@ -63,9 +63,9 @@
 # Visualize the dataset
 # =====================
 
+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-import matplotlib.pyplot as plt
 
 sns.set_style("darkgrid")
 

diff --git a/examples/20_basic/simple_flows_and_runs_tutorial.py b/examples/20_basic/simple_flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
 from sklearn import ensemble, neighbors
 
+import openml
 
 ############################################################################
 # .. warning::

diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -8,29 +8,24 @@
 
 # License: BSD 3-Clauses
 
-import openml
 import pandas as pd
+
+import openml
 from openml.datasets import edit_dataset, fork_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
 # **********
 #
-# * List datasets
-#
-#   * Use the output_format parameter to select output type
-#   * Default gives 'dict' (other option: 'dataframe', see below)
-#
-# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
-# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
-datalist = openml.datasets.list_datasets(output_format="dataframe")
+# * List datasets and return a dataframe
+datalist = openml.datasets.list_datasets()
 datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]
 
 print(f"First 10 of {len(datalist)} datasets...")
 datalist.head(n=10)
 
 # The same can be done with lesser lines of code
-openml_df = openml.datasets.list_datasets(output_format="dataframe")
+openml_df = openml.datasets.list_datasets()
 openml_df.head(n=10)
 
 ############################################################################

diff --git a/examples/30_extended/fetch_evaluations_tutorial.py b/examples/30_extended/fetch_evaluations_tutorial.py
@@ -32,9 +32,7 @@
 # Required filters can be applied to retrieve results from runs as required.
 
 # We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
-openml.evaluations.list_evaluations(
-    function="predictive_accuracy", size=10, output_format="dataframe"
-)
+openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)
 
 # Using other evaluation metrics, 'precision' in this case
 evals = openml.evaluations.list_evaluations(
@@ -94,7 +92,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
     plt.minorticks_on()
     plt.grid(visible=True, which="minor", linestyle="--")
     plt.axvline(max_val, linestyle="--", color="gray")
-    plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
+    plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
     plt.show()
 
 
@@ -162,7 +160,10 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
 # List evaluations in descending order based on predictive_accuracy with
 # hyperparameters
 evals_setups = openml.evaluations.list_evaluations_setups(
-    function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
+    function="predictive_accuracy",
+    tasks=[31],
+    size=100,
+    sort_order="desc",
 )
 
 ""

diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -7,9 +7,9 @@
 
 # License: BSD 3-Clause
 
-import openml
-from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree
 
+import openml
 
 ############################################################################
 # We'll use the test server for the rest of this tutorial.

diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -6,9 +6,10 @@
 
 # License: BSD 3-Clause
 
-import openml
 import numpy as np
 
+import openml
+
 ####################################################################################################
 # First step - obtaining the data
 # ===============================
@@ -22,7 +23,6 @@
     function="predictive_accuracy",
     flows=[8353],
     tasks=[6],
-    output_format="dataframe",
     # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
     # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
     parameters_in_separate_columns=True,

diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
@@ -17,16 +17,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing studies
 # ***************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-studies = openml.study.list_studies(output_format="dataframe", status="all")
+studies = openml.study.list_studies(status="all")
 print(studies.head(n=10))
 
 
@@ -52,8 +47,8 @@
 # the evaluations available for the conducted runs:
 evaluations = openml.evaluations.list_evaluations(
     function="predictive_accuracy",
-    output_format="dataframe",
     study=study.study_id,
+    output_format="dataframe",
 )
 print(evaluations.head())
 
@@ -81,7 +76,7 @@
 # To verify
 # https://test.openml.org/api/v1/study/1
 suite = openml.study.get_suite("OpenML100")
-print(all([t_id in suite.tasks for t_id in tasks]))
+print(all(t_id in suite.tasks for t_id in tasks))
 
 run_ids = []
 for task_id in tasks:

diff --git a/examples/30_extended/suites_tutorial.py b/examples/30_extended/suites_tutorial.py
@@ -19,16 +19,11 @@
 
 import openml
 
-
 ############################################################################
 # Listing suites
 # **************
-#
-# * Use the output_format parameter to select output type
-# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
-#   easier-to-work-with data structure
 
-suites = openml.study.list_suites(output_format="dataframe", status="all")
+suites = openml.study.list_suites(status="all")
 print(suites.head(n=10))
 
 ############################################################################
@@ -51,7 +46,7 @@
 
 ############################################################################
 # And we can use the task listing functionality to learn more about them:
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 
 # Using ``@`` in `pd.DataFrame.query <
 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
@@ -76,7 +71,7 @@
 
 # We'll take a random subset of at least ten tasks of all available tasks on
 # the test server:
-all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
+all_tasks = list(openml.tasks.list_tasks()["tid"])
 task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))
 
 # The study needs a machine-readable and unique alias. To obtain this,

diff --git a/examples/30_extended/task_manual_iteration_tutorial.py b/examples/30_extended/task_manual_iteration_tutorial.py
@@ -68,7 +68,7 @@
 ####################################################################################################
 # And then split the data based on this:
 
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 X_train = X.iloc[train_indices]
 y_train = y.iloc[train_indices]
 X_test = X.iloc[test_indices]
@@ -88,7 +88,7 @@
 
 task_id = 3
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -132,7 +132,7 @@
 
 task_id = 1767
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
@@ -176,7 +176,7 @@
 
 task_id = 1702
 task = openml.tasks.get_task(task_id)
-X, y = task.get_X_and_y(dataset_format="dataframe")
+X, y = task.get_X_and_y()
 n_repeats, n_folds, n_samples = task.get_split_dimensions()
 print(
     "Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(

diff --git a/examples/30_extended/tasks_tutorial.py b/examples/30_extended/tasks_tutorial.py
@@ -9,7 +9,6 @@
 
 import openml
 from openml.tasks import TaskType
-import pandas as pd
 
 ############################################################################
 #
@@ -30,14 +29,11 @@
 # ^^^^^^^^^^^^^
 #
 # We will start by simply listing only *supervised classification* tasks.
-# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
-# request a
+# **openml.tasks.list_tasks()** getting a
 # `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
-# instead to have better visualization capabilities and easier access:
+# to have good visualization capabilities and easier access:
 
-tasks = openml.tasks.list_tasks(
-    task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
-)
+tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
 print(tasks.columns)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
@@ -71,21 +67,21 @@
 #
 # Similar to listing tasks by task type, we can list tasks by tags:
 
-tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
+tasks = openml.tasks.list_tasks(tag="OpenML100")
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # Furthermore, we can list tasks based on the dataset id:
 
-tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
+tasks = openml.tasks.list_tasks(data_id=1471)
 print(f"First 5 of {len(tasks)} tasks:")
 print(tasks.head())
 
 ############################################################################
 # In addition, a size limit and an offset can be applied both separately and simultaneously:
 
-tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
+tasks = openml.tasks.list_tasks(size=10, offset=50)
 print(tasks)
 
 ############################################################################
@@ -101,7 +97,7 @@
 # Finally, it is also possible to list all tasks on OpenML with:
 
 ############################################################################
-tasks = openml.tasks.list_tasks(output_format="dataframe")
+tasks = openml.tasks.list_tasks()
 print(len(tasks))
 
 ############################################################################
@@ -195,7 +191,7 @@
     # Error code for 'task already exists'
     if e.code == 614:
         # Lookup task
-        tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
+        tasks = openml.tasks.list_tasks(data_id=128)
         tasks = tasks.query(
             'task_type == "Supervised Classification" '
             'and estimation_procedure == "10-fold Crossvalidation" '

diff --git a/examples/40_paper/2015_neurips_feurer_example.py b/examples/40_paper/2015_neurips_feurer_example.py
@@ -13,12 +13,10 @@
 | Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
 | In *Advances in Neural Information Processing Systems 28*, 2015
 | Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
-"""  # noqa F401
+"""
 
 # License: BSD 3-Clause
 
-import pandas as pd
-
 import openml
 
 ####################################################################################################
@@ -60,15 +58,14 @@
 tasks = openml.tasks.list_tasks(
     task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
     status="all",
-    output_format="dataframe",
 )
 
 # Query only those with holdout as the resampling startegy.
 tasks = tasks.query('estimation_procedure == "33% Holdout set"')
 
 task_ids = []
 for did in dataset_ids:
-    tasks_ = list(tasks.query("did == {}".format(did)).tid)
+    tasks_ = list(tasks.query(f"did == {did}").tid)
     if len(tasks_) >= 1:  # if there are multiple task, take the one with lowest ID (oldest).
         task_id = min(tasks_)
     else:

diff --git a/examples/40_paper/2018_ida_strang_example.py b/examples/40_paper/2018_ida_strang_example.py
@@ -17,8 +17,8 @@
 # License: BSD 3-Clause
 
 import matplotlib.pyplot as plt
+
 import openml
-import pandas as pd
 
 ##############################################################################
 # A basic step for each data-mining or machine learning task is to determine
@@ -47,13 +47,17 @@
 
 # Downloads all evaluation records related to this study
 evaluations = openml.evaluations.list_evaluations(
-    measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
+    measure,
+    size=None,
+    flows=flow_ids,
+    study=study_id,
+    output_format="dataframe",
 )
 # gives us a table with columns data_id, flow1_value, flow2_value
 evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
 # downloads all data qualities (for scatter plot)
 data_qualities = openml.datasets.list_datasets(
-    data_id=list(evaluations.index.values), output_format="dataframe"
+    data_id=list(evaluations.index.values),
 )
 # removes irrelevant data qualities
 data_qualities = data_qualities[meta_features]
@@ -86,10 +90,9 @@
 def determine_class(val_lin, val_nonlin):
     if val_lin < val_nonlin:
         return class_values[0]
-    elif val_nonlin < val_lin:
+    if val_nonlin < val_lin:
         return class_values[1]
-    else:
-        return class_values[2]
+    return class_values[2]
 
 
 evaluations["class"] = evaluations.apply(