Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7a6f845
refactor: Remove "array" format
eddiebergman Oct 16, 2024
6296f37
refactor: Explicitly name parameter to listing functions
eddiebergman Oct 16, 2024
a31dafc
fix: Don't double call list
eddiebergman Oct 17, 2024
3bd924c
update...
eddiebergman Nov 4, 2024
e98c70a
attempted to fixed merge conflicts for examples
SubhadityaMukherjee Apr 1, 2025
2c82122
rename target_names back to target
SubhadityaMukherjee Apr 1, 2025
ce57881
fix: resolve kdd_rijn example to be without dataframe parameter and r…
LennartPurucker Jun 16, 2025
ddda711
fix: ensure dtypes as in original code
LennartPurucker Jun 16, 2025
063a8e6
fix: remove incorrect parsing of sparse pandas
LennartPurucker Jun 16, 2025
2e6c4c7
fix: make sklearn tests work with pandas
LennartPurucker Jun 16, 2025
12dedb0
fix: fix listing calls and test for utils
LennartPurucker Jun 16, 2025
4aae48b
Merge remote-tracking branch 'upstream/develop' into refactor-default…
LennartPurucker Jun 16, 2025
6517f6a
fix/maint: update and fix tests for new dataframe default
LennartPurucker Jun 16, 2025
466022e
fix/maint: resolve tests that used old default format
LennartPurucker Jun 16, 2025
bd120f5
fix: remove OrdinalEncoder
LennartPurucker Jun 16, 2025
de597b5
fix: update test to new assert with onehot
LennartPurucker Jun 16, 2025
32e6fbf
fix/maint: update examples
LennartPurucker Jun 16, 2025
bae06ca
fix: example revert
LennartPurucker Jun 16, 2025
78b1888
fix: add impute for tests to work with older sklearn version
LennartPurucker Jun 16, 2025
22b6b52
fix: make examples work
LennartPurucker Jun 16, 2025
232b37c
Update openml/utils.py
LennartPurucker Jun 17, 2025
f14fce6
Update openml/utils.py
LennartPurucker Jun 17, 2025
7fb5eb2
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
f45530f
Update openml/setups/setup.py
LennartPurucker Jun 17, 2025
7fb31ce
remove comment we do not understand
LennartPurucker Jun 17, 2025
9c2800e
Merge remote-tracking branch 'upstream/refactor-default-dataframe' in…
LennartPurucker Jun 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<div id="user-content-toc">
<ul align="center" style="list-style: none;">
<summary>
<img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
<img src="https://github.com/openml/openml.org/blob/master/app/public/static/svg/logo.svg" width="50" alt="OpenML Logo"/>
<h1>OpenML-Python</h1>
<img src="https://github.com/openml/docs/blob/master/docs/img/python.png" width="50" alt="Python Logo"/>
</summary>
Expand Down
6 changes: 3 additions & 3 deletions examples/20_basic/simple_datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
# List datasets
# =============

datasets_df = openml.datasets.list_datasets(output_format="dataframe")
datasets_df = openml.datasets.list_datasets()
print(datasets_df.head(n=10))

############################################################################
Expand Down Expand Up @@ -48,7 +48,7 @@
# attribute_names - the names of the features for the examples (X) and
# target feature (y)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format="dataframe", target=dataset.default_target_attribute
target=dataset.default_target_attribute
)

############################################################################
Expand All @@ -63,9 +63,9 @@
# Visualize the dataset
# =====================

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("darkgrid")

Expand Down
2 changes: 1 addition & 1 deletion examples/20_basic/simple_flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

# License: BSD 3-Clause

import openml
from sklearn import ensemble, neighbors

import openml

############################################################################
# .. warning::
Expand Down
15 changes: 5 additions & 10 deletions examples/30_extended/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,24 @@

# License: BSD 3-Clauses

import openml
import pandas as pd

import openml
from openml.datasets import edit_dataset, fork_dataset, get_dataset

############################################################################
# Exercise 0
# **********
#
# * List datasets
#
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)
#
# Note: list_datasets will return a pandas dataframe by default from 0.15. When using
# openml-python 0.14, `list_datasets` will warn you to use output_format='dataframe'.
datalist = openml.datasets.list_datasets(output_format="dataframe")
# * List datasets and return a dataframe
datalist = openml.datasets.list_datasets()
datalist = datalist[["did", "name", "NumberOfInstances", "NumberOfFeatures", "NumberOfClasses"]]

print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)

# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format="dataframe")
openml_df = openml.datasets.list_datasets()
openml_df.head(n=10)

############################################################################
Expand Down
11 changes: 6 additions & 5 deletions examples/30_extended/fetch_evaluations_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@
# Required filters can be applied to retrieve results from runs as required.

# We shall retrieve a small set (only 10 entries) to test the listing function for evaluations
openml.evaluations.list_evaluations(
function="predictive_accuracy", size=10, output_format="dataframe"
)
openml.evaluations.list_evaluations(function="predictive_accuracy", size=10)

# Using other evaluation metrics, 'precision' in this case
evals = openml.evaluations.list_evaluations(
Expand Down Expand Up @@ -94,7 +92,7 @@ def plot_cdf(values, metric="predictive_accuracy"):
plt.minorticks_on()
plt.grid(visible=True, which="minor", linestyle="--")
plt.axvline(max_val, linestyle="--", color="gray")
plt.text(max_val, 0, "%.3f" % max_val, fontsize=9)
plt.text(max_val, 0, f"{max_val:.3f}", fontsize=9)
plt.show()


Expand Down Expand Up @@ -162,7 +160,10 @@ def plot_flow_compare(evaluations, top_n=10, metric="predictive_accuracy"):
# List evaluations in descending order based on predictive_accuracy with
# hyperparameters
evals_setups = openml.evaluations.list_evaluations_setups(
function="predictive_accuracy", tasks=[31], size=100, sort_order="desc"
function="predictive_accuracy",
tasks=[31],
size=100,
sort_order="desc",
)

""
Expand Down
4 changes: 2 additions & 2 deletions examples/30_extended/flows_and_runs_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

# License: BSD 3-Clause

import openml
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
from sklearn import compose, ensemble, impute, neighbors, pipeline, preprocessing, tree

import openml

############################################################################
# We'll use the test server for the rest of this tutorial.
Expand Down
4 changes: 2 additions & 2 deletions examples/30_extended/plot_svm_hyperparameters_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

# License: BSD 3-Clause

import openml
import numpy as np

import openml

####################################################################################################
# First step - obtaining the data
# ===============================
Expand All @@ -22,7 +23,6 @@
function="predictive_accuracy",
flows=[8353],
tasks=[6],
output_format="dataframe",
# Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
# the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
parameters_in_separate_columns=True,
Expand Down
11 changes: 3 additions & 8 deletions examples/30_extended/study_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,11 @@

import openml


############################################################################
# Listing studies
# ***************
#
# * Use the output_format parameter to select output type
# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
# easier-to-work-with data structure

studies = openml.study.list_studies(output_format="dataframe", status="all")
studies = openml.study.list_studies(status="all")
print(studies.head(n=10))


Expand All @@ -52,8 +47,8 @@
# the evaluations available for the conducted runs:
evaluations = openml.evaluations.list_evaluations(
function="predictive_accuracy",
output_format="dataframe",
study=study.study_id,
output_format="dataframe",
)
print(evaluations.head())

Expand Down Expand Up @@ -81,7 +76,7 @@
# To verify
# https://test.openml.org/api/v1/study/1
suite = openml.study.get_suite("OpenML100")
print(all([t_id in suite.tasks for t_id in tasks]))
print(all(t_id in suite.tasks for t_id in tasks))

run_ids = []
for task_id in tasks:
Expand Down
11 changes: 3 additions & 8 deletions examples/30_extended/suites_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,11 @@

import openml


############################################################################
# Listing suites
# **************
#
# * Use the output_format parameter to select output type
# * Default gives ``dict``, but we'll use ``dataframe`` to obtain an
# easier-to-work-with data structure

suites = openml.study.list_suites(output_format="dataframe", status="all")
suites = openml.study.list_suites(status="all")
print(suites.head(n=10))

############################################################################
Expand All @@ -51,7 +46,7 @@

############################################################################
# And we can use the task listing functionality to learn more about them:
tasks = openml.tasks.list_tasks(output_format="dataframe")
tasks = openml.tasks.list_tasks()

# Using ``@`` in `pd.DataFrame.query <
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.query.html>`_
Expand All @@ -76,7 +71,7 @@

# We'll take a random subset of at least ten tasks of all available tasks on
# the test server:
all_tasks = list(openml.tasks.list_tasks(output_format="dataframe")["tid"])
all_tasks = list(openml.tasks.list_tasks()["tid"])
task_ids_for_suite = sorted(np.random.choice(all_tasks, replace=False, size=20))

# The study needs a machine-readable and unique alias. To obtain this,
Expand Down
8 changes: 4 additions & 4 deletions examples/30_extended/task_manual_iteration_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
####################################################################################################
# And then split the data based on this:

X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
Expand All @@ -88,7 +88,7 @@

task_id = 3
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down Expand Up @@ -132,7 +132,7 @@

task_id = 1767
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down Expand Up @@ -176,7 +176,7 @@

task_id = 1702
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y(dataset_format="dataframe")
X, y = task.get_X_and_y()
n_repeats, n_folds, n_samples = task.get_split_dimensions()
print(
"Task {}: number of repeats: {}, number of folds: {}, number of samples {}.".format(
Expand Down
20 changes: 8 additions & 12 deletions examples/30_extended/tasks_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import openml
from openml.tasks import TaskType
import pandas as pd

############################################################################
#
Expand All @@ -30,14 +29,11 @@
# ^^^^^^^^^^^^^
#
# We will start by simply listing only *supervised classification* tasks.
# **openml.tasks.list_tasks()** returns a dictionary of dictionaries by default, but we
# request a
# **openml.tasks.list_tasks()** getting a
# `pandas dataframe <https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html>`_
# instead to have better visualization capabilities and easier access:
# to have good visualization capabilities and easier access:

tasks = openml.tasks.list_tasks(
task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe"
)
tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION)
print(tasks.columns)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())
Expand Down Expand Up @@ -71,21 +67,21 @@
#
# Similar to listing tasks by task type, we can list tasks by tags:

tasks = openml.tasks.list_tasks(tag="OpenML100", output_format="dataframe")
tasks = openml.tasks.list_tasks(tag="OpenML100")
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

############################################################################
# Furthermore, we can list tasks based on the dataset id:

tasks = openml.tasks.list_tasks(data_id=1471, output_format="dataframe")
tasks = openml.tasks.list_tasks(data_id=1471)
print(f"First 5 of {len(tasks)} tasks:")
print(tasks.head())

############################################################################
# In addition, a size limit and an offset can be applied both separately and simultaneously:

tasks = openml.tasks.list_tasks(size=10, offset=50, output_format="dataframe")
tasks = openml.tasks.list_tasks(size=10, offset=50)
print(tasks)

############################################################################
Expand All @@ -101,7 +97,7 @@
# Finally, it is also possible to list all tasks on OpenML with:

############################################################################
tasks = openml.tasks.list_tasks(output_format="dataframe")
tasks = openml.tasks.list_tasks()
print(len(tasks))

############################################################################
Expand Down Expand Up @@ -195,7 +191,7 @@
# Error code for 'task already exists'
if e.code == 614:
# Lookup task
tasks = openml.tasks.list_tasks(data_id=128, output_format="dataframe")
tasks = openml.tasks.list_tasks(data_id=128)
tasks = tasks.query(
'task_type == "Supervised Classification" '
'and estimation_procedure == "10-fold Crossvalidation" '
Expand Down
7 changes: 2 additions & 5 deletions examples/40_paper/2015_neurips_feurer_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
| Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter
| In *Advances in Neural Information Processing Systems 28*, 2015
| Available at https://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf
""" # noqa F401
"""

# License: BSD 3-Clause

import pandas as pd

import openml

####################################################################################################
Expand Down Expand Up @@ -60,15 +58,14 @@
tasks = openml.tasks.list_tasks(
task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION,
status="all",
output_format="dataframe",
)

# Query only those with holdout as the resampling startegy.
tasks = tasks.query('estimation_procedure == "33% Holdout set"')

task_ids = []
for did in dataset_ids:
tasks_ = list(tasks.query("did == {}".format(did)).tid)
tasks_ = list(tasks.query(f"did == {did}").tid)
if len(tasks_) >= 1: # if there are multiple task, take the one with lowest ID (oldest).
task_id = min(tasks_)
else:
Expand Down
15 changes: 9 additions & 6 deletions examples/40_paper/2018_ida_strang_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
# License: BSD 3-Clause

import matplotlib.pyplot as plt

import openml
import pandas as pd

##############################################################################
# A basic step for each data-mining or machine learning task is to determine
Expand Down Expand Up @@ -47,13 +47,17 @@

# Downloads all evaluation records related to this study
evaluations = openml.evaluations.list_evaluations(
measure, size=None, flows=flow_ids, study=study_id, output_format="dataframe"
measure,
size=None,
flows=flow_ids,
study=study_id,
output_format="dataframe",
)
# gives us a table with columns data_id, flow1_value, flow2_value
evaluations = evaluations.pivot(index="data_id", columns="flow_id", values="value").dropna()
# downloads all data qualities (for scatter plot)
data_qualities = openml.datasets.list_datasets(
data_id=list(evaluations.index.values), output_format="dataframe"
data_id=list(evaluations.index.values),
)
# removes irrelevant data qualities
data_qualities = data_qualities[meta_features]
Expand Down Expand Up @@ -86,10 +90,9 @@
def determine_class(val_lin, val_nonlin):
if val_lin < val_nonlin:
return class_values[0]
elif val_nonlin < val_lin:
if val_nonlin < val_lin:
return class_values[1]
else:
return class_values[2]
return class_values[2]


evaluations["class"] = evaluations.apply(
Expand Down
Loading
Loading