diff --git a/configs/regular/svm.json b/configs/regular/svm.json index f83e1be1..4a1bb915 100644 --- a/configs/regular/svm.json +++ b/configs/regular/svm.json @@ -14,10 +14,6 @@ "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } } }, - { - "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } }, - "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } - }, { "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, "algorithm": { diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json index 607a8f26..a55b6e51 100644 --- a/configs/regular/train_test_split.json +++ b/configs/regular/train_test_split.json @@ -10,7 +10,6 @@ "susy", "sift", "gist", - "epsilon", "svhn" ] } diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json index ec1d9c2d..bd1ac2c3 100644 --- a/configs/regular/xgboost_binary.json +++ b/configs/regular/xgboost_binary.json @@ -42,23 +42,6 @@ } } }, - { - "data": { - "dataset": "epsilon", - "split_kwargs": { - "train_size": 10000, - "test_size": 100000 - } - }, - "algorithm": { - "estimator_params": { - "max_depth": 8, - "colsample_bytree": 0.1, - "colsample_bynode": 0.1, - "n_estimators": 200 - } - } - }, { "data": { "dataset": "gisette", diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt index 064c9e90..a2536eed 100644 --- a/envs/requirements-sklearn.txt +++ b/envs/requirements-sklearn.txt @@ -18,3 +18,4 @@ tqdm psutil requests py-cpuinfo +openml diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index a93787f9..d75f5ea3 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -15,59 +15,113 @@ # =============================================================================== import os +import time from typing import Callable, List, Union import numpy as np +import openml import pandas as pd import requests from scipy.sparse import csr_matrix -from sklearn.datasets import fetch_openml -def retrieve(url: str, filename: str) -> None: +def retrieve(url: str, filename: str, max_retries: int = 3) -> None: + """Download a file from a URL with basic retry logic.""" if os.path.isfile(filename): return - elif url.startswith("http"): - response = requests.get(url, stream=True) - if response.status_code != 200: + + if not url.startswith("http"): + raise ValueError(f"URL must start with http:// or https://, got: {url}") + + for attempt in range(max_retries): + try: + response = requests.get(url, stream=True, timeout=120) + if response.status_code != 200: + raise AssertionError( + f"Failed to download from {url}. " + f"Response returned status code {response.status_code}" + ) + + total_size = int(response.headers.get("content-length", 0)) + block_size = 8192 + + with open(filename, "wb") as datafile: + bytes_written = 0 + for data in response.iter_content(block_size): + if data: + datafile.write(data) + bytes_written += len(data) + + # Verify download completeness if size is known + if total_size > 0 and bytes_written != total_size: + os.remove(filename) + if attempt < max_retries - 1: + time.sleep(1) + continue + raise AssertionError( + f"Incomplete download from {url}. " + f"Expected {total_size} bytes, got {bytes_written}" + ) + return + + except ( + requests.exceptions.RequestException, + IOError, + ) as e: + if os.path.isfile(filename): + os.remove(filename) + if attempt < max_retries - 1: + time.sleep(1) + continue raise AssertionError( - f"Failed to download from {url}.\n" - f"Response returned status code {response.status_code}" - ) - total_size = int(response.headers.get("content-length", 0)) - block_size = 8192 - n = 0 - with open(filename, "wb+") as datafile: - for data in response.iter_content(block_size): - n += len(data) / 1024 - datafile.write(data) - if total_size != 0 and n != total_size / 1024: - raise AssertionError("Some content was present but not downloaded/written") + f"Failed to download {url} after {max_retries} attempts: {e}" + ) from e def fetch_and_correct_openml( data_id: int, raw_data_cache_dir: str, as_frame: str = "auto" ): - x, y = fetch_openml( - data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir + """Fetch OpenML dataset using the openml package.""" + # Configure openml cache directory + openml_cache = os.path.join(raw_data_cache_dir, "openml") + os.makedirs(openml_cache, exist_ok=True) + openml.config.set_root_cache_directory(openml_cache) + + # Fetch the dataset + dataset = openml.datasets.get_dataset( + data_id, + download_data=True, + download_qualities=False, + download_features_meta_data=False, ) - if ( - isinstance(x, csr_matrix) - or isinstance(x, pd.DataFrame) - or isinstance(x, np.ndarray) - ): - pass - else: - raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml') + + # Get the data with target column specified + x, y, _, _ = dataset.get_data( + dataset_format="dataframe" if as_frame is True else "array", + target=dataset.default_target_attribute, + ) + + # Validate x type + if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)): + raise ValueError(f'Unknown x type "{type(x)}" returned from openml') + + # Convert sparse DataFrame to dense format + if isinstance(x, pd.DataFrame): + if any(pd.api.types.is_sparse(x[col]) for col in x.columns): + x = x.sparse.to_dense() + + # Convert y to numpy array if needed if isinstance(y, pd.Series): - # label transforms to cat.codes if it is passed as categorical series if isinstance(y.dtype, pd.CategoricalDtype): y = y.cat.codes - y = y.values - elif isinstance(y, np.ndarray): - pass - else: - raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml') + # Use to_numpy() for sparse arrays to densify them, otherwise use values + if pd.api.types.is_sparse(y): + y = y.to_numpy() + else: + y = y.values + elif not isinstance(y, np.ndarray): + raise ValueError(f'Unknown y type "{type(y)}" returned from openml') + return x, y