From 6342f1d2fb9ce2406bef2bf483277baca0f65134 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Mon, 9 Feb 2026 18:07:43 -0800 Subject: [PATCH 1/7] Initial attempt at dataset download issue resolutions --- sklbench/datasets/downloaders.py | 208 ++++++++++++++++++++++++++++--- 1 file changed, 188 insertions(+), 20 deletions(-) diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index a93787f9..31f737d5 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -15,6 +15,8 @@ # =============================================================================== import os +import time +import warnings from typing import Callable, List, Union import numpy as np @@ -24,33 +26,197 @@ from sklearn.datasets import fetch_openml -def retrieve(url: str, filename: str) -> None: +def retrieve(url: str, filename: str, max_retries: int = 5) -> None: + """ + Download a file from a URL with retry logic and resume capability. + + Args: + url: URL to download from + filename: Local file path to save to + max_retries: Maximum number of retry attempts for failed downloads + """ if os.path.isfile(filename): - return - elif url.startswith("http"): - response = requests.get(url, stream=True) - if response.status_code != 200: - raise AssertionError( - f"Failed to download from {url}.\n" - f"Response returned status code {response.status_code}" + # Check if file is complete by comparing size + try: + head_response = requests.head(url, allow_redirects=True, timeout=30) + expected_size = int(head_response.headers.get("content-length", 0)) + actual_size = os.path.getsize(filename) + + if expected_size > 0 and actual_size == expected_size: + # File exists and is complete + return + else: + warnings.warn( + f"Existing file {filename} is incomplete ({actual_size}/{expected_size} bytes). " + f"Will attempt to resume download.", + RuntimeWarning + ) + except Exception as e: + # If we can't verify, assume file is complete + warnings.warn( + f"Could not verify file completeness for {filename}: {e}. Assuming complete.", + RuntimeWarning + ) + return + + if not url.startswith("http"): + raise ValueError(f"URL must start with http:// or https://, got: {url}") + + temp_filename = filename + ".partial" + block_size = 8192 + + for attempt in range(max_retries): + try: + # Check if we can resume a partial download + resume_pos = 0 + if os.path.isfile(temp_filename): + resume_pos = os.path.getsize(temp_filename) + headers = {"Range": f"bytes={resume_pos}-"} + mode = "ab" # Append mode + warnings.warn( + f"Resuming download of {url} from byte {resume_pos}", + RuntimeWarning + ) + else: + headers = {} + mode = "wb" + + response = requests.get(url, stream=True, headers=headers, timeout=60) + + # Handle different response codes + if response.status_code == 200: + # Full download + mode = "wb" + resume_pos = 0 + elif response.status_code == 206: + # Partial content (resume successful) + pass + elif response.status_code == 416: + # Range not satisfiable - file might be complete + if os.path.isfile(temp_filename): + os.rename(temp_filename, filename) + return + else: + raise AssertionError( + f"Failed to download from {url}. " + f"Response returned status code {response.status_code}" + ) + + # Get expected total size + if response.status_code == 206: + content_range = response.headers.get("content-range", "") + if content_range: + total_size = int(content_range.split("/")[1]) + else: + total_size = 0 + else: + total_size = int(response.headers.get("content-length", 0)) + + # Download the file + bytes_downloaded = resume_pos + with open(temp_filename, mode) as datafile: + for data in response.iter_content(block_size): + if data: # filter out keep-alive chunks + datafile.write(data) + bytes_downloaded += len(data) + + # Verify download completeness + if total_size > 0: + actual_size = os.path.getsize(temp_filename) + if actual_size != total_size: + warnings.warn( + f"Download incomplete: {actual_size}/{total_size} bytes. " + f"Attempt {attempt + 1}/{max_retries}", + RuntimeWarning + ) + if attempt < max_retries - 1: + continue # Retry + else: + raise AssertionError( + f"Failed to completely download {url} after {max_retries} attempts. " + f"Got {actual_size}/{total_size} bytes" + ) + + # Download successful, rename temp file to final filename + os.rename(temp_filename, filename) + return + + except (requests.exceptions.ChunkedEncodingError, + requests.exceptions.ConnectionError, + requests.exceptions.Timeout) as e: + warnings.warn( + f"Download interrupted for {url}: {type(e).__name__}: {e}. " + f"Attempt {attempt + 1}/{max_retries}", + RuntimeWarning ) - total_size = int(response.headers.get("content-length", 0)) - block_size = 8192 - n = 0 - with open(filename, "wb+") as datafile: - for data in response.iter_content(block_size): - n += len(data) / 1024 - datafile.write(data) - if total_size != 0 and n != total_size / 1024: - raise AssertionError("Some content was present but not downloaded/written") + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s, 8s, 16s + warnings.warn(f"Waiting {wait_time}s before retry...", RuntimeWarning) + time.sleep(wait_time) + continue + else: + # Clean up partial file if all retries failed + if os.path.isfile(temp_filename): + os.remove(temp_filename) + raise AssertionError( + f"Failed to download {url} after {max_retries} attempts. " + f"Last error: {type(e).__name__}: {e}" + ) from e def fetch_and_correct_openml( data_id: int, raw_data_cache_dir: str, as_frame: str = "auto" ): - x, y = fetch_openml( - data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir - ) + """ + Fetch OpenML dataset with fallback for MD5 checksum errors. + + First tries sklearn's fetch_openml. If that fails due to MD5 checksum mismatch, + falls back to using the openml package directly, which has updated checksums. + """ + try: + # Try sklearn's fetch_openml first + x, y = fetch_openml( + data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir + ) + except ValueError as e: + # Check if it's an MD5 checksum error + if "md5 checksum" in str(e).lower(): + warnings.warn( + f"MD5 checksum validation failed for OpenML dataset {data_id}. " + f"Falling back to using openml package directly. " + f"Original error: {e}", + RuntimeWarning + ) + + # Fall back to openml package which might have updated checksums + try: + import openml + # Configure openml to use the provided cache directory + openml_cache = os.path.join(raw_data_cache_dir, "openml_direct") + os.makedirs(openml_cache, exist_ok=True) + openml.config.set_root_cache_directory(openml_cache) + + dataset = openml.datasets.get_dataset( + data_id, + download_data=True, + download_qualities=False, + download_features_meta_data=False + ) + #Get the data with target column specified + x, y, _, _ = dataset.get_data( + dataset_format="dataframe" if as_frame == "auto" or as_frame else "array", + target=dataset.default_target_attribute + ) + except Exception as openml_error: + raise ValueError( + f"Failed to load OpenML dataset {data_id} using both sklearn and openml package. " + f"sklearn error: {e}. openml error: {openml_error}" + ) from openml_error + else: + # Not a checksum error, re-raise + raise + + # Validate and convert return types if ( isinstance(x, csr_matrix) or isinstance(x, pd.DataFrame) @@ -59,6 +225,7 @@ def fetch_and_correct_openml( pass else: raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml') + if isinstance(y, pd.Series): # label transforms to cat.codes if it is passed as categorical series if isinstance(y.dtype, pd.CategoricalDtype): @@ -68,6 +235,7 @@ def fetch_and_correct_openml( pass else: raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml') + return x, y From 2141a21e08950d5b20927ed144d484bbedf8c401 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Mon, 9 Feb 2026 18:12:18 -0800 Subject: [PATCH 2/7] add openml to deps --- envs/requirements-sklearn.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt index 064c9e90..a2536eed 100644 --- a/envs/requirements-sklearn.txt +++ b/envs/requirements-sklearn.txt @@ -18,3 +18,4 @@ tqdm psutil requests py-cpuinfo +openml From 580610b94d0f1a4faadf08d3043a7662b05b1aad Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Tue, 10 Feb 2026 23:16:30 -0800 Subject: [PATCH 3/7] reduce unnecessary diff --- sklbench/datasets/downloaders.py | 243 ++++++++----------------------- 1 file changed, 60 insertions(+), 183 deletions(-) diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index 31f737d5..8fff5794 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -16,225 +16,102 @@ import os import time -import warnings from typing import Callable, List, Union import numpy as np +import openml import pandas as pd import requests from scipy.sparse import csr_matrix -from sklearn.datasets import fetch_openml -def retrieve(url: str, filename: str, max_retries: int = 5) -> None: - """ - Download a file from a URL with retry logic and resume capability. - - Args: - url: URL to download from - filename: Local file path to save to - max_retries: Maximum number of retry attempts for failed downloads - """ +def retrieve(url: str, filename: str, max_retries: int = 3) -> None: + """Download a file from a URL with basic retry logic.""" if os.path.isfile(filename): - # Check if file is complete by comparing size - try: - head_response = requests.head(url, allow_redirects=True, timeout=30) - expected_size = int(head_response.headers.get("content-length", 0)) - actual_size = os.path.getsize(filename) - - if expected_size > 0 and actual_size == expected_size: - # File exists and is complete - return - else: - warnings.warn( - f"Existing file {filename} is incomplete ({actual_size}/{expected_size} bytes). " - f"Will attempt to resume download.", - RuntimeWarning - ) - except Exception as e: - # If we can't verify, assume file is complete - warnings.warn( - f"Could not verify file completeness for {filename}: {e}. Assuming complete.", - RuntimeWarning - ) - return + return if not url.startswith("http"): raise ValueError(f"URL must start with http:// or https://, got: {url}") - temp_filename = filename + ".partial" - block_size = 8192 - for attempt in range(max_retries): try: - # Check if we can resume a partial download - resume_pos = 0 - if os.path.isfile(temp_filename): - resume_pos = os.path.getsize(temp_filename) - headers = {"Range": f"bytes={resume_pos}-"} - mode = "ab" # Append mode - warnings.warn( - f"Resuming download of {url} from byte {resume_pos}", - RuntimeWarning - ) - else: - headers = {} - mode = "wb" - - response = requests.get(url, stream=True, headers=headers, timeout=60) - - # Handle different response codes - if response.status_code == 200: - # Full download - mode = "wb" - resume_pos = 0 - elif response.status_code == 206: - # Partial content (resume successful) - pass - elif response.status_code == 416: - # Range not satisfiable - file might be complete - if os.path.isfile(temp_filename): - os.rename(temp_filename, filename) - return - else: + response = requests.get(url, stream=True, timeout=120) + if response.status_code != 200: raise AssertionError( f"Failed to download from {url}. " f"Response returned status code {response.status_code}" ) - # Get expected total size - if response.status_code == 206: - content_range = response.headers.get("content-range", "") - if content_range: - total_size = int(content_range.split("/")[1]) - else: - total_size = 0 - else: - total_size = int(response.headers.get("content-length", 0)) + total_size = int(response.headers.get("content-length", 0)) + block_size = 8192 - # Download the file - bytes_downloaded = resume_pos - with open(temp_filename, mode) as datafile: + with open(filename, "wb") as datafile: + bytes_written = 0 for data in response.iter_content(block_size): - if data: # filter out keep-alive chunks + if data: datafile.write(data) - bytes_downloaded += len(data) - - # Verify download completeness - if total_size > 0: - actual_size = os.path.getsize(temp_filename) - if actual_size != total_size: - warnings.warn( - f"Download incomplete: {actual_size}/{total_size} bytes. " - f"Attempt {attempt + 1}/{max_retries}", - RuntimeWarning - ) - if attempt < max_retries - 1: - continue # Retry - else: - raise AssertionError( - f"Failed to completely download {url} after {max_retries} attempts. " - f"Got {actual_size}/{total_size} bytes" - ) - - # Download successful, rename temp file to final filename - os.rename(temp_filename, filename) + bytes_written += len(data) + + # Verify download completeness if size is known + if total_size > 0 and bytes_written != total_size: + os.remove(filename) + if attempt < max_retries - 1: + time.sleep(1) + continue + raise AssertionError( + f"Incomplete download from {url}. " + f"Expected {total_size} bytes, got {bytes_written}" + ) return - except (requests.exceptions.ChunkedEncodingError, - requests.exceptions.ConnectionError, - requests.exceptions.Timeout) as e: - warnings.warn( - f"Download interrupted for {url}: {type(e).__name__}: {e}. " - f"Attempt {attempt + 1}/{max_retries}", - RuntimeWarning - ) + except ( + requests.exceptions.RequestException, + IOError, + ) as e: + if os.path.isfile(filename): + os.remove(filename) if attempt < max_retries - 1: - wait_time = 2 ** attempt # Exponential backoff: 1s, 2s, 4s, 8s, 16s - warnings.warn(f"Waiting {wait_time}s before retry...", RuntimeWarning) - time.sleep(wait_time) + time.sleep(1) continue - else: - # Clean up partial file if all retries failed - if os.path.isfile(temp_filename): - os.remove(temp_filename) - raise AssertionError( - f"Failed to download {url} after {max_retries} attempts. " - f"Last error: {type(e).__name__}: {e}" - ) from e + raise AssertionError( + f"Failed to download {url} after {max_retries} attempts: {e}" + ) from e def fetch_and_correct_openml( data_id: int, raw_data_cache_dir: str, as_frame: str = "auto" ): - """ - Fetch OpenML dataset with fallback for MD5 checksum errors. - - First tries sklearn's fetch_openml. If that fails due to MD5 checksum mismatch, - falls back to using the openml package directly, which has updated checksums. - """ - try: - # Try sklearn's fetch_openml first - x, y = fetch_openml( - data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir - ) - except ValueError as e: - # Check if it's an MD5 checksum error - if "md5 checksum" in str(e).lower(): - warnings.warn( - f"MD5 checksum validation failed for OpenML dataset {data_id}. " - f"Falling back to using openml package directly. " - f"Original error: {e}", - RuntimeWarning - ) - - # Fall back to openml package which might have updated checksums - try: - import openml - # Configure openml to use the provided cache directory - openml_cache = os.path.join(raw_data_cache_dir, "openml_direct") - os.makedirs(openml_cache, exist_ok=True) - openml.config.set_root_cache_directory(openml_cache) - - dataset = openml.datasets.get_dataset( - data_id, - download_data=True, - download_qualities=False, - download_features_meta_data=False - ) - #Get the data with target column specified - x, y, _, _ = dataset.get_data( - dataset_format="dataframe" if as_frame == "auto" or as_frame else "array", - target=dataset.default_target_attribute - ) - except Exception as openml_error: - raise ValueError( - f"Failed to load OpenML dataset {data_id} using both sklearn and openml package. " - f"sklearn error: {e}. openml error: {openml_error}" - ) from openml_error - else: - # Not a checksum error, re-raise - raise - - # Validate and convert return types - if ( - isinstance(x, csr_matrix) - or isinstance(x, pd.DataFrame) - or isinstance(x, np.ndarray) - ): - pass - else: - raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml') - + """Fetch OpenML dataset using the openml package.""" + # Configure openml cache directory + openml_cache = os.path.join(raw_data_cache_dir, "openml") + os.makedirs(openml_cache, exist_ok=True) + openml.config.set_root_cache_directory(openml_cache) + + # Fetch the dataset + dataset = openml.datasets.get_dataset( + data_id, + download_data=True, + download_qualities=False, + download_features_meta_data=False, + ) + + # Get the data with target column specified + x, y, _, _ = dataset.get_data( + dataset_format="dataframe" if as_frame == "auto" or as_frame else "array", + target=dataset.default_target_attribute, + ) + + # Validate x type + if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)): + raise ValueError(f'Unknown x type "{type(x)}" returned from openml') + + # Convert y to numpy array if needed if isinstance(y, pd.Series): - # label transforms to cat.codes if it is passed as categorical series if isinstance(y.dtype, pd.CategoricalDtype): y = y.cat.codes y = y.values - elif isinstance(y, np.ndarray): - pass - else: - raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml') + elif not isinstance(y, np.ndarray): + raise ValueError(f'Unknown y type "{type(y)}" returned from openml') return x, y From 23d69ec13101cc0c110868269a79a6c45a0ee312 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Wed, 11 Feb 2026 23:14:15 -0800 Subject: [PATCH 4/7] SparseArray error workaround --- sklbench/datasets/downloaders.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index 8fff5794..48fbf36d 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -109,7 +109,11 @@ def fetch_and_correct_openml( if isinstance(y, pd.Series): if isinstance(y.dtype, pd.CategoricalDtype): y = y.cat.codes - y = y.values + # Use to_numpy() for sparse arrays to densify them, otherwise use values + if pd.api.types.is_sparse(y): + y = y.to_numpy() + else: + y = y.values elif not isinstance(y, np.ndarray): raise ValueError(f'Unknown y type "{type(y)}" returned from openml') From 7bfda99275241ad327ac8a77cf447d6e9e1ccc60 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Thu, 12 Feb 2026 08:37:23 -0800 Subject: [PATCH 5/7] add sparse workaround for X too --- sklbench/datasets/downloaders.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index 48fbf36d..e386ea71 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -105,6 +105,11 @@ def fetch_and_correct_openml( if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)): raise ValueError(f'Unknown x type "{type(x)}" returned from openml') + # Convert sparse DataFrame to dense format + if isinstance(x, pd.DataFrame): + if any(pd.api.types.is_sparse(x[col]) for col in x.columns): + x = x.sparse.to_dense() + # Convert y to numpy array if needed if isinstance(y, pd.Series): if isinstance(y.dtype, pd.CategoricalDtype): From 3f6da9685518e20e41e238e8c684476a4f1f7fa3 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Thu, 12 Feb 2026 09:44:56 -0800 Subject: [PATCH 6/7] another fix --- sklbench/datasets/downloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py index e386ea71..d75f5ea3 100644 --- a/sklbench/datasets/downloaders.py +++ b/sklbench/datasets/downloaders.py @@ -97,7 +97,7 @@ def fetch_and_correct_openml( # Get the data with target column specified x, y, _, _ = dataset.get_data( - dataset_format="dataframe" if as_frame == "auto" or as_frame else "array", + dataset_format="dataframe" if as_frame is True else "array", target=dataset.default_target_attribute, ) From 3588a11ea8220db63e335bda4738701d4bfbd124 Mon Sep 17 00:00:00 2001 From: ethanglaser Date: Thu, 12 Feb 2026 12:41:10 -0800 Subject: [PATCH 7/7] drop problematic epsilon dataset --- configs/regular/svm.json | 4 ---- configs/regular/train_test_split.json | 1 - configs/regular/xgboost_binary.json | 17 ----------------- 3 files changed, 22 deletions(-) diff --git a/configs/regular/svm.json b/configs/regular/svm.json index f83e1be1..4a1bb915 100644 --- a/configs/regular/svm.json +++ b/configs/regular/svm.json @@ -14,10 +14,6 @@ "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } } }, - { - "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } }, - "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } - }, { "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, "algorithm": { diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json index 607a8f26..a55b6e51 100644 --- a/configs/regular/train_test_split.json +++ b/configs/regular/train_test_split.json @@ -10,7 +10,6 @@ "susy", "sift", "gist", - "epsilon", "svhn" ] } diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json index ec1d9c2d..bd1ac2c3 100644 --- a/configs/regular/xgboost_binary.json +++ b/configs/regular/xgboost_binary.json @@ -42,23 +42,6 @@ } } }, - { - "data": { - "dataset": "epsilon", - "split_kwargs": { - "train_size": 10000, - "test_size": 100000 - } - }, - "algorithm": { - "estimator_params": { - "max_depth": 8, - "colsample_bytree": 0.1, - "colsample_bynode": 0.1, - "n_estimators": 200 - } - } - }, { "data": { "dataset": "gisette",