From 6342f1d2fb9ce2406bef2bf483277baca0f65134 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Mon, 9 Feb 2026 18:07:43 -0800
Subject: [PATCH 1/7] Initial attempt at dataset download issue resolutions

---
 sklbench/datasets/downloaders.py | 208 ++++++++++++++++++++++++++++---
 1 file changed, 188 insertions(+), 20 deletions(-)

diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
index a93787f9..31f737d5 100644
--- a/sklbench/datasets/downloaders.py
+++ b/sklbench/datasets/downloaders.py
@@ -15,6 +15,8 @@
 # ===============================================================================
 
 import os
+import time
+import warnings
 from typing import Callable, List, Union
 
 import numpy as np
@@ -24,33 +26,197 @@
 from sklearn.datasets import fetch_openml
 
 
-def retrieve(url: str, filename: str) -> None:
+def retrieve(url: str, filename: str, max_retries: int = 5) -> None:
+    """
+    Download a file from a URL with retry logic and resume capability.
+
+    Args:
+        url: URL to download from
+        filename: Local file path to save to
+        max_retries: Maximum number of retry attempts for failed downloads
+    """
     if os.path.isfile(filename):
-        return
-    elif url.startswith("http"):
-        response = requests.get(url, stream=True)
-        if response.status_code != 200:
-            raise AssertionError(
-                f"Failed to download from {url}.\n"
-                f"Response returned status code {response.status_code}"
+        # Check if file is complete by comparing size
+        try:
+            head_response = requests.head(url, allow_redirects=True, timeout=30)
+            expected_size = int(head_response.headers.get("content-length", 0))
+            actual_size = os.path.getsize(filename)
+
+            if expected_size > 0 and actual_size == expected_size:
+                # File exists and is complete
+                return
+            else:
+                warnings.warn(
+                    f"Existing file {filename} is incomplete ({actual_size}/{expected_size} bytes). "
+                    f"Will attempt to resume download.",
+                    RuntimeWarning
+                )
+        except Exception as e:
+            # If we can't verify, assume file is complete
+            warnings.warn(
+                f"Could not verify file completeness for {filename}: {e}. Assuming complete.",
+                RuntimeWarning
+            )
+            return
+
+    if not url.startswith("http"):
+        raise ValueError(f"URL must start with http:// or https://, got: {url}")
+
+    temp_filename = filename + ".partial"
+    block_size = 8192
+
+    for attempt in range(max_retries):
+        try:
+            # Check if we can resume a partial download
+            resume_pos = 0
+            if os.path.isfile(temp_filename):
+                resume_pos = os.path.getsize(temp_filename)
+                headers = {"Range": f"bytes={resume_pos}-"}
+                mode = "ab"  # Append mode
+                warnings.warn(
+                    f"Resuming download of {url} from byte {resume_pos}",
+                    RuntimeWarning
+                )
+            else:
+                headers = {}
+                mode = "wb"
+
+            response = requests.get(url, stream=True, headers=headers, timeout=60)
+
+            # Handle different response codes
+            if response.status_code == 200:
+                # Full download
+                mode = "wb"
+                resume_pos = 0
+            elif response.status_code == 206:
+                # Partial content (resume successful)
+                pass
+            elif response.status_code == 416:
+                # Range not satisfiable - file might be complete
+                if os.path.isfile(temp_filename):
+                    os.rename(temp_filename, filename)
+                return
+            else:
+                raise AssertionError(
+                    f"Failed to download from {url}. "
+                    f"Response returned status code {response.status_code}"
+                )
+
+            # Get expected total size
+            if response.status_code == 206:
+                content_range = response.headers.get("content-range", "")
+                if content_range:
+                    total_size = int(content_range.split("/")[1])
+                else:
+                    total_size = 0
+            else:
+                total_size = int(response.headers.get("content-length", 0))
+
+            # Download the file
+            bytes_downloaded = resume_pos
+            with open(temp_filename, mode) as datafile:
+                for data in response.iter_content(block_size):
+                    if data:  # filter out keep-alive chunks
+                        datafile.write(data)
+                        bytes_downloaded += len(data)
+
+            # Verify download completeness
+            if total_size > 0:
+                actual_size = os.path.getsize(temp_filename)
+                if actual_size != total_size:
+                    warnings.warn(
+                        f"Download incomplete: {actual_size}/{total_size} bytes. "
+                        f"Attempt {attempt + 1}/{max_retries}",
+                        RuntimeWarning
+                    )
+                    if attempt < max_retries - 1:
+                        continue  # Retry
+                    else:
+                        raise AssertionError(
+                            f"Failed to completely download {url} after {max_retries} attempts. "
+                            f"Got {actual_size}/{total_size} bytes"
+                        )
+
+            # Download successful, rename temp file to final filename
+            os.rename(temp_filename, filename)
+            return
+
+        except (requests.exceptions.ChunkedEncodingError,
+                requests.exceptions.ConnectionError,
+                requests.exceptions.Timeout) as e:
+            warnings.warn(
+                f"Download interrupted for {url}: {type(e).__name__}: {e}. "
+                f"Attempt {attempt + 1}/{max_retries}",
+                RuntimeWarning
             )
-        total_size = int(response.headers.get("content-length", 0))
-        block_size = 8192
-        n = 0
-        with open(filename, "wb+") as datafile:
-            for data in response.iter_content(block_size):
-                n += len(data) / 1024
-                datafile.write(data)
-        if total_size != 0 and n != total_size / 1024:
-            raise AssertionError("Some content was present but not downloaded/written")
+            if attempt < max_retries - 1:
+                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s, 8s, 16s
+                warnings.warn(f"Waiting {wait_time}s before retry...", RuntimeWarning)
+                time.sleep(wait_time)
+                continue
+            else:
+                # Clean up partial file if all retries failed
+                if os.path.isfile(temp_filename):
+                    os.remove(temp_filename)
+                raise AssertionError(
+                    f"Failed to download {url} after {max_retries} attempts. "
+                    f"Last error: {type(e).__name__}: {e}"
+                ) from e
 
 
 def fetch_and_correct_openml(
     data_id: int, raw_data_cache_dir: str, as_frame: str = "auto"
 ):
-    x, y = fetch_openml(
-        data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir
-    )
+    """
+    Fetch OpenML dataset with fallback for MD5 checksum errors.
+
+    First tries sklearn's fetch_openml. If that fails due to MD5 checksum mismatch,
+    falls back to using the openml package directly, which has updated checksums.
+    """
+    try:
+        # Try sklearn's fetch_openml first
+        x, y = fetch_openml(
+            data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir
+        )
+    except ValueError as e:
+        # Check if it's an MD5 checksum error
+        if "md5 checksum" in str(e).lower():
+            warnings.warn(
+                f"MD5 checksum validation failed for OpenML dataset {data_id}. "
+                f"Falling back to using openml package directly. "
+                f"Original error: {e}",
+                RuntimeWarning
+            )
+
+            # Fall back to openml package which might have updated checksums
+            try:
+                import openml
+                # Configure openml to use the provided cache directory
+                openml_cache = os.path.join(raw_data_cache_dir, "openml_direct")
+                os.makedirs(openml_cache, exist_ok=True)
+                openml.config.set_root_cache_directory(openml_cache)
+
+                dataset = openml.datasets.get_dataset(
+                    data_id,
+                    download_data=True,
+                    download_qualities=False,
+                    download_features_meta_data=False
+                )
+                #Get the data with target column specified
+                x, y, _, _ = dataset.get_data(
+                    dataset_format="dataframe" if as_frame == "auto" or as_frame else "array",
+                    target=dataset.default_target_attribute
+                )
+            except Exception as openml_error:
+                raise ValueError(
+                    f"Failed to load OpenML dataset {data_id} using both sklearn and openml package. "
+                    f"sklearn error: {e}. openml error: {openml_error}"
+                ) from openml_error
+        else:
+            # Not a checksum error, re-raise
+            raise
+
+    # Validate and convert return types
     if (
         isinstance(x, csr_matrix)
         or isinstance(x, pd.DataFrame)
@@ -59,6 +225,7 @@ def fetch_and_correct_openml(
         pass
     else:
         raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml')
+
     if isinstance(y, pd.Series):
         # label transforms to cat.codes if it is passed as categorical series
         if isinstance(y.dtype, pd.CategoricalDtype):
@@ -68,6 +235,7 @@ def fetch_and_correct_openml(
         pass
     else:
         raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml')
+
     return x, y
 
 

From 2141a21e08950d5b20927ed144d484bbedf8c401 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Mon, 9 Feb 2026 18:12:18 -0800
Subject: [PATCH 2/7] add openml to deps

---
 envs/requirements-sklearn.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/envs/requirements-sklearn.txt b/envs/requirements-sklearn.txt
index 064c9e90..a2536eed 100644
--- a/envs/requirements-sklearn.txt
+++ b/envs/requirements-sklearn.txt
@@ -18,3 +18,4 @@ tqdm
 psutil
 requests
 py-cpuinfo
+openml

From 580610b94d0f1a4faadf08d3043a7662b05b1aad Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Tue, 10 Feb 2026 23:16:30 -0800
Subject: [PATCH 3/7] reduce unnecessary diff

---
 sklbench/datasets/downloaders.py | 243 ++++++++-----------------------
 1 file changed, 60 insertions(+), 183 deletions(-)

diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
index 31f737d5..8fff5794 100644
--- a/sklbench/datasets/downloaders.py
+++ b/sklbench/datasets/downloaders.py
@@ -16,225 +16,102 @@
 
 import os
 import time
-import warnings
 from typing import Callable, List, Union
 
 import numpy as np
+import openml
 import pandas as pd
 import requests
 from scipy.sparse import csr_matrix
-from sklearn.datasets import fetch_openml
 
 
-def retrieve(url: str, filename: str, max_retries: int = 5) -> None:
-    """
-    Download a file from a URL with retry logic and resume capability.
-
-    Args:
-        url: URL to download from
-        filename: Local file path to save to
-        max_retries: Maximum number of retry attempts for failed downloads
-    """
+def retrieve(url: str, filename: str, max_retries: int = 3) -> None:
+    """Download a file from a URL with basic retry logic."""
     if os.path.isfile(filename):
-        # Check if file is complete by comparing size
-        try:
-            head_response = requests.head(url, allow_redirects=True, timeout=30)
-            expected_size = int(head_response.headers.get("content-length", 0))
-            actual_size = os.path.getsize(filename)
-
-            if expected_size > 0 and actual_size == expected_size:
-                # File exists and is complete
-                return
-            else:
-                warnings.warn(
-                    f"Existing file {filename} is incomplete ({actual_size}/{expected_size} bytes). "
-                    f"Will attempt to resume download.",
-                    RuntimeWarning
-                )
-        except Exception as e:
-            # If we can't verify, assume file is complete
-            warnings.warn(
-                f"Could not verify file completeness for {filename}: {e}. Assuming complete.",
-                RuntimeWarning
-            )
-            return
+        return
 
     if not url.startswith("http"):
         raise ValueError(f"URL must start with http:// or https://, got: {url}")
 
-    temp_filename = filename + ".partial"
-    block_size = 8192
-
     for attempt in range(max_retries):
         try:
-            # Check if we can resume a partial download
-            resume_pos = 0
-            if os.path.isfile(temp_filename):
-                resume_pos = os.path.getsize(temp_filename)
-                headers = {"Range": f"bytes={resume_pos}-"}
-                mode = "ab"  # Append mode
-                warnings.warn(
-                    f"Resuming download of {url} from byte {resume_pos}",
-                    RuntimeWarning
-                )
-            else:
-                headers = {}
-                mode = "wb"
-
-            response = requests.get(url, stream=True, headers=headers, timeout=60)
-
-            # Handle different response codes
-            if response.status_code == 200:
-                # Full download
-                mode = "wb"
-                resume_pos = 0
-            elif response.status_code == 206:
-                # Partial content (resume successful)
-                pass
-            elif response.status_code == 416:
-                # Range not satisfiable - file might be complete
-                if os.path.isfile(temp_filename):
-                    os.rename(temp_filename, filename)
-                return
-            else:
+            response = requests.get(url, stream=True, timeout=120)
+            if response.status_code != 200:
                 raise AssertionError(
                     f"Failed to download from {url}. "
                     f"Response returned status code {response.status_code}"
                 )
 
-            # Get expected total size
-            if response.status_code == 206:
-                content_range = response.headers.get("content-range", "")
-                if content_range:
-                    total_size = int(content_range.split("/")[1])
-                else:
-                    total_size = 0
-            else:
-                total_size = int(response.headers.get("content-length", 0))
+            total_size = int(response.headers.get("content-length", 0))
+            block_size = 8192
 
-            # Download the file
-            bytes_downloaded = resume_pos
-            with open(temp_filename, mode) as datafile:
+            with open(filename, "wb") as datafile:
+                bytes_written = 0
                 for data in response.iter_content(block_size):
-                    if data:  # filter out keep-alive chunks
+                    if data:
                         datafile.write(data)
-                        bytes_downloaded += len(data)
-
-            # Verify download completeness
-            if total_size > 0:
-                actual_size = os.path.getsize(temp_filename)
-                if actual_size != total_size:
-                    warnings.warn(
-                        f"Download incomplete: {actual_size}/{total_size} bytes. "
-                        f"Attempt {attempt + 1}/{max_retries}",
-                        RuntimeWarning
-                    )
-                    if attempt < max_retries - 1:
-                        continue  # Retry
-                    else:
-                        raise AssertionError(
-                            f"Failed to completely download {url} after {max_retries} attempts. "
-                            f"Got {actual_size}/{total_size} bytes"
-                        )
-
-            # Download successful, rename temp file to final filename
-            os.rename(temp_filename, filename)
+                        bytes_written += len(data)
+
+            # Verify download completeness if size is known
+            if total_size > 0 and bytes_written != total_size:
+                os.remove(filename)
+                if attempt < max_retries - 1:
+                    time.sleep(1)
+                    continue
+                raise AssertionError(
+                    f"Incomplete download from {url}. "
+                    f"Expected {total_size} bytes, got {bytes_written}"
+                )
             return
 
-        except (requests.exceptions.ChunkedEncodingError,
-                requests.exceptions.ConnectionError,
-                requests.exceptions.Timeout) as e:
-            warnings.warn(
-                f"Download interrupted for {url}: {type(e).__name__}: {e}. "
-                f"Attempt {attempt + 1}/{max_retries}",
-                RuntimeWarning
-            )
+        except (
+            requests.exceptions.RequestException,
+            IOError,
+        ) as e:
+            if os.path.isfile(filename):
+                os.remove(filename)
             if attempt < max_retries - 1:
-                wait_time = 2 ** attempt  # Exponential backoff: 1s, 2s, 4s, 8s, 16s
-                warnings.warn(f"Waiting {wait_time}s before retry...", RuntimeWarning)
-                time.sleep(wait_time)
+                time.sleep(1)
                 continue
-            else:
-                # Clean up partial file if all retries failed
-                if os.path.isfile(temp_filename):
-                    os.remove(temp_filename)
-                raise AssertionError(
-                    f"Failed to download {url} after {max_retries} attempts. "
-                    f"Last error: {type(e).__name__}: {e}"
-                ) from e
+            raise AssertionError(
+                f"Failed to download {url} after {max_retries} attempts: {e}"
+            ) from e
 
 
 def fetch_and_correct_openml(
     data_id: int, raw_data_cache_dir: str, as_frame: str = "auto"
 ):
-    """
-    Fetch OpenML dataset with fallback for MD5 checksum errors.
-
-    First tries sklearn's fetch_openml. If that fails due to MD5 checksum mismatch,
-    falls back to using the openml package directly, which has updated checksums.
-    """
-    try:
-        # Try sklearn's fetch_openml first
-        x, y = fetch_openml(
-            data_id=data_id, return_X_y=True, as_frame=as_frame, data_home=raw_data_cache_dir
-        )
-    except ValueError as e:
-        # Check if it's an MD5 checksum error
-        if "md5 checksum" in str(e).lower():
-            warnings.warn(
-                f"MD5 checksum validation failed for OpenML dataset {data_id}. "
-                f"Falling back to using openml package directly. "
-                f"Original error: {e}",
-                RuntimeWarning
-            )
-
-            # Fall back to openml package which might have updated checksums
-            try:
-                import openml
-                # Configure openml to use the provided cache directory
-                openml_cache = os.path.join(raw_data_cache_dir, "openml_direct")
-                os.makedirs(openml_cache, exist_ok=True)
-                openml.config.set_root_cache_directory(openml_cache)
-
-                dataset = openml.datasets.get_dataset(
-                    data_id,
-                    download_data=True,
-                    download_qualities=False,
-                    download_features_meta_data=False
-                )
-                #Get the data with target column specified
-                x, y, _, _ = dataset.get_data(
-                    dataset_format="dataframe" if as_frame == "auto" or as_frame else "array",
-                    target=dataset.default_target_attribute
-                )
-            except Exception as openml_error:
-                raise ValueError(
-                    f"Failed to load OpenML dataset {data_id} using both sklearn and openml package. "
-                    f"sklearn error: {e}. openml error: {openml_error}"
-                ) from openml_error
-        else:
-            # Not a checksum error, re-raise
-            raise
-
-    # Validate and convert return types
-    if (
-        isinstance(x, csr_matrix)
-        or isinstance(x, pd.DataFrame)
-        or isinstance(x, np.ndarray)
-    ):
-        pass
-    else:
-        raise ValueError(f'Unknown "{type(x)}" x type was returned from fetch_openml')
-
+    """Fetch OpenML dataset using the openml package."""
+    # Configure openml cache directory
+    openml_cache = os.path.join(raw_data_cache_dir, "openml")
+    os.makedirs(openml_cache, exist_ok=True)
+    openml.config.set_root_cache_directory(openml_cache)
+
+    # Fetch the dataset
+    dataset = openml.datasets.get_dataset(
+        data_id,
+        download_data=True,
+        download_qualities=False,
+        download_features_meta_data=False,
+    )
+
+    # Get the data with target column specified
+    x, y, _, _ = dataset.get_data(
+        dataset_format="dataframe" if as_frame == "auto" or as_frame else "array",
+        target=dataset.default_target_attribute,
+    )
+
+    # Validate x type
+    if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)):
+        raise ValueError(f'Unknown x type "{type(x)}" returned from openml')
+
+    # Convert y to numpy array if needed
     if isinstance(y, pd.Series):
-        # label transforms to cat.codes if it is passed as categorical series
         if isinstance(y.dtype, pd.CategoricalDtype):
             y = y.cat.codes
         y = y.values
-    elif isinstance(y, np.ndarray):
-        pass
-    else:
-        raise ValueError(f'Unknown "{type(y)}" y type was returned from fetch_openml')
+    elif not isinstance(y, np.ndarray):
+        raise ValueError(f'Unknown y type "{type(y)}" returned from openml')
 
     return x, y
 

From 23d69ec13101cc0c110868269a79a6c45a0ee312 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Wed, 11 Feb 2026 23:14:15 -0800
Subject: [PATCH 4/7] SparseArray error workaround

---
 sklbench/datasets/downloaders.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
index 8fff5794..48fbf36d 100644
--- a/sklbench/datasets/downloaders.py
+++ b/sklbench/datasets/downloaders.py
@@ -109,7 +109,11 @@ def fetch_and_correct_openml(
     if isinstance(y, pd.Series):
         if isinstance(y.dtype, pd.CategoricalDtype):
             y = y.cat.codes
-        y = y.values
+        # Use to_numpy() for sparse arrays to densify them, otherwise use values
+        if pd.api.types.is_sparse(y):
+            y = y.to_numpy()
+        else:
+            y = y.values
     elif not isinstance(y, np.ndarray):
         raise ValueError(f'Unknown y type "{type(y)}" returned from openml')
 

From 7bfda99275241ad327ac8a77cf447d6e9e1ccc60 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Thu, 12 Feb 2026 08:37:23 -0800
Subject: [PATCH 5/7] add sparse workaround for X too

---
 sklbench/datasets/downloaders.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
index 48fbf36d..e386ea71 100644
--- a/sklbench/datasets/downloaders.py
+++ b/sklbench/datasets/downloaders.py
@@ -105,6 +105,11 @@ def fetch_and_correct_openml(
     if not isinstance(x, (csr_matrix, pd.DataFrame, np.ndarray)):
         raise ValueError(f'Unknown x type "{type(x)}" returned from openml')
 
+    # Convert sparse DataFrame to dense format
+    if isinstance(x, pd.DataFrame):
+        if any(pd.api.types.is_sparse(x[col]) for col in x.columns):
+            x = x.sparse.to_dense()
+
     # Convert y to numpy array if needed
     if isinstance(y, pd.Series):
         if isinstance(y.dtype, pd.CategoricalDtype):

From 3f6da9685518e20e41e238e8c684476a4f1f7fa3 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Thu, 12 Feb 2026 09:44:56 -0800
Subject: [PATCH 6/7] another fix

---
 sklbench/datasets/downloaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklbench/datasets/downloaders.py b/sklbench/datasets/downloaders.py
index e386ea71..d75f5ea3 100644
--- a/sklbench/datasets/downloaders.py
+++ b/sklbench/datasets/downloaders.py
@@ -97,7 +97,7 @@ def fetch_and_correct_openml(
 
     # Get the data with target column specified
     x, y, _, _ = dataset.get_data(
-        dataset_format="dataframe" if as_frame == "auto" or as_frame else "array",
+        dataset_format="dataframe" if as_frame is True else "array",
         target=dataset.default_target_attribute,
     )
 

From 3588a11ea8220db63e335bda4738701d4bfbd124 Mon Sep 17 00:00:00 2001
From: ethanglaser <ethan.glaser@intel.com>
Date: Thu, 12 Feb 2026 12:41:10 -0800
Subject: [PATCH 7/7] drop problematic epsilon dataset

---
 configs/regular/svm.json              |  4 ----
 configs/regular/train_test_split.json |  1 -
 configs/regular/xgboost_binary.json   | 17 -----------------
 3 files changed, 22 deletions(-)

diff --git a/configs/regular/svm.json b/configs/regular/svm.json
index f83e1be1..4a1bb915 100644
--- a/configs/regular/svm.json
+++ b/configs/regular/svm.json
@@ -14,10 +14,6 @@
                 "data": { "dataset": "ijcnn", "split_kwargs": { "train_size": 20000, "test_size": null } },
                 "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
             },
-            {
-                "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 10000, "test_size": 10000 } },
-                "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
-            },
             {
                 "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
                 "algorithm": {
diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json
index 607a8f26..a55b6e51 100644
--- a/configs/regular/train_test_split.json
+++ b/configs/regular/train_test_split.json
@@ -10,7 +10,6 @@
                         "susy",
                         "sift",
                         "gist",
-                        "epsilon",
                         "svhn"
                     ]
                 }
diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json
index ec1d9c2d..bd1ac2c3 100644
--- a/configs/regular/xgboost_binary.json
+++ b/configs/regular/xgboost_binary.json
@@ -42,23 +42,6 @@
                     }
                 }
             },
-            {
-                "data": {
-                    "dataset": "epsilon",
-                    "split_kwargs": {
-                        "train_size": 10000,
-                        "test_size": 100000
-                    }
-                },
-                "algorithm": {
-                    "estimator_params": {
-                        "max_depth": 8,
-                        "colsample_bytree": 0.1,
-                        "colsample_bynode": 0.1,
-                        "n_estimators": 200
-                    }
-                }
-            },
             {
                 "data": {
                     "dataset": "gisette",