From bddca29edd0d3bcce4f595159180f2d54cc88ad8 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 4 Feb 2026 14:54:59 -0800
Subject: [PATCH 1/4] MANUALLYDISABLEDcuMemAllocManaged

---
 cuda_bindings/cuda/bindings/driver.pyx.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 4ed8409a4d..f28b48a410 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -29191,7 +29191,7 @@ def cuMemHostGetFlags(p):
     return (_dict_CUresult[err], pFlags)
 {{endif}}
 
-{{if 'cuMemAllocManaged' in found_functions}}
+{{if 'MANUALLYDISABLEDcuMemAllocManaged' in found_functions}}
 
 @cython.embedsignature(True)
 def cuMemAllocManaged(size_t bytesize, unsigned int flags):

From b9f8452fa294e8f1e7bb45ec90fc01aaa0a13911 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 4 Feb 2026 17:03:59 -0800
Subject: [PATCH 2/4] Skip managed-memory tests when concurrent access is
 disabled

Treat missing cuMemAllocManaged as disabled access and gate managed-memory
test paths in cuda_core and cuda_bindings to avoid false failures.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../bindings/_test_helpers/managed_memory.py  | 68 +++++++++++++++
 cuda_bindings/tests/test_cuda.py              |  9 ++
 cuda_core/tests/conftest.py                   | 11 +++
 cuda_core/tests/memory_ipc/test_event_ipc.py  |  2 +
 cuda_core/tests/memory_ipc/test_memory_ipc.py |  3 +
 .../tests/memory_ipc/test_peer_access.py      |  2 +
 .../tests/memory_ipc/test_send_buffers.py     |  2 +
 cuda_core/tests/memory_ipc/test_serialize.py  |  3 +
 cuda_core/tests/memory_ipc/test_workerpool.py |  2 +
 cuda_core/tests/test_graph_mem.py             |  2 +
 cuda_core/tests/test_helpers.py               |  2 +
 cuda_core/tests/test_memory.py                | 86 ++++++++++++++-----
 cuda_core/tests/test_memory_peer_access.py    |  2 +
 13 files changed, 174 insertions(+), 20 deletions(-)
 create mode 100644 cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py

diff --git a/cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py b/cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py
new file mode 100644
index 0000000000..b7a077d6fb
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from __future__ import annotations
+
+from functools import cache
+
+import pytest
+
+try:
+    from cuda.bindings import driver
+except Exception:
+    from cuda import cuda as driver
+
+
+def _resolve_device_id(device) -> int:
+    if device is None:
+        return 0
+    if hasattr(device, "device_id"):
+        return int(device.device_id)
+    try:
+        return int(device)
+    except Exception:
+        return 0
+
+
+def _cu_init_ok() -> bool:
+    try:
+        (err,) = driver.cuInit(0)
+    except Exception:
+        return False
+    return err == driver.CUresult.CUDA_SUCCESS
+
+
+@cache
+def _get_concurrent_managed_access(device_id: int) -> int | None:
+    if not _cu_init_ok():
+        return None
+    try:
+        attr = driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+    except Exception:
+        return None
+    try:
+        err, value = driver.cuDeviceGetAttribute(attr, device_id)
+    except Exception:
+        return None
+    if err != driver.CUresult.CUDA_SUCCESS:
+        return None
+    return int(value)
+
+
+def managed_memory_skip_reason(device=None) -> str | None:
+    """Return a skip reason when managed memory should be avoided."""
+    if not hasattr(driver, "cuMemAllocManaged"):
+        return "cuMemAllocManaged is unavailable; treating concurrent managed access as disabled"
+    device_id = _resolve_device_id(device)
+    value = _get_concurrent_managed_access(device_id)
+    if value is None:
+        return "Unable to query CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"
+    if value == 0:
+        return "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS == 0"
+    return None
+
+
+def skip_if_concurrent_managed_access_disabled(device=None) -> None:
+    reason = managed_memory_skip_reason(device)
+    if reason:
+        pytest.skip(reason)
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 51810cf3c7..2a14aefd05 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pytest
 from cuda.bindings import driver
+from cuda.bindings._test_helpers.managed_memory import managed_memory_skip_reason
 
 
 def driverVersionLessThan(target):
@@ -38,6 +39,12 @@ def callableBinary(name):
     return shutil.which(name) is not None
 
 
+def skip_if_concurrent_managed_access_disabled():
+    reason = managed_memory_skip_reason()
+    if reason:
+        pytest.skip(reason)
+
+
 def test_cuda_memcpy():
     # Get device
 
@@ -323,6 +330,7 @@ def test_cuda_memPool_attr():
     driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced"
 )
 def test_cuda_pointer_attr():
+    skip_if_concurrent_managed_access_disabled()
     err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
@@ -388,6 +396,7 @@ def test_pointer_get_attributes_device_ordinal():
 
 @pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced")
 def test_cuda_mem_range_attr(device):
+    skip_if_concurrent_managed_access_disabled()
     size = 0x1000
     location_device = cuda.CUmemLocation()
     location_device.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 340e632719..cc8953a7ae 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -6,6 +6,7 @@
 
 import helpers
 import pytest
+from cuda.bindings._test_helpers.managed_memory import managed_memory_skip_reason
 
 try:
     from cuda.bindings import driver
@@ -35,6 +36,9 @@ def skip_if_pinned_memory_unsupported(device):
 
 
 def skip_if_managed_memory_unsupported(device):
+    reason = managed_memory_skip_reason(device)
+    if reason:
+        pytest.skip(reason)
     try:
         if not device.properties.memory_pools_supported or not device.properties.concurrent_managed_access:
             pytest.skip("Device does not support managed memory pool operations")
@@ -51,6 +55,13 @@ def create_managed_memory_resource_or_skip(*args, **kwargs):
         raise
 
 
+@pytest.fixture
+def requires_concurrent_managed_access():
+    reason = managed_memory_skip_reason()
+    if reason:
+        pytest.skip(reason)
+
+
 @pytest.fixture(scope="session", autouse=True)
 def session_setup():
     # Always init CUDA.
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index 1fabaeddda..767bc55140 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -17,6 +17,8 @@
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
+    pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index d92a28ab5a..e1728e685f 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,6 +3,7 @@
 
 import multiprocessing as mp
 
+import pytest
 from cuda.core import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
@@ -11,6 +12,8 @@
 NWORKERS = 2
 NTASKS = 2
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcMempool:
     def test_main(self, ipc_device, ipc_memory_resource):
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 5a06133c9b..5928dc9590 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -57,6 +57,8 @@ class TestBufferPeerAccessAfterImport:
     setting peer access on the imported memory resource, and that access can be revoked.
     """
 
+    pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
     @pytest.mark.parametrize("grant_access_in_parent", [True, False])
     def test_main(self, mempool_device_x2, grant_access_in_parent):
         dev0, dev1 = mempool_device_x2
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 2df3fe1bbc..302a00c91c 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -14,6 +14,8 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcSendBuffers:
     @pytest.mark.parametrize("nmrs", (1, NMRS))
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 546c8a91aa..6f150836bd 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,6 +5,7 @@
 import multiprocessing.reduction
 import os
 
+import pytest
 from cuda.core import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
@@ -12,6 +13,8 @@
 NBYTES = 64
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestObjectSerializationDirect:
     """
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index b13b9896a1..dbc4c28cea 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,8 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcWorkerPool:
     """
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index bcb8a800a1..aaa7e42c87 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -74,6 +74,7 @@ def free(self, buffers):
         self.stream.sync()
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 @pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
 @pytest.mark.parametrize("action", ["incr", "fill"])
 def test_graph_alloc(mempool_device, mode, action):
@@ -142,6 +143,7 @@ def apply_kernels(mr, stream, out):
             assert compare_buffer_to_constant(out, 6)
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 @pytest.mark.skipif(IS_WINDOWS or IS_WSL, reason="auto_free_on_launch not supported on Windows")
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
 def test_graph_alloc_with_output(mempool_device, mode):
diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
index 76712e8432..d4d9c6734c 100644
--- a/cuda_core/tests/test_helpers.py
+++ b/cuda_core/tests/test_helpers.py
@@ -16,6 +16,8 @@
 ENABLE_LOGGING = False  # Set True for test debugging and development
 NBYTES = 64
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 def test_latchkernel():
     """Test LatchKernel."""
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 9a88f5f483..485d9e8995 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -164,13 +164,26 @@ def buffer_initialization(dummy_mr: MemoryResource):
     buffer.close()
 
 
-def test_buffer_initialization():
+@pytest.mark.parametrize(
+    ("mr_factory", "needs_device"),
+    [
+        (DummyDeviceMemoryResource, True),
+        (DummyHostMemoryResource, False),
+        (DummyUnifiedMemoryResource, True),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "host", "unified", "pinned"],
+)
+def test_buffer_initialization(mr_factory, needs_device, request):
     device = Device()
     device.set_current()
-    buffer_initialization(DummyDeviceMemoryResource(device))
-    buffer_initialization(DummyHostMemoryResource())
-    buffer_initialization(DummyUnifiedMemoryResource(device))
-    buffer_initialization(DummyPinnedMemoryResource(device))
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device) if needs_device else mr_factory()
+    buffer_initialization(mr)
+
+
+def test_buffer_initialization_invalid_mr():
     with pytest.raises(TypeError):
         buffer_initialization(MemoryResource())
 
@@ -198,12 +211,22 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_to():
+@pytest.mark.parametrize(
+    ("mr_factory", "check"),
+    [
+        (DummyDeviceMemoryResource, False),
+        (DummyUnifiedMemoryResource, False),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "unified", "pinned"],
+)
+def test_buffer_copy_to(mr_factory, check, request):
     device = Device()
     device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device)
+    buffer_copy_to(mr, device, check=check)
 
 
 def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
@@ -229,12 +252,22 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_from():
+@pytest.mark.parametrize(
+    ("mr_factory", "check"),
+    [
+        (DummyDeviceMemoryResource, False),
+        (DummyUnifiedMemoryResource, False),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "unified", "pinned"],
+)
+def test_buffer_copy_from(mr_factory, check, request):
     device = Device()
     device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device)
+    buffer_copy_from(mr, device, check=check)
 
 
 def _bytes_repeat(pattern: bytes, size: int) -> bytes:
@@ -256,6 +289,7 @@ def fill_env(request):
     if request.param == "device":
         mr = DummyDeviceMemoryResource(device)
     elif request.param == "unified":
+        request.getfixturevalue("requires_concurrent_managed_access")
         mr = DummyUnifiedMemoryResource(device)
     else:
         mr = DummyPinnedMemoryResource(device)
@@ -345,13 +379,23 @@ def buffer_close(dummy_mr: MemoryResource):
     assert buffer.memory_resource is None
 
 
-def test_buffer_close():
+@pytest.mark.parametrize(
+    ("mr_factory", "needs_device"),
+    [
+        (DummyDeviceMemoryResource, True),
+        (DummyHostMemoryResource, False),
+        (DummyUnifiedMemoryResource, True),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "host", "unified", "pinned"],
+)
+def test_buffer_close(mr_factory, needs_device, request):
     device = Device()
     device.set_current()
-    buffer_close(DummyDeviceMemoryResource(device))
-    buffer_close(DummyHostMemoryResource())
-    buffer_close(DummyUnifiedMemoryResource(device))
-    buffer_close(DummyPinnedMemoryResource(device))
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device) if needs_device else mr_factory()
+    buffer_close(mr)
 
 
 def test_buffer_external_host():
@@ -447,7 +491,7 @@ def test_buffer_external_pinned_registered(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
-def test_buffer_external_managed(change_device):
+def test_buffer_external_managed(change_device, requires_concurrent_managed_access):
     n = ccx_system.get_num_devices()
     if n < 1:
         pytest.skip("No devices found")
@@ -517,9 +561,11 @@ def test_buffer_dunder_dlpack():
         (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
     ],
 )
-def test_buffer_dunder_dlpack_device_success(DummyMR, expected):
+def test_buffer_dunder_dlpack_device_success(DummyMR, expected, request):
     device = Device()
     device.set_current()
+    if DummyMR is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
     dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device)
     buffer = dummy_mr.allocate(size=1024)
     assert buffer.__dlpack_device__() == expected
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index bcae9576da..14c41849fa 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -10,6 +10,7 @@
 NBYTES = 1024
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""
     dev0, dev1 = mempool_device_x2
@@ -78,6 +79,7 @@ def check(expected):
         dmr.peer_accessible_by = [num_devices]  # device ID out of bounds
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 def test_peer_access_transitions(mempool_device_x3):
     """Advanced tests for dmr.peer_accessible_by."""
 

From 85f76f527402b746c6a28cf976a8833240b2ea73 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 4 Feb 2026 17:17:42 -0800
Subject: [PATCH 3/4] Partial revert of
 6bdcda0e15f65f8cc3addb7ae7fae8e6739d6c6e (#1567): Remove stream.sync() in
 cuda_core/tests/test_launcher.py::test_launch_invalid_values

---
 cuda_core/tests/test_launcher.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index ab304bb9bc..ae3e5531c1 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -150,7 +150,6 @@ def test_launch_invalid_values(init_cuda):
         launch(StreamWrapper(stream), config, ker)
 
     launch(stream, config, ker)
-    stream.sync()  # TODO(#1539)
 
 
 # Parametrize: (python_type, cpp_type, init_value)

From a48565f5a1c92f0b32fa8250aff9133a57a2767e Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 5 Feb 2026 14:16:13 -0800
Subject: [PATCH 4/4] Share managed-memory test helper

Move the managed-memory skip logic into cuda_python_test_helpers and point
bindings/core tests at the shared module, with path bootstrapping to prefer
in-repo helpers. This avoids relying on bindings test helpers that are absent
in 12.9.x wheels.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 cuda_bindings/tests/conftest.py                      |  8 ++++++++
 cuda_bindings/tests/test_cuda.py                     |  2 +-
 cuda_core/tests/conftest.py                          |  2 +-
 cuda_core/tests/helpers/__init__.py                  | 12 ++++++------
 .../cuda_python_test_helpers}/managed_memory.py      |  6 +++---
 5 files changed, 19 insertions(+), 11 deletions(-)
 rename {cuda_bindings/cuda/bindings/_test_helpers => cuda_python_test_helpers/cuda_python_test_helpers}/managed_memory.py (92%)

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
index f0a426406a..d936d9a7e8 100644
--- a/cuda_bindings/tests/conftest.py
+++ b/cuda_bindings/tests/conftest.py
@@ -1,6 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import pathlib
+import sys
+
+helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
+if helpers_root.is_dir() and str(helpers_root) not in sys.path:
+    # Prefer the in-repo helpers over any installed copy.
+    sys.path.insert(0, str(helpers_root))
+
 import cuda.bindings.driver as cuda
 import pytest
 
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
index 2a14aefd05..f4ca78d043 100644
--- a/cuda_bindings/tests/test_cuda.py
+++ b/cuda_bindings/tests/test_cuda.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pytest
 from cuda.bindings import driver
-from cuda.bindings._test_helpers.managed_memory import managed_memory_skip_reason
+from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason
 
 
 def driverVersionLessThan(target):
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index cc8953a7ae..4c8bb9e299 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -6,7 +6,7 @@
 
 import helpers
 import pytest
-from cuda.bindings._test_helpers.managed_memory import managed_memory_skip_reason
+from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason
 
 try:
     from cuda.bindings import driver
diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
index ad9d281c16..54ddbe586f 100644
--- a/cuda_core/tests/helpers/__init__.py
+++ b/cuda_core/tests/helpers/__init__.py
@@ -22,12 +22,12 @@
             CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
 
 
-try:
-    from cuda_python_test_helpers import *  # noqa: F403
-except ImportError:
-    # Import shared platform helpers for tests across repos
-    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"))
-    from cuda_python_test_helpers import *  # noqa: F403
+helpers_root = pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"
+if helpers_root.is_dir() and str(helpers_root) not in sys.path:
+    # Prefer the in-repo helpers over any installed copy.
+    sys.path.insert(0, str(helpers_root))
+
+from cuda_python_test_helpers import *  # noqa: E402, F403
 
 
 @functools.cache
diff --git a/cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py b/cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py
similarity index 92%
rename from cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py
rename to cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py
index b7a077d6fb..8c535706cc 100644
--- a/cuda_bindings/cuda/bindings/_test_helpers/managed_memory.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+# SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
 
@@ -8,9 +8,9 @@
 import pytest
 
 try:
-    from cuda.bindings import driver
+    from cuda.bindings import driver  # type: ignore
 except Exception:
-    from cuda import cuda as driver
+    from cuda import cuda as driver  # type: ignore
 
 
 def _resolve_device_id(device) -> int: