NVIDIA · rwgk · Feb 4, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -29191,7 +29191,7 @@ def cuMemHostGetFlags(p):
     return (_dict_CUresult[err], pFlags)
 {{endif}}
 
-{{if 'cuMemAllocManaged' in found_functions}}
+{{if 'MANUALLYDISABLEDcuMemAllocManaged' in found_functions}}
 
 @cython.embedsignature(True)
 def cuMemAllocManaged(size_t bytesize, unsigned int flags):

diff --git a/cuda_bindings/tests/conftest.py b/cuda_bindings/tests/conftest.py
@@ -1,6 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import pathlib
+import sys
+
+helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
+if helpers_root.is_dir() and str(helpers_root) not in sys.path:
+    # Prefer the in-repo helpers over any installed copy.
+    sys.path.insert(0, str(helpers_root))
+
 import cuda.bindings.driver as cuda
 import pytest
 

diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -10,6 +10,7 @@
 import numpy as np
 import pytest
 from cuda.bindings import driver
+from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason
 
 
 def driverVersionLessThan(target):
@@ -38,6 +39,12 @@ def callableBinary(name):
     return shutil.which(name) is not None
 
 
+def skip_if_concurrent_managed_access_disabled():
+    reason = managed_memory_skip_reason()
+    if reason:
+        pytest.skip(reason)
+
+
 def test_cuda_memcpy():
     # Get device
 
@@ -323,6 +330,7 @@ def test_cuda_memPool_attr():
     driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced"
 )
 def test_cuda_pointer_attr():
+    skip_if_concurrent_managed_access_disabled()
     err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
@@ -388,6 +396,7 @@ def test_pointer_get_attributes_device_ordinal():
 
 @pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced")
 def test_cuda_mem_range_attr(device):
+    skip_if_concurrent_managed_access_disabled()
     size = 0x1000
     location_device = cuda.CUmemLocation()
     location_device.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -6,6 +6,7 @@
 
 import helpers
 import pytest
+from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason
 
 try:
     from cuda.bindings import driver
@@ -35,6 +36,9 @@ def skip_if_pinned_memory_unsupported(device):
 
 
 def skip_if_managed_memory_unsupported(device):
+    reason = managed_memory_skip_reason(device)
+    if reason:
+        pytest.skip(reason)
     try:
         if not device.properties.memory_pools_supported or not device.properties.concurrent_managed_access:
             pytest.skip("Device does not support managed memory pool operations")
@@ -51,6 +55,13 @@ def create_managed_memory_resource_or_skip(*args, **kwargs):
         raise
 
 
+@pytest.fixture
+def requires_concurrent_managed_access():
+    reason = managed_memory_skip_reason()
+    if reason:
+        pytest.skip(reason)
+
+
 @pytest.fixture(scope="session", autouse=True)
 def session_setup():
     # Always init CUDA.

diff --git a/cuda_core/tests/helpers/__init__.py b/cuda_core/tests/helpers/__init__.py
@@ -22,12 +22,12 @@
             CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS
 
 
-try:
-    from cuda_python_test_helpers import *  # noqa: F403
-except ImportError:
-    # Import shared platform helpers for tests across repos
-    sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"))
-    from cuda_python_test_helpers import *  # noqa: F403
+helpers_root = pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"
+if helpers_root.is_dir() and str(helpers_root) not in sys.path:
+    # Prefer the in-repo helpers over any installed copy.
+    sys.path.insert(0, str(helpers_root))
+
+from cuda_python_test_helpers import *  # noqa: E402, F403
 
 
 @functools.cache

diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -17,6 +17,8 @@
 class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
+    pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
     def test_main(self, ipc_device, ipc_memory_resource):
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device

diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,6 +3,7 @@
 
 import multiprocessing as mp
 
+import pytest
 from cuda.core import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
@@ -11,6 +12,8 @@
 NWORKERS = 2
 NTASKS = 2
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcMempool:
     def test_main(self, ipc_device, ipc_memory_resource):

diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -57,6 +57,8 @@ class TestBufferPeerAccessAfterImport:
     setting peer access on the imported memory resource, and that access can be revoked.
     """
 
+    pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
     @pytest.mark.parametrize("grant_access_in_parent", [True, False])
     def test_main(self, mempool_device_x2, grant_access_in_parent):
         dev0, dev1 = mempool_device_x2

diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -14,6 +14,8 @@
 NTASKS = 7
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcSendBuffers:
     @pytest.mark.parametrize("nmrs", (1, NMRS))

diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,13 +5,16 @@
 import multiprocessing.reduction
 import os
 
+import pytest
 from cuda.core import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestObjectSerializationDirect:
     """

diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -16,6 +16,8 @@
 NTASKS = 20
 POOL_SIZE = 2097152
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 class TestIpcWorkerPool:
     """

diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
@@ -74,6 +74,7 @@ def free(self, buffers):
         self.stream.sync()
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 @pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
 @pytest.mark.parametrize("action", ["incr", "fill"])
 def test_graph_alloc(mempool_device, mode, action):
@@ -142,6 +143,7 @@ def apply_kernels(mr, stream, out):
             assert compare_buffer_to_constant(out, 6)
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 @pytest.mark.skipif(IS_WINDOWS or IS_WSL, reason="auto_free_on_launch not supported on Windows")
 @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
 def test_graph_alloc_with_output(mempool_device, mode):

diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
@@ -16,6 +16,8 @@
 ENABLE_LOGGING = False  # Set True for test debugging and development
 NBYTES = 64
 
+pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")
+
 
 def test_latchkernel():
     """Test LatchKernel."""

diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
@@ -150,7 +150,6 @@ def test_launch_invalid_values(init_cuda):
         launch(StreamWrapper(stream), config, ker)
 
     launch(stream, config, ker)
-    stream.sync()  # TODO(#1539)
 
 
 # Parametrize: (python_type, cpp_type, init_value)

diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -164,13 +164,26 @@ def buffer_initialization(dummy_mr: MemoryResource):
     buffer.close()
 
 
-def test_buffer_initialization():
+@pytest.mark.parametrize(
+    ("mr_factory", "needs_device"),
+    [
+        (DummyDeviceMemoryResource, True),
+        (DummyHostMemoryResource, False),
+        (DummyUnifiedMemoryResource, True),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "host", "unified", "pinned"],
+)
+def test_buffer_initialization(mr_factory, needs_device, request):
     device = Device()
     device.set_current()
-    buffer_initialization(DummyDeviceMemoryResource(device))
-    buffer_initialization(DummyHostMemoryResource())
-    buffer_initialization(DummyUnifiedMemoryResource(device))
-    buffer_initialization(DummyPinnedMemoryResource(device))
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device) if needs_device else mr_factory()
+    buffer_initialization(mr)
+
+
+def test_buffer_initialization_invalid_mr():
     with pytest.raises(TypeError):
         buffer_initialization(MemoryResource())
 
@@ -198,12 +211,22 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_to():
+@pytest.mark.parametrize(
+    ("mr_factory", "check"),
+    [
+        (DummyDeviceMemoryResource, False),
+        (DummyUnifiedMemoryResource, False),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "unified", "pinned"],
+)
+def test_buffer_copy_to(mr_factory, check, request):
     device = Device()
     device.set_current()
-    buffer_copy_to(DummyDeviceMemoryResource(device), device)
-    buffer_copy_to(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device)
+    buffer_copy_to(mr, device, check=check)
 
 
 def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
@@ -229,12 +252,22 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
     src_buffer.close()
 
 
-def test_buffer_copy_from():
+@pytest.mark.parametrize(
+    ("mr_factory", "check"),
+    [
+        (DummyDeviceMemoryResource, False),
+        (DummyUnifiedMemoryResource, False),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "unified", "pinned"],
+)
+def test_buffer_copy_from(mr_factory, check, request):
     device = Device()
     device.set_current()
-    buffer_copy_from(DummyDeviceMemoryResource(device), device)
-    buffer_copy_from(DummyUnifiedMemoryResource(device), device)
-    buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device)
+    buffer_copy_from(mr, device, check=check)
 
 
 def _bytes_repeat(pattern: bytes, size: int) -> bytes:
@@ -256,6 +289,7 @@ def fill_env(request):
     if request.param == "device":
         mr = DummyDeviceMemoryResource(device)
     elif request.param == "unified":
+        request.getfixturevalue("requires_concurrent_managed_access")
         mr = DummyUnifiedMemoryResource(device)
     else:
         mr = DummyPinnedMemoryResource(device)
@@ -345,13 +379,23 @@ def buffer_close(dummy_mr: MemoryResource):
     assert buffer.memory_resource is None
 
 
-def test_buffer_close():
+@pytest.mark.parametrize(
+    ("mr_factory", "needs_device"),
+    [
+        (DummyDeviceMemoryResource, True),
+        (DummyHostMemoryResource, False),
+        (DummyUnifiedMemoryResource, True),
+        (DummyPinnedMemoryResource, True),
+    ],
+    ids=["device", "host", "unified", "pinned"],
+)
+def test_buffer_close(mr_factory, needs_device, request):
     device = Device()
     device.set_current()
-    buffer_close(DummyDeviceMemoryResource(device))
-    buffer_close(DummyHostMemoryResource())
-    buffer_close(DummyUnifiedMemoryResource(device))
-    buffer_close(DummyPinnedMemoryResource(device))
+    if mr_factory is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
+    mr = mr_factory(device) if needs_device else mr_factory()
+    buffer_close(mr)
 
 
 def test_buffer_external_host():
@@ -447,7 +491,7 @@ def test_buffer_external_pinned_registered(change_device):
 
 
 @pytest.mark.parametrize("change_device", [True, False])
-def test_buffer_external_managed(change_device):
+def test_buffer_external_managed(change_device, requires_concurrent_managed_access):
     n = ccx_system.get_num_devices()
     if n < 1:
         pytest.skip("No devices found")
@@ -517,9 +561,11 @@ def test_buffer_dunder_dlpack():
         (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
     ],
 )
-def test_buffer_dunder_dlpack_device_success(DummyMR, expected):
+def test_buffer_dunder_dlpack_device_success(DummyMR, expected, request):
     device = Device()
     device.set_current()
+    if DummyMR is DummyUnifiedMemoryResource:
+        request.getfixturevalue("requires_concurrent_managed_access")
     dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device)
     buffer = dummy_mr.allocate(size=1024)
     assert buffer.__dlpack_device__() == expected

diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
@@ -10,6 +10,7 @@
 NBYTES = 1024
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 def test_peer_access_basic(mempool_device_x2):
     """Basic tests for dmr.peer_accessible_by."""
     dev0, dev1 = mempool_device_x2
@@ -78,6 +79,7 @@ def check(expected):
         dmr.peer_accessible_by = [num_devices]  # device ID out of bounds
 
 
+@pytest.mark.usefixtures("requires_concurrent_managed_access")
 def test_peer_access_transitions(mempool_device_x3):
     """Advanced tests for dmr.peer_accessible_by."""