Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_bindings/cuda/bindings/driver.pyx.in
Original file line number Diff line number Diff line change
Expand Up @@ -29191,7 +29191,7 @@ def cuMemHostGetFlags(p):
return (_dict_CUresult[err], pFlags)
{{endif}}

{{if 'cuMemAllocManaged' in found_functions}}
{{if 'MANUALLYDISABLEDcuMemAllocManaged' in found_functions}}

@cython.embedsignature(True)
def cuMemAllocManaged(size_t bytesize, unsigned int flags):
Expand Down
8 changes: 8 additions & 0 deletions cuda_bindings/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import pathlib
import sys

helpers_root = pathlib.Path(__file__).resolve().parents[2] / "cuda_python_test_helpers"
if helpers_root.is_dir() and str(helpers_root) not in sys.path:
# Prefer the in-repo helpers over any installed copy.
sys.path.insert(0, str(helpers_root))

import cuda.bindings.driver as cuda
import pytest

Expand Down
9 changes: 9 additions & 0 deletions cuda_bindings/tests/test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pytest
from cuda.bindings import driver
from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason


def driverVersionLessThan(target):
Expand Down Expand Up @@ -38,6 +39,12 @@ def callableBinary(name):
return shutil.which(name) is not None


def skip_if_concurrent_managed_access_disabled():
reason = managed_memory_skip_reason()
if reason:
pytest.skip(reason)


def test_cuda_memcpy():
# Get device

Expand Down Expand Up @@ -323,6 +330,7 @@ def test_cuda_memPool_attr():
driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced"
)
def test_cuda_pointer_attr():
skip_if_concurrent_managed_access_disabled()
err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)
assert err == cuda.CUresult.CUDA_SUCCESS

Expand Down Expand Up @@ -388,6 +396,7 @@ def test_pointer_get_attributes_device_ordinal():

@pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced")
def test_cuda_mem_range_attr(device):
skip_if_concurrent_managed_access_disabled()
size = 0x1000
location_device = cuda.CUmemLocation()
location_device.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
Expand Down
11 changes: 11 additions & 0 deletions cuda_core/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import helpers
import pytest
from cuda_python_test_helpers.managed_memory import managed_memory_skip_reason

try:
from cuda.bindings import driver
Expand Down Expand Up @@ -35,6 +36,9 @@ def skip_if_pinned_memory_unsupported(device):


def skip_if_managed_memory_unsupported(device):
reason = managed_memory_skip_reason(device)
if reason:
pytest.skip(reason)
try:
if not device.properties.memory_pools_supported or not device.properties.concurrent_managed_access:
pytest.skip("Device does not support managed memory pool operations")
Expand All @@ -51,6 +55,13 @@ def create_managed_memory_resource_or_skip(*args, **kwargs):
raise


@pytest.fixture
def requires_concurrent_managed_access():
reason = managed_memory_skip_reason()
if reason:
pytest.skip(reason)


@pytest.fixture(scope="session", autouse=True)
def session_setup():
# Always init CUDA.
Expand Down
12 changes: 6 additions & 6 deletions cuda_core/tests/helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
CCCL_INCLUDE_PATHS = (path,) + CCCL_INCLUDE_PATHS


try:
from cuda_python_test_helpers import * # noqa: F403
except ImportError:
# Import shared platform helpers for tests across repos
sys.path.insert(0, str(pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"))
from cuda_python_test_helpers import * # noqa: F403
helpers_root = pathlib.Path(__file__).resolve().parents[3] / "cuda_python_test_helpers"
if helpers_root.is_dir() and str(helpers_root) not in sys.path:
# Prefer the in-repo helpers over any installed copy.
sys.path.insert(0, str(helpers_root))

from cuda_python_test_helpers import * # noqa: E402, F403


@functools.cache
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/memory_ipc/test_event_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
class TestEventIpc:
"""Check the basic usage of IPC-enabled events with a latch kernel."""

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")

def test_main(self, ipc_device, ipc_memory_resource):
log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
device = ipc_device
Expand Down
3 changes: 3 additions & 0 deletions cuda_core/tests/memory_ipc/test_memory_ipc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import multiprocessing as mp

import pytest
from cuda.core import Buffer, DeviceMemoryResource
from helpers.buffers import PatternGen

Expand All @@ -11,6 +12,8 @@
NWORKERS = 2
NTASKS = 2

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")


class TestIpcMempool:
def test_main(self, ipc_device, ipc_memory_resource):
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/memory_ipc/test_peer_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class TestBufferPeerAccessAfterImport:
setting peer access on the imported memory resource, and that access can be revoked.
"""

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")

@pytest.mark.parametrize("grant_access_in_parent", [True, False])
def test_main(self, mempool_device_x2, grant_access_in_parent):
dev0, dev1 = mempool_device_x2
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/memory_ipc/test_send_buffers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
NTASKS = 7
POOL_SIZE = 2097152

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")


class TestIpcSendBuffers:
@pytest.mark.parametrize("nmrs", (1, NMRS))
Expand Down
3 changes: 3 additions & 0 deletions cuda_core/tests/memory_ipc/test_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
import multiprocessing.reduction
import os

import pytest
from cuda.core import Buffer, Device, DeviceMemoryResource
from helpers.buffers import PatternGen

CHILD_TIMEOUT_SEC = 20
NBYTES = 64
POOL_SIZE = 2097152

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")


class TestObjectSerializationDirect:
"""
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/memory_ipc/test_workerpool.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
NTASKS = 20
POOL_SIZE = 2097152

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")


class TestIpcWorkerPool:
"""
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/test_graph_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def free(self, buffers):
self.stream.sync()


@pytest.mark.usefixtures("requires_concurrent_managed_access")
@pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"])
@pytest.mark.parametrize("action", ["incr", "fill"])
def test_graph_alloc(mempool_device, mode, action):
Expand Down Expand Up @@ -142,6 +143,7 @@ def apply_kernels(mr, stream, out):
assert compare_buffer_to_constant(out, 6)


@pytest.mark.usefixtures("requires_concurrent_managed_access")
@pytest.mark.skipif(IS_WINDOWS or IS_WSL, reason="auto_free_on_launch not supported on Windows")
@pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"])
def test_graph_alloc_with_output(mempool_device, mode):
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
ENABLE_LOGGING = False # Set True for test debugging and development
NBYTES = 64

pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access")


def test_latchkernel():
"""Test LatchKernel."""
Expand Down
1 change: 0 additions & 1 deletion cuda_core/tests/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ def test_launch_invalid_values(init_cuda):
launch(StreamWrapper(stream), config, ker)

launch(stream, config, ker)
stream.sync() # TODO(#1539)


# Parametrize: (python_type, cpp_type, init_value)
Expand Down
86 changes: 66 additions & 20 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,26 @@ def buffer_initialization(dummy_mr: MemoryResource):
buffer.close()


def test_buffer_initialization():
@pytest.mark.parametrize(
("mr_factory", "needs_device"),
[
(DummyDeviceMemoryResource, True),
(DummyHostMemoryResource, False),
(DummyUnifiedMemoryResource, True),
(DummyPinnedMemoryResource, True),
],
ids=["device", "host", "unified", "pinned"],
)
def test_buffer_initialization(mr_factory, needs_device, request):
device = Device()
device.set_current()
buffer_initialization(DummyDeviceMemoryResource(device))
buffer_initialization(DummyHostMemoryResource())
buffer_initialization(DummyUnifiedMemoryResource(device))
buffer_initialization(DummyPinnedMemoryResource(device))
if mr_factory is DummyUnifiedMemoryResource:
request.getfixturevalue("requires_concurrent_managed_access")
mr = mr_factory(device) if needs_device else mr_factory()
buffer_initialization(mr)


def test_buffer_initialization_invalid_mr():
with pytest.raises(TypeError):
buffer_initialization(MemoryResource())

Expand Down Expand Up @@ -198,12 +211,22 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False):
src_buffer.close()


def test_buffer_copy_to():
@pytest.mark.parametrize(
("mr_factory", "check"),
[
(DummyDeviceMemoryResource, False),
(DummyUnifiedMemoryResource, False),
(DummyPinnedMemoryResource, True),
],
ids=["device", "unified", "pinned"],
)
def test_buffer_copy_to(mr_factory, check, request):
device = Device()
device.set_current()
buffer_copy_to(DummyDeviceMemoryResource(device), device)
buffer_copy_to(DummyUnifiedMemoryResource(device), device)
buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True)
if mr_factory is DummyUnifiedMemoryResource:
request.getfixturevalue("requires_concurrent_managed_access")
mr = mr_factory(device)
buffer_copy_to(mr, device, check=check)


def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
Expand All @@ -229,12 +252,22 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False):
src_buffer.close()


def test_buffer_copy_from():
@pytest.mark.parametrize(
("mr_factory", "check"),
[
(DummyDeviceMemoryResource, False),
(DummyUnifiedMemoryResource, False),
(DummyPinnedMemoryResource, True),
],
ids=["device", "unified", "pinned"],
)
def test_buffer_copy_from(mr_factory, check, request):
device = Device()
device.set_current()
buffer_copy_from(DummyDeviceMemoryResource(device), device)
buffer_copy_from(DummyUnifiedMemoryResource(device), device)
buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True)
if mr_factory is DummyUnifiedMemoryResource:
request.getfixturevalue("requires_concurrent_managed_access")
mr = mr_factory(device)
buffer_copy_from(mr, device, check=check)


def _bytes_repeat(pattern: bytes, size: int) -> bytes:
Expand All @@ -256,6 +289,7 @@ def fill_env(request):
if request.param == "device":
mr = DummyDeviceMemoryResource(device)
elif request.param == "unified":
request.getfixturevalue("requires_concurrent_managed_access")
mr = DummyUnifiedMemoryResource(device)
else:
mr = DummyPinnedMemoryResource(device)
Expand Down Expand Up @@ -345,13 +379,23 @@ def buffer_close(dummy_mr: MemoryResource):
assert buffer.memory_resource is None


def test_buffer_close():
@pytest.mark.parametrize(
("mr_factory", "needs_device"),
[
(DummyDeviceMemoryResource, True),
(DummyHostMemoryResource, False),
(DummyUnifiedMemoryResource, True),
(DummyPinnedMemoryResource, True),
],
ids=["device", "host", "unified", "pinned"],
)
def test_buffer_close(mr_factory, needs_device, request):
device = Device()
device.set_current()
buffer_close(DummyDeviceMemoryResource(device))
buffer_close(DummyHostMemoryResource())
buffer_close(DummyUnifiedMemoryResource(device))
buffer_close(DummyPinnedMemoryResource(device))
if mr_factory is DummyUnifiedMemoryResource:
request.getfixturevalue("requires_concurrent_managed_access")
mr = mr_factory(device) if needs_device else mr_factory()
buffer_close(mr)


def test_buffer_external_host():
Expand Down Expand Up @@ -447,7 +491,7 @@ def test_buffer_external_pinned_registered(change_device):


@pytest.mark.parametrize("change_device", [True, False])
def test_buffer_external_managed(change_device):
def test_buffer_external_managed(change_device, requires_concurrent_managed_access):
n = ccx_system.get_num_devices()
if n < 1:
pytest.skip("No devices found")
Expand Down Expand Up @@ -517,9 +561,11 @@ def test_buffer_dunder_dlpack():
(DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
],
)
def test_buffer_dunder_dlpack_device_success(DummyMR, expected):
def test_buffer_dunder_dlpack_device_success(DummyMR, expected, request):
device = Device()
device.set_current()
if DummyMR is DummyUnifiedMemoryResource:
request.getfixturevalue("requires_concurrent_managed_access")
dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device)
buffer = dummy_mr.allocate(size=1024)
assert buffer.__dlpack_device__() == expected
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/tests/test_memory_peer_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
NBYTES = 1024


@pytest.mark.usefixtures("requires_concurrent_managed_access")
def test_peer_access_basic(mempool_device_x2):
"""Basic tests for dmr.peer_accessible_by."""
dev0, dev1 = mempool_device_x2
Expand Down Expand Up @@ -78,6 +79,7 @@ def check(expected):
dmr.peer_accessible_by = [num_devices] # device ID out of bounds


@pytest.mark.usefixtures("requires_concurrent_managed_access")
def test_peer_access_transitions(mempool_device_x3):
"""Advanced tests for dmr.peer_accessible_by."""

Expand Down
Loading