Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cuda_core/cuda/core/_memory/_memory_pool.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,8 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep
elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
IF CUDA_CORE_BUILD_MAJOR >= 13:
assert dev_id == 0
loc.id = 0
assert dev_id >= 0
loc.id = dev_id
loc.type = opts._location
with nogil:
HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
Expand Down
46 changes: 40 additions & 6 deletions cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,47 @@ def _check_numa_nodes():
if numa_count is not None and numa_count > 1:
warnings.warn(
f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
f"uses location ID 0, which may not work correctly with multiple "
f"NUMA nodes.",
f"targets the host NUMA node closest to the current device; "
f"this may not work correctly with multiple NUMA nodes.",
UserWarning,
stacklevel=3
)

_numa_warning_shown = True


def _host_numa_id_for_current_device() -> int:
"""Return host NUMA node closest to current device (fallback to 0)."""
cdef cydriver.CUdevice dev
cdef cydriver.CUresult err
cdef int host_numa_id

with nogil:
err = cydriver.cuCtxGetDevice(&dev)
if err in (
cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT,
cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED,
):
return 0
HANDLE_RETURN(err)

with nogil:
err = cydriver.cuDeviceGetAttribute(
&host_numa_id,
cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID,
dev
)
if err in (
cydriver.CUresult.CUDA_ERROR_INVALID_VALUE,
cydriver.CUresult.CUDA_ERROR_NOT_SUPPORTED,
):
return 0
HANDLE_RETURN(err)
if host_numa_id < 0:
return 0
return host_numa_id


__all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']


Expand Down Expand Up @@ -134,10 +166,10 @@ cdef class PinnedMemoryResource(_MemPool):
allocations between processes, specify ``ipc_enabled=True`` in the initializer
option. When IPC is enabled, the location type is automatically set to
CU_MEM_LOCATION_TYPE_HOST_NUMA instead of CU_MEM_LOCATION_TYPE_HOST,
with location ID 0.
using the host NUMA node closest to the current device.

Note: IPC support for pinned memory requires a single NUMA node. A warning
is issued if multiple NUMA nodes are detected.
Note: IPC support for pinned memory can be sensitive to NUMA placement. A
warning is issued if multiple NUMA nodes are detected.

See :class:`DeviceMemoryResource` for more details on IPC usage patterns.
"""
Expand All @@ -150,13 +182,15 @@ cdef class PinnedMemoryResource(_MemPool):
cdef _MemPoolOptions opts_base = _MemPoolOptions()

cdef bint ipc_enabled = False
cdef int location_id = -1
if opts:
ipc_enabled = opts.ipc_enabled
if ipc_enabled and not _ipc.is_supported():
raise RuntimeError(f"IPC is not available on {platform.system()}")
if ipc_enabled:
# Check for multiple NUMA nodes on Linux
_check_numa_nodes()
location_id = _host_numa_id_for_current_device()
opts_base._max_size = opts.max_size
opts_base._use_current = False
opts_base._ipc_enabled = ipc_enabled
Expand All @@ -166,7 +200,7 @@ cdef class PinnedMemoryResource(_MemPool):
opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED

super().__init__(0 if ipc_enabled else -1, opts_base)
super().__init__(location_id if ipc_enabled else -1, opts_base)

def __reduce__(self):
return PinnedMemoryResource.from_registry, (self.uuid,)
Expand Down
8 changes: 6 additions & 2 deletions cuda_core/tests/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ def test_mempool_ipc_errors(mempool_device):

def test_pinned_mempool_ipc_basic():
"""Test basic IPC functionality for PinnedMemoryResource."""
device = Device()
device = Device(0)
device.set_current()

skip_if_pinned_memory_unsupported(device)
Expand All @@ -995,7 +995,11 @@ def test_pinned_mempool_ipc_basic():
assert mr.is_ipc_enabled
assert mr.is_device_accessible
assert mr.is_host_accessible
assert mr.device_id == 0 # IPC-enabled uses location id 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't the device_id still be zero?

Also, perhaps line 981 should be updated to say Device(0) explicitly? Otherwise, it uses the currently active device, but that doesn't make sense if tests are supposed to be isolated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or is device_id not really the device ID; rather it's the NUMA ID? That seems confusing.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cursor told me:

On Device() vs Device(0): Device() resolves to the current device if a context exists, else it defaults to 0 (cudart‑like behavior). We immediately call set_current(), so the test is deterministic and follows the device whose host NUMA ID we should use. I’m happy to make it explicit (Device(0)) if you prefer stricter isolation; it doesn’t change the intent.

I then asked it to make Device(0) explicit.

Shouldn't the device_id still be zero?

Cursor explained:

device_id here isn’t a GPU ordinal for pinned host pools. For host allocations we use the CUmemLocation.id as the “device_id” field: -1 for plain host memory, and host NUMA node ID for CU_MEM_LOCATION_TYPE_HOST_NUMA. So with IPC‑enabled pinned pools, device_id reflects the host NUMA node closest to the current device, not necessarily 0. That’s why the test now checks device.properties.host_numa_id (falling back to 0 if the attribute is unavailable).

Do you think that's correct? I assumed so and asked it to add a terse comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe @leofang can chime in. For most memory resources, the device_id is the device ordinal. It seems like, here, it is overloaded with another meaning. I'm not sure it has a clear meaning for pinned memory that is not associated with a specific device. Perhaps it should be None? Another opinion would help.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g75cfd5b9fa5c1c6ee2be2547bfbe882e

The underlying driver overloads the id member of the CUmemLocation_v1 struct, but I don't think we ultimately do the same. Maybe we should introduce some specific properties like device_id and numa_id?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @kkraus14

Sorry I'm really out of my depth here (all that's here came from Cursor). I hope @Andy-Jost can take over this PR.

# For IPC-enabled pinned pools, device_id reflects the host NUMA location ID.
expected_numa_id = device.properties.host_numa_id
if expected_numa_id < 0:
expected_numa_id = 0
assert mr.device_id == expected_numa_id

# Test allocation handle export
alloc_handle = mr.get_allocation_handle()
Expand Down