From 2afcb2051e7e9471fa36d06d2910328d9520ba8a Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 00:16:27 +0000 Subject: [PATCH 1/3] cache cc to speed it up --- cuda_core/cuda/core/experimental/_device.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 0cbd462cd..36a111e3e 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -1029,13 +1029,11 @@ def properties(self) -> DeviceProperties: @property def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" - major = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id) - ) - minor = handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id) - ) - return ComputeCapability(major, minor) + if "compute_capability" in self.properties._cache: + return self.properties._cache["compute_capability"] + cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor) + self.properties._cache["compute_capability"] = cc + return cc @property @precondition(_check_context_initialized) From 87405ad907bce2f802e331dd9044b9815264df63 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 19:26:58 +0000 Subject: [PATCH 2/3] avoid using cudart APIs in Device constructor --- cuda_core/cuda/core/experimental/_device.py | 61 +++++++++++++-------- cuda_core/tests/conftest.py | 3 +- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 36a111e3e..d703e0161 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -11,7 +11,8 @@ from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime _tls = threading.local() -_tls_lock = threading.Lock() +_lock = threading.Lock() +_is_cuInit = False class DeviceProperties: @@ -938,37 +939,51 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id=None): + global _is_cuInit + if _is_cuInit is False: + with _lock: + handle_return(driver.cuInit(0)) + _is_cuInit = True + # important: creating a Device instance does not initialize the GPU! if device_id is None: - device_id = handle_return(runtime.cudaGetDevice()) + err, dev = driver.cuCtxGetDevice() + if err == 0: + device_id = int(dev) + else: + ctx = handle_return(driver.cuCtxGetCurrent()) + assert int(ctx) == 0 + device_id = 0 # cudart behavior assert isinstance(device_id, int), f"{device_id=}" else: - total = handle_return(runtime.cudaGetDeviceCount()) + total = handle_return(driver.cuDeviceGetCount()) if not isinstance(device_id, int) or not (0 <= device_id < total): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton - with _tls_lock: - if not hasattr(_tls, "devices"): - total = handle_return(runtime.cudaGetDeviceCount()) - _tls.devices = [] - for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id - # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + if not hasattr(_tls, "devices"): + total = handle_return(driver.cuDeviceGetCount()) + _tls.devices = [] + for dev_id in range(total): + dev = super().__new__(cls) + + dev._id = dev_id + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the SynchronousMemoryResource which does not use memory pools. + if ( + handle_return( + driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id ) - ) == 1: - dev._mr = _DefaultAsyncMempool(dev_id) - else: - dev._mr = _SynchronousMemoryResource(dev_id) - - dev._has_inited = False - dev._properties = None - _tls.devices.append(dev) + ) + ) == 1: + dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) + dev._has_inited = False + dev._properties = None + + _tls.devices.append(dev) return _tls.devices[device_id] diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index dc50585ab..72bbeae83 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -42,8 +42,7 @@ def _device_unset_current(): return handle_return(driver.cuCtxPopCurrent()) if hasattr(_device._tls, "devices"): - with _device._tls_lock: - del _device._tls.devices + del _device._tls.devices @pytest.fixture(scope="function") From 95777c478ca02d2c94f5e56aced9254872038fbf Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Fri, 21 Feb 2025 19:36:25 +0000 Subject: [PATCH 3/3] avoid silly, redundant lock --- cuda_core/cuda/core/experimental/_device.py | 50 ++++++++++++--------- cuda_core/tests/conftest.py | 3 +- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 36a111e3e..74888dca8 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -11,7 +11,8 @@ from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime _tls = threading.local() -_tls_lock = threading.Lock() +_lock = threading.Lock() +_is_cuInit = False class DeviceProperties: @@ -938,6 +939,12 @@ class Device: __slots__ = ("_id", "_mr", "_has_inited", "_properties") def __new__(cls, device_id=None): + global _is_cuInit + if _is_cuInit is False: + with _lock: + handle_return(driver.cuInit(0)) + _is_cuInit = True + # important: creating a Device instance does not initialize the GPU! if device_id is None: device_id = handle_return(runtime.cudaGetDevice()) @@ -948,27 +955,26 @@ def __new__(cls, device_id=None): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton - with _tls_lock: - if not hasattr(_tls, "devices"): - total = handle_return(runtime.cudaGetDeviceCount()) - _tls.devices = [] - for dev_id in range(total): - dev = super().__new__(cls) - dev._id = dev_id - # If the device is in TCC mode, or does not support memory pools for some other reason, - # use the SynchronousMemoryResource which does not use memory pools. - if ( - handle_return( - runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) - ) - ) == 1: - dev._mr = _DefaultAsyncMempool(dev_id) - else: - dev._mr = _SynchronousMemoryResource(dev_id) - - dev._has_inited = False - dev._properties = None - _tls.devices.append(dev) + if not hasattr(_tls, "devices"): + total = handle_return(runtime.cudaGetDeviceCount()) + _tls.devices = [] + for dev_id in range(total): + dev = super().__new__(cls) + dev._id = dev_id + # If the device is in TCC mode, or does not support memory pools for some other reason, + # use the SynchronousMemoryResource which does not use memory pools. + if ( + handle_return( + runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0) + ) + ) == 1: + dev._mr = _DefaultAsyncMempool(dev_id) + else: + dev._mr = _SynchronousMemoryResource(dev_id) + + dev._has_inited = False + dev._properties = None + _tls.devices.append(dev) return _tls.devices[device_id] diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index dc50585ab..72bbeae83 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -42,8 +42,7 @@ def _device_unset_current(): return handle_return(driver.cuCtxPopCurrent()) if hasattr(_device._tls, "devices"): - with _device._tls_lock: - del _device._tls.devices + del _device._tls.devices @pytest.fixture(scope="function")