From 2afcb2051e7e9471fa36d06d2910328d9520ba8a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 00:16:27 +0000
Subject: [PATCH 1/3] cache cc to speed it up

---
 cuda_core/cuda/core/experimental/_device.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 0cbd462cd..36a111e3e 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -1029,13 +1029,11 @@ def properties(self) -> DeviceProperties:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        major = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
-        )
-        minor = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
-        )
-        return ComputeCapability(major, minor)
+        if "compute_capability" in self.properties._cache:
+            return self.properties._cache["compute_capability"]
+        cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor)
+        self.properties._cache["compute_capability"] = cc
+        return cc
 
     @property
     @precondition(_check_context_initialized)

From 87405ad907bce2f802e331dd9044b9815264df63 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 19:26:58 +0000
Subject: [PATCH 2/3] avoid using cudart APIs in Device constructor

---
 cuda_core/cuda/core/experimental/_device.py | 61 +++++++++++++--------
 cuda_core/tests/conftest.py                 |  3 +-
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 36a111e3e..d703e0161 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,37 +939,51 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
-            device_id = handle_return(runtime.cudaGetDevice())
+            err, dev = driver.cuCtxGetDevice()
+            if err == 0:
+                device_id = int(dev)
+            else:
+                ctx = handle_return(driver.cuCtxGetCurrent())
+                assert int(ctx) == 0
+                device_id = 0  # cudart behavior
             assert isinstance(device_id, int), f"{device_id=}"
         else:
-            total = handle_return(runtime.cudaGetDeviceCount())
+            total = handle_return(driver.cuDeviceGetCount())
             if not isinstance(device_id, int) or not (0 <= device_id < total):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(driver.cuDeviceGetCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        driver.cuDeviceGetAttribute(
+                            driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, dev_id
                         )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+                dev._has_inited = False
+                dev._properties = None
+
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index dc50585ab..72bbeae83 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -42,8 +42,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")

From 95777c478ca02d2c94f5e56aced9254872038fbf Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 21 Feb 2025 19:36:25 +0000
Subject: [PATCH 3/3] avoid silly, redundant lock

---
 cuda_core/cuda/core/experimental/_device.py | 50 ++++++++++++---------
 cuda_core/tests/conftest.py                 |  3 +-
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 36a111e3e..74888dca8 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,6 +939,12 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
@@ -948,27 +955,26 @@ def __new__(cls, device_id=None):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
-                        )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(runtime.cudaGetDeviceCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+
+                dev._has_inited = False
+                dev._properties = None
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index dc50585ab..72bbeae83 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -42,8 +42,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")