Skip to content

Commit 2aca306

Browse files
authored
Merge pull request #559 from carterbox/dching/event-timing-message
NEW: Make event timing error messages more specific and actionable
2 parents c71ed39 + 9904f3d commit 2aca306

File tree

3 files changed

+149
-33
lines changed

3 files changed

+149
-33
lines changed

.github/workflows/test-wheel-windows.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ jobs:
5555
5656
if ('${{ inputs.local-ctk }}' -eq '1') {
5757
if ($TEST_CUDA_MAJOR -eq '12') {
58-
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
58+
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
5959
} else {
60-
$MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
60+
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
6161
}
6262
}
6363

cuda_core/cuda/core/experimental/_event.py

+32-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,15 @@
88
from dataclasses import dataclass
99
from typing import TYPE_CHECKING, Optional
1010

11-
from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
11+
from cuda.core.experimental._utils.cuda_utils import (
12+
CUDAError,
13+
check_or_create_options,
14+
driver,
15+
handle_return,
16+
)
17+
from cuda.core.experimental._utils.cuda_utils import (
18+
_check_driver_error as raise_if_driver_error,
19+
)
1220

1321
if TYPE_CHECKING:
1422
import cuda.bindings
@@ -117,13 +125,31 @@ def __rsub__(self, other):
117125

118126
def __sub__(self, other):
119127
# return self - other (in milliseconds)
128+
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
120129
try:
121-
timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
130+
raise_if_driver_error(err)
131+
return timing
122132
except CUDAError as e:
123-
raise RuntimeError(
124-
"Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
125-
) from e
126-
return timing
133+
if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
134+
if self.is_timing_disabled or other.is_timing_disabled:
135+
explanation = (
136+
"Both Events must be created with timing enabled in order to subtract them; "
137+
"use EventOptions(enable_timing=True) when creating both events."
138+
)
139+
else:
140+
explanation = (
141+
"Both Events must be recorded before they can be subtracted; "
142+
"use Stream.record() to record both events to a stream."
143+
)
144+
elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
145+
explanation = (
146+
"One or both events have not completed; "
147+
"use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
148+
"before subtracting them."
149+
)
150+
else:
151+
raise e
152+
raise RuntimeError(explanation) from e
127153

128154
@property
129155
def is_timing_disabled(self) -> bool:

cuda_core/tests/test_event.py

+115-25
Original file line numberDiff line numberDiff line change
@@ -7,50 +7,43 @@
77
# is strictly prohibited.
88

99
import os
10+
import pathlib
1011
import time
1112

13+
import numpy as np
1214
import pytest
1315

1416
import cuda.core.experimental
15-
from cuda.core.experimental import Device, EventOptions
17+
from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
18+
from cuda.core.experimental._memory import _DefaultPinnedMemorySource
1619

1720

1821
def test_event_init_disabled():
1922
with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
2023
cuda.core.experimental._event.Event() # Ensure back door is locked.
2124

2225

23-
@pytest.mark.parametrize("enable_timing", [True, False, None])
24-
def test_timing(init_cuda, enable_timing):
25-
options = EventOptions(enable_timing=enable_timing)
26+
def test_timing_success(init_cuda):
27+
options = EventOptions(enable_timing=True)
2628
stream = Device().create_stream()
2729
delay_seconds = 0.5
2830
e1 = stream.record(options=options)
2931
time.sleep(delay_seconds)
3032
e2 = stream.record(options=options)
3133
e2.sync()
32-
for e in (e1, e2):
33-
assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
34-
if enable_timing:
35-
elapsed_time_ms = e2 - e1
36-
assert isinstance(elapsed_time_ms, float)
37-
# Using a generous tolerance, to avoid flaky tests:
38-
# We only want to exercise the __sub__ method, this test is not meant
39-
# to stress-test the CUDA driver or time.sleep().
40-
delay_ms = delay_seconds * 1000
41-
if os.name == "nt": # noqa: SIM108
42-
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
43-
generous_tolerance = 100
44-
else:
45-
# Most modern Linux kernels have a default timer resolution of 1 ms.
46-
generous_tolerance = 20
47-
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
34+
elapsed_time_ms = e2 - e1
35+
assert isinstance(elapsed_time_ms, float)
36+
# Using a generous tolerance, to avoid flaky tests:
37+
# We only want to exercise the __sub__ method, this test is not meant
38+
# to stress-test the CUDA driver or time.sleep().
39+
delay_ms = delay_seconds * 1000
40+
if os.name == "nt": # noqa: SIM108
41+
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
42+
generous_tolerance = 100
4843
else:
49-
with pytest.raises(RuntimeError) as e:
50-
elapsed_time_ms = e2 - e1
51-
msg = str(e)
52-
assert "disabled by default" in msg
53-
assert "CUDA_ERROR_INVALID_HANDLE" in msg
44+
# Most modern Linux kernels have a default timer resolution of 1 ms.
45+
generous_tolerance = 20
46+
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
5447

5548

5649
def test_is_sync_busy_waited(init_cuda):
@@ -80,3 +73,100 @@ def test_is_done(init_cuda):
8073
# Without a sync, the captured work might not have yet completed
8174
# Therefore this check should never raise an exception
8275
assert event.is_done in (True, False)
76+
77+
78+
def test_error_timing_disabled():
79+
device = Device()
80+
device.set_current()
81+
enabled = EventOptions(enable_timing=True)
82+
disabled = EventOptions(enable_timing=False)
83+
stream = device.create_stream()
84+
85+
event1 = stream.record(options=enabled)
86+
event2 = stream.record(options=disabled)
87+
assert not event1.is_timing_disabled
88+
assert event2.is_timing_disabled
89+
stream.sync()
90+
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
91+
event2 - event1
92+
93+
event1 = stream.record(options=disabled)
94+
event2 = stream.record(options=disabled)
95+
stream.sync()
96+
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
97+
event2 - event1
98+
99+
100+
def test_error_timing_recorded():
101+
device = Device()
102+
device.set_current()
103+
enabled = EventOptions(enable_timing=True)
104+
stream = device.create_stream()
105+
106+
event1 = stream.record(options=enabled)
107+
event2 = device.create_event(options=enabled)
108+
event3 = device.create_event(options=enabled)
109+
110+
stream.sync()
111+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
112+
event2 - event1
113+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
114+
event1 - event2
115+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
116+
event3 - event2
117+
118+
119+
# TODO: improve this once path finder can find headers
120+
@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
121+
@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
122+
def test_error_timing_incomplete():
123+
device = Device()
124+
device.set_current()
125+
126+
# This kernel is designed to busy loop until a signal is received
127+
code = """
128+
#include <cuda/atomic>
129+
130+
extern "C"
131+
__global__ void wait(int* val) {
132+
cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
133+
while (true) {
134+
if (signal.load(cuda::memory_order_relaxed)) {
135+
break;
136+
}
137+
}
138+
}
139+
"""
140+
141+
arch = "".join(f"{i}" for i in device.compute_capability)
142+
program_options = ProgramOptions(
143+
std="c++17",
144+
arch=f"sm_{arch}",
145+
include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
146+
)
147+
prog = Program(code, code_type="c++", options=program_options)
148+
mod = prog.compile(target_type="cubin")
149+
ker = mod.get_kernel("wait")
150+
151+
mr = _DefaultPinnedMemorySource()
152+
b = mr.allocate(4)
153+
arr = np.from_dlpack(b).view(np.int32)
154+
arr[0] = 0
155+
156+
config = LaunchConfig(grid=1, block=1)
157+
ker_args = (arr.ctypes.data,)
158+
159+
enabled = EventOptions(enable_timing=True)
160+
stream = device.create_stream()
161+
162+
event1 = stream.record(options=enabled)
163+
launch(stream, config, ker, *ker_args)
164+
event3 = stream.record(options=enabled)
165+
166+
# event3 will never complete because the stream is waiting on wait() to complete
167+
with pytest.raises(RuntimeError, match="^One or both events have not completed."):
168+
event3 - event1
169+
170+
arr[0] = 1
171+
event3.sync()
172+
event3 - event1 # this should work

0 commit comments

Comments
 (0)