|
7 | 7 | # is strictly prohibited.
|
8 | 8 |
|
9 | 9 | import os
|
| 10 | +import pathlib |
10 | 11 | import time
|
11 | 12 |
|
| 13 | +import numpy as np |
12 | 14 | import pytest
|
13 | 15 |
|
14 | 16 | import cuda.core.experimental
|
15 |
| -from cuda.core.experimental import Device, EventOptions |
| 17 | +from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch |
| 18 | +from cuda.core.experimental._memory import _DefaultPinnedMemorySource |
16 | 19 |
|
17 | 20 |
|
18 | 21 | def test_event_init_disabled():
|
19 | 22 | with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
|
20 | 23 | cuda.core.experimental._event.Event() # Ensure back door is locked.
|
21 | 24 |
|
22 | 25 |
|
23 |
| -@pytest.mark.parametrize("enable_timing", [True, False, None]) |
24 |
| -def test_timing(init_cuda, enable_timing): |
25 |
| - options = EventOptions(enable_timing=enable_timing) |
| 26 | +def test_timing_success(init_cuda): |
| 27 | + options = EventOptions(enable_timing=True) |
26 | 28 | stream = Device().create_stream()
|
27 | 29 | delay_seconds = 0.5
|
28 | 30 | e1 = stream.record(options=options)
|
29 | 31 | time.sleep(delay_seconds)
|
30 | 32 | e2 = stream.record(options=options)
|
31 | 33 | e2.sync()
|
32 |
| - for e in (e1, e2): |
33 |
| - assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing) |
34 |
| - if enable_timing: |
35 |
| - elapsed_time_ms = e2 - e1 |
36 |
| - assert isinstance(elapsed_time_ms, float) |
37 |
| - # Using a generous tolerance, to avoid flaky tests: |
38 |
| - # We only want to exercise the __sub__ method, this test is not meant |
39 |
| - # to stress-test the CUDA driver or time.sleep(). |
40 |
| - delay_ms = delay_seconds * 1000 |
41 |
| - if os.name == "nt": # noqa: SIM108 |
42 |
| - # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. |
43 |
| - generous_tolerance = 100 |
44 |
| - else: |
45 |
| - # Most modern Linux kernels have a default timer resolution of 1 ms. |
46 |
| - generous_tolerance = 20 |
47 |
| - assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance |
| 34 | + elapsed_time_ms = e2 - e1 |
| 35 | + assert isinstance(elapsed_time_ms, float) |
| 36 | + # Using a generous tolerance, to avoid flaky tests: |
| 37 | + # We only want to exercise the __sub__ method, this test is not meant |
| 38 | + # to stress-test the CUDA driver or time.sleep(). |
| 39 | + delay_ms = delay_seconds * 1000 |
| 40 | + if os.name == "nt": # noqa: SIM108 |
| 41 | + # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default. |
| 42 | + generous_tolerance = 100 |
48 | 43 | else:
|
49 |
| - with pytest.raises(RuntimeError) as e: |
50 |
| - elapsed_time_ms = e2 - e1 |
51 |
| - msg = str(e) |
52 |
| - assert "disabled by default" in msg |
53 |
| - assert "CUDA_ERROR_INVALID_HANDLE" in msg |
| 44 | + # Most modern Linux kernels have a default timer resolution of 1 ms. |
| 45 | + generous_tolerance = 20 |
| 46 | + assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance |
54 | 47 |
|
55 | 48 |
|
56 | 49 | def test_is_sync_busy_waited(init_cuda):
|
@@ -80,3 +73,100 @@ def test_is_done(init_cuda):
|
80 | 73 | # Without a sync, the captured work might not have yet completed
|
81 | 74 | # Therefore this check should never raise an exception
|
82 | 75 | assert event.is_done in (True, False)
|
| 76 | + |
| 77 | + |
| 78 | +def test_error_timing_disabled(): |
| 79 | + device = Device() |
| 80 | + device.set_current() |
| 81 | + enabled = EventOptions(enable_timing=True) |
| 82 | + disabled = EventOptions(enable_timing=False) |
| 83 | + stream = device.create_stream() |
| 84 | + |
| 85 | + event1 = stream.record(options=enabled) |
| 86 | + event2 = stream.record(options=disabled) |
| 87 | + assert not event1.is_timing_disabled |
| 88 | + assert event2.is_timing_disabled |
| 89 | + stream.sync() |
| 90 | + with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"): |
| 91 | + event2 - event1 |
| 92 | + |
| 93 | + event1 = stream.record(options=disabled) |
| 94 | + event2 = stream.record(options=disabled) |
| 95 | + stream.sync() |
| 96 | + with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"): |
| 97 | + event2 - event1 |
| 98 | + |
| 99 | + |
| 100 | +def test_error_timing_recorded(): |
| 101 | + device = Device() |
| 102 | + device.set_current() |
| 103 | + enabled = EventOptions(enable_timing=True) |
| 104 | + stream = device.create_stream() |
| 105 | + |
| 106 | + event1 = stream.record(options=enabled) |
| 107 | + event2 = device.create_event(options=enabled) |
| 108 | + event3 = device.create_event(options=enabled) |
| 109 | + |
| 110 | + stream.sync() |
| 111 | + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): |
| 112 | + event2 - event1 |
| 113 | + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): |
| 114 | + event1 - event2 |
| 115 | + with pytest.raises(RuntimeError, match="^Both Events must be recorded"): |
| 116 | + event3 - event2 |
| 117 | + |
| 118 | + |
| 119 | +# TODO: improve this once path finder can find headers |
| 120 | +@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header") |
| 121 | +@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") |
| 122 | +def test_error_timing_incomplete(): |
| 123 | + device = Device() |
| 124 | + device.set_current() |
| 125 | + |
| 126 | + # This kernel is designed to busy loop until a signal is received |
| 127 | + code = """ |
| 128 | +#include <cuda/atomic> |
| 129 | +
|
| 130 | +extern "C" |
| 131 | +__global__ void wait(int* val) { |
| 132 | + cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val}; |
| 133 | + while (true) { |
| 134 | + if (signal.load(cuda::memory_order_relaxed)) { |
| 135 | + break; |
| 136 | + } |
| 137 | + } |
| 138 | +} |
| 139 | +""" |
| 140 | + |
| 141 | + arch = "".join(f"{i}" for i in device.compute_capability) |
| 142 | + program_options = ProgramOptions( |
| 143 | + std="c++17", |
| 144 | + arch=f"sm_{arch}", |
| 145 | + include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")), |
| 146 | + ) |
| 147 | + prog = Program(code, code_type="c++", options=program_options) |
| 148 | + mod = prog.compile(target_type="cubin") |
| 149 | + ker = mod.get_kernel("wait") |
| 150 | + |
| 151 | + mr = _DefaultPinnedMemorySource() |
| 152 | + b = mr.allocate(4) |
| 153 | + arr = np.from_dlpack(b).view(np.int32) |
| 154 | + arr[0] = 0 |
| 155 | + |
| 156 | + config = LaunchConfig(grid=1, block=1) |
| 157 | + ker_args = (arr.ctypes.data,) |
| 158 | + |
| 159 | + enabled = EventOptions(enable_timing=True) |
| 160 | + stream = device.create_stream() |
| 161 | + |
| 162 | + event1 = stream.record(options=enabled) |
| 163 | + launch(stream, config, ker, *ker_args) |
| 164 | + event3 = stream.record(options=enabled) |
| 165 | + |
| 166 | + # event3 will never complete because the stream is waiting on wait() to complete |
| 167 | + with pytest.raises(RuntimeError, match="^One or both events have not completed."): |
| 168 | + event3 - event1 |
| 169 | + |
| 170 | + arr[0] = 1 |
| 171 | + event3.sync() |
| 172 | + event3 - event1 # this should work |
0 commit comments