Skip to content

Commit 90f9269

Browse files
carterboxleofang
andauthored
CI: Install latest compute-sanitizer separately from CTK (#594)
* CI: Install latest compute-sanitizer separately from CTK * Use mkdir -p to reuse CUDA_PATH * BUG: Always start with clean CUDA_PATH when building cache * REF: Setup CUDA component cache in TMP directory Prevent collision with components already installed to the CUDA_PATH * DEV: Skip non python-3.12 for debugging * fix * Revert "Merge pull request #593 from carterbox/dching/latest-sanitizer" This reverts commit bd770e1, reversing changes made to 19df0d9. * use guess_latest.sh to get the latest CTK ver * fix cache restoration * fix cache restoration * guess_latest needs wget * dunno what I was thinking * not sure if this would work on git for windows... * fix * give rsync a shot * debug * trailing slash, it's always the trailing slash * for some reason --strip-components=1 does not work... * the test runners do not have rsync * restore full matrix * move * avoid rsync... * fix * add inline comments --------- Co-authored-by: Leo Fang <leof@nvidia.com> Co-authored-by: Leo Fang <leo80042@gmail.com>
1 parent bd770e1 commit 90f9269

File tree

4 files changed

+95
-36
lines changed

4 files changed

+95
-36
lines changed

.github/actions/fetch_ctk/action.yml

+29-22
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ inputs:
1717
description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
1818
required: false
1919
type: string
20-
default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_sanitizer_api,libnvjitlink"
20+
default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink"
2121

2222
runs:
2323
using: composite
@@ -50,38 +50,40 @@ runs:
5050
if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }}
5151
shell: bash --noprofile --norc -xeuo pipefail {0}
5252
run: |
53-
CUDA_PATH="./cuda_toolkit"
54-
mkdir $CUDA_PATH
53+
# Everything under this folder is packed and stored in the GitHub Cache space,
54+
# and unpacked after retrieving from the cache.
55+
CACHE_TMP_DIR="./cache_tmp_dir"
56+
rm -rf $CACHE_TMP_DIR
57+
mkdir $CACHE_TMP_DIR
5558
5659
# The binary archives (redist) are guaranteed to be updated as part of the release posting.
5760
CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
61+
CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
5862
if [[ "${{ inputs.host-platform }}" == linux* ]]; then
5963
if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
6064
CTK_SUBDIR="linux-x86_64"
6165
elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
6266
CTK_SUBDIR="linux-sbsa"
6367
fi
6468
function extract() {
65-
tar -xvf $1 -C $CUDA_PATH --strip-components=1
69+
tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
6670
}
6771
elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
6872
CTK_SUBDIR="windows-x86_64"
6973
function extract() {
7074
_TEMP_DIR_=$(mktemp -d)
7175
unzip $1 -d $_TEMP_DIR_
72-
cp -r $_TEMP_DIR_/*/* $CUDA_PATH
76+
cp -r $_TEMP_DIR_/*/* $CACHE_TMP_DIR
7377
rm -rf $_TEMP_DIR_
7478
}
7579
fi
76-
7780
function populate_cuda_path() {
7881
# take the component name as a argument
7982
function download() {
8083
curl -kLSs $1 -o $2
8184
}
82-
local CTK_COMPONENT=$1
83-
local CTK_VERSION=$2
84-
CTK_COMPONENT_REL_PATH="$(curl -s ${CTK_BASE_URL}/redistrib_${CTK_VERSION}.json |
85+
CTK_COMPONENT=$1
86+
CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
8587
python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
8688
CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
8789
CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
@@ -98,23 +100,23 @@ runs:
98100
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
99101
# Get headers and shared libraries in place
100102
for item in $(echo $CTK_CACHE_COMPONENTS | tr ',' ' '); do
101-
ctk_version="${{ inputs.cuda-version }}"
102-
if [[ "$item" == "cuda_sanitizer_api" ]]; then
103-
# Always use latest CTK for cuda_sanitizer_api
104-
# FIXME: Automatically track latest CTK version
105-
CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
106-
if [[ "$CUDA_MAJOR" == "12" ]]; then
107-
# TODO: Automatically track latest CTK minor version
108-
ctk_version="12.8.0"
109-
fi
110-
fi
111-
populate_cuda_path "$item" "$ctk_version"
103+
populate_cuda_path "$item"
112104
done
113-
ls -l $CUDA_PATH
105+
ls -l $CACHE_TMP_DIR
114106
115107
# Prepare the cache
116108
# Note: try to escape | and > ...
117-
tar -czvf ${CTK_CACHE_FILENAME} ${CUDA_PATH}
109+
tar -czvf ${CTK_CACHE_FILENAME} ${CACHE_TMP_DIR}
110+
111+
# "Move" files from temp dir to CUDA_PATH
112+
CUDA_PATH="./cuda_toolkit"
113+
mkdir -p $CUDA_PATH
114+
# Unfortunately we cannot use "rsync -av $CACHE_TMP_DIR/ $CUDA_PATH" because
115+
# not all runners have rsync pre-installed (or even installable, such as
116+
# Git Bash). We do it in the dumb way.
117+
cp -r $CACHE_TMP_DIR/* $CUDA_PATH
118+
rm -rf $CACHE_TMP_DIR
119+
ls -l $CUDA_PATH
118120
119121
- name: Upload CTK cache
120122
if: ${{ always() &&
@@ -129,8 +131,13 @@ runs:
129131
shell: bash --noprofile --norc -xeuo pipefail {0}
130132
run: |
131133
ls -l
134+
CACHE_TMP_DIR="./cache_tmp_dir"
132135
CUDA_PATH="./cuda_toolkit"
136+
mkdir -p $CUDA_PATH
133137
tar -xzvf $CTK_CACHE_FILENAME
138+
# Can't use rsync here, see above
139+
cp -r $CACHE_TMP_DIR/* $CUDA_PATH
140+
rm -rf $CACHE_TMP_DIR $CTK_CACHE_FILENAME
134141
ls -l $CUDA_PATH
135142
if [ ! -d "$CUDA_PATH/include" ]; then
136143
exit 1

.github/workflows/guess_latest.sh

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/bash
2+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
3+
#
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
# URL to search
7+
URL="https://developer.download.nvidia.com/compute/cuda/redist/"
8+
9+
# Fetch the directory listing and extract the latest version number
10+
get_latest_version() {
11+
# Get the HTML content of the page
12+
local html_content=$(wget -q -O - "$URL")
13+
14+
# Extract links matching the pattern redistrib_?.?.?.json
15+
local files=$(echo "$html_content" | grep -oP 'redistrib_[0-9]+\.[0-9]+\.[0-9]+\.json' | cut -d'"' -f2)
16+
17+
# If files were found, extract the version numbers and find the latest
18+
if [ -n "$files" ]; then
19+
# Extract just the version numbers using regex
20+
local versions=$(echo "$files" | grep -oP 'redistrib_\K[0-9]+\.[0-9]+\.[0-9]+(?=\.json)')
21+
22+
# Sort the versions and get the latest
23+
local latest_version=$(echo "$versions" | sort -V | tail -n 1)
24+
echo "$latest_version"
25+
else
26+
echo "No files matching the pattern were found."
27+
return 1
28+
fi
29+
}
30+
31+
# Call the function and store the result
32+
latest_version=$(get_latest_version)
33+
echo $latest_version

.github/workflows/install_gpu_driver.ps1

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
#Requires -RunAsAdministrator
1+
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
24

35
# Install the driver
46
function Install-Driver {
@@ -23,7 +25,7 @@ function Install-Driver {
2325
$ProgressPreference = $ProgressPreference_tmp
2426
Write-Output 'Download complete!'
2527

26-
# Install the file with the specified path from earlier as well as the RunAs admin option
28+
# Install the file with the specified path from earlier
2729
Write-Output 'Running the driver installer...'
2830
Start-Process -FilePath $file_dir -ArgumentList $install_args -Wait
2931
Write-Output 'Done!'

.github/workflows/test-wheel-linux.yml

+29-12
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,14 @@ jobs:
5252
with:
5353
fetch-depth: 0
5454

55+
- name: Install dependencies
56+
uses: ./.github/actions/install_unix_deps
57+
continue-on-error: false
58+
with:
59+
# gcc for Cython tests, jq/wget for artifact fetching
60+
dependencies: "build-essential jq wget"
61+
dependent_exes: "gcc jq wget"
62+
5563
- name: Set environment variables
5664
run: |
5765
PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.')
@@ -78,6 +86,17 @@ jobs:
7886
fi
7987
fi
8088
89+
# We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort
90+
# We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
91+
# Only local ctk installs have compute-sanitizer; there is not wheel for it
92+
if [[ "${{ inputs.python-version }}" == "3.12" && "${{ inputs.cuda-version }}" != "11.8.0" && "${{ inputs.local-ctk }}" == 1 ]]; then
93+
SETUP_SANITIZER=1
94+
echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
95+
else
96+
SETUP_SANITIZER=0
97+
fi
98+
echo "SETUP_SANITIZER=${SETUP_SANITIZER}" >> $GITHUB_ENV
99+
81100
# make outputs from the previous job as env vars
82101
CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.host-platform }}"
83102
echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}" >> $GITHUB_ENV
@@ -91,14 +110,6 @@ jobs:
91110
echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV
92111
echo "SKIP_CUDA_CORE_CYTHON_TEST=${SKIP_CUDA_CORE_CYTHON_TEST}" >> $GITHUB_ENV
93112
94-
- name: Install dependencies
95-
uses: ./.github/actions/install_unix_deps
96-
continue-on-error: false
97-
with:
98-
# gcc for Cython tests, jq/wget for artifact fetching
99-
dependencies: "build-essential jq wget"
100-
dependent_exes: "gcc jq wget"
101-
102113
- name: Download cuda-python build artifacts
103114
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0'}}
104115
uses: actions/download-artifact@v4
@@ -184,12 +195,18 @@ jobs:
184195
host-platform: ${{ inputs.host-platform }}
185196
cuda-version: ${{ inputs.cuda-version }}
186197

198+
- name: Set up latest cuda_sanitizer_api
199+
if: ${{ env.SETUP_SANITIZER == '1' }}
200+
uses: ./.github/actions/fetch_ctk
201+
continue-on-error: false
202+
with:
203+
host-platform: ${{ inputs.host-platform }}
204+
cuda-version: ${{ env.LATEST_CUDA_VERSION }}
205+
cuda-components: "cuda_sanitizer_api"
206+
187207
- name: Set up compute-sanitizer
188208
run: |
189-
# We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort
190-
# We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
191-
# Only local ctk installs have compute-sanitizer; there is not wheel for it
192-
if [[ "${{ inputs.python-version }}" == "3.12" && "${{ inputs.cuda-version }}" != "11.8.0" && "${{ inputs.local-ctk }}" == 1 ]]; then
209+
if [[ "${SETUP_SANITIZER}" == 1 ]]; then
193210
COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer"
194211
COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g')
195212
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1"

0 commit comments

Comments
 (0)