Skip to content

ci: add verify-spdx-headers action #4912

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: GPL-3.0-or-later

name: Linting

on:
Expand All @@ -15,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate']
tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate', 'verify-spdx-headers']
steps:
- name: Harden Runner
uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
Expand All @@ -34,7 +37,7 @@ jobs:
python -m pip install --upgrade pre-commit
pre-commit install
- name: Run ${{ matrix.tool }} using pre-commit
if: ${{ matrix.tool != 'gitlint' }}
if: ${{ matrix.tool != 'gitlint' && matrix.tool != 'verify-spdx-headers' }}
run: |
pre-commit run ${{ matrix.tool }} --all-files
- name: Run gitlint
Expand All @@ -44,4 +47,16 @@ jobs:
run: |
python -m pip install --upgrade gitlint
echo "$TITLE" | gitlint

- name: Run verify-spdx-headers
if: ${{ matrix.tool == 'verify-spdx-headers' }}
env:
INPUT_LICENSES: '["Apache-2.0", "GPL-3.0-or-later"]'
IGNORE_DIRS: '[".github", "fuzz", "test"]'
COPYRIGHT_HOLDERS: '["Intel Corporation"]'
run: |
for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }}); do
if [[ -f "$file" ]]; then
echo "Checking $file"
python .github/workflows/verify-spdx-headers.py "$file"
fi
done
355 changes: 355 additions & 0 deletions .github/workflows/verify-spdx-headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
#!/usr/bin/env python3
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# This script has been adapted from https://github.com/enarx/spdx

import json
import os
import re
import sys
from typing import Generator

SLUG = re.compile("[- a-zA-Z0-9.]+")
SPDX = re.compile(rf"SPDX-License-Identifier:\s+({SLUG.pattern})")
CRIGHT = re.compile(r"Copyright (?:\([Cc]\))?\s+[0-9-]+\s+([a-zA-Z0-9\s,/&]+).?")


class Language:
"""
A class to represent a programming language and its comment styles for SPDX and copyright headers.

Attributes:
__shebang (bool): Indicates if the language uses a shebang.
__match_license (list): List of compiled regex patterns to match SPDX license headers.
__match_copyright (list): List of compiled regex patterns to match copyright headers.

Methods:
__init__(self, *comments, shebang=False):
Initializes the Language object with comment styles and optional shebang.
license_copyright(self, path):
Finds and returns the license and copyright from the SPDX header in the given file.
"""

def __init__(self, *comments: str, shebang: bool = False) -> None:
"""
Initialize the VerifySpdxHeaders class.

This method compiles regular expressions to match SPDX license identifiers and copyright notices based on the provided comment delimiters.

Args:
*comments (str): Variable length argument list of comment delimiters. Each comment can be a string or a tuple of strings.
shebang (bool): Optional; If True, indicates that the file may contain a shebang line. Defaults to False.

Raises:
SystemExit: If the shebang argument is not a boolean.
"""
if not isinstance(shebang, bool):
print("The shebang argument must be a boolean value!")
sys.exit(1)

self.__shebang = shebang
self.__match_license = []
self.__match_copyright = []

for comment in comments:
(init, fini) = (comment, "")
if isinstance(comment, tuple):
(init, fini) = comment

pattern = rf"^{init}\s*{SPDX.pattern}\s*{fini}\s*$"
self.__match_license.append(re.compile(pattern))

pattern = rf"^{init}\s*{CRIGHT.pattern}\s*{fini}\s*$"
self.__match_copyright.append(re.compile(pattern))

def license_copyright(self, path: str) -> list:
"""
Find the license and copyright information from the SPDX header in the given file.

Args:
path (str): The file path to read and extract the SPDX header from.

Returns:
list: A list containing two elements:
- The first element is the license information (str) if found, otherwise None.
- The second element is the copyright information (str) if found, otherwise None.
"""
res = [None, None]
matched_license = matched_copyright = False

with open(path) as f:
lines = f.readlines()
for line in lines:
if matched_license and matched_copyright:
break
if not matched_license:
for matcher in self.__match_license:
match = matcher.match(line)
if match:
res[0] = match.group(1).strip()
matched_license = True
break
if not matched_copyright:
for matcher in self.__match_copyright:
match = matcher.match(line)
if match:
res[1] = match.group(1).strip()
matched_copyright = True
break

return res


class Index:
"""
A class to represent an index for verifying SPDX headers in files.

Attributes:
INTERPRETERS (dict): A dictionary mapping interpreter names to language names.
EXTENSIONS (dict): A dictionary mapping file extensions to language names.
__languages (dict): A dictionary mapping language names to Language objects.
__ignore_dirs (list): A list of directories to ignore during scanning.

Methods:
__init__(ignore=None):
Initializes the Index with optional directories to ignore.
language(path):
Determines the language of a file based on its extension or shebang.
scan_file(file):
Scans a single file for its SPDX header.
scan(root):
Recursively scans a directory for files and their SPDX headers.
"""

INTERPRETERS = {
"python3": "python",
"python2": "python",
"python": "python",
"ruby": "ruby",
"tsm": "typescript",
"sh": "sh",
}

EXTENSIONS = {
".py": "python",
".proto": "protobuf",
".rs": "rust",
".yml": "yaml",
".yaml": "yaml",
".json": "json",
".toml": "toml",
".md": "md",
".rb": "ruby",
".c": "c",
".h": "c",
".cpp": "c++",
".hpp": "c++",
".cc": "c++",
".hh": "c++",
".cu": "cuda-c",
".cuh": "cuda-c",
".td": "tablegen",
".ts": "typescript",
".sh": "shell",
}

def __init__(self, ignore: list = None) -> None:
"""
Initializes the VerifySpdxHeaders class.

Args:
ignore (list, optional): A list of directories to ignore. Defaults to None.

Attributes:
__languages (dict): A dictionary mapping file types to their respective Language objects.
__ignore_dirs (list): A list of directories to ignore during processing.
"""

self.__languages = {
"python": Language("#+", shebang=True),
"yaml": Language("#+"),
"ruby": Language("#+", shebang=True),
"c": Language("//+", ("/\\*", "\\*/")),
"c++": Language("//+", ("/\\*", "\\*/")),
"cuda-c": Language("//+", ("/\\*", "\\*/")),
"rust": Language("//+", "//!", ("/\\*", "\\*/")),
"protobuf": Language("//+", "//!", ("/\\*", "\\*/")),
"tablegen": Language("//+"),
"typescript": Language("//+", ("/\\*", "\\*/"), shebang=True),
"shell": Language("#+", shebang=True),
}
self.__ignore_dirs = [".git"]

if ignore:
self.__ignore_dirs.extend(ignore)

def determinge_language(self, path: str) -> Language:
"""
Determines the programming language of a given file based on its extension or interpreter.

The method first attempts to determine the language by the file extension using the EXTENSIONS dictionary.
If the extension is not recognized, it checks the file's shebang line (if present) to identify the interpreter
and then uses the INTERPRETERS dictionary to determine the language.

Args:
path (str): The file path to check.

Returns:
Language or None: The name of the programming language if recognized, otherwise None.
"""
name = self.EXTENSIONS.get(os.path.splitext(path)[1])

if name is None:
with open(path, "rb") as f:
if f.read(2) == bytearray(b"#!"):
# assume a text file and retry as text file
with open(path, encoding="utf-8") as t:
interpreter = t.readline().rstrip().rsplit(os.path.sep)[-1]
name = self.INTERPRETERS.get(interpreter)

return self.__languages.get(name)

def scan_file(self, file: str) -> tuple:
"""
Scans the given file to determine its programming language and parse its SPDX header.

Args:
file (str): The path to the file to be scanned.

Returns:
tuple: A tuple containing the license and copyright information if the language is recognized,
otherwise (None, None).
"""
language = self.determinge_language(file)

if language is None:
return None, None

# Parse the SPDX header for the language.
return language.license_copyright(file)

def scan_directory(self, root: str) -> Generator[tuple, None, None]:
"""
Scans the given root directory and its subdirectories for files, ignoring specified directories,
and yields the path and SPDX license information for each file.

Notes:
- Symlink files are ignored.
- Empty files are skipped.
- Files with unrecognized languages are skipped.

Args:
root (str): The root directory to scan.

Yields:
tuple: A tuple containing the file path and the SPDX license information.
"""

for root, dirs, files in os.walk(root):
# Ignore the specified directories.
dirs[:] = [d for d in dirs if d not in self.__ignore_dirs]

for file in files:
path = os.path.join(root, file)

# If the file is a symlink, don't bother
if os.path.islink(path):
continue
# If the file is empty skip.
if os.path.getsize(path) == 0:
continue
# Find the language of the file.
language = self.determinge_language(path)
if language is None:
continue

# Parse the SPDX header for the language.
yield (path, language.license_copyright(path))


def is_license_valid(license: str, valid_licenses: list[str], path: str) -> bool:
"""
Checks if the given license is in the list of valid licenses.

Args:
license (str): The license to check.
valid_licenses (list): A list of valid licenses.
path (str): The file path being checked.

Returns:
bool: Returns True if the license is valid, otherwise returns False.
"""
if license not in valid_licenses:
if license is None:
print(f"NO SPDX {path}")
else:
print(f"{license:16} {path} not in {valid_licenses}")
return False
return True


def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> bool:
"""
Checks if the given copyright holder is in the list of valid holders.

Args:
copyright (str): The copyright holder to check.
valid_holders (list): A list of valid copyright holders.
path (str): The file path being checked.

Returns:
bool: Returns True if the copyright holder is valid or if there are no valid holders.
Returns False if the copyright holder is not valid or is None.
"""

if not valid_holders:
return True
if copyright not in valid_holders:
if copyright is None:
print(f"No Copyright found: {path}")
else:
print(
f"Unexpected Copyright holder: {copyright:16} {path} not in {valid_holders}"
)
return False
return True


if __name__ == "__main__":
# Get arguments through environment
ignore = os.getenv("IGNORE_DIRS")
licenses = os.getenv("INPUT_LICENSES")
copyright_holders = os.getenv("COPYRIGHT_HOLDERS")

# Load and validate the arguments
if copyright_holders:
copyright_holders = json.loads(copyright_holders)
if ignore:
ignore = json.loads(ignore)
if licenses:
licenses = json.loads(licenses)
for license in licenses:
if not SLUG.match(license):
print("Invalid license '%s'!" % license)
raise SystemExit(1)

valid = True
rv = 1

index = Index(ignore=ignore)
path_to_scan = "."
if sys.argv[1:]:
path_to_scan = sys.argv[1:][-1]
if os.path.isdir(path_to_scan):
for path, (license, copyright) in index.scan_directory(path_to_scan):
valid &= is_license_valid(license, licenses, path)
valid &= is_copyright_valid(copyright, copyright_holders, path)
elif os.path.isfile(path_to_scan):
license, copyright = index.scan_file(path_to_scan)
valid &= is_license_valid(license, licenses, path_to_scan)
valid &= is_copyright_valid(copyright, copyright_holders, path_to_scan)

if valid:
rv = 0
exit(rv)
Loading