From 66dc9c4298f3f0c8753c1b1525929e21e5837319 Mon Sep 17 00:00:00 2001 From: jrmencha Date: Mon, 3 Mar 2025 12:23:24 -0700 Subject: [PATCH 1/2] ci: add verify-spdx-headers action Add SPDX header verification to the CI pipeline. This action will check all changed files for SPDX headers and verify that the license and copyright holders are correct. Co-authored-by: Thomas, Hailee Signed-off-by: Patel, Narendra Signed-off-by: Courier, Taylor fixes: #4219 --- .github/workflows/linting.yml | 21 +- .github/workflows/verify-spdx-headers.py | 351 +++++++++++++++++++++++ 2 files changed, 369 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/verify-spdx-headers.py diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index da8a4a647d..1d0511e660 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,3 +1,6 @@ +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: GPL-3.0-or-later + name: Linting on: @@ -15,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate'] + tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate', 'verify-spdx-headers'] steps: - name: Harden Runner uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 @@ -34,7 +37,7 @@ jobs: python -m pip install --upgrade pre-commit pre-commit install - name: Run ${{ matrix.tool }} using pre-commit - if: ${{ matrix.tool != 'gitlint' }} + if: ${{ matrix.tool != 'gitlint' && matrix.tool != 'verify-spdx-headers' }} run: | pre-commit run ${{ matrix.tool }} --all-files - name: Run gitlint @@ -44,4 +47,16 @@ jobs: run: | python -m pip install --upgrade gitlint echo "$TITLE" | gitlint - + - name: Run verify-spdx-headers + if: ${{ matrix.tool == 'verify-spdx-headers' }} + env: + INPUT_LICENSES: '["Apache-2.0", "GPL-3.0-or-later"]' + IGNORE_DIRS: '[".github", "fuzz", "test"]' + COPYRIGHT_HOLDERS: '["Intel Corporation"]' + run: | + for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }}); do + if [[ -f "$file" ]]; then + echo "Checking $file" + python .github/workflows/verify-spdx-headers "$file" + fi + done diff --git a/.github/workflows/verify-spdx-headers.py b/.github/workflows/verify-spdx-headers.py new file mode 100644 index 0000000000..d17a148d00 --- /dev/null +++ b/.github/workflows/verify-spdx-headers.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# This script has been adapted from https://github.com/enarx/spdx + +import json +import os +import re +import sys +from typing import Generator + +SLUG = re.compile("[- a-zA-Z0-9.]+") +SPDX = re.compile(rf"SPDX-License-Identifier:\s+({SLUG.pattern})") +CRIGHT = re.compile(r"Copyright (?:\([Cc]\))?\s+[0-9-]+\s+([a-zA-Z0-9\s,/&]+).?") + + +class Language: + """ + A class to represent a programming language and its comment styles for SPDX and copyright headers. + + Attributes: + __shebang (bool): Indicates if the language uses a shebang. + __match_license (list): List of compiled regex patterns to match SPDX license headers. + __match_copyright (list): List of compiled regex patterns to match copyright headers. + + Methods: + __init__(self, *comments, shebang=False): + Initializes the Language object with comment styles and optional shebang. + license_copyright(self, path): + Finds and returns the license and copyright from the SPDX header in the given file. + """ + + def __init__(self, *comments: str, shebang: bool = False) -> None: + """ + Initialize the VerifySpdxHeaders class. + + This method compiles regular expressions to match SPDX license identifiers and copyright notices based on the provided comment delimiters. + + Args: + *comments (str): Variable length argument list of comment delimiters. Each comment can be a string or a tuple of strings. + shebang (bool): Optional; If True, indicates that the file may contain a shebang line. Defaults to False. + """ + if not isinstance(shebang, bool): + print("The shebang argument must be a boolean value!") + sys.exit(1) + + self.__shebang = shebang + self.__match_license = [] + self.__match_copyright = [] + + for comment in comments: + (init, fini) = (comment, "") + if isinstance(comment, tuple): + (init, fini) = comment + + pattern = rf"^{init}\s*{SPDX.pattern}\s*{fini}\s*$" + self.__match_license.append(re.compile(pattern)) + + pattern = rf"^{init}\s*{CRIGHT.pattern}\s*{fini}\s*$" + self.__match_copyright.append(re.compile(pattern)) + + def license_copyright(self, path: str) -> list: + """ + Find the license and copyright information from the SPDX header in the given file. + + Args: + path (str): The file path to read and extract the SPDX header from. + + Returns: + list: A list containing two elements: + - The first element is the license information (str) if found, otherwise None. + - The second element is the copyright information (str) if found, otherwise None. + """ + res = [None, None] + matched_license = matched_copyright = False + + with open(path) as f: + lines = f.readlines() + for line in lines: + if matched_license and matched_copyright: + break + if not matched_license: + for matcher in self.__match_license: + match = matcher.match(line) + if match: + res[0] = match.group(1).strip() + matched_license = True + break + if not matched_copyright: + for matcher in self.__match_copyright: + match = matcher.match(line) + if match: + res[1] = match.group(1).strip() + matched_copyright = True + break + + return res + + +class Index: + """ + A class to represent an index for verifying SPDX headers in files. + + Attributes: + INTERPRETERS (dict): A dictionary mapping interpreter names to language names. + EXTENSIONS (dict): A dictionary mapping file extensions to language names. + __languages (dict): A dictionary mapping language names to Language objects. + __ignore_dirs (list): A list of directories to ignore during scanning. + + Methods: + __init__(ignore=None): + Initializes the Index with optional directories to ignore. + language(path): + Determines the language of a file based on its extension or shebang. + scan_file(file): + Scans a single file for its SPDX header. + scan(root): + Recursively scans a directory for files and their SPDX headers. + """ + + INTERPRETERS = { + "python3": "python", + "python2": "python", + "python": "python", + "ruby": "ruby", + "tsm": "typescript", + "sh": "sh", + } + + EXTENSIONS = { + ".py": "python", + ".proto": "protobuf", + ".rs": "rust", + ".yml": "yaml", + ".yaml": "yaml", + ".json": "json", + ".toml": "toml", + ".md": "md", + ".rb": "ruby", + ".c": "c", + ".h": "c", + ".cpp": "c++", + ".hpp": "c++", + ".cc": "c++", + ".hh": "c++", + ".cu": "cuda-c", + ".cuh": "cuda-c", + ".td": "tablegen", + ".ts": "typescript", + ".sh": "shell", + } + + def __init__(self, ignore: list = None) -> None: + """ + Initializes the VerifySpdxHeaders class. + + Args: + ignore (list, optional): A list of directories to ignore. Defaults to None. + + Attributes: + __languages (dict): A dictionary mapping file types to their respective Language objects. + __ignore_dirs (list): A list of directories to ignore during processing. + """ + + self.__languages = { + "python": Language("#+", shebang=True), + "yaml": Language("#+"), + "ruby": Language("#+", shebang=True), + "c": Language("//+", ("/\\*", "\\*/")), + "c++": Language("//+", ("/\\*", "\\*/")), + "cuda-c": Language("//+", ("/\\*", "\\*/")), + "rust": Language("//+", "//!", ("/\\*", "\\*/")), + "protobuf": Language("//+", "//!", ("/\\*", "\\*/")), + "tablegen": Language("//+"), + "typescript": Language("//+", ("/\\*", "\\*/"), shebang=True), + "shell": Language("#+", shebang=True), + } + self.__ignore_dirs = [".git"] + + if ignore: + self.__ignore_dirs.extend(ignore) + + def determinge_language(self, path: str) -> Language: + """ + Determines the programming language of a given file based on its extension or interpreter. + + The method first attempts to determine the language by the file extension using the EXTENSIONS dictionary. + If the extension is not recognized, it checks the file's shebang line (if present) to identify the interpreter + and then uses the INTERPRETERS dictionary to determine the language. + + Args: + path (str): The file path to check. + + Returns: + Language or None: The name of the programming language if recognized, otherwise None. + """ + name = self.EXTENSIONS.get(os.path.splitext(path)[1]) + + if name is None: + with open(path, "rb") as f: + if f.read(2) == bytearray(b"#!"): + # assume a text file and retry as text file + with open(path, encoding="utf-8") as t: + interpreter = t.readline().rstrip().rsplit(os.path.sep)[-1] + name = self.INTERPRETERS.get(interpreter) + + return self.__languages.get(name) + + def scan_file(self, file: str) -> tuple: + """ + Scans the given file to determine its programming language and parse its SPDX header. + + Args: + file (str): The path to the file to be scanned. + + Returns: + tuple: A tuple containing the license and copyright information if the language is recognized, + otherwise (None, None). + """ + language = self.determinge_language(file) + + if language is None: + return None, None + + # Parse the SPDX header for the language. + return language.license_copyright(file) + + def scan_directory(self, root: str) -> Generator[tuple, None, None]: + """ + Scans the given root directory and its subdirectories for files, ignoring specified directories, + and yields the path and SPDX license information for each file. + + Notes: + - Symlink files are ignored. + - Empty files are skipped. + - Files with unrecognized languages are skipped. + + Args: + root (str): The root directory to scan. + + Yields: + tuple: A tuple containing the file path and the SPDX license information. + """ + + for root, dirs, files in os.walk(root): + # Ignore the specified directories. + dirs[:] = [d for d in dirs if d not in self.__ignore_dirs] + + for file in files: + path = os.path.join(root, file) + + # If the file is a symlink, don't bother + if os.path.islink(path): + continue + # If the file is empty skip. + if os.path.getsize(path) == 0: + continue + # Find the language of the file. + language = self.determinge_language(path) + if language is None: + continue + + # Parse the SPDX header for the language. + yield (path, language.license_copyright(path)) + + +def is_license_valid(license: str, valid_licenses: list[str], path: str) -> bool: + """ + Checks if the given license is in the list of valid licenses. + + Args: + license (str): The license to check. + valid_licenses (list): A list of valid licenses. + path (str): The file path being checked. + + Returns: + bool: Returns True if the license is not valid, otherwise returns False. + """ + if license not in valid_licenses: + if license is None: + print(f"NO SPDX {path}") + else: + print(f"{license:16} {path} not in {valid_licenses}") + return True + return False + + +def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> bool: + """ + Checks if the given copyright holder is in the list of valid holders. + + Args: + copyright (str): The copyright holder to check. + valid_holders (list): A list of valid copyright holders. + path (str): The file path being checked. + + Returns: + bool: Returns True if the copyright holder is valid or if there are no valid holders. + Returns False if the copyright holder is not valid or is None. + """ + + if not valid_holders: + return True + if copyright not in valid_holders: + if copyright is None: + print(f"No Copyright found: {path}") + else: + print( + f"Unexpected Copyright holder: {copyright:16} {path} not in {valid_holders}" + ) + return False + return True + + +if __name__ == "__main__": + # Get arguments through environment + ignore = os.getenv("IGNORE_DIRS") + licenses = os.getenv("INPUT_LICENSES") + copyright_holders = os.getenv("COPYRIGHT_HOLDERS") + + # Load and validate the arguments + if copyright_holders: + copyright_holders = json.loads(copyright_holders) + if ignore: + ignore = json.loads(ignore) + if licenses is None: + licenses = sys.argv[1:] + else: + licenses = json.loads(licenses) + for license in licenses: + if not SLUG.match(license): + print("Invalid license '%s'!" % license) + raise SystemExit(1) + + rv = 0 + + index = Index(ignore=ignore) + path_to_scan = "." + if sys.argv[1:]: + path_to_scan = sys.argv[1:][-1] + if os.path.isdir(path_to_scan): + for path, (license, copyright) in index.scan_directory(path_to_scan): + rv |= is_license_valid(license, licenses, path) + rv |= is_copyright_valid(copyright, copyright_holders, path) + elif os.path.isfile(path_to_scan): + license, copyright = index.scan_file(path_to_scan) + rv |= is_license_valid(license, licenses, path_to_scan) + rv |= is_copyright_valid(copyright, copyright_holders, path_to_scan) + + raise SystemExit(rv) From 9e9637db591cf07ae346dd3a98b2636477af773f Mon Sep 17 00:00:00 2001 From: Jesus R Menchaca Date: Fri, 7 Mar 2025 19:54:56 -0500 Subject: [PATCH 2/2] ci: adding verify copyright and license to pre-commit --- .github/workflows/linting.yml | 2 +- .github/workflows/verify-spdx-headers.py | 28 ++++++++++++++---------- .pre-commit-config.yaml | 15 +++++++++++++ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 1d0511e660..2b48fefa8e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -57,6 +57,6 @@ jobs: for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }}); do if [[ -f "$file" ]]; then echo "Checking $file" - python .github/workflows/verify-spdx-headers "$file" + python .github/workflows/verify-spdx-headers.py "$file" fi done diff --git a/.github/workflows/verify-spdx-headers.py b/.github/workflows/verify-spdx-headers.py index d17a148d00..bf2c50ca23 100644 --- a/.github/workflows/verify-spdx-headers.py +++ b/.github/workflows/verify-spdx-headers.py @@ -40,6 +40,9 @@ def __init__(self, *comments: str, shebang: bool = False) -> None: Args: *comments (str): Variable length argument list of comment delimiters. Each comment can be a string or a tuple of strings. shebang (bool): Optional; If True, indicates that the file may contain a shebang line. Defaults to False. + + Raises: + SystemExit: If the shebang argument is not a boolean. """ if not isinstance(shebang, bool): print("The shebang argument must be a boolean value!") @@ -275,15 +278,15 @@ def is_license_valid(license: str, valid_licenses: list[str], path: str) -> bool path (str): The file path being checked. Returns: - bool: Returns True if the license is not valid, otherwise returns False. + bool: Returns True if the license is valid, otherwise returns False. """ if license not in valid_licenses: if license is None: print(f"NO SPDX {path}") else: print(f"{license:16} {path} not in {valid_licenses}") - return True - return False + return False + return True def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> bool: @@ -324,16 +327,15 @@ def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> b copyright_holders = json.loads(copyright_holders) if ignore: ignore = json.loads(ignore) - if licenses is None: - licenses = sys.argv[1:] - else: + if licenses: licenses = json.loads(licenses) for license in licenses: if not SLUG.match(license): print("Invalid license '%s'!" % license) raise SystemExit(1) - rv = 0 + valid = True + rv = 1 index = Index(ignore=ignore) path_to_scan = "." @@ -341,11 +343,13 @@ def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> b path_to_scan = sys.argv[1:][-1] if os.path.isdir(path_to_scan): for path, (license, copyright) in index.scan_directory(path_to_scan): - rv |= is_license_valid(license, licenses, path) - rv |= is_copyright_valid(copyright, copyright_holders, path) + valid &= is_license_valid(license, licenses, path) + valid &= is_copyright_valid(copyright, copyright_holders, path) elif os.path.isfile(path_to_scan): license, copyright = index.scan_file(path_to_scan) - rv |= is_license_valid(license, licenses, path_to_scan) - rv |= is_copyright_valid(copyright, copyright_holders, path_to_scan) + valid &= is_license_valid(license, licenses, path_to_scan) + valid &= is_copyright_valid(copyright, copyright_holders, path_to_scan) - raise SystemExit(rv) + if valid: + rv = 0 + exit(rv) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9d371aedf8..8a82505190 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,3 +97,18 @@ repos: ] language: python types: [text] + + - repo: local + hooks: + - id: verify-spdx-headers + name: Verify SPDX Headers + entry: | + env INPUT_LICENSES='["Apache-2.0", "GPL-3.0-or-later"]' + env IGNORE_DIRS='[".git", "fuzz", "test"]' + env COPYRIGHT_HOLDERS='["Intel Corporation"]' + python .github/workflows/verify-spdx-headers.py + language: system + files: \.py$ + pass_filenames: true + stages: [pre-commit] + additional_dependencies: []