intel · hai1337 · Mar 3, 2025 · Mar 8, 2025
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -1,3 +1,6 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: GPL-3.0-or-later
+
 name: Linting
 
 on:
@@ -15,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate']
+        tool: ['isort', 'black', 'pyupgrade', 'flake8', 'bandit', 'gitlint', 'mypy', 'interrogate', 'verify-spdx-headers']
     steps:
       - name: Harden Runner
         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
@@ -34,7 +37,7 @@ jobs:
           python -m pip install --upgrade pre-commit
           pre-commit install
       - name: Run ${{ matrix.tool }} using pre-commit
-        if: ${{ matrix.tool != 'gitlint' }}
+        if: ${{ matrix.tool != 'gitlint' && matrix.tool != 'verify-spdx-headers'  }}
         run: |
           pre-commit run ${{ matrix.tool }} --all-files
       - name: Run gitlint
@@ -44,4 +47,16 @@ jobs:
         run: |
           python -m pip install --upgrade gitlint
           echo "$TITLE" | gitlint
-
+      - name: Run verify-spdx-headers
+        if: ${{ matrix.tool == 'verify-spdx-headers' }}
+        env:
+          INPUT_LICENSES: '["Apache-2.0", "GPL-3.0-or-later"]'
+          IGNORE_DIRS: '[".github", "fuzz", "test"]'
+          COPYRIGHT_HOLDERS: '["Intel Corporation"]'
+        run: |
+          for file in $(git diff --name-only ${{ github.event.before }} ${{ github.sha }}); do
+            if [[ -f "$file" ]]; then
+              echo "Checking $file"
+              python .github/workflows/verify-spdx-headers.py "$file"
+            fi
+          done
diff --git a/.github/workflows/verify-spdx-headers.py b/.github/workflows/verify-spdx-headers.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# This script has been adapted from https://github.com/enarx/spdx
+
+import json
+import os
+import re
+import sys
+from typing import Generator
+
+SLUG = re.compile("[- a-zA-Z0-9.]+")
+SPDX = re.compile(rf"SPDX-License-Identifier:\s+({SLUG.pattern})")
+CRIGHT = re.compile(r"Copyright (?:\([Cc]\))?\s+[0-9-]+\s+([a-zA-Z0-9\s,/&]+).?")
+
+
+class Language:
+    """
+    A class to represent a programming language and its comment styles for SPDX and copyright headers.
+
+    Attributes:
+        __shebang (bool): Indicates if the language uses a shebang.
+        __match_license (list): List of compiled regex patterns to match SPDX license headers.
+        __match_copyright (list): List of compiled regex patterns to match copyright headers.
+
+    Methods:
+        __init__(self, *comments, shebang=False):
+            Initializes the Language object with comment styles and optional shebang.
+        license_copyright(self, path):
+            Finds and returns the license and copyright from the SPDX header in the given file.
+    """
+
+    def __init__(self, *comments: str, shebang: bool = False) -> None:
+        """
+        Initialize the VerifySpdxHeaders class.
+
+        This method compiles regular expressions to match SPDX license identifiers and copyright notices based on the provided comment delimiters.
+
+        Args:
+            *comments (str): Variable length argument list of comment delimiters. Each comment can be a string or a tuple of strings.
+            shebang (bool): Optional; If True, indicates that the file may contain a shebang line. Defaults to False.
+
+        Raises:
+            SystemExit: If the shebang argument is not a boolean.
+        """
+        if not isinstance(shebang, bool):
+            print("The shebang argument must be a boolean value!")
+            sys.exit(1)
+
+        self.__shebang = shebang
+        self.__match_license = []
+        self.__match_copyright = []
+
+        for comment in comments:
+            (init, fini) = (comment, "")
+            if isinstance(comment, tuple):
+                (init, fini) = comment
+
+            pattern = rf"^{init}\s*{SPDX.pattern}\s*{fini}\s*$"
+            self.__match_license.append(re.compile(pattern))
+
+            pattern = rf"^{init}\s*{CRIGHT.pattern}\s*{fini}\s*$"
+            self.__match_copyright.append(re.compile(pattern))
+
+    def license_copyright(self, path: str) -> list:
+        """
+        Find the license and copyright information from the SPDX header in the given file.
+
+        Args:
+            path (str): The file path to read and extract the SPDX header from.
+
+        Returns:
+            list: A list containing two elements:
+                - The first element is the license information (str) if found, otherwise None.
+                - The second element is the copyright information (str) if found, otherwise None.
+        """
+        res = [None, None]
+        matched_license = matched_copyright = False
+
+        with open(path) as f:
+            lines = f.readlines()
+            for line in lines:
+                if matched_license and matched_copyright:
+                    break
+                if not matched_license:
+                    for matcher in self.__match_license:
+                        match = matcher.match(line)
+                        if match:
+                            res[0] = match.group(1).strip()
+                            matched_license = True
+                            break
+                if not matched_copyright:
+                    for matcher in self.__match_copyright:
+                        match = matcher.match(line)
+                        if match:
+                            res[1] = match.group(1).strip()
+                            matched_copyright = True
+                            break
+
+        return res
+
+
+class Index:
+    """
+    A class to represent an index for verifying SPDX headers in files.
+
+    Attributes:
+        INTERPRETERS (dict): A dictionary mapping interpreter names to language names.
+        EXTENSIONS (dict): A dictionary mapping file extensions to language names.
+        __languages (dict): A dictionary mapping language names to Language objects.
+        __ignore_dirs (list): A list of directories to ignore during scanning.
+
+    Methods:
+        __init__(ignore=None):
+            Initializes the Index with optional directories to ignore.
+        language(path):
+            Determines the language of a file based on its extension or shebang.
+        scan_file(file):
+            Scans a single file for its SPDX header.
+        scan(root):
+            Recursively scans a directory for files and their SPDX headers.
+    """
+
+    INTERPRETERS = {
+        "python3": "python",
+        "python2": "python",
+        "python": "python",
+        "ruby": "ruby",
+        "tsm": "typescript",
+        "sh": "sh",
+    }
+
+    EXTENSIONS = {
+        ".py": "python",
+        ".proto": "protobuf",
+        ".rs": "rust",
+        ".yml": "yaml",
+        ".yaml": "yaml",
+        ".json": "json",
+        ".toml": "toml",
+        ".md": "md",
+        ".rb": "ruby",
+        ".c": "c",
+        ".h": "c",
+        ".cpp": "c++",
+        ".hpp": "c++",
+        ".cc": "c++",
+        ".hh": "c++",
+        ".cu": "cuda-c",
+        ".cuh": "cuda-c",
+        ".td": "tablegen",
+        ".ts": "typescript",
+        ".sh": "shell",
+    }
+
+    def __init__(self, ignore: list = None) -> None:
+        """
+        Initializes the VerifySpdxHeaders class.
+
+        Args:
+            ignore (list, optional): A list of directories to ignore. Defaults to None.
+
+        Attributes:
+            __languages (dict): A dictionary mapping file types to their respective Language objects.
+            __ignore_dirs (list): A list of directories to ignore during processing.
+        """
+
+        self.__languages = {
+            "python": Language("#+", shebang=True),
+            "yaml": Language("#+"),
+            "ruby": Language("#+", shebang=True),
+            "c": Language("//+", ("/\\*", "\\*/")),
+            "c++": Language("//+", ("/\\*", "\\*/")),
+            "cuda-c": Language("//+", ("/\\*", "\\*/")),
+            "rust": Language("//+", "//!", ("/\\*", "\\*/")),
+            "protobuf": Language("//+", "//!", ("/\\*", "\\*/")),
+            "tablegen": Language("//+"),
+            "typescript": Language("//+", ("/\\*", "\\*/"), shebang=True),
+            "shell": Language("#+", shebang=True),
+        }
+        self.__ignore_dirs = [".git"]
+
+        if ignore:
+            self.__ignore_dirs.extend(ignore)
+
+    def determinge_language(self, path: str) -> Language:
+        """
+        Determines the programming language of a given file based on its extension or interpreter.
+
+        The method first attempts to determine the language by the file extension using the EXTENSIONS dictionary.
+        If the extension is not recognized, it checks the file's shebang line (if present) to identify the interpreter
+        and then uses the INTERPRETERS dictionary to determine the language.
+
+        Args:
+            path (str): The file path to check.
+
+        Returns:
+            Language or None: The name of the programming language if recognized, otherwise None.
+        """
+        name = self.EXTENSIONS.get(os.path.splitext(path)[1])
+
+        if name is None:
+            with open(path, "rb") as f:
+                if f.read(2) == bytearray(b"#!"):
+                    # assume a text file and retry as text file
+                    with open(path, encoding="utf-8") as t:
+                        interpreter = t.readline().rstrip().rsplit(os.path.sep)[-1]
+            name = self.INTERPRETERS.get(interpreter)
+
+        return self.__languages.get(name)
+
+    def scan_file(self, file: str) -> tuple:
+        """
+        Scans the given file to determine its programming language and parse its SPDX header.
+
+        Args:
+            file (str): The path to the file to be scanned.
+
+        Returns:
+            tuple: A tuple containing the license and copyright information if the language is recognized,
+                   otherwise (None, None).
+        """
+        language = self.determinge_language(file)
+
+        if language is None:
+            return None, None
+
+        # Parse the SPDX header for the language.
+        return language.license_copyright(file)
+
+    def scan_directory(self, root: str) -> Generator[tuple, None, None]:
+        """
+        Scans the given root directory and its subdirectories for files, ignoring specified directories,
+        and yields the path and SPDX license information for each file.
+
+        Notes:
+            - Symlink files are ignored.
+            - Empty files are skipped.
+            - Files with unrecognized languages are skipped.
+
+        Args:
+            root (str): The root directory to scan.
+
+        Yields:
+            tuple: A tuple containing the file path and the SPDX license information.
+        """
+
+        for root, dirs, files in os.walk(root):
+            # Ignore the specified directories.
+            dirs[:] = [d for d in dirs if d not in self.__ignore_dirs]
+
+            for file in files:
+                path = os.path.join(root, file)
+
+                # If the file is a symlink, don't bother
+                if os.path.islink(path):
+                    continue
+                # If the file is empty skip.
+                if os.path.getsize(path) == 0:
+                    continue
+                # Find the language of the file.
+                language = self.determinge_language(path)
+                if language is None:
+                    continue
+
+                # Parse the SPDX header for the language.
+                yield (path, language.license_copyright(path))
+
+
+def is_license_valid(license: str, valid_licenses: list[str], path: str) -> bool:
+    """
+    Checks if the given license is in the list of valid licenses.
+
+    Args:
+        license (str): The license to check.
+        valid_licenses (list): A list of valid licenses.
+        path (str): The file path being checked.
+
+    Returns:
+        bool: Returns True if the license is valid, otherwise returns False.
+    """
+    if license not in valid_licenses:
+        if license is None:
+            print(f"NO SPDX {path}")
+        else:
+            print(f"{license:16} {path} not in {valid_licenses}")
+        return False
+    return True
+
+
+def is_copyright_valid(copyright: str, valid_holders: list[str], path: str) -> bool:
+    """
+    Checks if the given copyright holder is in the list of valid holders.
+
+    Args:
+        copyright (str): The copyright holder to check.
+        valid_holders (list): A list of valid copyright holders.
+        path (str): The file path being checked.
+
+    Returns:
+        bool: Returns True if the copyright holder is valid or if there are no valid holders.
+             Returns False if the copyright holder is not valid or is None.
+    """
+
+    if not valid_holders:
+        return True
+    if copyright not in valid_holders:
+        if copyright is None:
+            print(f"No Copyright found: {path}")
+        else:
+            print(
+                f"Unexpected Copyright holder: {copyright:16} {path} not in {valid_holders}"
+            )
+        return False
+    return True
+
+
+if __name__ == "__main__":
+    # Get arguments through environment
+    ignore = os.getenv("IGNORE_DIRS")
+    licenses = os.getenv("INPUT_LICENSES")
+    copyright_holders = os.getenv("COPYRIGHT_HOLDERS")
+
+    # Load and validate the arguments
+    if copyright_holders:
+        copyright_holders = json.loads(copyright_holders)
+    if ignore:
+        ignore = json.loads(ignore)
+    if licenses:
+        licenses = json.loads(licenses)
+    for license in licenses:
+        if not SLUG.match(license):
+            print("Invalid license '%s'!" % license)
+            raise SystemExit(1)
+
+    valid = True
+    rv = 1
+
+    index = Index(ignore=ignore)
+    path_to_scan = "."
+    if sys.argv[1:]:
+        path_to_scan = sys.argv[1:][-1]
+    if os.path.isdir(path_to_scan):
+        for path, (license, copyright) in index.scan_directory(path_to_scan):
+            valid &= is_license_valid(license, licenses, path)
+            valid &= is_copyright_valid(copyright, copyright_holders, path)
+    elif os.path.isfile(path_to_scan):
+        license, copyright = index.scan_file(path_to_scan)
+        valid &= is_license_valid(license, licenses, path_to_scan)
+        valid &= is_copyright_valid(copyright, copyright_holders, path_to_scan)
+
+    if valid:
+        rv = 0
+    exit(rv)