gh-112302: Add Software Bill-of-Materials (SBOM) tracking for dependencies (#112303)

This commit is contained in:
Seth Michael Larson 2023-12-07 10:01:58 -06:00 committed by GitHub
parent 2d76be251d
commit 21221c398f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 2499 additions and 1 deletions

4
.github/CODEOWNERS vendored
View file

@ -190,3 +190,7 @@ Doc/howto/clinic.rst @erlend-aasland
# WebAssembly
/Tools/wasm/ @brettcannon
# SBOM
/Misc/sbom.spdx.json @sethmlarson
/Tools/build/generate_sbom.py @sethmlarson

View file

@ -9,6 +9,7 @@ on:
paths:
- ".github/workflows/mypy.yml"
- "Lib/test/libregrtest/**"
- "Tools/build/generate_sbom.py"
- "Tools/cases_generator/**"
- "Tools/clinic/**"
- "Tools/peg_generator/**"
@ -34,6 +35,7 @@ jobs:
matrix:
target: [
"Lib/test/libregrtest",
"Tools/build/",
"Tools/cases_generator",
"Tools/clinic",
"Tools/peg_generator",

View file

@ -1359,7 +1359,7 @@ regen-unicodedata:
regen-all: regen-cases regen-typeslots \
regen-token regen-ast regen-keyword regen-sre regen-frozen \
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
regen-test-levenshtein regen-global-objects
regen-test-levenshtein regen-global-objects regen-sbom
@echo
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi, "
@echo "make regen-configure and make regen-unicodedata should be run manually"
@ -2651,6 +2651,10 @@ autoconf:
regen-configure:
$(srcdir)/Tools/build/regen-configure.sh
.PHONY: regen-sbom
regen-sbom:
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/build/generate_sbom.py
# Create a tags file for vi
tags::
ctags -w $(srcdir)/Include/*.h $(srcdir)/Include/cpython/*.h $(srcdir)/Include/internal/*.h

View file

@ -0,0 +1,2 @@
Created a Software Bill-of-Materials document and tooling for tracking
dependencies.

2294
Misc/sbom.spdx.json Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,179 @@
"""Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
import re
import hashlib
import json
import glob
import pathlib
import subprocess
import typing
# Before adding a new entry to this list, double check that
# the license expression is a valid SPDX license expression:
# See: https://spdx.org/licenses
ALLOWED_LICENSE_EXPRESSIONS = {
"MIT",
"CC0-1.0",
"Apache-2.0",
"BSD-2-Clause",
}
# Properties which are required for our purposes.
REQUIRED_PROPERTIES_PACKAGE = frozenset([
"SPDXID",
"name",
"versionInfo",
"downloadLocation",
"checksums",
"licenseConcluded",
"externalRefs",
"originator",
"primaryPackagePurpose",
])
class PackageFiles(typing.NamedTuple):
"""Structure for describing the files of a package"""
include: list[str]
exclude: list[str] | None = None
# SBOMS don't have a method to specify the sources of files
# so we need to do that external to the SBOM itself. Add new
# values to 'exclude' if we create new files within tracked
# directories that aren't sourced from third-party packages.
PACKAGE_TO_FILES = {
"mpdecimal": PackageFiles(
include=["Modules/_decimal/libmpdec/**"]
),
"expat": PackageFiles(
include=["Modules/expat/**"]
),
"pip": PackageFiles(
include=["Lib/ensurepip/_bundled/pip-23.3.1-py3-none-any.whl"]
),
"macholib": PackageFiles(
include=["Lib/ctypes/macholib/**"],
exclude=[
"Lib/ctypes/macholib/README.ctypes",
"Lib/ctypes/macholib/fetch_macholib",
"Lib/ctypes/macholib/fetch_macholib.bat",
],
),
"libb2": PackageFiles(
include=["Modules/_blake2/impl/**"]
),
"hacl-star": PackageFiles(
include=["Modules/_hacl/**"],
exclude=[
"Modules/_hacl/refresh.sh",
"Modules/_hacl/README.md",
"Modules/_hacl/python_hacl_namespace.h",
]
),
}
def spdx_id(value: str) -> str:
"""Encode a value into characters that are valid in an SPDX ID"""
return re.sub(r"[^a-zA-Z0-9.\-]+", "-", value)
def filter_gitignored_paths(paths: list[str]) -> list[str]:
"""
Filter out paths excluded by the gitignore file.
The output of 'git check-ignore --non-matching --verbose' looks
like this for non-matching (included) files:
'::<whitespace><path>'
And looks like this for matching (excluded) files:
'.gitignore:9:*.a Tools/lib.a'
"""
# Filter out files in gitignore.
# Non-matching files show up as '::<whitespace><path>'
git_check_ignore_proc = subprocess.run(
["git", "check-ignore", "--verbose", "--non-matching", *paths],
check=False,
stdout=subprocess.PIPE,
)
# 1 means matches, 0 means no matches.
assert git_check_ignore_proc.returncode in (0, 1)
# Return the list of paths sorted
git_check_ignore_lines = git_check_ignore_proc.stdout.decode().splitlines()
return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
def main() -> None:
root_dir = pathlib.Path(__file__).parent.parent.parent
sbom_path = root_dir / "Misc/sbom.spdx.json"
sbom_data = json.loads(sbom_path.read_bytes())
# Make a bunch of assertions about the SBOM data to ensure it's consistent.
assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)
for package in sbom_data["packages"]:
# Properties and ID must be properly formed.
assert set(package.keys()) == REQUIRED_PROPERTIES_PACKAGE
assert package["SPDXID"] == spdx_id(f"SPDXRef-PACKAGE-{package['name']}")
# Version must be in the download and external references.
version = package["versionInfo"]
assert version in package["downloadLocation"]
assert all(version in ref["referenceLocator"] for ref in package["externalRefs"])
# License must be on the approved list for SPDX.
assert package["licenseConcluded"] in ALLOWED_LICENSE_EXPRESSIONS, package["licenseConcluded"]
# Regenerate file information from current data.
sbom_files = []
sbom_relationships = []
# We call 'sorted()' here a lot to avoid filesystem scan order issues.
for name, files in sorted(PACKAGE_TO_FILES.items()):
package_spdx_id = spdx_id(f"SPDXRef-PACKAGE-{name}")
exclude = files.exclude or ()
for include in sorted(files.include):
# Find all the paths and then filter them through .gitignore.
paths = glob.glob(include, root_dir=root_dir, recursive=True)
paths = filter_gitignored_paths(paths)
assert paths, include # Make sure that every value returns something!
for path in paths:
# Skip directories and excluded files
if not (root_dir / path).is_file() or path in exclude:
continue
# SPDX requires SHA1 to be used for files, but we provide SHA256 too.
data = (root_dir / path).read_bytes()
checksum_sha1 = hashlib.sha1(data).hexdigest()
checksum_sha256 = hashlib.sha256(data).hexdigest()
file_spdx_id = spdx_id(f"SPDXRef-FILE-{path}")
sbom_files.append({
"SPDXID": file_spdx_id,
"fileName": path,
"checksums": [
{"algorithm": "SHA1", "checksumValue": checksum_sha1},
{"algorithm": "SHA256", "checksumValue": checksum_sha256},
],
})
# Tie each file back to its respective package.
sbom_relationships.append({
"spdxElementId": package_spdx_id,
"relatedSpdxElement": file_spdx_id,
"relationshipType": "CONTAINS",
})
# Update the SBOM on disk
sbom_data["files"] = sbom_files
sbom_data["relationships"] = sbom_relationships
sbom_path.write_text(json.dumps(sbom_data, indent=2, sort_keys=True))
if __name__ == "__main__":
main()

13
Tools/build/mypy.ini Normal file
View file

@ -0,0 +1,13 @@
[mypy]
files = Tools/build/generate_sbom.py
pretty = True
# Make sure Python can still be built
# using Python 3.10 for `PYTHON_FOR_REGEN`...
python_version = 3.10
# ...And be strict:
strict = True
strict_concatenate = True
enable_error_code = ignore-without-code,redundant-expr,truthy-bool,possibly-undefined
warn_unreachable = True