Skip to content

Commit cc25975

Browse files
schlenkMichael Schlenkerjkowalleck
authored
Handle misencoded license text files graceful. (#884)
--------- Signed-off-by: Michael Schlenker <michael.schlenker@contact-software.com> Signed-off-by: Jan Kowalleck <jan.kowalleck@gmail.com> Co-authored-by: Michael Schlenker <michael.schlenker@contact-software.com> Co-authored-by: Jan Kowalleck <jan.kowalleck@gmail.com>
1 parent 9861a46 commit cc25975

File tree

62 files changed

+1241
-45
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1241
-45
lines changed

cyclonedx_py/_internal/utils/bytes.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# This file is part of CycloneDX Python
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# SPDX-License-Identifier: Apache-2.0
16+
# Copyright (c) OWASP Foundation. All Rights Reserved.
17+
18+
from sys import getdefaultencoding
19+
20+
from chardet import detect as chardetect
21+
22+
23+
def bytes2str(data: bytes, *, errors: str = 'strict') -> str:
24+
# see https://docs.python.org/3/library/codecs.html#standard-encodings
25+
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
26+
# replace Windows-encoding with code-page
27+
'Windows-', 'cp')
28+
return data.decode(encoding, errors)

cyclonedx_py/_internal/utils/io.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,14 @@
1515
# SPDX-License-Identifier: Apache-2.0
1616
# Copyright (c) OWASP Foundation. All Rights Reserved.
1717

18-
from sys import getdefaultencoding
1918
from tempfile import NamedTemporaryFile
2019
from typing import BinaryIO
2120

22-
from chardet import detect as chardetect
21+
from .bytes import bytes2str
2322

2423

2524
def io2str(io: BinaryIO, *, errors: str = 'strict') -> str:
26-
data = io.read()
27-
# see https://docs.python.org/3/library/codecs.html#standard-encodings
28-
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
29-
# replace Windows-encoding with code-page
30-
'Windows-', 'cp')
31-
return data.decode(encoding, errors)
25+
return bytes2str(io.read(), errors=errors)
3226

3327

3428
def io2file(io: BinaryIO, *, errors: str = 'strict') -> str:

cyclonedx_py/_internal/utils/pep639.py

+19-6
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from cyclonedx.model import AttachedText, Encoding
3131
from cyclonedx.model.license import DisjunctiveLicense, LicenseAcknowledgement
3232

33+
from .bytes import bytes2str
3334
from .mimetypes import guess_type
3435

3536
if TYPE_CHECKING: # pragma: no cover
@@ -38,6 +39,10 @@
3839

3940
from cyclonedx.model.license import License
4041

42+
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
43+
# but in practice, other locations are used, too.
44+
_LICENSE_LOCATIONS = ('licenses', 'license_files', '')
45+
4146

4247
def dist2licenses(
4348
dist: 'Distribution',
@@ -55,12 +60,20 @@ def dist2licenses(
5560
for mlfile in set(metadata.get_all('License-File', ())):
5661
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
5762
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
58-
59-
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
60-
# but in practice, other locations are used, too.
61-
content = dist.read_text(join('licenses', mlfile)) \
62-
or dist.read_text(join('license_files', mlfile)) \
63-
or dist.read_text(mlfile)
63+
content = None
64+
for mlpath in _LICENSE_LOCATIONS:
65+
try:
66+
content = dist.read_text(join(mlpath, mlfile))
67+
except UnicodeDecodeError as err:
68+
try:
69+
content = bytes2str(err.object)
70+
except UnicodeDecodeError:
71+
pass
72+
else:
73+
break # for-loop
74+
else:
75+
if content is not None:
76+
break # for-loop
6477
if content is None: # pragma: no cover
6578
logger.debug('Error: failed to read license file %r for dist %r',
6679
mlfile, metadata['Name'])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# EditorConfig is awesome: https://editorconfig.org
2+
3+
[my_licenses/utf-8*]
4+
charset = utf-8
5+
6+
[my_licenses/utf-16le*]
7+
charset = utf-16le
8+
9+
[my_licenses/utf-16be*]
10+
charset = utf-16be
11+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Licenses/* binary
2+
Licenses/*.txt binary diff=txt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# PEP 639 - regression 868
2+
3+
see <https://github.com/CycloneDX/cyclonedx-python/issues/868>
4+
5+
PEP-630 expects license gfiles to be UTF8 encoded text.
6+
some license files may not be text, some may not be UTF8 encoded, but still be added as license files.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
this file is
2+
utf-8 encoded
3+
without BOM
4+
😃
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
this file is
2+
utf-8 encoded
3+
with BOM
4+
😃
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[build-system]
2+
# Known broken version
3+
requires = ["setuptools == 78.1.0"]
4+
build-backend = "setuptools.build_meta"
5+
6+
[project]
7+
name = "regression-issue868"
8+
version = "0.1"
9+
license-files = ["my_licenses/*"]
10+
readme = "README.md"
11+
12+
[tool.setuptools]
13+
include-package-data = false
14+
exclude-package-data = { "*" = ["*", "**"] }
15+
[tool.setuptools.package-data]
16+
# do not want any content installed

tests/_data/infiles/environment/with-license-pep639/init.py

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ def main() -> None:
7373
'lxml',
7474
# with expression-like License AND License-File
7575
'cryptography==43.0.1', # https://github.com/CycloneDX/cyclonedx-python/issues/826
76+
# with possibly unexpected license files
77+
# https://github.com/CycloneDX/cyclonedx-python/issues/868
78+
'../../_helpers/local_pckages/with-license-pep639_regression-issue868',
7679
)
7780

7881

tests/_data/infiles/environment/with-license-pep639/pyproject.toml

+13-11
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@ name = "with-extras"
44
version = "0.1.0"
55
description = "depenndencies with license declaration accoring to PEP 639"
66

7-
dependencies = [
8-
# with License-Expression
9-
"attrs",
10-
# with License-File
11-
"boolean.py",
12-
"jsonpointer",
13-
"license_expression",
14-
"lxml",
15-
# with expression-like License AND License-File
16-
"cryptography",
17-
]
7+
[project.dependencies]
8+
# with License-Expression
9+
"attrs" = { }
10+
# with License-File
11+
"boolean.py" = { }
12+
"jsonpointer" = { }
13+
"license_expression" = { }
14+
"lxml" = { }
15+
# with expression-like License AND License-File
16+
"cryptography" = { }
17+
# with possibly unexpected license files
18+
"regression-issue868" = { path = "../../_helpers/local_pckages/with-license-pep639_regression-issue868" }
19+

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.0.xml.bin

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.1.xml.bin

+48
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/_data/snapshots/environment/pep639-texts_with-license-pep639_1.2.json.bin

+66-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)