diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 1ae80a1e0..18b94aae0 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -70,7 +70,7 @@ jobs: sudo apt-get update - name: Install APT dependencies run: - sudo apt-get install ghostscript poppler-utils + sudo apt-get install ghostscript jbig2dec poppler-utils - name: Checkout Code uses: actions/checkout@v4 with: diff --git a/docs/user/installation.md b/docs/user/installation.md index 56f86d089..27f7ac433 100644 --- a/docs/user/installation.md +++ b/docs/user/installation.md @@ -48,6 +48,14 @@ If you plan to use image extraction, you need Pillow: pip install pypdf[image] ``` +For JBIG2 support, you need to install a global OS-level package as well: +[`jbig2dec`](https://github.com/ArtifexSoftware/jbig2dec) The installation procedure +depends on our operating system. For Ubuntu, just use the following for example: + +``` +sudo apt-get install jbig2dec +``` + ## Python Version Support Since pypdf 4.0, every release, including point releases, should work with all diff --git a/pypdf/constants.py b/pypdf/constants.py index 7bf5aae0c..5a20ff323 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -245,6 +245,7 @@ class FilterTypes(StrEnum): CCITT_FAX_DECODE = "/CCITTFaxDecode" # abbreviation: CCF DCT_DECODE = "/DCTDecode" # abbreviation: DCT JPX_DECODE = "/JPXDecode" + JBIG2_DECODE = "/JBIG2Decode" class FilterTypeAbbreviations: diff --git a/pypdf/filters.py b/pypdf/filters.py index 29b3a622e..4f7aa8807 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -35,11 +35,15 @@ __author_email__ = "biziqe@mathieu.fenniak.net" import math +import os +import shutil import struct +import subprocess import zlib from base64 import a85decode from dataclasses import dataclass from io import BytesIO +from tempfile import NamedTemporaryFile from typing import Any, Dict, List, Optional, Tuple, Union, cast from ._codecs._codecs import LzwCodec as _LzwCodec @@ -56,7 +60,7 @@ from .constants import ImageAttributes as IA from .constants import LzwFilterParameters as LZW from .constants import StreamAttributes as SA -from .errors import DeprecationError, PdfReadError, PdfStreamError +from .errors import DependencyError, DeprecationError, PdfReadError, PdfStreamError from .generic import ( ArrayObject, BooleanObject, @@ -615,6 +619,49 @@ def decode( return tiff_header + data +_JBIG2DEC_BINARY = shutil.which("jbig2dec") + + +class JBIG2Decode: + @staticmethod + def decode( + data: bytes, + decode_parms: Optional[DictionaryObject] = None, + **kwargs: Any, + ) -> bytes: + # decode_parms is unused here + if _JBIG2DEC_BINARY is None: + raise DependencyError("jbig2dec binary is not available.") + + with NamedTemporaryFile(suffix=".jbig2") as infile: + infile.write(data) + infile.seek(0) + environment = os.environ.copy() + environment["LC_ALL"] = "C" + result = subprocess.run( # noqa: S603 + [_JBIG2DEC_BINARY, "--embedded", "--format", "png", "--output", "-", infile.name], + capture_output=True, + env=environment, + ) + if b"unrecognized option '--embedded'" in result.stderr: + raise DependencyError("jbig2dec>=0.15 is required.") + return result.stdout + + @staticmethod + def _is_binary_compatible() -> bool: + if not _JBIG2DEC_BINARY: # pragma: no cover + return False + result = subprocess.run( # noqa: S603 + [_JBIG2DEC_BINARY, "--version"], + capture_output=True, + text=True, + ) + version = result.stdout.split(" ", maxsplit=1)[1] + + from ._utils import Version + return Version(version) >= Version("0.15") + + def decode_stream_data(stream: Any) -> bytes: """ Decode the stream data based on the specified filters. @@ -665,6 +712,8 @@ def decode_stream_data(stream: Any) -> bytes: data = DCTDecode.decode(data) elif filter_name == FT.JPX_DECODE: data = JPXDecode.decode(data) + elif filter_name == FT.JBIG2_DECODE: + data = JBIG2Decode.decode(data) elif filter_name == "/Crypt": if "/Name" in params or "/Type" in params: raise NotImplementedError( @@ -800,6 +849,13 @@ def _apply_alpha( ".tiff", False, ) + elif lfilters == FT.JBIG2_DECODE: + img, image_format, extension, invert_color = ( + Image.open(BytesIO(data), formats=("PNG",)), + "PNG", + ".png", + False, + ) elif mode == "CMYK": img, image_format, extension, invert_color = ( _extended_image_frombytes(mode, size, data), diff --git a/tests/test_filters.py b/tests/test_filters.py index dce07a456..ae2062941 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -6,12 +6,13 @@ from io import BytesIO from itertools import product as cartesian_product from pathlib import Path +from unittest import mock import pytest from PIL import Image, ImageOps from pypdf import PdfReader -from pypdf.errors import DeprecationError, PdfReadError +from pypdf.errors import DependencyError, DeprecationError, PdfReadError from pypdf.filters import ( ASCII85Decode, ASCIIHexDecode, @@ -19,6 +20,7 @@ CCITTFaxDecode, CCITTParameters, FlateDecode, + JBIG2Decode, ) from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NumberObject @@ -644,6 +646,25 @@ def test_ascii85decode__non_recoverable(caplog): assert caplog.text == "" +def test_jbig2decode__binary_errors(): + with mock.patch("pypdf.filters._JBIG2DEC_BINARY", None), \ + pytest.raises(DependencyError, match="jbig2dec binary is not available."): + JBIG2Decode.decode(b"dummy") + + result = subprocess.CompletedProcess( + args=["dummy"], returncode=0, stdout=b"", + stderr=( + b"jbig2dec: unrecognized option '--embedded'\n" + b"Usage: jbig2dec [options] \n" + b" or jbig2dec [options] \n" + ) + ) + with mock.patch("pypdf.filters.subprocess.run", return_value=result), \ + mock.patch("pypdf.filters._JBIG2DEC_BINARY", "/usr/bin/jbig2dec"), \ + pytest.raises(DependencyError, match="jbig2dec>=0.15 is required."): + JBIG2Decode.decode(b"dummy") + + @pytest.mark.enable_socket def test_ccitt_fax_decode__black_is_1(): url = "https://github.com/user-attachments/files/19288881/imagemagick-CCITTFaxDecode_BlackIs1-true.pdf" diff --git a/tests/test_images.py b/tests/test_images.py index 69f58d217..4f87e10a5 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -15,6 +15,7 @@ from PIL import Image, ImageChops, ImageDraw from pypdf import PageObject, PdfReader, PdfWriter +from pypdf.filters import JBIG2Decode from pypdf.generic import ContentStream, NameObject, NullObject from . import get_data_from_url @@ -530,3 +531,23 @@ def test_inline_image_containing_ei_in_body(): output = BytesIO() writer.write(output) assert expected in output.getvalue() + + +@pytest.mark.enable_socket +@pytest.mark.skipif(condition=not JBIG2Decode._is_binary_compatible(), reason="Requires recent jbig2dec") +def test_jbig2decode(): + url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf" + name = "jbig2.pdf" + + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + image = next(iter(page.images)) + assert image.image.size == (5138, 6630) + assert image.image.mode == "1" + assert image.image.format == "PNG" + + url = "https://github.com/user-attachments/assets/d6f88c80-a2e0-4ea9-b1e0-34442041d004" + name = "jbig2.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + + assert image_similarity(image.image, img) >= 0.999