From edba14ff2c73bcd37828cb39064835b3fc3f2dcf Mon Sep 17 00:00:00 2001 From: RunasSudo Date: Fri, 9 May 2025 00:05:36 +1000 Subject: [PATCH] Implement DJVU output (foreground JB2, background IW44) --- README.md | 6 ++- pdf_segmented/__init__.py | 52 ++++++++++++++++++++-- pdf_segmented/__main__.py | 6 +-- pdf_segmented/compression/__init__.py | 17 +++++++- pdf_segmented/compression/iw44.py | 55 ++++++++++++++++++++++++ pdf_segmented/compression/jb2.py | 49 +++++++++++++++++++++ pdf_segmented/output/djvu.py | 62 +++++++++++++++++++++++++++ pdf_segmented/output/pdf.py | 5 +++ pdf_segmented/util.py | 25 +++++++++++ 9 files changed, 267 insertions(+), 10 deletions(-) create mode 100644 pdf_segmented/compression/iw44.py create mode 100644 pdf_segmented/compression/jb2.py create mode 100644 pdf_segmented/output/djvu.py diff --git a/README.md b/README.md index 10400e7..6314a48 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # pdf-segmented -Generate PDFs using separate compression for foreground and background +Generate PDFs (or DJVU) using separate compression for foreground and background ## Usage @@ -14,7 +14,9 @@ All black pixels (#000000) will be considered to be foreground, and all remainin The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing). -Dependencies: +Additional compression algorithms are supported (JPEG 2000, PNG); see `--help` for detailed options. DJVU output (foreground JB2, background IW44) is also supported. + +## Dependencies * [Python 3](https://www.python.org/) (tested using 3.13.3) * [NumPy](https://numpy.org/) (tested using 2.2.5) diff --git a/pdf_segmented/__init__.py b/pdf_segmented/__init__.py index 0a84634..5693b66 100644 --- a/pdf_segmented/__init__.py +++ b/pdf_segmented/__init__.py @@ -16,6 +16,7 @@ from .compression import CompressionOptions, compress_pages from .input.xcf import xcf_get_pages +from .output.djvu import djvu_write_pages from .output.pdf import pdf_write_pages from .segmentation import segment_pages @@ -29,8 +30,8 @@ def convert_file( output_file: str, input_format: Optional[str] = None, output_format: Optional[str] = None, - fg_compression: str = 'jbig2', - bg_compression: str = 'jpeg', + fg_compression: Optional[str] = None, + bg_compression: Optional[str] = None, options: CompressionOptions = CompressionOptions() ) -> None: # Create temporary directory @@ -43,11 +44,48 @@ def convert_file( input_format = 'xcf' else: print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr) + if output_format is None: if output_file.endswith('.pdf'): output_format = 'pdf' + elif output_file.endswith('.djvu'): + output_format = 'djvu' else: - print('Warning: Unknown output file extension, assuming PDF', file=sys.stderr) + print('Error: Unknown output file extension (try --output-format)', file=sys.stderr) + sys.exit(1) + + if fg_compression is None: + if output_format == 'pdf': + fg_compression = 'jbig2' + elif output_format == 'djvu': + fg_compression = 'jb2' + else: + raise NotImplementedError() + + if bg_compression is None: + if output_format == 'pdf': + bg_compression = 'jpeg' + elif output_format == 'djvu': + bg_compression = 'iw44' + else: + raise NotImplementedError() + + # Validate format compatibility + if output_format == 'pdf': + if bg_compression not in ('jp2', 'jpeg', 'png'): + print('Error: Unsupported --bg-compression for PDF format (supported: jp2, jpeg, png)') + sys.exit(1) + if fg_compression not in ('jbig2', 'png'): + print('Error: Unsupported --fg-compression for PDF format (supported: jp2, jpeg, png)') + sys.exit(1) + + if output_format == 'djvu': + if bg_compression != 'iw44': + print('Error: Unsupported --bg-compression for DJVU format (supported: iw44)') + sys.exit(1) + if fg_compression != 'jb2': + print('Error: Unsupported --fg-compression for DJVU format (supported: jb2)') + sys.exit(1) # Get input pages if input_format == 'xcf': @@ -60,6 +98,7 @@ def convert_file( # Compress layers compressed_pages = compress_pages( + input_pages=input_pages, segmented_pages=segmented_pages, fg_compression=fg_compression, bg_compression=bg_compression, @@ -74,6 +113,13 @@ def convert_file( compressed_pages=compressed_pages, output_file=output_file ) + elif output_format == 'djvu': + djvu_write_pages( + input_pages=input_pages, + compressed_pages=compressed_pages, + output_file=output_file, + tempdir=tempdir + ) else: raise NotImplementedError() finally: diff --git a/pdf_segmented/__main__.py b/pdf_segmented/__main__.py index 0efabcd..e81dea7 100644 --- a/pdf_segmented/__main__.py +++ b/pdf_segmented/__main__.py @@ -27,9 +27,9 @@ parser = argparse.ArgumentParser( parser.add_argument('input_file') parser.add_argument('output_file') parser.add_argument('--input-format', choices=['xcf']) -parser.add_argument('--output-format', choices=['pdf']) -parser.add_argument('--fg-compression', default='jbig2', choices=['jbig2', 'png']) -parser.add_argument('--bg-compression', default='jpeg', choices=['jpeg', 'jp2', 'png']) +parser.add_argument('--output-format', choices=['pdf', 'djvu']) +parser.add_argument('--fg-compression', choices=['jbig2', 'png', 'jb2']) +parser.add_argument('--bg-compression', choices=['jpeg', 'jp2', 'png', 'iw44']) parser.add_argument('--jp2-lossless', action='store_true') parser.add_argument('--jp2-rate', type=float) parser.add_argument('--jpeg-quality', type=float) diff --git a/pdf_segmented/compression/__init__.py b/pdf_segmented/compression/__init__.py index d28496c..de5137b 100644 --- a/pdf_segmented/compression/__init__.py +++ b/pdf_segmented/compression/__init__.py @@ -17,14 +17,17 @@ class CompressedLayer: # Superclass for all compressed layer types (JPEG, JBIG2, etc.) - def cleanup(): + def cleanup(self): # Clean up any temporary files, etc. pass +from .iw44 import iw44_compress_layer +from .jb2 import jb2_compress_layer from .jbig2 import jbig2_compress_layer from .jp2 import jp2_compress_layer from .jpeg import jpeg_compress_layer from .png import png_compress_layer +from ..input import InputPages from ..segmentation import SegmentedPage from PIL import Image @@ -44,6 +47,7 @@ class CompressedPage: bg: CompressedLayer def compress_pages( + input_pages: InputPages, segmented_pages: Iterable[SegmentedPage], fg_compression: str, bg_compression: str, @@ -54,6 +58,7 @@ def compress_pages( # Compress foreground and background layers on each segmented page for segmented_page in segmented_pages: yield compress_page( + input_pages=input_pages, segmented_page=segmented_page, fg_compression=fg_compression, bg_compression=bg_compression, @@ -62,6 +67,7 @@ def compress_pages( ) def compress_page( + input_pages: InputPages, segmented_page: SegmentedPage, fg_compression: str, bg_compression: str, @@ -72,6 +78,7 @@ def compress_page( # Compress foreground and background layers return CompressedPage( fg=compress_layer( + input_pages=input_pages, layer=segmented_page.fg, compression=fg_compression, is_foreground=True, @@ -79,6 +86,7 @@ def compress_page( tempdir=tempdir ), bg=compress_layer( + input_pages=input_pages, layer=segmented_page.bg, compression=bg_compression, is_foreground=False, @@ -88,6 +96,7 @@ def compress_page( ) def compress_layer( + input_pages: InputPages, layer: Image, compression: str, is_foreground: bool, @@ -96,7 +105,11 @@ def compress_layer( ) -> CompressedLayer: # Compress the given layer - if compression == 'jbig2': + if compression == 'iw44': + return iw44_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir) + elif compression == 'jb2': + return jb2_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir) + elif compression == 'jbig2': return jbig2_compress_layer(layer=layer, tempdir=tempdir) elif compression == 'jp2': return jp2_compress_layer(layer=layer, jp2_lossless=options.jp2_lossless, jp2_rate=options.jp2_rate) diff --git a/pdf_segmented/compression/iw44.py b/pdf_segmented/compression/iw44.py new file mode 100644 index 0000000..8d633ab --- /dev/null +++ b/pdf_segmented/compression/iw44.py @@ -0,0 +1,55 @@ +# pdf-segmented: Generate PDFs using separate compression for foreground and background +# Copyright (C) 2025 Lee Yingtong Li +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from . import CompressedLayer +from ..util import assert_has_c44, assert_has_djvuextract + +from PIL import Image + +from dataclasses import dataclass +import io +import os +import subprocess +import tempfile + +@dataclass +class IW44Layer(CompressedLayer): + filename: str + + def cleanup(self): + os.unlink(self.filename) + +def iw44_compress_layer(layer: Image, dpi: float, tempdir: str) -> IW44Layer: + assert_has_c44('IW44 compression requires DjvuLibre') + assert_has_djvuextract('IW44 compression requires DjvuLibre') + + # Save image to PBM temporarily + _, ppm_file = tempfile.mkstemp(suffix='.ppm', dir=tempdir) + layer.save(ppm_file, format='ppm') + + # Convert image to IW44 + _, djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir) + subprocess.run(['c44', '-dpi', str(round(dpi)), ppm_file, djvu_file], check=True) + + # Extract background IW44 file + _, iw44_file = tempfile.mkstemp(suffix='.iw44', dir=tempdir) + subprocess.run(['djvuextract', djvu_file, 'BG44={}'.format(iw44_file)], check=True, capture_output=True) + + # Clean up + os.unlink(ppm_file) + os.unlink(djvu_file) + + return IW44Layer(filename=iw44_file) diff --git a/pdf_segmented/compression/jb2.py b/pdf_segmented/compression/jb2.py new file mode 100644 index 0000000..02b3956 --- /dev/null +++ b/pdf_segmented/compression/jb2.py @@ -0,0 +1,49 @@ +# pdf-segmented: Generate PDFs using separate compression for foreground and background +# Copyright (C) 2025 Lee Yingtong Li +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from . import CompressedLayer +from ..util import assert_has_cjb2 + +from PIL import Image + +from dataclasses import dataclass +import io +import os +import subprocess +import tempfile + +@dataclass +class JB2Layer(CompressedLayer): + filename: str + + def cleanup(self): + os.unlink(self.filename) + +def jb2_compress_layer(layer: Image, dpi: float, tempdir: str) -> JB2Layer: + assert_has_cjb2('JB2 compression requires DjvuLibre') + + # Save image to PPM temporarily + _, pbm_file = tempfile.mkstemp(suffix='.pbm', dir=tempdir) + layer.convert('1').save(pbm_file, format='ppm') + + # Convert image to JB2 + _, jb2_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir) + subprocess.run(['cjb2', '-dpi', str(round(dpi)), pbm_file, jb2_file], check=True) + + # Clean up + os.unlink(pbm_file) + + return JB2Layer(filename=jb2_file) diff --git a/pdf_segmented/output/djvu.py b/pdf_segmented/output/djvu.py new file mode 100644 index 0000000..9675caa --- /dev/null +++ b/pdf_segmented/output/djvu.py @@ -0,0 +1,62 @@ +# pdf-segmented: Generate PDFs using separate compression for foreground and background +# Copyright (C) 2025 Lee Yingtong Li +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from ..compression import CompressedPage +from ..input import InputPages +from ..util import assert_has_djvm, assert_has_djvumake + +import os +import subprocess +import tempfile +from typing import Generator + +def djvu_write_pages( + input_pages: InputPages, + compressed_pages: Generator[CompressedPage], + output_file: str, + tempdir: str +) -> None: + + assert_has_djvm('DJVU output requires DjvuLibre') + assert_has_djvumake('DJVU output requires DjvuLibre') + + djvu_page_files = [] + + try: + # Write each page + for compressed_page in compressed_pages: + try: + # Combine foreground and background + _, page_djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir) + + # TODO: Handle case where empty background or foreground + args = ['djvumake', page_djvu_file, 'INFO={},{},{}'.format(input_pages.width, input_pages.height, round(input_pages.dpi))] + args.append('Sjbz={}'.format(compressed_page.fg.filename)) + args.append('BG44={}'.format(compressed_page.bg.filename)) + subprocess.run(args, check=True, capture_output=True) + + djvu_page_files.append(page_djvu_file) + finally: + # Clean up + compressed_page.bg.cleanup() + compressed_page.fg.cleanup() + + # Combine pages + subprocess.run(['djvm', '-c', output_file] + djvu_page_files, check=True) + finally: + # Clean up + for page_djvu_file in djvu_page_files: + os.unlink(page_djvu_file) diff --git a/pdf_segmented/output/pdf.py b/pdf_segmented/output/pdf.py index f098b39..622f269 100644 --- a/pdf_segmented/output/pdf.py +++ b/pdf_segmented/output/pdf.py @@ -42,10 +42,15 @@ def pdf_write_pages( page = pdf.add_blank_page(page_size=(width_pt, height_pt)) # Write each layer to the page + # TODO: Handle case where empty background or foreground content_instructions = [] pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions) pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions) + # Clean up + compressed_page.bg.cleanup() + compressed_page.fg.cleanup() + # Generate content stream wrapped_instructions = [ ContentStreamInstruction([], Operator('q')), diff --git a/pdf_segmented/util.py b/pdf_segmented/util.py index 7ae2547..a541371 100644 --- a/pdf_segmented/util.py +++ b/pdf_segmented/util.py @@ -17,6 +17,31 @@ import shutil import sys +def assert_has_c44(error_message: str = 'DjvuLibre is required') -> None: + if shutil.which('c44') is None: + print('Error: {} (c44 not found on PATH)'.format(error_message), file=sys.stderr) + sys.exit(1) + +def assert_has_cjb2(error_message: str = 'DjvuLibre is required') -> None: + if shutil.which('cjb2') is None: + print('Error: {} (cjb2 not found on PATH)'.format(error_message), file=sys.stderr) + sys.exit(1) + +def assert_has_djvm(error_message: str = 'DjvuLibre is required') -> None: + if shutil.which('djvm') is None: + print('Error: {} (djvm not found on PATH)'.format(error_message), file=sys.stderr) + sys.exit(1) + +def assert_has_djvuextract(error_message: str = 'DjvuLibre is required') -> None: + if shutil.which('djvuextract') is None: + print('Error: {} (djvuextract not found on PATH)'.format(error_message), file=sys.stderr) + sys.exit(1) + +def assert_has_djvumake(error_message: str = 'DjvuLibre is required') -> None: + if shutil.which('djvumake') is None: + print('Error: {} (djvumake not found on PATH)'.format(error_message), file=sys.stderr) + sys.exit(1) + def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None: if shutil.which('magick') is None: print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)