diff --git a/README.md b/README.md
index 10400e7..6314a48 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# pdf-segmented
-Generate PDFs using separate compression for foreground and background
+Generate PDFs (or DJVU) using separate compression for foreground and background
## Usage
@@ -14,7 +14,9 @@ All black pixels (#000000) will be considered to be foreground, and all remainin
The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing).
-Dependencies:
+Additional compression algorithms are supported (JPEG 2000, PNG); see `--help` for detailed options. DJVU output (foreground JB2, background IW44) is also supported.
+
+## Dependencies
* [Python 3](https://www.python.org/) (tested using 3.13.3)
* [NumPy](https://numpy.org/) (tested using 2.2.5)
diff --git a/pdf_segmented/__init__.py b/pdf_segmented/__init__.py
index 0a84634..5693b66 100644
--- a/pdf_segmented/__init__.py
+++ b/pdf_segmented/__init__.py
@@ -16,6 +16,7 @@
from .compression import CompressionOptions, compress_pages
from .input.xcf import xcf_get_pages
+from .output.djvu import djvu_write_pages
from .output.pdf import pdf_write_pages
from .segmentation import segment_pages
@@ -29,8 +30,8 @@ def convert_file(
output_file: str,
input_format: Optional[str] = None,
output_format: Optional[str] = None,
- fg_compression: str = 'jbig2',
- bg_compression: str = 'jpeg',
+ fg_compression: Optional[str] = None,
+ bg_compression: Optional[str] = None,
options: CompressionOptions = CompressionOptions()
) -> None:
# Create temporary directory
@@ -43,11 +44,48 @@ def convert_file(
input_format = 'xcf'
else:
print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr)
+
if output_format is None:
if output_file.endswith('.pdf'):
output_format = 'pdf'
+ elif output_file.endswith('.djvu'):
+ output_format = 'djvu'
else:
- print('Warning: Unknown output file extension, assuming PDF', file=sys.stderr)
+ print('Error: Unknown output file extension (try --output-format)', file=sys.stderr)
+ sys.exit(1)
+
+ if fg_compression is None:
+ if output_format == 'pdf':
+ fg_compression = 'jbig2'
+ elif output_format == 'djvu':
+ fg_compression = 'jb2'
+ else:
+ raise NotImplementedError()
+
+ if bg_compression is None:
+ if output_format == 'pdf':
+ bg_compression = 'jpeg'
+ elif output_format == 'djvu':
+ bg_compression = 'iw44'
+ else:
+ raise NotImplementedError()
+
+ # Validate format compatibility
+ if output_format == 'pdf':
+ if bg_compression not in ('jp2', 'jpeg', 'png'):
+ print('Error: Unsupported --bg-compression for PDF format (supported: jp2, jpeg, png)')
+ sys.exit(1)
+ if fg_compression not in ('jbig2', 'png'):
+ print('Error: Unsupported --fg-compression for PDF format (supported: jp2, jpeg, png)')
+ sys.exit(1)
+
+ if output_format == 'djvu':
+ if bg_compression != 'iw44':
+ print('Error: Unsupported --bg-compression for DJVU format (supported: iw44)')
+ sys.exit(1)
+ if fg_compression != 'jb2':
+ print('Error: Unsupported --fg-compression for DJVU format (supported: jb2)')
+ sys.exit(1)
# Get input pages
if input_format == 'xcf':
@@ -60,6 +98,7 @@ def convert_file(
# Compress layers
compressed_pages = compress_pages(
+ input_pages=input_pages,
segmented_pages=segmented_pages,
fg_compression=fg_compression,
bg_compression=bg_compression,
@@ -74,6 +113,13 @@ def convert_file(
compressed_pages=compressed_pages,
output_file=output_file
)
+ elif output_format == 'djvu':
+ djvu_write_pages(
+ input_pages=input_pages,
+ compressed_pages=compressed_pages,
+ output_file=output_file,
+ tempdir=tempdir
+ )
else:
raise NotImplementedError()
finally:
diff --git a/pdf_segmented/__main__.py b/pdf_segmented/__main__.py
index 0efabcd..e81dea7 100644
--- a/pdf_segmented/__main__.py
+++ b/pdf_segmented/__main__.py
@@ -27,9 +27,9 @@ parser = argparse.ArgumentParser(
parser.add_argument('input_file')
parser.add_argument('output_file')
parser.add_argument('--input-format', choices=['xcf'])
-parser.add_argument('--output-format', choices=['pdf'])
-parser.add_argument('--fg-compression', default='jbig2', choices=['jbig2', 'png'])
-parser.add_argument('--bg-compression', default='jpeg', choices=['jpeg', 'jp2', 'png'])
+parser.add_argument('--output-format', choices=['pdf', 'djvu'])
+parser.add_argument('--fg-compression', choices=['jbig2', 'png', 'jb2'])
+parser.add_argument('--bg-compression', choices=['jpeg', 'jp2', 'png', 'iw44'])
parser.add_argument('--jp2-lossless', action='store_true')
parser.add_argument('--jp2-rate', type=float)
parser.add_argument('--jpeg-quality', type=float)
diff --git a/pdf_segmented/compression/__init__.py b/pdf_segmented/compression/__init__.py
index d28496c..de5137b 100644
--- a/pdf_segmented/compression/__init__.py
+++ b/pdf_segmented/compression/__init__.py
@@ -17,14 +17,17 @@
class CompressedLayer:
# Superclass for all compressed layer types (JPEG, JBIG2, etc.)
- def cleanup():
+ def cleanup(self):
# Clean up any temporary files, etc.
pass
+from .iw44 import iw44_compress_layer
+from .jb2 import jb2_compress_layer
from .jbig2 import jbig2_compress_layer
from .jp2 import jp2_compress_layer
from .jpeg import jpeg_compress_layer
from .png import png_compress_layer
+from ..input import InputPages
from ..segmentation import SegmentedPage
from PIL import Image
@@ -44,6 +47,7 @@ class CompressedPage:
bg: CompressedLayer
def compress_pages(
+ input_pages: InputPages,
segmented_pages: Iterable[SegmentedPage],
fg_compression: str,
bg_compression: str,
@@ -54,6 +58,7 @@ def compress_pages(
# Compress foreground and background layers on each segmented page
for segmented_page in segmented_pages:
yield compress_page(
+ input_pages=input_pages,
segmented_page=segmented_page,
fg_compression=fg_compression,
bg_compression=bg_compression,
@@ -62,6 +67,7 @@ def compress_pages(
)
def compress_page(
+ input_pages: InputPages,
segmented_page: SegmentedPage,
fg_compression: str,
bg_compression: str,
@@ -72,6 +78,7 @@ def compress_page(
# Compress foreground and background layers
return CompressedPage(
fg=compress_layer(
+ input_pages=input_pages,
layer=segmented_page.fg,
compression=fg_compression,
is_foreground=True,
@@ -79,6 +86,7 @@ def compress_page(
tempdir=tempdir
),
bg=compress_layer(
+ input_pages=input_pages,
layer=segmented_page.bg,
compression=bg_compression,
is_foreground=False,
@@ -88,6 +96,7 @@ def compress_page(
)
def compress_layer(
+ input_pages: InputPages,
layer: Image,
compression: str,
is_foreground: bool,
@@ -96,7 +105,11 @@ def compress_layer(
) -> CompressedLayer:
# Compress the given layer
- if compression == 'jbig2':
+ if compression == 'iw44':
+ return iw44_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+ elif compression == 'jb2':
+ return jb2_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+ elif compression == 'jbig2':
return jbig2_compress_layer(layer=layer, tempdir=tempdir)
elif compression == 'jp2':
return jp2_compress_layer(layer=layer, jp2_lossless=options.jp2_lossless, jp2_rate=options.jp2_rate)
diff --git a/pdf_segmented/compression/iw44.py b/pdf_segmented/compression/iw44.py
new file mode 100644
index 0000000..8d633ab
--- /dev/null
+++ b/pdf_segmented/compression/iw44.py
@@ -0,0 +1,55 @@
+# pdf-segmented: Generate PDFs using separate compression for foreground and background
+# Copyright (C) 2025 Lee Yingtong Li
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from . import CompressedLayer
+from ..util import assert_has_c44, assert_has_djvuextract
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class IW44Layer(CompressedLayer):
+ filename: str
+
+ def cleanup(self):
+ os.unlink(self.filename)
+
+def iw44_compress_layer(layer: Image, dpi: float, tempdir: str) -> IW44Layer:
+ assert_has_c44('IW44 compression requires DjvuLibre')
+ assert_has_djvuextract('IW44 compression requires DjvuLibre')
+
+ # Save image to PBM temporarily
+ _, ppm_file = tempfile.mkstemp(suffix='.ppm', dir=tempdir)
+ layer.save(ppm_file, format='ppm')
+
+ # Convert image to IW44
+ _, djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+ subprocess.run(['c44', '-dpi', str(round(dpi)), ppm_file, djvu_file], check=True)
+
+ # Extract background IW44 file
+ _, iw44_file = tempfile.mkstemp(suffix='.iw44', dir=tempdir)
+ subprocess.run(['djvuextract', djvu_file, 'BG44={}'.format(iw44_file)], check=True, capture_output=True)
+
+ # Clean up
+ os.unlink(ppm_file)
+ os.unlink(djvu_file)
+
+ return IW44Layer(filename=iw44_file)
diff --git a/pdf_segmented/compression/jb2.py b/pdf_segmented/compression/jb2.py
new file mode 100644
index 0000000..02b3956
--- /dev/null
+++ b/pdf_segmented/compression/jb2.py
@@ -0,0 +1,49 @@
+# pdf-segmented: Generate PDFs using separate compression for foreground and background
+# Copyright (C) 2025 Lee Yingtong Li
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from . import CompressedLayer
+from ..util import assert_has_cjb2
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class JB2Layer(CompressedLayer):
+ filename: str
+
+ def cleanup(self):
+ os.unlink(self.filename)
+
+def jb2_compress_layer(layer: Image, dpi: float, tempdir: str) -> JB2Layer:
+ assert_has_cjb2('JB2 compression requires DjvuLibre')
+
+ # Save image to PPM temporarily
+ _, pbm_file = tempfile.mkstemp(suffix='.pbm', dir=tempdir)
+ layer.convert('1').save(pbm_file, format='ppm')
+
+ # Convert image to JB2
+ _, jb2_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+ subprocess.run(['cjb2', '-dpi', str(round(dpi)), pbm_file, jb2_file], check=True)
+
+ # Clean up
+ os.unlink(pbm_file)
+
+ return JB2Layer(filename=jb2_file)
diff --git a/pdf_segmented/output/djvu.py b/pdf_segmented/output/djvu.py
new file mode 100644
index 0000000..9675caa
--- /dev/null
+++ b/pdf_segmented/output/djvu.py
@@ -0,0 +1,62 @@
+# pdf-segmented: Generate PDFs using separate compression for foreground and background
+# Copyright (C) 2025 Lee Yingtong Li
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+
+from ..compression import CompressedPage
+from ..input import InputPages
+from ..util import assert_has_djvm, assert_has_djvumake
+
+import os
+import subprocess
+import tempfile
+from typing import Generator
+
+def djvu_write_pages(
+ input_pages: InputPages,
+ compressed_pages: Generator[CompressedPage],
+ output_file: str,
+ tempdir: str
+) -> None:
+
+ assert_has_djvm('DJVU output requires DjvuLibre')
+ assert_has_djvumake('DJVU output requires DjvuLibre')
+
+ djvu_page_files = []
+
+ try:
+ # Write each page
+ for compressed_page in compressed_pages:
+ try:
+ # Combine foreground and background
+ _, page_djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+
+ # TODO: Handle case where empty background or foreground
+ args = ['djvumake', page_djvu_file, 'INFO={},{},{}'.format(input_pages.width, input_pages.height, round(input_pages.dpi))]
+ args.append('Sjbz={}'.format(compressed_page.fg.filename))
+ args.append('BG44={}'.format(compressed_page.bg.filename))
+ subprocess.run(args, check=True, capture_output=True)
+
+ djvu_page_files.append(page_djvu_file)
+ finally:
+ # Clean up
+ compressed_page.bg.cleanup()
+ compressed_page.fg.cleanup()
+
+ # Combine pages
+ subprocess.run(['djvm', '-c', output_file] + djvu_page_files, check=True)
+ finally:
+ # Clean up
+ for page_djvu_file in djvu_page_files:
+ os.unlink(page_djvu_file)
diff --git a/pdf_segmented/output/pdf.py b/pdf_segmented/output/pdf.py
index f098b39..622f269 100644
--- a/pdf_segmented/output/pdf.py
+++ b/pdf_segmented/output/pdf.py
@@ -42,10 +42,15 @@ def pdf_write_pages(
page = pdf.add_blank_page(page_size=(width_pt, height_pt))
# Write each layer to the page
+ # TODO: Handle case where empty background or foreground
content_instructions = []
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)
+ # Clean up
+ compressed_page.bg.cleanup()
+ compressed_page.fg.cleanup()
+
# Generate content stream
wrapped_instructions = [
ContentStreamInstruction([], Operator('q')),
diff --git a/pdf_segmented/util.py b/pdf_segmented/util.py
index 7ae2547..a541371 100644
--- a/pdf_segmented/util.py
+++ b/pdf_segmented/util.py
@@ -17,6 +17,31 @@
import shutil
import sys
+def assert_has_c44(error_message: str = 'DjvuLibre is required') -> None:
+ if shutil.which('c44') is None:
+ print('Error: {} (c44 not found on PATH)'.format(error_message), file=sys.stderr)
+ sys.exit(1)
+
+def assert_has_cjb2(error_message: str = 'DjvuLibre is required') -> None:
+ if shutil.which('cjb2') is None:
+ print('Error: {} (cjb2 not found on PATH)'.format(error_message), file=sys.stderr)
+ sys.exit(1)
+
+def assert_has_djvm(error_message: str = 'DjvuLibre is required') -> None:
+ if shutil.which('djvm') is None:
+ print('Error: {} (djvm not found on PATH)'.format(error_message), file=sys.stderr)
+ sys.exit(1)
+
+def assert_has_djvuextract(error_message: str = 'DjvuLibre is required') -> None:
+ if shutil.which('djvuextract') is None:
+ print('Error: {} (djvuextract not found on PATH)'.format(error_message), file=sys.stderr)
+ sys.exit(1)
+
+def assert_has_djvumake(error_message: str = 'DjvuLibre is required') -> None:
+ if shutil.which('djvumake') is None:
+ print('Error: {} (djvumake not found on PATH)'.format(error_message), file=sys.stderr)
+ sys.exit(1)
+
def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None:
if shutil.which('magick') is None:
print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)