Implement DJVU output (foreground JB2, background IW44)

Implement PNG compression
Implement JPEG2000 compression
2025-05-09 00:06:34 +10:00 · 2025-05-09 00:06:34 +10:00 · 2025-05-09 00:06:34 +10:00
11 changed files with 367 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # pdf-segmented

-Generate PDFs using separate compression for foreground and background
+Generate PDFs (or DJVU) using separate compression for foreground and background

 ## Usage

@ -14,7 +14,9 @@ All black pixels (#000000) will be considered to be foreground, and all remainin

 The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing).

-Dependencies:
+Additional compression algorithms are supported (JPEG 2000, PNG); see `--help` for detailed options. DJVU output (foreground JB2, background IW44) is also supported.
+
+## Dependencies

 * [Python 3](https://www.python.org/) (tested using 3.13.3)
 	* [NumPy](https://numpy.org/) (tested using 2.2.5)
--- a/pdf_segmented/init.py
+++ b/pdf_segmented/init.py
@ -16,6 +16,7 @@

 from .compression import CompressionOptions, compress_pages
 from .input.xcf import xcf_get_pages
+from .output.djvu import djvu_write_pages
 from .output.pdf import pdf_write_pages
 from .segmentation import segment_pages

@ -29,8 +30,8 @@ def convert_file(
 	output_file: str,
 	input_format: Optional[str] = None,
 	output_format: Optional[str] = None,
-	fg_compression: str = 'jbig2',
-	bg_compression: str = 'jpeg',
+	fg_compression: Optional[str] = None,
+	bg_compression: Optional[str] = None,
 	options: CompressionOptions = CompressionOptions()
 ) -> None:
 	# Create temporary directory
@ -43,11 +44,48 @@ def convert_file(
 				input_format = 'xcf'
 			else:
 				print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr)
+		
 		if output_format is None:
 			if output_file.endswith('.pdf'):
 				output_format = 'pdf'
+			elif output_file.endswith('.djvu'):
+				output_format = 'djvu'
 			else:
-				print('Warning: Unknown output file extension, assuming PDF', file=sys.stderr)
+				print('Error: Unknown output file extension (try --output-format)', file=sys.stderr)
+				sys.exit(1)
+		
+		if fg_compression is None:
+			if output_format == 'pdf':
+				fg_compression = 'jbig2'
+			elif output_format == 'djvu':
+				fg_compression = 'jb2'
+			else:
+				raise NotImplementedError()
+		
+		if bg_compression is None:
+			if output_format == 'pdf':
+				bg_compression = 'jpeg'
+			elif output_format == 'djvu':
+				bg_compression = 'iw44'
+			else:
+				raise NotImplementedError()
+		
+		# Validate format compatibility
+		if output_format == 'pdf':
+			if bg_compression not in ('jp2', 'jpeg', 'png'):
+				print('Error: Unsupported --bg-compression for PDF format (supported: jp2, jpeg, png)')
+				sys.exit(1)
+			if fg_compression not in ('jbig2', 'png'):
+				print('Error: Unsupported --fg-compression for PDF format (supported: jp2, jpeg, png)')
+				sys.exit(1)
+		
+		if output_format == 'djvu':
+			if bg_compression != 'iw44':
+				print('Error: Unsupported --bg-compression for DJVU format (supported: iw44)')
+				sys.exit(1)
+			if fg_compression != 'jb2':
+				print('Error: Unsupported --fg-compression for DJVU format (supported: jb2)')
+				sys.exit(1)
 		
 		# Get input pages
 		if input_format == 'xcf':
@ -60,6 +98,7 @@ def convert_file(
 		
 		# Compress layers
 		compressed_pages = compress_pages(
+			input_pages=input_pages,
 			segmented_pages=segmented_pages,
 			fg_compression=fg_compression,
 			bg_compression=bg_compression,
@ -74,6 +113,13 @@ def convert_file(
 				compressed_pages=compressed_pages,
 				output_file=output_file
 			)
+		elif output_format == 'djvu':
+			djvu_write_pages(
+				input_pages=input_pages,
+				compressed_pages=compressed_pages,
+				output_file=output_file,
+				tempdir=tempdir
+			)
 		else:
 			raise NotImplementedError()
 	finally:
--- a/pdf_segmented/main.py
+++ b/pdf_segmented/main.py
@ -27,9 +27,9 @@ parser = argparse.ArgumentParser(
 parser.add_argument('input_file')
 parser.add_argument('output_file')
 parser.add_argument('--input-format', choices=['xcf'])
-parser.add_argument('--output-format', choices=['pdf'])
-parser.add_argument('--fg-compression', default='jbig2', choices=['jbig2', 'png'])
-parser.add_argument('--bg-compression', default='jpeg', choices=['jpeg', 'jp2', 'png'])
+parser.add_argument('--output-format', choices=['pdf', 'djvu'])
+parser.add_argument('--fg-compression', choices=['jbig2', 'png', 'jb2'])
+parser.add_argument('--bg-compression', choices=['jpeg', 'jp2', 'png', 'iw44'])
 parser.add_argument('--jp2-lossless', action='store_true')
 parser.add_argument('--jp2-rate', type=float)
 parser.add_argument('--jpeg-quality', type=float)
--- a/pdf_segmented/compression/init.py
+++ b/pdf_segmented/compression/init.py
@ -17,14 +17,17 @@
 class CompressedLayer:
 	# Superclass for all compressed layer types (JPEG, JBIG2, etc.)
 	
-	def cleanup():
+	def cleanup(self):
 		# Clean up any temporary files, etc.
 		pass

+from .iw44 import iw44_compress_layer
+from .jb2 import jb2_compress_layer
 from .jbig2 import jbig2_compress_layer
 from .jp2 import jp2_compress_layer
 from .jpeg import jpeg_compress_layer
 from .png import png_compress_layer
+from ..input import InputPages
 from ..segmentation import SegmentedPage

 from PIL import Image
@ -44,6 +47,7 @@ class CompressedPage:
 	bg: CompressedLayer

 def compress_pages(
+	input_pages: InputPages,
 	segmented_pages: Iterable[SegmentedPage],
 	fg_compression: str,
 	bg_compression: str,
@ -54,6 +58,7 @@ def compress_pages(
 	# Compress foreground and background layers on each segmented page
 	for segmented_page in segmented_pages:
 		yield compress_page(
+			input_pages=input_pages,
 			segmented_page=segmented_page,
 			fg_compression=fg_compression,
 			bg_compression=bg_compression,
@ -62,6 +67,7 @@ def compress_pages(
 		)

 def compress_page(
+	input_pages: InputPages,
 	segmented_page: SegmentedPage,
 	fg_compression: str,
 	bg_compression: str,
@ -72,6 +78,7 @@ def compress_page(
 	# Compress foreground and background layers
 	return CompressedPage(
 		fg=compress_layer(
+			input_pages=input_pages,
 			layer=segmented_page.fg,
 			compression=fg_compression,
 			is_foreground=True,
@ -79,6 +86,7 @@ def compress_page(
 			tempdir=tempdir
 		),
 		bg=compress_layer(
+			input_pages=input_pages,
 			layer=segmented_page.bg,
 			compression=bg_compression,
 			is_foreground=False,
@ -88,6 +96,7 @@ def compress_page(
 	)

 def compress_layer(
+	input_pages: InputPages,
 	layer: Image,
 	compression: str,
 	is_foreground: bool,
@ -96,7 +105,11 @@ def compress_layer(
 ) -> CompressedLayer:
 	
 	# Compress the given layer
-	if compression == 'jbig2':
+	if compression == 'iw44':
+		return iw44_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+	elif compression == 'jb2':
+		return jb2_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+	elif compression == 'jbig2':
 		return jbig2_compress_layer(layer=layer, tempdir=tempdir)
 	elif compression == 'jp2':
 		return jp2_compress_layer(layer=layer, jp2_lossless=options.jp2_lossless, jp2_rate=options.jp2_rate)
--- a/pdf_segmented/compression/iw44.py
+++ b/pdf_segmented/compression/iw44.py
@ -0,0 +1,55 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+from ..util import assert_has_c44, assert_has_djvuextract
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class IW44Layer(CompressedLayer):
+	filename: str
+	
+	def cleanup(self):
+		os.unlink(self.filename)
+
+def iw44_compress_layer(layer: Image, dpi: float, tempdir: str) -> IW44Layer:
+	assert_has_c44('IW44 compression requires DjvuLibre')
+	assert_has_djvuextract('IW44 compression requires DjvuLibre')
+	
+	# Save image to PBM temporarily
+	_, ppm_file = tempfile.mkstemp(suffix='.ppm', dir=tempdir)
+	layer.save(ppm_file, format='ppm')
+	
+	# Convert image to IW44
+	_, djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+	subprocess.run(['c44', '-dpi', str(round(dpi)), ppm_file, djvu_file], check=True)
+	
+	# Extract background IW44 file
+	_, iw44_file = tempfile.mkstemp(suffix='.iw44', dir=tempdir)
+	subprocess.run(['djvuextract', djvu_file, 'BG44={}'.format(iw44_file)], check=True, capture_output=True)
+	
+	# Clean up
+	os.unlink(ppm_file)
+	os.unlink(djvu_file)
+	
+	return IW44Layer(filename=iw44_file)
--- a/pdf_segmented/compression/jb2.py
+++ b/pdf_segmented/compression/jb2.py
@ -0,0 +1,49 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+from ..util import assert_has_cjb2
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class JB2Layer(CompressedLayer):
+	filename: str
+	
+	def cleanup(self):
+		os.unlink(self.filename)
+
+def jb2_compress_layer(layer: Image, dpi: float, tempdir: str) -> JB2Layer:
+	assert_has_cjb2('JB2 compression requires DjvuLibre')
+	
+	# Save image to PPM temporarily
+	_, pbm_file = tempfile.mkstemp(suffix='.pbm', dir=tempdir)
+	layer.convert('1').save(pbm_file, format='ppm')
+	
+	# Convert image to JB2
+	_, jb2_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+	subprocess.run(['cjb2', '-dpi', str(round(dpi)), pbm_file, jb2_file], check=True)
+	
+	# Clean up
+	os.unlink(pbm_file)
+	
+	return JB2Layer(filename=jb2_file)
--- a/pdf_segmented/compression/jp2.py
+++ b/pdf_segmented/compression/jp2.py
@ -0,0 +1,39 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+from typing import Optional
+
+@dataclass
+class JP2Layer(CompressedLayer):
+	data: bytes
+
+def jp2_compress_layer(layer: Image, jp2_lossless: bool, jp2_rate: Optional[float]) -> JP2Layer:
+	save_kwargs = {}
+	if jp2_rate is not None:
+		save_kwargs['quality_mode'] = 'rates'
+		save_kwargs['quality_layers'] = [jp2_rate]
+	
+	# Save image to JPEG 2000
+	bytesio = io.BytesIO()
+	layer.save(bytesio, format='jpeg2000', no_jp2=False, irreversible=not jp2_lossless, **save_kwargs)
+	
+	return JP2Layer(data=bytesio.getvalue())
--- a/pdf_segmented/compression/png.py
+++ b/pdf_segmented/compression/png.py
@ -0,0 +1,61 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import struct
+
+@dataclass
+class PNGLayer(CompressedLayer):
+	data: bytes
+	
+	def get_flate_data(self):
+		# Parse PNG data to get the IDAT chunks
+		bytesio = io.BytesIO(self.data)
+		bytesio.read(8)  # Read PNG header
+		
+		flate_data = bytearray()
+		
+		while True:
+			# Read PNG chunks
+			length_bytes = bytesio.read(4)
+			if length_bytes == b'':  # EOF
+				break
+			length = struct.unpack('>I', length_bytes)[0]
+			cid = bytesio.read(4)
+			data = bytesio.read(length)
+			crc = bytesio.read(4)
+			
+			# IDAT chunk contains DEFLATE data
+			if cid == b'IDAT':
+				flate_data.extend(data)
+		
+		return bytes(flate_data)
+
+def png_compress_layer(layer: Image, is_foreground: bool) -> PNGLayer:
+	if is_foreground:
+		# Foreground is 1bpp
+		layer = layer.convert('1')
+	
+	# Save image to PNG
+	bytesio = io.BytesIO()
+	layer.save(bytesio, format='png', optimize=True)
+	
+	return PNGLayer(data=bytesio.getvalue())
--- a/pdf_segmented/output/djvu.py
+++ b/pdf_segmented/output/djvu.py
@ -0,0 +1,62 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from ..compression import CompressedPage
+from ..input import InputPages
+from ..util import assert_has_djvm, assert_has_djvumake
+
+import os
+import subprocess
+import tempfile
+from typing import Generator
+
+def djvu_write_pages(
+	input_pages: InputPages,
+	compressed_pages: Generator[CompressedPage],
+	output_file: str,
+	tempdir: str
+) -> None:
+	
+	assert_has_djvm('DJVU output requires DjvuLibre')
+	assert_has_djvumake('DJVU output requires DjvuLibre')
+	
+	djvu_page_files = []
+	
+	try:
+		# Write each page
+		for compressed_page in compressed_pages:
+			try:
+				# Combine foreground and background
+				_, page_djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+				
+				# TODO: Handle case where empty background or foreground
+				args = ['djvumake', page_djvu_file, 'INFO={},{},{}'.format(input_pages.width, input_pages.height, round(input_pages.dpi))]
+				args.append('Sjbz={}'.format(compressed_page.fg.filename))
+				args.append('BG44={}'.format(compressed_page.bg.filename))
+				subprocess.run(args, check=True, capture_output=True)
+				
+				djvu_page_files.append(page_djvu_file)
+			finally:
+				# Clean up
+				compressed_page.bg.cleanup()
+				compressed_page.fg.cleanup()
+		
+		# Combine pages
+		subprocess.run(['djvm', '-c', output_file] + djvu_page_files, check=True)
+	finally:
+		# Clean up
+		for page_djvu_file in djvu_page_files:
+			os.unlink(page_djvu_file)
--- a/pdf_segmented/output/pdf.py
+++ b/pdf_segmented/output/pdf.py
@ -42,10 +42,15 @@ def pdf_write_pages(
 		page = pdf.add_blank_page(page_size=(width_pt, height_pt))
 		
 		# Write each layer to the page
+		# TODO: Handle case where empty background or foreground
 		content_instructions = []
 		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
 		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)
 		
+		# Clean up
+		compressed_page.bg.cleanup()
+		compressed_page.fg.cleanup()
+		
 		# Generate content stream
 		wrapped_instructions = [
 			ContentStreamInstruction([], Operator('q')),
--- a/pdf_segmented/util.py
+++ b/pdf_segmented/util.py
@ -17,6 +17,31 @@
 import shutil
 import sys

+def assert_has_c44(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('c44') is None:
+		print('Error: {} (c44 not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_cjb2(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('cjb2') is None:
+		print('Error: {} (cjb2 not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvm(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvm') is None:
+		print('Error: {} (djvm not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvuextract(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvuextract') is None:
+		print('Error: {} (djvuextract not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvumake(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvumake') is None:
+		print('Error: {} (djvumake not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
 def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None:
 	if shutil.which('magick') is None:
 		print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)
Author	SHA1	Message	Date
RunasSudo	edba14ff2c	Implement DJVU output (foreground JB2, background IW44)	2025-05-09 00:06:34 +10:00
RunasSudo	4abbe79d5a	Implement PNG compression	2025-05-09 00:06:34 +10:00
RunasSudo	f6cbe3215b	Implement JPEG2000 compression	2025-05-09 00:06:34 +10:00