From edba14ff2c73bcd37828cb39064835b3fc3f2dcf Mon Sep 17 00:00:00 2001
From: RunasSudo <runassudo@yingtongli.me>
Date: Fri, 9 May 2025 00:05:36 +1000
Subject: [PATCH] Implement DJVU output (foreground JB2, background IW44)

---
 README.md                             |  6 ++-
 pdf_segmented/__init__.py             | 52 ++++++++++++++++++++--
 pdf_segmented/__main__.py             |  6 +--
 pdf_segmented/compression/__init__.py | 17 +++++++-
 pdf_segmented/compression/iw44.py     | 55 ++++++++++++++++++++++++
 pdf_segmented/compression/jb2.py      | 49 +++++++++++++++++++++
 pdf_segmented/output/djvu.py          | 62 +++++++++++++++++++++++++++
 pdf_segmented/output/pdf.py           |  5 +++
 pdf_segmented/util.py                 | 25 +++++++++++
 9 files changed, 267 insertions(+), 10 deletions(-)
 create mode 100644 pdf_segmented/compression/iw44.py
 create mode 100644 pdf_segmented/compression/jb2.py
 create mode 100644 pdf_segmented/output/djvu.py

diff --git a/README.md b/README.md
index 10400e7..6314a48 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # pdf-segmented
 
-Generate PDFs using separate compression for foreground and background
+Generate PDFs (or DJVU) using separate compression for foreground and background
 
 ## Usage
 
@@ -14,7 +14,9 @@ All black pixels (#000000) will be considered to be foreground, and all remainin
 
 The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing).
 
-Dependencies:
+Additional compression algorithms are supported (JPEG 2000, PNG); see `--help` for detailed options. DJVU output (foreground JB2, background IW44) is also supported.
+
+## Dependencies
 
 * [Python 3](https://www.python.org/) (tested using 3.13.3)
 	* [NumPy](https://numpy.org/) (tested using 2.2.5)
diff --git a/pdf_segmented/__init__.py b/pdf_segmented/__init__.py
index 0a84634..5693b66 100644
--- a/pdf_segmented/__init__.py
+++ b/pdf_segmented/__init__.py
@@ -16,6 +16,7 @@
 
 from .compression import CompressionOptions, compress_pages
 from .input.xcf import xcf_get_pages
+from .output.djvu import djvu_write_pages
 from .output.pdf import pdf_write_pages
 from .segmentation import segment_pages
 
@@ -29,8 +30,8 @@ def convert_file(
 	output_file: str,
 	input_format: Optional[str] = None,
 	output_format: Optional[str] = None,
-	fg_compression: str = 'jbig2',
-	bg_compression: str = 'jpeg',
+	fg_compression: Optional[str] = None,
+	bg_compression: Optional[str] = None,
 	options: CompressionOptions = CompressionOptions()
 ) -> None:
 	# Create temporary directory
@@ -43,11 +44,48 @@ def convert_file(
 				input_format = 'xcf'
 			else:
 				print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr)
+		
 		if output_format is None:
 			if output_file.endswith('.pdf'):
 				output_format = 'pdf'
+			elif output_file.endswith('.djvu'):
+				output_format = 'djvu'
 			else:
-				print('Warning: Unknown output file extension, assuming PDF', file=sys.stderr)
+				print('Error: Unknown output file extension (try --output-format)', file=sys.stderr)
+				sys.exit(1)
+		
+		if fg_compression is None:
+			if output_format == 'pdf':
+				fg_compression = 'jbig2'
+			elif output_format == 'djvu':
+				fg_compression = 'jb2'
+			else:
+				raise NotImplementedError()
+		
+		if bg_compression is None:
+			if output_format == 'pdf':
+				bg_compression = 'jpeg'
+			elif output_format == 'djvu':
+				bg_compression = 'iw44'
+			else:
+				raise NotImplementedError()
+		
+		# Validate format compatibility
+		if output_format == 'pdf':
+			if bg_compression not in ('jp2', 'jpeg', 'png'):
+				print('Error: Unsupported --bg-compression for PDF format (supported: jp2, jpeg, png)')
+				sys.exit(1)
+			if fg_compression not in ('jbig2', 'png'):
+				print('Error: Unsupported --fg-compression for PDF format (supported: jp2, jpeg, png)')
+				sys.exit(1)
+		
+		if output_format == 'djvu':
+			if bg_compression != 'iw44':
+				print('Error: Unsupported --bg-compression for DJVU format (supported: iw44)')
+				sys.exit(1)
+			if fg_compression != 'jb2':
+				print('Error: Unsupported --fg-compression for DJVU format (supported: jb2)')
+				sys.exit(1)
 		
 		# Get input pages
 		if input_format == 'xcf':
@@ -60,6 +98,7 @@ def convert_file(
 		
 		# Compress layers
 		compressed_pages = compress_pages(
+			input_pages=input_pages,
 			segmented_pages=segmented_pages,
 			fg_compression=fg_compression,
 			bg_compression=bg_compression,
@@ -74,6 +113,13 @@ def convert_file(
 				compressed_pages=compressed_pages,
 				output_file=output_file
 			)
+		elif output_format == 'djvu':
+			djvu_write_pages(
+				input_pages=input_pages,
+				compressed_pages=compressed_pages,
+				output_file=output_file,
+				tempdir=tempdir
+			)
 		else:
 			raise NotImplementedError()
 	finally:
diff --git a/pdf_segmented/__main__.py b/pdf_segmented/__main__.py
index 0efabcd..e81dea7 100644
--- a/pdf_segmented/__main__.py
+++ b/pdf_segmented/__main__.py
@@ -27,9 +27,9 @@ parser = argparse.ArgumentParser(
 parser.add_argument('input_file')
 parser.add_argument('output_file')
 parser.add_argument('--input-format', choices=['xcf'])
-parser.add_argument('--output-format', choices=['pdf'])
-parser.add_argument('--fg-compression', default='jbig2', choices=['jbig2', 'png'])
-parser.add_argument('--bg-compression', default='jpeg', choices=['jpeg', 'jp2', 'png'])
+parser.add_argument('--output-format', choices=['pdf', 'djvu'])
+parser.add_argument('--fg-compression', choices=['jbig2', 'png', 'jb2'])
+parser.add_argument('--bg-compression', choices=['jpeg', 'jp2', 'png', 'iw44'])
 parser.add_argument('--jp2-lossless', action='store_true')
 parser.add_argument('--jp2-rate', type=float)
 parser.add_argument('--jpeg-quality', type=float)
diff --git a/pdf_segmented/compression/__init__.py b/pdf_segmented/compression/__init__.py
index d28496c..de5137b 100644
--- a/pdf_segmented/compression/__init__.py
+++ b/pdf_segmented/compression/__init__.py
@@ -17,14 +17,17 @@
 class CompressedLayer:
 	# Superclass for all compressed layer types (JPEG, JBIG2, etc.)
 	
-	def cleanup():
+	def cleanup(self):
 		# Clean up any temporary files, etc.
 		pass
 
+from .iw44 import iw44_compress_layer
+from .jb2 import jb2_compress_layer
 from .jbig2 import jbig2_compress_layer
 from .jp2 import jp2_compress_layer
 from .jpeg import jpeg_compress_layer
 from .png import png_compress_layer
+from ..input import InputPages
 from ..segmentation import SegmentedPage
 
 from PIL import Image
@@ -44,6 +47,7 @@ class CompressedPage:
 	bg: CompressedLayer
 
 def compress_pages(
+	input_pages: InputPages,
 	segmented_pages: Iterable[SegmentedPage],
 	fg_compression: str,
 	bg_compression: str,
@@ -54,6 +58,7 @@ def compress_pages(
 	# Compress foreground and background layers on each segmented page
 	for segmented_page in segmented_pages:
 		yield compress_page(
+			input_pages=input_pages,
 			segmented_page=segmented_page,
 			fg_compression=fg_compression,
 			bg_compression=bg_compression,
@@ -62,6 +67,7 @@ def compress_pages(
 		)
 
 def compress_page(
+	input_pages: InputPages,
 	segmented_page: SegmentedPage,
 	fg_compression: str,
 	bg_compression: str,
@@ -72,6 +78,7 @@ def compress_page(
 	# Compress foreground and background layers
 	return CompressedPage(
 		fg=compress_layer(
+			input_pages=input_pages,
 			layer=segmented_page.fg,
 			compression=fg_compression,
 			is_foreground=True,
@@ -79,6 +86,7 @@ def compress_page(
 			tempdir=tempdir
 		),
 		bg=compress_layer(
+			input_pages=input_pages,
 			layer=segmented_page.bg,
 			compression=bg_compression,
 			is_foreground=False,
@@ -88,6 +96,7 @@ def compress_page(
 	)
 
 def compress_layer(
+	input_pages: InputPages,
 	layer: Image,
 	compression: str,
 	is_foreground: bool,
@@ -96,7 +105,11 @@ def compress_layer(
 ) -> CompressedLayer:
 	
 	# Compress the given layer
-	if compression == 'jbig2':
+	if compression == 'iw44':
+		return iw44_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+	elif compression == 'jb2':
+		return jb2_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
+	elif compression == 'jbig2':
 		return jbig2_compress_layer(layer=layer, tempdir=tempdir)
 	elif compression == 'jp2':
 		return jp2_compress_layer(layer=layer, jp2_lossless=options.jp2_lossless, jp2_rate=options.jp2_rate)
diff --git a/pdf_segmented/compression/iw44.py b/pdf_segmented/compression/iw44.py
new file mode 100644
index 0000000..8d633ab
--- /dev/null
+++ b/pdf_segmented/compression/iw44.py
@@ -0,0 +1,55 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+from ..util import assert_has_c44, assert_has_djvuextract
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class IW44Layer(CompressedLayer):
+	filename: str
+	
+	def cleanup(self):
+		os.unlink(self.filename)
+
+def iw44_compress_layer(layer: Image, dpi: float, tempdir: str) -> IW44Layer:
+	assert_has_c44('IW44 compression requires DjvuLibre')
+	assert_has_djvuextract('IW44 compression requires DjvuLibre')
+	
+	# Save image to PBM temporarily
+	_, ppm_file = tempfile.mkstemp(suffix='.ppm', dir=tempdir)
+	layer.save(ppm_file, format='ppm')
+	
+	# Convert image to IW44
+	_, djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+	subprocess.run(['c44', '-dpi', str(round(dpi)), ppm_file, djvu_file], check=True)
+	
+	# Extract background IW44 file
+	_, iw44_file = tempfile.mkstemp(suffix='.iw44', dir=tempdir)
+	subprocess.run(['djvuextract', djvu_file, 'BG44={}'.format(iw44_file)], check=True, capture_output=True)
+	
+	# Clean up
+	os.unlink(ppm_file)
+	os.unlink(djvu_file)
+	
+	return IW44Layer(filename=iw44_file)
diff --git a/pdf_segmented/compression/jb2.py b/pdf_segmented/compression/jb2.py
new file mode 100644
index 0000000..02b3956
--- /dev/null
+++ b/pdf_segmented/compression/jb2.py
@@ -0,0 +1,49 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from . import CompressedLayer
+from ..util import assert_has_cjb2
+
+from PIL import Image
+
+from dataclasses import dataclass
+import io
+import os
+import subprocess
+import tempfile
+
+@dataclass
+class JB2Layer(CompressedLayer):
+	filename: str
+	
+	def cleanup(self):
+		os.unlink(self.filename)
+
+def jb2_compress_layer(layer: Image, dpi: float, tempdir: str) -> JB2Layer:
+	assert_has_cjb2('JB2 compression requires DjvuLibre')
+	
+	# Save image to PPM temporarily
+	_, pbm_file = tempfile.mkstemp(suffix='.pbm', dir=tempdir)
+	layer.convert('1').save(pbm_file, format='ppm')
+	
+	# Convert image to JB2
+	_, jb2_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+	subprocess.run(['cjb2', '-dpi', str(round(dpi)), pbm_file, jb2_file], check=True)
+	
+	# Clean up
+	os.unlink(pbm_file)
+	
+	return JB2Layer(filename=jb2_file)
diff --git a/pdf_segmented/output/djvu.py b/pdf_segmented/output/djvu.py
new file mode 100644
index 0000000..9675caa
--- /dev/null
+++ b/pdf_segmented/output/djvu.py
@@ -0,0 +1,62 @@
+#   pdf-segmented: Generate PDFs using separate compression for foreground and background
+#   Copyright (C) 2025  Lee Yingtong Li
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as published by
+#   the Free Software Foundation, either version 3 of the License, or
+#   (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from ..compression import CompressedPage
+from ..input import InputPages
+from ..util import assert_has_djvm, assert_has_djvumake
+
+import os
+import subprocess
+import tempfile
+from typing import Generator
+
+def djvu_write_pages(
+	input_pages: InputPages,
+	compressed_pages: Generator[CompressedPage],
+	output_file: str,
+	tempdir: str
+) -> None:
+	
+	assert_has_djvm('DJVU output requires DjvuLibre')
+	assert_has_djvumake('DJVU output requires DjvuLibre')
+	
+	djvu_page_files = []
+	
+	try:
+		# Write each page
+		for compressed_page in compressed_pages:
+			try:
+				# Combine foreground and background
+				_, page_djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
+				
+				# TODO: Handle case where empty background or foreground
+				args = ['djvumake', page_djvu_file, 'INFO={},{},{}'.format(input_pages.width, input_pages.height, round(input_pages.dpi))]
+				args.append('Sjbz={}'.format(compressed_page.fg.filename))
+				args.append('BG44={}'.format(compressed_page.bg.filename))
+				subprocess.run(args, check=True, capture_output=True)
+				
+				djvu_page_files.append(page_djvu_file)
+			finally:
+				# Clean up
+				compressed_page.bg.cleanup()
+				compressed_page.fg.cleanup()
+		
+		# Combine pages
+		subprocess.run(['djvm', '-c', output_file] + djvu_page_files, check=True)
+	finally:
+		# Clean up
+		for page_djvu_file in djvu_page_files:
+			os.unlink(page_djvu_file)
diff --git a/pdf_segmented/output/pdf.py b/pdf_segmented/output/pdf.py
index f098b39..622f269 100644
--- a/pdf_segmented/output/pdf.py
+++ b/pdf_segmented/output/pdf.py
@@ -42,10 +42,15 @@ def pdf_write_pages(
 		page = pdf.add_blank_page(page_size=(width_pt, height_pt))
 		
 		# Write each layer to the page
+		# TODO: Handle case where empty background or foreground
 		content_instructions = []
 		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
 		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)
 		
+		# Clean up
+		compressed_page.bg.cleanup()
+		compressed_page.fg.cleanup()
+		
 		# Generate content stream
 		wrapped_instructions = [
 			ContentStreamInstruction([], Operator('q')),
diff --git a/pdf_segmented/util.py b/pdf_segmented/util.py
index 7ae2547..a541371 100644
--- a/pdf_segmented/util.py
+++ b/pdf_segmented/util.py
@@ -17,6 +17,31 @@
 import shutil
 import sys
 
+def assert_has_c44(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('c44') is None:
+		print('Error: {} (c44 not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_cjb2(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('cjb2') is None:
+		print('Error: {} (cjb2 not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvm(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvm') is None:
+		print('Error: {} (djvm not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvuextract(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvuextract') is None:
+		print('Error: {} (djvuextract not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
+def assert_has_djvumake(error_message: str = 'DjvuLibre is required') -> None:
+	if shutil.which('djvumake') is None:
+		print('Error: {} (djvumake not found on PATH)'.format(error_message), file=sys.stderr)
+		sys.exit(1)
+
 def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None:
 	if shutil.which('magick') is None:
 		print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)