Compare commits
3 Commits
60c6ee92e5
...
edba14ff2c
Author | SHA1 | Date | |
---|---|---|---|
edba14ff2c | |||
4abbe79d5a | |||
f6cbe3215b |
@ -1,6 +1,6 @@
|
|||||||
# pdf-segmented
|
# pdf-segmented
|
||||||
|
|
||||||
Generate PDFs using separate compression for foreground and background
|
Generate PDFs (or DJVU) using separate compression for foreground and background
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -14,7 +14,9 @@ All black pixels (#000000) will be considered to be foreground, and all remainin
|
|||||||
|
|
||||||
The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing).
|
The foreground will be compressed losslessly using [JBIG2](https://en.wikipedia.org/wiki/JBIG2). The background will be compressed lossily using [JPEG](https://en.wikipedia.org/wiki/JPEG). JPEG quality can be controlled using the `--jpeg-quality` option; the default is the Pillow default (75% at time of writing).
|
||||||
|
|
||||||
Dependencies:
|
Additional compression algorithms are supported (JPEG 2000, PNG); see `--help` for detailed options. DJVU output (foreground JB2, background IW44) is also supported.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
* [Python 3](https://www.python.org/) (tested using 3.13.3)
|
* [Python 3](https://www.python.org/) (tested using 3.13.3)
|
||||||
* [NumPy](https://numpy.org/) (tested using 2.2.5)
|
* [NumPy](https://numpy.org/) (tested using 2.2.5)
|
||||||
|
@ -14,8 +14,9 @@
|
|||||||
# You should have received a copy of the GNU Affero General Public License
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from .compression import compress_pages
|
from .compression import CompressionOptions, compress_pages
|
||||||
from .input.xcf import xcf_get_pages
|
from .input.xcf import xcf_get_pages
|
||||||
|
from .output.djvu import djvu_write_pages
|
||||||
from .output.pdf import pdf_write_pages
|
from .output.pdf import pdf_write_pages
|
||||||
from .segmentation import segment_pages
|
from .segmentation import segment_pages
|
||||||
|
|
||||||
@ -29,9 +30,9 @@ def convert_file(
|
|||||||
output_file: str,
|
output_file: str,
|
||||||
input_format: Optional[str] = None,
|
input_format: Optional[str] = None,
|
||||||
output_format: Optional[str] = None,
|
output_format: Optional[str] = None,
|
||||||
fg_compression: str = 'jbig2',
|
fg_compression: Optional[str] = None,
|
||||||
bg_compression: str = 'jpeg',
|
bg_compression: Optional[str] = None,
|
||||||
jpeg_quality: Optional[float] = None
|
options: CompressionOptions = CompressionOptions()
|
||||||
) -> None:
|
) -> None:
|
||||||
# Create temporary directory
|
# Create temporary directory
|
||||||
tempdir = tempfile.mkdtemp('pdf-segmented')
|
tempdir = tempfile.mkdtemp('pdf-segmented')
|
||||||
@ -43,11 +44,48 @@ def convert_file(
|
|||||||
input_format = 'xcf'
|
input_format = 'xcf'
|
||||||
else:
|
else:
|
||||||
print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr)
|
print('Warning: Unknown input file extension, assuming XCF', file=sys.stderr)
|
||||||
|
|
||||||
if output_format is None:
|
if output_format is None:
|
||||||
if output_file.endswith('.pdf'):
|
if output_file.endswith('.pdf'):
|
||||||
output_format = 'pdf'
|
output_format = 'pdf'
|
||||||
|
elif output_file.endswith('.djvu'):
|
||||||
|
output_format = 'djvu'
|
||||||
else:
|
else:
|
||||||
print('Warning: Unknown output file extension, assuming PDF', file=sys.stderr)
|
print('Error: Unknown output file extension (try --output-format)', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if fg_compression is None:
|
||||||
|
if output_format == 'pdf':
|
||||||
|
fg_compression = 'jbig2'
|
||||||
|
elif output_format == 'djvu':
|
||||||
|
fg_compression = 'jb2'
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
if bg_compression is None:
|
||||||
|
if output_format == 'pdf':
|
||||||
|
bg_compression = 'jpeg'
|
||||||
|
elif output_format == 'djvu':
|
||||||
|
bg_compression = 'iw44'
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
# Validate format compatibility
|
||||||
|
if output_format == 'pdf':
|
||||||
|
if bg_compression not in ('jp2', 'jpeg', 'png'):
|
||||||
|
print('Error: Unsupported --bg-compression for PDF format (supported: jp2, jpeg, png)')
|
||||||
|
sys.exit(1)
|
||||||
|
if fg_compression not in ('jbig2', 'png'):
|
||||||
|
print('Error: Unsupported --fg-compression for PDF format (supported: jp2, jpeg, png)')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if output_format == 'djvu':
|
||||||
|
if bg_compression != 'iw44':
|
||||||
|
print('Error: Unsupported --bg-compression for DJVU format (supported: iw44)')
|
||||||
|
sys.exit(1)
|
||||||
|
if fg_compression != 'jb2':
|
||||||
|
print('Error: Unsupported --fg-compression for DJVU format (supported: jb2)')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Get input pages
|
# Get input pages
|
||||||
if input_format == 'xcf':
|
if input_format == 'xcf':
|
||||||
@ -60,10 +98,11 @@ def convert_file(
|
|||||||
|
|
||||||
# Compress layers
|
# Compress layers
|
||||||
compressed_pages = compress_pages(
|
compressed_pages = compress_pages(
|
||||||
|
input_pages=input_pages,
|
||||||
segmented_pages=segmented_pages,
|
segmented_pages=segmented_pages,
|
||||||
fg_compression=fg_compression,
|
fg_compression=fg_compression,
|
||||||
bg_compression=bg_compression,
|
bg_compression=bg_compression,
|
||||||
jpeg_quality=jpeg_quality,
|
options=options,
|
||||||
tempdir=tempdir
|
tempdir=tempdir
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -74,6 +113,13 @@ def convert_file(
|
|||||||
compressed_pages=compressed_pages,
|
compressed_pages=compressed_pages,
|
||||||
output_file=output_file
|
output_file=output_file
|
||||||
)
|
)
|
||||||
|
elif output_format == 'djvu':
|
||||||
|
djvu_write_pages(
|
||||||
|
input_pages=input_pages,
|
||||||
|
compressed_pages=compressed_pages,
|
||||||
|
output_file=output_file,
|
||||||
|
tempdir=tempdir
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
finally:
|
finally:
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from . import convert_file
|
from . import convert_file
|
||||||
|
from .compression import CompressionOptions
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
@ -26,9 +27,11 @@ parser = argparse.ArgumentParser(
|
|||||||
parser.add_argument('input_file')
|
parser.add_argument('input_file')
|
||||||
parser.add_argument('output_file')
|
parser.add_argument('output_file')
|
||||||
parser.add_argument('--input-format', choices=['xcf'])
|
parser.add_argument('--input-format', choices=['xcf'])
|
||||||
parser.add_argument('--output-format', choices=['pdf'])
|
parser.add_argument('--output-format', choices=['pdf', 'djvu'])
|
||||||
parser.add_argument('--fg-compression', default='jbig2', choices=['jbig2'])
|
parser.add_argument('--fg-compression', choices=['jbig2', 'png', 'jb2'])
|
||||||
parser.add_argument('--bg-compression', default='jpeg', choices=['jpeg'])
|
parser.add_argument('--bg-compression', choices=['jpeg', 'jp2', 'png', 'iw44'])
|
||||||
|
parser.add_argument('--jp2-lossless', action='store_true')
|
||||||
|
parser.add_argument('--jp2-rate', type=float)
|
||||||
parser.add_argument('--jpeg-quality', type=float)
|
parser.add_argument('--jpeg-quality', type=float)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -41,5 +44,9 @@ convert_file(
|
|||||||
output_format=args.output_format,
|
output_format=args.output_format,
|
||||||
fg_compression=args.fg_compression,
|
fg_compression=args.fg_compression,
|
||||||
bg_compression=args.bg_compression,
|
bg_compression=args.bg_compression,
|
||||||
jpeg_quality=args.jpeg_quality
|
options=CompressionOptions(
|
||||||
|
jp2_lossless=args.jp2_lossless,
|
||||||
|
jp2_rate=args.jp2_rate,
|
||||||
|
jpeg_quality=args.jpeg_quality
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
@ -17,12 +17,17 @@
|
|||||||
class CompressedLayer:
|
class CompressedLayer:
|
||||||
# Superclass for all compressed layer types (JPEG, JBIG2, etc.)
|
# Superclass for all compressed layer types (JPEG, JBIG2, etc.)
|
||||||
|
|
||||||
def cleanup():
|
def cleanup(self):
|
||||||
# Clean up any temporary files, etc.
|
# Clean up any temporary files, etc.
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
from .iw44 import iw44_compress_layer
|
||||||
|
from .jb2 import jb2_compress_layer
|
||||||
from .jbig2 import jbig2_compress_layer
|
from .jbig2 import jbig2_compress_layer
|
||||||
|
from .jp2 import jp2_compress_layer
|
||||||
from .jpeg import jpeg_compress_layer
|
from .jpeg import jpeg_compress_layer
|
||||||
|
from .png import png_compress_layer
|
||||||
|
from ..input import InputPages
|
||||||
from ..segmentation import SegmentedPage
|
from ..segmentation import SegmentedPage
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -30,64 +35,87 @@ from PIL import Image
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Generator, Iterable, Optional
|
from typing import Generator, Iterable, Optional
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompressionOptions:
|
||||||
|
jp2_lossless: bool = False
|
||||||
|
jp2_rate: Optional[float] = None
|
||||||
|
jpeg_quality: Optional[float] = None
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CompressedPage:
|
class CompressedPage:
|
||||||
fg: CompressedLayer
|
fg: CompressedLayer
|
||||||
bg: CompressedLayer
|
bg: CompressedLayer
|
||||||
|
|
||||||
def compress_pages(
|
def compress_pages(
|
||||||
|
input_pages: InputPages,
|
||||||
segmented_pages: Iterable[SegmentedPage],
|
segmented_pages: Iterable[SegmentedPage],
|
||||||
fg_compression: str,
|
fg_compression: str,
|
||||||
bg_compression: str,
|
bg_compression: str,
|
||||||
jpeg_quality: Optional[float],
|
options: CompressionOptions,
|
||||||
tempdir: str
|
tempdir: str
|
||||||
) -> Generator[CompressedPage]:
|
) -> Generator[CompressedPage]:
|
||||||
|
|
||||||
# Compress foreground and background layers on each segmented page
|
# Compress foreground and background layers on each segmented page
|
||||||
for segmented_page in segmented_pages:
|
for segmented_page in segmented_pages:
|
||||||
yield compress_page(
|
yield compress_page(
|
||||||
|
input_pages=input_pages,
|
||||||
segmented_page=segmented_page,
|
segmented_page=segmented_page,
|
||||||
fg_compression=fg_compression,
|
fg_compression=fg_compression,
|
||||||
bg_compression=bg_compression,
|
bg_compression=bg_compression,
|
||||||
jpeg_quality=jpeg_quality,
|
options=options,
|
||||||
tempdir=tempdir
|
tempdir=tempdir
|
||||||
)
|
)
|
||||||
|
|
||||||
def compress_page(
|
def compress_page(
|
||||||
|
input_pages: InputPages,
|
||||||
segmented_page: SegmentedPage,
|
segmented_page: SegmentedPage,
|
||||||
fg_compression: str,
|
fg_compression: str,
|
||||||
bg_compression: str,
|
bg_compression: str,
|
||||||
jpeg_quality: Optional[float],
|
options: CompressionOptions,
|
||||||
tempdir: str
|
tempdir: str
|
||||||
) -> CompressedPage:
|
) -> CompressedPage:
|
||||||
|
|
||||||
# Compress foreground and background layers
|
# Compress foreground and background layers
|
||||||
return CompressedPage(
|
return CompressedPage(
|
||||||
fg=compress_layer(
|
fg=compress_layer(
|
||||||
|
input_pages=input_pages,
|
||||||
layer=segmented_page.fg,
|
layer=segmented_page.fg,
|
||||||
compression=fg_compression,
|
compression=fg_compression,
|
||||||
jpeg_quality=jpeg_quality,
|
is_foreground=True,
|
||||||
|
options=options,
|
||||||
tempdir=tempdir
|
tempdir=tempdir
|
||||||
),
|
),
|
||||||
bg=compress_layer(
|
bg=compress_layer(
|
||||||
|
input_pages=input_pages,
|
||||||
layer=segmented_page.bg,
|
layer=segmented_page.bg,
|
||||||
compression=bg_compression,
|
compression=bg_compression,
|
||||||
jpeg_quality=jpeg_quality,
|
is_foreground=False,
|
||||||
|
options=options,
|
||||||
tempdir=tempdir
|
tempdir=tempdir
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def compress_layer(
|
def compress_layer(
|
||||||
|
input_pages: InputPages,
|
||||||
layer: Image,
|
layer: Image,
|
||||||
compression: str,
|
compression: str,
|
||||||
jpeg_quality: Optional[float],
|
is_foreground: bool,
|
||||||
|
options: CompressionOptions,
|
||||||
tempdir: str
|
tempdir: str
|
||||||
) -> CompressedLayer:
|
) -> CompressedLayer:
|
||||||
|
|
||||||
# Compress the given layer
|
# Compress the given layer
|
||||||
if compression == 'jbig2':
|
if compression == 'iw44':
|
||||||
|
return iw44_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
|
||||||
|
elif compression == 'jb2':
|
||||||
|
return jb2_compress_layer(layer=layer, dpi=input_pages.dpi, tempdir=tempdir)
|
||||||
|
elif compression == 'jbig2':
|
||||||
return jbig2_compress_layer(layer=layer, tempdir=tempdir)
|
return jbig2_compress_layer(layer=layer, tempdir=tempdir)
|
||||||
|
elif compression == 'jp2':
|
||||||
|
return jp2_compress_layer(layer=layer, jp2_lossless=options.jp2_lossless, jp2_rate=options.jp2_rate)
|
||||||
elif compression == 'jpeg':
|
elif compression == 'jpeg':
|
||||||
return jpeg_compress_layer(layer=layer, jpeg_quality=jpeg_quality)
|
return jpeg_compress_layer(layer=layer, jpeg_quality=options.jpeg_quality)
|
||||||
|
elif compression == 'png':
|
||||||
|
return png_compress_layer(layer=layer, is_foreground=is_foreground)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
55
pdf_segmented/compression/iw44.py
Normal file
55
pdf_segmented/compression/iw44.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# pdf-segmented: Generate PDFs using separate compression for foreground and background
|
||||||
|
# Copyright (C) 2025 Lee Yingtong Li
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from . import CompressedLayer
|
||||||
|
from ..util import assert_has_c44, assert_has_djvuextract
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IW44Layer(CompressedLayer):
|
||||||
|
filename: str
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
os.unlink(self.filename)
|
||||||
|
|
||||||
|
def iw44_compress_layer(layer: Image, dpi: float, tempdir: str) -> IW44Layer:
|
||||||
|
assert_has_c44('IW44 compression requires DjvuLibre')
|
||||||
|
assert_has_djvuextract('IW44 compression requires DjvuLibre')
|
||||||
|
|
||||||
|
# Save image to PBM temporarily
|
||||||
|
_, ppm_file = tempfile.mkstemp(suffix='.ppm', dir=tempdir)
|
||||||
|
layer.save(ppm_file, format='ppm')
|
||||||
|
|
||||||
|
# Convert image to IW44
|
||||||
|
_, djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
|
||||||
|
subprocess.run(['c44', '-dpi', str(round(dpi)), ppm_file, djvu_file], check=True)
|
||||||
|
|
||||||
|
# Extract background IW44 file
|
||||||
|
_, iw44_file = tempfile.mkstemp(suffix='.iw44', dir=tempdir)
|
||||||
|
subprocess.run(['djvuextract', djvu_file, 'BG44={}'.format(iw44_file)], check=True, capture_output=True)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
os.unlink(ppm_file)
|
||||||
|
os.unlink(djvu_file)
|
||||||
|
|
||||||
|
return IW44Layer(filename=iw44_file)
|
49
pdf_segmented/compression/jb2.py
Normal file
49
pdf_segmented/compression/jb2.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# pdf-segmented: Generate PDFs using separate compression for foreground and background
|
||||||
|
# Copyright (C) 2025 Lee Yingtong Li
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from . import CompressedLayer
|
||||||
|
from ..util import assert_has_cjb2
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JB2Layer(CompressedLayer):
|
||||||
|
filename: str
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
os.unlink(self.filename)
|
||||||
|
|
||||||
|
def jb2_compress_layer(layer: Image, dpi: float, tempdir: str) -> JB2Layer:
|
||||||
|
assert_has_cjb2('JB2 compression requires DjvuLibre')
|
||||||
|
|
||||||
|
# Save image to PPM temporarily
|
||||||
|
_, pbm_file = tempfile.mkstemp(suffix='.pbm', dir=tempdir)
|
||||||
|
layer.convert('1').save(pbm_file, format='ppm')
|
||||||
|
|
||||||
|
# Convert image to JB2
|
||||||
|
_, jb2_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
|
||||||
|
subprocess.run(['cjb2', '-dpi', str(round(dpi)), pbm_file, jb2_file], check=True)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
os.unlink(pbm_file)
|
||||||
|
|
||||||
|
return JB2Layer(filename=jb2_file)
|
39
pdf_segmented/compression/jp2.py
Normal file
39
pdf_segmented/compression/jp2.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# pdf-segmented: Generate PDFs using separate compression for foreground and background
|
||||||
|
# Copyright (C) 2025 Lee Yingtong Li
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from . import CompressedLayer
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import io
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class JP2Layer(CompressedLayer):
|
||||||
|
data: bytes
|
||||||
|
|
||||||
|
def jp2_compress_layer(layer: Image, jp2_lossless: bool, jp2_rate: Optional[float]) -> JP2Layer:
|
||||||
|
save_kwargs = {}
|
||||||
|
if jp2_rate is not None:
|
||||||
|
save_kwargs['quality_mode'] = 'rates'
|
||||||
|
save_kwargs['quality_layers'] = [jp2_rate]
|
||||||
|
|
||||||
|
# Save image to JPEG 2000
|
||||||
|
bytesio = io.BytesIO()
|
||||||
|
layer.save(bytesio, format='jpeg2000', no_jp2=False, irreversible=not jp2_lossless, **save_kwargs)
|
||||||
|
|
||||||
|
return JP2Layer(data=bytesio.getvalue())
|
61
pdf_segmented/compression/png.py
Normal file
61
pdf_segmented/compression/png.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
# pdf-segmented: Generate PDFs using separate compression for foreground and background
|
||||||
|
# Copyright (C) 2025 Lee Yingtong Li
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from . import CompressedLayer
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import io
|
||||||
|
import struct
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PNGLayer(CompressedLayer):
|
||||||
|
data: bytes
|
||||||
|
|
||||||
|
def get_flate_data(self):
|
||||||
|
# Parse PNG data to get the IDAT chunks
|
||||||
|
bytesio = io.BytesIO(self.data)
|
||||||
|
bytesio.read(8) # Read PNG header
|
||||||
|
|
||||||
|
flate_data = bytearray()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Read PNG chunks
|
||||||
|
length_bytes = bytesio.read(4)
|
||||||
|
if length_bytes == b'': # EOF
|
||||||
|
break
|
||||||
|
length = struct.unpack('>I', length_bytes)[0]
|
||||||
|
cid = bytesio.read(4)
|
||||||
|
data = bytesio.read(length)
|
||||||
|
crc = bytesio.read(4)
|
||||||
|
|
||||||
|
# IDAT chunk contains DEFLATE data
|
||||||
|
if cid == b'IDAT':
|
||||||
|
flate_data.extend(data)
|
||||||
|
|
||||||
|
return bytes(flate_data)
|
||||||
|
|
||||||
|
def png_compress_layer(layer: Image, is_foreground: bool) -> PNGLayer:
|
||||||
|
if is_foreground:
|
||||||
|
# Foreground is 1bpp
|
||||||
|
layer = layer.convert('1')
|
||||||
|
|
||||||
|
# Save image to PNG
|
||||||
|
bytesio = io.BytesIO()
|
||||||
|
layer.save(bytesio, format='png', optimize=True)
|
||||||
|
|
||||||
|
return PNGLayer(data=bytesio.getvalue())
|
62
pdf_segmented/output/djvu.py
Normal file
62
pdf_segmented/output/djvu.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
# pdf-segmented: Generate PDFs using separate compression for foreground and background
|
||||||
|
# Copyright (C) 2025 Lee Yingtong Li
|
||||||
|
#
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from ..compression import CompressedPage
|
||||||
|
from ..input import InputPages
|
||||||
|
from ..util import assert_has_djvm, assert_has_djvumake
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
def djvu_write_pages(
|
||||||
|
input_pages: InputPages,
|
||||||
|
compressed_pages: Generator[CompressedPage],
|
||||||
|
output_file: str,
|
||||||
|
tempdir: str
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
assert_has_djvm('DJVU output requires DjvuLibre')
|
||||||
|
assert_has_djvumake('DJVU output requires DjvuLibre')
|
||||||
|
|
||||||
|
djvu_page_files = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Write each page
|
||||||
|
for compressed_page in compressed_pages:
|
||||||
|
try:
|
||||||
|
# Combine foreground and background
|
||||||
|
_, page_djvu_file = tempfile.mkstemp(suffix='.djvu', dir=tempdir)
|
||||||
|
|
||||||
|
# TODO: Handle case where empty background or foreground
|
||||||
|
args = ['djvumake', page_djvu_file, 'INFO={},{},{}'.format(input_pages.width, input_pages.height, round(input_pages.dpi))]
|
||||||
|
args.append('Sjbz={}'.format(compressed_page.fg.filename))
|
||||||
|
args.append('BG44={}'.format(compressed_page.bg.filename))
|
||||||
|
subprocess.run(args, check=True, capture_output=True)
|
||||||
|
|
||||||
|
djvu_page_files.append(page_djvu_file)
|
||||||
|
finally:
|
||||||
|
# Clean up
|
||||||
|
compressed_page.bg.cleanup()
|
||||||
|
compressed_page.fg.cleanup()
|
||||||
|
|
||||||
|
# Combine pages
|
||||||
|
subprocess.run(['djvm', '-c', output_file] + djvu_page_files, check=True)
|
||||||
|
finally:
|
||||||
|
# Clean up
|
||||||
|
for page_djvu_file in djvu_page_files:
|
||||||
|
os.unlink(page_djvu_file)
|
@ -16,10 +16,12 @@
|
|||||||
|
|
||||||
from ..compression import CompressedLayer, CompressedPage
|
from ..compression import CompressedLayer, CompressedPage
|
||||||
from ..compression.jbig2 import JBIG2Layer
|
from ..compression.jbig2 import JBIG2Layer
|
||||||
|
from ..compression.jp2 import JP2Layer
|
||||||
from ..compression.jpeg import JPEGLayer
|
from ..compression.jpeg import JPEGLayer
|
||||||
|
from ..compression.png import PNGLayer
|
||||||
from ..input import InputPages
|
from ..input import InputPages
|
||||||
|
|
||||||
from pikepdf import ContentStreamInstruction, Name, Operator, Page, Pdf, Stream, unparse_content_stream
|
from pikepdf import ContentStreamInstruction, Dictionary, Name, Operator, Page, Pdf, Stream, unparse_content_stream
|
||||||
|
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
|
|
||||||
@ -40,9 +42,14 @@ def pdf_write_pages(
|
|||||||
page = pdf.add_blank_page(page_size=(width_pt, height_pt))
|
page = pdf.add_blank_page(page_size=(width_pt, height_pt))
|
||||||
|
|
||||||
# Write each layer to the page
|
# Write each layer to the page
|
||||||
|
# TODO: Handle case where empty background or foreground
|
||||||
content_instructions = []
|
content_instructions = []
|
||||||
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, content_instructions=content_instructions)
|
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
|
||||||
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, content_instructions=content_instructions)
|
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
compressed_page.bg.cleanup()
|
||||||
|
compressed_page.fg.cleanup()
|
||||||
|
|
||||||
# Generate content stream
|
# Generate content stream
|
||||||
wrapped_instructions = [
|
wrapped_instructions = [
|
||||||
@ -62,6 +69,7 @@ def pdf_write_layer(
|
|||||||
pdf: Pdf,
|
pdf: Pdf,
|
||||||
page: Page,
|
page: Page,
|
||||||
layer: CompressedLayer,
|
layer: CompressedLayer,
|
||||||
|
is_foreground: bool,
|
||||||
content_instructions,
|
content_instructions,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
@ -78,6 +86,17 @@ def pdf_write_layer(
|
|||||||
BitsPerComponent=1,
|
BitsPerComponent=1,
|
||||||
Mask=[1, 1] # Layer mask
|
Mask=[1, 1] # Layer mask
|
||||||
)
|
)
|
||||||
|
elif isinstance(layer, JP2Layer):
|
||||||
|
pdf_write_image(
|
||||||
|
input_pages=input_pages,
|
||||||
|
pdf=pdf,
|
||||||
|
page=page,
|
||||||
|
value=layer.data,
|
||||||
|
content_instructions=content_instructions,
|
||||||
|
ColorSpace=Name.DeviceRGB,
|
||||||
|
Filter=Name.JPXDecode,
|
||||||
|
BitsPerComponent=8
|
||||||
|
)
|
||||||
elif isinstance(layer, JPEGLayer):
|
elif isinstance(layer, JPEGLayer):
|
||||||
pdf_write_image(
|
pdf_write_image(
|
||||||
input_pages=input_pages,
|
input_pages=input_pages,
|
||||||
@ -89,6 +108,42 @@ def pdf_write_layer(
|
|||||||
Filter=Name.DCTDecode,
|
Filter=Name.DCTDecode,
|
||||||
BitsPerComponent=8
|
BitsPerComponent=8
|
||||||
)
|
)
|
||||||
|
elif isinstance(layer, PNGLayer):
|
||||||
|
if is_foreground:
|
||||||
|
# See PDF 1.7 section 7.4.4.3
|
||||||
|
# See also the implementation in img2pdf
|
||||||
|
pdf_write_image(
|
||||||
|
input_pages=input_pages,
|
||||||
|
pdf=pdf,
|
||||||
|
page=page,
|
||||||
|
value=layer.get_flate_data(),
|
||||||
|
content_instructions=content_instructions,
|
||||||
|
ColorSpace=Name.DeviceGray,
|
||||||
|
Filter=Name.FlateDecode,
|
||||||
|
BitsPerComponent=1,
|
||||||
|
Mask=[1, 1], # Layer mask
|
||||||
|
DecodeParms=Dictionary(
|
||||||
|
Predictor=15, # PNG prediction (on encoding, PNG optimum) - this is the only allowed value in a PNG file
|
||||||
|
BitsPerComponent=1, # Default is 8 so must set this here
|
||||||
|
Columns=input_pages.width
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
pdf_write_image(
|
||||||
|
input_pages=input_pages,
|
||||||
|
pdf=pdf,
|
||||||
|
page=page,
|
||||||
|
value=layer.get_flate_data(),
|
||||||
|
content_instructions=content_instructions,
|
||||||
|
ColorSpace=Name.DeviceRGB,
|
||||||
|
Filter=Name.FlateDecode,
|
||||||
|
BitsPerComponent=8,
|
||||||
|
DecodeParms=Dictionary(
|
||||||
|
Predictor=15,
|
||||||
|
Colors=3, # Default is 1 so must set this here
|
||||||
|
Columns=input_pages.width
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -17,6 +17,31 @@
|
|||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
def assert_has_c44(error_message: str = 'DjvuLibre is required') -> None:
|
||||||
|
if shutil.which('c44') is None:
|
||||||
|
print('Error: {} (c44 not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def assert_has_cjb2(error_message: str = 'DjvuLibre is required') -> None:
|
||||||
|
if shutil.which('cjb2') is None:
|
||||||
|
print('Error: {} (cjb2 not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def assert_has_djvm(error_message: str = 'DjvuLibre is required') -> None:
|
||||||
|
if shutil.which('djvm') is None:
|
||||||
|
print('Error: {} (djvm not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def assert_has_djvuextract(error_message: str = 'DjvuLibre is required') -> None:
|
||||||
|
if shutil.which('djvuextract') is None:
|
||||||
|
print('Error: {} (djvuextract not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def assert_has_djvumake(error_message: str = 'DjvuLibre is required') -> None:
|
||||||
|
if shutil.which('djvumake') is None:
|
||||||
|
print('Error: {} (djvumake not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None:
|
def assert_has_imagemagick(error_message: str = 'ImageMagick is required') -> None:
|
||||||
if shutil.which('magick') is None:
|
if shutil.which('magick') is None:
|
||||||
print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)
|
print('Error: {} (magick not found on PATH)'.format(error_message), file=sys.stderr)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user