172 lines
4.9 KiB
Python

# pdf-segmented: Generate PDFs using separate compression for foreground and background
# Copyright (C) 2025 Lee Yingtong Li
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
from ..compression import CompressedLayer, CompressedPage
from ..compression.jbig2 import JBIG2Layer
from ..compression.jp2 import JP2Layer
from ..compression.jpeg import JPEGLayer
from ..compression.png import PNGLayer
from ..input import InputPages
from pikepdf import ContentStreamInstruction, Dictionary, Name, Operator, Page, Pdf, Stream, unparse_content_stream
from typing import Generator
def pdf_write_pages(
input_pages: InputPages,
compressed_pages: Generator[CompressedPage],
output_file: str
) -> None:
# Get size of image in PostScript points
width_pt = input_pages.width / input_pages.dpi * 72
height_pt = input_pages.height / input_pages.dpi * 72
# Build PDF
pdf = Pdf.new()
# Write each page
for compressed_page in compressed_pages:
page = pdf.add_blank_page(page_size=(width_pt, height_pt))
# Write each layer to the page
content_instructions = []
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)
# Generate content stream
wrapped_instructions = [
ContentStreamInstruction([], Operator('q')),
ContentStreamInstruction([width_pt, 0, 0, height_pt, 0, 0], Operator('cm'))
] + content_instructions + [
ContentStreamInstruction([], Operator('Q')),
]
content_stream = unparse_content_stream(wrapped_instructions)
page.Contents.write(content_stream)
# Save PDF
pdf.save(output_file)
def pdf_write_layer(
input_pages: InputPages,
pdf: Pdf,
page: Page,
layer: CompressedLayer,
is_foreground: bool,
content_instructions,
) -> None:
# Write the layer to PDF
if isinstance(layer, JBIG2Layer):
pdf_write_image(
input_pages=input_pages,
pdf=pdf,
page=page,
value=layer.data,
content_instructions=content_instructions,
ColorSpace=Name.DeviceGray,
Filter=Name.JBIG2Decode,
BitsPerComponent=1,
Mask=[1, 1] # Layer mask
)
elif isinstance(layer, JP2Layer):
pdf_write_image(
input_pages=input_pages,
pdf=pdf,
page=page,
value=layer.data,
content_instructions=content_instructions,
ColorSpace=Name.DeviceRGB,
Filter=Name.JPXDecode,
BitsPerComponent=8
)
elif isinstance(layer, JPEGLayer):
pdf_write_image(
input_pages=input_pages,
pdf=pdf,
page=page,
value=layer.data,
content_instructions=content_instructions,
ColorSpace=Name.DeviceRGB,
Filter=Name.DCTDecode,
BitsPerComponent=8
)
elif isinstance(layer, PNGLayer):
if is_foreground:
# See PDF 1.7 section 7.4.4.3
# See also the implementation in img2pdf
pdf_write_image(
input_pages=input_pages,
pdf=pdf,
page=page,
value=layer.get_flate_data(),
content_instructions=content_instructions,
ColorSpace=Name.DeviceGray,
Filter=Name.FlateDecode,
BitsPerComponent=1,
Mask=[1, 1], # Layer mask
DecodeParms=Dictionary(
Predictor=15, # PNG prediction (on encoding, PNG optimum) - this is the only allowed value in a PNG file
BitsPerComponent=1, # Default is 8 so must set this here
Columns=input_pages.width
)
)
else:
pdf_write_image(
input_pages=input_pages,
pdf=pdf,
page=page,
value=layer.get_flate_data(),
content_instructions=content_instructions,
ColorSpace=Name.DeviceRGB,
Filter=Name.FlateDecode,
BitsPerComponent=8,
DecodeParms=Dictionary(
Predictor=15,
Colors=3, # Default is 1 so must set this here
Columns=input_pages.width
)
)
else:
raise NotImplementedError()
def pdf_write_image(
input_pages: InputPages,
pdf: Pdf,
page: Page,
value: bytes,
content_instructions,
**kwargs
) -> None:
# Write the layer as an Image
# Insert the Image as an XObject resource
xobj = Stream(
pdf,
value,
Type=Name.XObject,
Subtype=Name.Image,
Width=input_pages.width,
Height=input_pages.height,
**kwargs
)
xobj_name = page.add_resource(xobj, '/XObject')
# Add render instruction to the content stream
content_instructions.append(
ContentStreamInstruction([xobj_name], Operator('Do'))
)