pdf-segmented/pdf_segmented/output/pdf.py

#   pdf-segmented: Generate PDFs using separate compression for foreground and background
#   Copyright (C) 2025  Lee Yingtong Li
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

from ..compression import CompressedLayer, CompressedPage
from ..compression.jbig2 import JBIG2Layer
from ..compression.jp2 import JP2Layer
from ..compression.jpeg import JPEGLayer
from ..compression.png import PNGLayer
from ..input import InputPages

from pikepdf import ContentStreamInstruction, Dictionary, Name, Operator, Page, Pdf, Stream, unparse_content_stream

from typing import Generator

def pdf_write_pages(
	input_pages: InputPages,
	compressed_pages: Generator[CompressedPage],
	output_file: str
) -> None:
	# Get size of image in PostScript points
	width_pt = input_pages.width / input_pages.dpi * 72
	height_pt = input_pages.height / input_pages.dpi * 72

	# Build PDF
	pdf = Pdf.new()

	# Write each page
	for compressed_page in compressed_pages:
		page = pdf.add_blank_page(page_size=(width_pt, height_pt))

		# Write each layer to the page
		content_instructions = []
		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.bg, is_foreground=False, content_instructions=content_instructions)
		pdf_write_layer(input_pages=input_pages, pdf=pdf, page=page, layer=compressed_page.fg, is_foreground=True, content_instructions=content_instructions)

		# Generate content stream
		wrapped_instructions = [
			ContentStreamInstruction([], Operator('q')),
			ContentStreamInstruction([width_pt, 0, 0, height_pt, 0, 0], Operator('cm'))
		] + content_instructions + [
			ContentStreamInstruction([], Operator('Q')),
		]
		content_stream = unparse_content_stream(wrapped_instructions)
		page.Contents.write(content_stream)

	# Save PDF
	pdf.save(output_file)

def pdf_write_layer(
	input_pages: InputPages,
	pdf: Pdf,
	page: Page,
	layer: CompressedLayer,
	is_foreground: bool,
	content_instructions,
) -> None:

	# Write the layer to PDF
	if isinstance(layer, JBIG2Layer):
		pdf_write_image(
			input_pages=input_pages,
			pdf=pdf,
			page=page,
			value=layer.data,
			content_instructions=content_instructions,
			ColorSpace=Name.DeviceGray,
			Filter=Name.JBIG2Decode,
			BitsPerComponent=1,
			Mask=[1, 1]  # Layer mask
		)
	elif isinstance(layer, JP2Layer):
		pdf_write_image(
			input_pages=input_pages,
			pdf=pdf,
			page=page,
			value=layer.data,
			content_instructions=content_instructions,
			ColorSpace=Name.DeviceRGB,
			Filter=Name.JPXDecode,
			BitsPerComponent=8
		)
	elif isinstance(layer, JPEGLayer):
		pdf_write_image(
			input_pages=input_pages,
			pdf=pdf,
			page=page,
			value=layer.data,
			content_instructions=content_instructions,
			ColorSpace=Name.DeviceRGB,
			Filter=Name.DCTDecode,
			BitsPerComponent=8
		)
	elif isinstance(layer, PNGLayer):
		if is_foreground:
			# See PDF 1.7 section 7.4.4.3
			# See also the implementation in img2pdf
			pdf_write_image(
				input_pages=input_pages,
				pdf=pdf,
				page=page,
				value=layer.get_flate_data(),
				content_instructions=content_instructions,
				ColorSpace=Name.DeviceGray,
				Filter=Name.FlateDecode,
				BitsPerComponent=1,
				Mask=[1, 1],  # Layer mask
				DecodeParms=Dictionary(
					Predictor=15,  # PNG prediction (on encoding, PNG optimum) - this is the only allowed value in a PNG file
					BitsPerComponent=1,  # Default is 8 so must set this here
					Columns=input_pages.width
				)
			)
		else:
			pdf_write_image(
				input_pages=input_pages,
				pdf=pdf,
				page=page,
				value=layer.get_flate_data(),
				content_instructions=content_instructions,
				ColorSpace=Name.DeviceRGB,
				Filter=Name.FlateDecode,
				BitsPerComponent=8,
				DecodeParms=Dictionary(
					Predictor=15,
					Colors=3,  # Default is 1 so must set this here
					Columns=input_pages.width
				)
			)
	else:
		raise NotImplementedError()

def pdf_write_image(
	input_pages: InputPages,
	pdf: Pdf,
	page: Page,
	value: bytes,
	content_instructions,
	**kwargs
) -> None:

	# Write the layer as an Image

	# Insert the Image as an XObject resource
	xobj = Stream(
		pdf,
		value,
		Type=Name.XObject,
		Subtype=Name.Image,
		Width=input_pages.width,
		Height=input_pages.height,
		**kwargs
	)
	xobj_name = page.add_resource(xobj, '/XObject')

	# Add render instruction to the content stream
	content_instructions.append(
		ContentStreamInstruction([xobj_name], Operator('Do'))
	)