我学会了如何使用 PyPDF2 和 PyMuPDF(fitz) 进行拆分。但是拆分的时候经常会出现只有四分之一有文本的情况,但是它把4个四分之一都写到了新文件中,有文本也有空,一个有文本,其余都是空的,我需要一些东西让那个空的没有保存,我想以某种方式进行检查,但没有成功,缺乏知识。我试图读取新录制的文件并删除空白页,但是每一页都有文字,甚至在空白页上,我在acrobat reader中打开文件,但是页面是空的,我不明白如何。
这是我的代码,以防万一我怎么做:https ://paste.aiogram.dev/opiquhehus.py
这是我第一次在这里发帖,我不知道如何附加文件。pdf 文件,例如在电报频道:https ://t.me/+Tq7WpP1ImcjQXSZF 。
import copy
import logging
import random
from pathlib import Path
import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject
def from_a4_to_a6_not_sync(input_file, output_file):
input_file = str(input_file.absolute())
pdf_reader = PyPDF2.PdfFileReader(input_file)
# print(f'{pdf_reader.getNumPages()=}')
# print(f'{pdf_reader.documentInfo=}')
first_page = pdf_reader.getPage(0)
left_up_side = copy.deepcopy(first_page)
right_up_side = copy.deepcopy(first_page)
left_down_side = copy.deepcopy(first_page)
right_down_side = copy.deepcopy(first_page)
# print(f'{left_up_side.extractText()=}')
# print(f'{right_up_side.extractText()=}')
# print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
# second_page = pdf_reader.getPage(0)
# print(f'{type(second_page)=}\n{second_page.extractText()=}')
# third_page = pdf_reader.getPage(0)
# fourth_page = pdf_reader.getPage(0)
first_coord = first_page.mediaBox.upperRight[0]
second_coord = first_page.mediaBox.upperRight[1]
# print(f'{first_coord=}')
# print(f'{second_coord=}')
# cords_upperLeft = first_page.mediaBox.upperLeft
# cords_lowerLeft = first_page.mediaBox.lowerLeft
# cords_upperRight = first_page.mediaBox.upperRight
# cords_lowerRight = first_page.mediaBox.lowerRight
# print(f'{cords_upperLeft=}')
# print(f'{cords_lowerLeft=}')
# print(f'{cords_upperRight=}')
# print(f'{cords_lowerRight=}')
# first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# print(f'{first_page=}\n\n')
# one_page = left_up_side.getContents()
# second_page = right_up_side.getContents()
# decode_one = DecodedStreamObject()
# print(f'{decode_one.getData()}')
# print(f'{decodeStreamData(second_page)}')
# print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
# print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
# print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
# print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
pdf_writer = PyPDF2.PdfFileWriter()
# pdf_writer.addPage(first_page)
pdf_writer.addPage(left_up_side)
pdf_writer.addPage(right_up_side)
with open(output_file, 'wb') as file:
pdf_writer.write(file)
file.close()
def fitz_four_piaces(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
page = 0
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 + (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 + (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
count = 0 # почему-то не считает
rx += d # add the CropBox displacement
# print(f'{rx=}')
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the image
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# print(f'{spage.number=}')
# text_in_page = page.get_text("text")#.encode("utf8")
# print(f'{text_in_page=}')
# print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
# print(f'in cicle {doc.page_count - 1=}')
count += 1
# that's it, save output file
# print(f'{doc.metadata=}')
# print(f'{doc.page_count=}')
doc.save(output_file, #
garbage=3, # eliminate duplicate objects
deflate=True, # compress stuff where possible
)
# input_file2 = str(output_file.absolute())
# src2 = fitz.open(input_file2)
# print(f'{src2.page_count=}')
# for page in src2:
# print(f'{page.get_text("words")=}')
def fitz_four_piaces_read(input_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
print(f'{src.page_count=}')
for page in src:
print(f'{page.get_text("text")=}')
destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
f'up_lef.pdf') # up_lef_up_rig_low_lef_low_rig
destination_output = destination.joinpath(
f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf') # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'
# from_a4_to_a6_not_sync(destination_input, destination_output)
fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)