Convert python-docx to pdf and generate consecutive page numbers (no page numbers on the cover)

Brief description of content: Preset docx file template, automatically generate data, optionally merge multiple different templates, distinguish cover pages, add page numbers, and finally generate pdf

According to the needs of the project, the docx file is generated using the docxtpl package rendering method from the perspective of being easy to modify and expand (that is, replacing the preset key in the file with the value, and the key is between the two curly brackets. There are many online tutorials that are easy to learn)

Basic function implementation:

  1. Render a docx file list based on the template file
  2. docx merge
  3. docx to pdf
class PDFBuilder:
    @staticmethod
    def create_docx_replace_data(model_path: str, replace_dict: dict, output_path: str) -> None:
        """ replaces the template data and creates a new docx document

        :model_path: template file path
        :replace_dict: Replacement parameter dictionary
        :output_path: generated file path (docx)
        """
        from docxtpl import DocxTemplate
        #--------------------------------
        result_doc = DocxTemplate(model_path) # Template object
        result_doc.render(replace_dict) #Replace data
        result_doc.save(output_path)

    @staticmethod
    def merge_docx(merged_path_list: list, output_path: str) -> None:
        """ Merge multiple docx documents in list order

        :merged_path_list: List of file paths to be merged (docx)
        :output_path: generated file path (docx)
        """
        from docx import Document
        from docxcompose.composer import Composer
        #--------------------------------
        # Take the first file in the list as the main document
        merged_doc = Document(merged_path_list[0])
        # Merge processing
        if len(merged_path_list) > 1:
            for each_docx_path in merged_path_list[1:]:
                merged_doc.add_page_break() # Add page break to main document
                cp = Composer(merged_doc) # Create a Composer object for adding subdocuments to the main document
                cp.append(Document(each_docx_path)) # Merge
        merged_doc.save(output_path)

    @staticmethod
    def docx_to_pdf(docx_path, pdf_path):
        """ Convert docx file to pdf

        :docx_path: docx file path
        :pdf_path: Generate pdf file path
        """
        from docx2pdf import convert
        #--------------------------------
        convert(docx_path, pdf_path)

It is relatively difficult to generate continuous page numbers and no page numbers on the cover (the essence of docx is a zip-compressed xml file, similar to html. In fact, every page of docx we see is the result of word or wps rendering paging, and does not exist as a file. Each page has already been divided).

A relatively simple processing method is to first convert the docx into a pdf file (PDF is equivalent to a vector image, the format and style are completely determined), and then based on the confirmed page number of the pdf file, generate a pdf file with the same number of pages and only page numbers. , merge them (similar to layer overlap in ps)

There are other ideas, such as obtaining the file size settings of docx through code, including but not limited to page length and width, page margins, etc., and then calculating the length to divide the number of pages (this method must be very disgusting to implement, I have not studied it), converting docx The template is preset in Word to automatically generate page numbers (this method is not suitable for flexible settings with or without covers, and the DIY cover of docx is not easy to operate manually)…

Another processing method is to add section breaks when merging docx files to realize the function of automatically generating page numbers by distinguishing the cover and text without page numbers. However, this method has low universality and is relatively difficult to understand. It is placed in the code at the end.

Method of overlapping and merging layers to append page numbers (based on the old version of PyPDF2):

 @staticmethod
    def add_page_numbers(update_pdf_path, start_page_num=0) -> None:
        """ Append page numbers to pdf files

        :update_pdf_path: pdf file path
        :start_page_num: Which PDF page index does page number '1' start from (default starts from 0 as the first page)
        """
        import os
        from PyPDF2 import PdfFileWriter, PdfFileReader
        from reportlab.pdfgen import canvas
        #--------------------------------
        pdf_reader = PdfFileReader(update_pdf_path)
        pdf_writer = PdfFileWriter()

        # Get the total number of pages of the PDF to be modified
        total_pages = len(pdf_reader.pages)

        # Use reportlab to create a temporary PDF with only page numbers
        for page_num in range(total_pages):
            page_number = page_num + 1 - start_page_num
            if page_number < 1:
                continue
            c = canvas.Canvas("temp_page.pdf") # canvas object, used to add page numbers
            c.setFont("Helvetica", 10) # Customize font and font size
            c.drawCentredString(300, 20, str(page_number)) # Draw the page number in the middle of the bottom of the page
            c.save()

            # Merge the original page with the newly added page number page
            watermark = PdfFileReader("temp_page.pdf") # Temporary PDF as watermark
            page = pdf_reader.pages[page_num]
            page.mergePage(watermark.pages[0])
            pdf_writer.addPage(page)

        # Save the merged PDF file and overwrite the original file
        with open(update_pdf_path, "wb") as new_file:
            pdf_writer.write(new_file)

        # Delete temporary files
        os.remove("temp_page.pdf")

Page number appending (based on new version PyPDF2):

 @staticmethod
    def add_page_numbers(update_pdf_path, start_page_num=0) -> None:
        """ Append page numbers to pdf files

        :update_pdf_path: pdf file path
        :start_page_num: Which PDF page index does page number '1' start from (default starts from 0 as the first page)
        """
        import os
        from PyPDF2 import PdfWriter, PdfReader
        from reportlab.pdfgen import canvas
        #--------------------------------
        pdf_reader = PdfReader(update_pdf_path)
        pdf_writer = PdfWriter()

        # Get the total number of pages of the PDF to be modified
        total_pages = len(pdf_reader.pages)

        # Use reportlab to create a temporary PDF with only page numbers
        for page_num in range(total_pages):
            page_number = page_num + 1 - start_page_num
            if page_number < 1:
                continue
            c = canvas.Canvas("temp_page.pdf") # canvas object, used to add page numbers
            c.setFont("Helvetica", 10) # Customize font and font size
            c.drawCentredString(300, 20, str(page_number)) # Draw the page number in the middle of the bottom of the page
            c.save()

            # Merge the original page with the newly added page number page
            watermark = PdfReader("temp_page.pdf") # Temporary PDF as watermark
            page = pdf_reader.pages[page_num]
            page.merge_page(watermark.pages[0])
            pdf_writer.add_page(page)

        # Save the merged PDF file and overwrite the original file
        with open(update_pdf_path, "wb") as new_file:
            pdf_writer.write(new_file)

        # Delete temporary files
        os.remove("temp_page.pdf")

Continue to optimize so that files are always operated in memory;

In response to the requirements of the project structure, use section breaks to increase page numbers (I personally think it does not make much sense, and comments such as docx section breaks, page breaks, and annotations are not written in detail because it is better to merge layers);

Complete call and test:

# -*- coding: utf-8 -*-
# Author: otto
# Description: Simulate the docx template processing and then convert to pdf process

# Standard library Required modules: os, io, shutil, tempfile
import os
import io
from io import BytesIO
import shutil # Advanced file operations
import tempfile #Create temporary files and directories

# pip library Required modules: docx, docxtpl, docxcompose, docx2pdf
from docx import Document
from docx.oxml.ns import nsdecls
from docx.oxml import parse_xml
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.enum.section import WD_SECTION_START
from docxtpl import DocxTemplate
from docxcompose.composer import Composer
from docx2pdf import convert


def action_pdf(file_name_list) -> None: # TODO parameter
    """ Test function main process
    
    :file_name_list: The name of the docx file that needs to be merged, which will be changed to the identifier later.
    :return: None
    """
    model_path_dict = get_model_path_dict() # TODO file identification/path acquisition

    merge_docx_bytes_list = [] # Generated docx binary file
    for index, file_name in enumerate(file_name_list):
        docx_model_path = model_path_dict[file_name] # Get template

        render_dict = get_render_dict() # TODO Get rendering data

        docx_bytes = PDFBuilder.create_docx_replace_data( # Render template
            model_path=docx_model_path,
            render_dict=render_dict
        )
        merge_docx_bytes_list.append(docx_bytes)

    # Cover judgment
    if True: # TODO
        cover_flg = True

    merged_docx_bytes = PDFBuilder.merge_docx( # Merge docx in list order to generate page numbers
        merged_bytes_list=merge_docx_bytes_list,
        cover_flg=cover_flg
    )

    PDFBuilder.docx_to_pdf( # docx to pdf
        docx_bytes=merged_docx_bytes
    )


def get_model_path_dict() -> dict: # TODO file identification/path acquisition
    get_model_path_dict = {
        'Cover.docx': own_path + 'Cover.docx',
        'Empty text template.docx': own_path + 'Empty text template.docx',
        '123.docx': own_path + '123.docx',
    }
    return get_model_path_dict


def get_render_dict() -> dict: # TODO Get rendering data
    render_dict = {
        'empty': '',
        'code': 'abc',
    }
    return render_dict


class PDFBuilder:
    @staticmethod
    def create_docx_replace_data(model_path: str, render_dict: dict) -> BytesIO:
        """ Render template data and create a new docx document

        :model_path: template file path
        :render_dict: Replacement parameter dictionary
        :return: memory file (docx)
        """
        docx_file = io.BytesIO()
        doc_template = DocxTemplate(model_path)
        doc_template.render(render_dict) #Render template, replace data
        doc_template.save(docx_file)
        return docx_file

    @staticmethod
    def merge_docx(merged_bytes_list: list, cover_flg: bool) -> BytesIO:
        """ Merge multiple docx documents in list order and automatically generate page numbers

        :merged_path_list: List of file paths to be merged (docx)
        :cover_flg: Whether there is a cover (footer processing)
        :return: memory file (docx)
        """
        merged_doc = Document(merged_bytes_list[0]) # Take the first file in the list as the main document
        # Cover processing
        if cover_flg:
            new_section = merged_doc.add_section(WD_SECTION_START.CONTINUOUS) # Add section breaks (continuous)
            new_section.different_first_page = True # The first page is different
            new_section.starting_number = 1 # Starting number of page number
            footer = new_section.footer
            footer.is_linked_to_previous = False # Do not associate the footer with the footer of the previous section
        # Merge processing
        if len(merged_bytes_list) > 1:
            for each_docx_bytes in merged_bytes_list[1:]:
                merged_doc.add_page_break() # Add page break to main document
                cp = Composer(merged_doc) # Create a Composer object for adding subdocuments to the main document
                cp.append(Document(each_docx_bytes)) # Merge
        # Footer processing
        new_section = merged_doc.add_section(WD_SECTION_START.CONTINUOUS) # Add section breaks (continuous)
        new_section.different_first_page = True
        footer = new_section.footer
        footer_paragraph = footer.paragraphs[0]
        footer.is_linked_to_previous = False # Do not associate the footer with the footer of the previous section
        # Continuous page numbers
        field_code = 'PAGE'
        field = parse_xml(f'<w:fldSimple {nsdecls("w")} w:instr="{field_code}"/>')
        run = footer_paragraph.add_run()
        run._r.append(field)
        footer_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER # Center alignment of footer
        # Save data
        docx_file = io.BytesIO()
        merged_doc.save(docx_file)
        with open(merged_docx_path, 'wb') as file: # TODO for testing
            file.write(docx_file.getvalue())
        return docx_file

    @staticmethod
    def docx_to_pdf(docx_bytes) -> BytesIO:
        """ Convert docx file to pdf

        :docx_path: docx file path
        :return: memory file (pdf)
        """
        temp_docx = tempfile.NamedTemporaryFile(suffix='.docx', delete=False)
        temp_pdf = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
        temp_docx.write(docx_bytes.getvalue())
        temp_docx.close()
        temp_pdf.close()
        convert(temp_docx.name, temp_pdf.name)
        with open(temp_pdf.name, 'rb') as file:
            pdf_file = io.BytesIO()
            shutil.copyfileobj(file, pdf_file)
        os.remove(temp_docx.name)
        os.remove(temp_pdf.name)
        with open(output_pdf_path, 'wb') as file: # TODO for testing
            file.write(pdf_file.getvalue())
        return pdf_file


if __name__ == '__main__':
    # Test parameters
    own_path = 'C:\Users\11379\Desktop\'
    output_pdf_path = own_path + 'test.pdf'
    merged_docx_path = own_path + 'test.docx'

    file_name_list = [
        'Cover.docx',
        'Empty text template.docx',
        '123.docx',
    ]
    action_pdf(file_name_list)

The knowledge points of the article match the official knowledge files, and you can further learn relevant knowledge. Python entry skill treeHomepageOverview 345,295 people are learning the system