Use iTextSharp to process PDFs

Create PDF files

using System;
using System.Collections.Generic;
using System.Text;
using iTextsharp.text;
using iTextsharp.text.pdf;
using System.IO;
using System.Windows.Forms;
namespace ConsoleApplication4
{
class Program
{
    static void Main(string[] args)
    {
        iTextsharp.text.Document pdf doc new Document();
        Pdfwriter pdf write Pdfwriter.GetInstance(pdf doc,new Filestream(@"I:\chap1.pdf",FileMode.Create));
        pdfdoc.Open()
        pdf doc.Add(new Paragraph ("new pdf!"));
        pdf doc.close();
        MessageBox.Show ("OK!",Environment.UserName);
        Console.Read();
    }
}
}

Use iTextSharp library to get the page number of PDF files

using iTextsharp.text.pdf;
Count PDE document page count
private int pdf_pages (string filename)
{
    PdfReader pdf new pdfReader (filename);
    return pdf.Numberofpages;
}

Use iTextSharp library to obtain PDF file information

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Text.RegularExpressions;
using iTextSharp.text.pdf;
namespace WindowsApplication2
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }
        private void btn_browse_Click(object sender, EventArgs e)
        {
            DialogResult dia_dir_res = folderBrowserDialog_dir.ShowDialog();
            if (dia_dir_res == DialogResult.OK)
            {
                tbx_dir.Text = folderBrowserDialog_dir.SelectedPath;
                //tbx_dir.Update();
            }
        }
        private void btn_cancel_Click(object sender, EventArgs e)
        {
            Application.Exit();
        }
        private void btn_ok_Click(object sender, EventArgs e)
        {
            //Determine whether the folder exists
            if (!Directory.Exists(tbx_dir.Text))
            {
                MessageBox.Show("ERR: NO THIS DIR!");
                return;
            }
           
            //Extract PDF files in the folder
            string[] str_dirs = Directory.GetFiles(tbx_dir.Text, "*pdf");
            if (str_dirs.Length == 0)
            {
                MessageBox.Show("ERR: NO PDF FILE!");
                return;
            }
            //Change the spaces in the file name to -
            string str_new_dir = null;
            foreach (String str_dir in str_dirs)
            {
                if (str_dir.Contains(" "))
                {
                    str_new_dir = str_dir.Replace(" ", "-");
                    File.Copy(str_dir, str_new_dir);
                    File.Delete(str_dir);
                }
            }
           
            //Set drawing type
            string str_drawing_type = null;
            if (rbtn_final.Checked == true)
                str_drawing_type = "Complete drawing";
            else
                str_drawing_type = "Approval drawing";
            //Re-extract PDF files in the folder
            str_dirs = Directory.GetFiles(tbx_dir.Text, "*pdf");
            if (str_dirs.Length == 0)
            {
                MessageBox.Show("ERR: NO PDF FILE!");
                return;
            }
           
            //String written to the file
            string str_flush = null;
            string str_drawing_name = null;
            foreach (string str_dir in str_dirs)
            {
                str_drawing_name = str_dir.Replace(tbx_dir.Text + @"", "");
                str_drawing_name = str_drawing_name.Replace(".pdf", "");
                str_flush + = str_drawing_name + " ";
                str_flush + = str_drawing_type + " ";
                str_flush + = pdf_pages(str_dir).ToString() + " ";
                str_flush + = "None" + System.Environment.NewLine;
            }
            //Write to file
            StreamWriter sw = new StreamWriter(tbx_dir.Text + @"\pdf_pages.txt", false, System.Text.Encoding.Default);
            sw.Write(str_flush);
            sw.Close();
            MessageBox.Show("OK!");
        }
        Count PDF document page count
        private int pdf_pages(string filename)
        {
            PdfReader pdf = new PdfReader(filename);
            return pdf.NumberOfPages;
        }
    }
}

Extract PDF text content

Extracting PDF file content can be done using ITEXTSHARP.

1. Define the reader, parser, and reading strategy as shown below.

The reading process is:

a) Generate a reader by specifying the file path;

b) Generate a parser based on the reader;

c) Generate a reading strategy by specifying the page number and the corresponding reading method through the parser’s template method ProcessContent (i in the figure below is the page number, the second parameter is the reading method class, and the type of the template method in the preceding angle brackets Specify that text reading should be of the type shown in the figure below);

d) Obtain the entire text content of the specified page through the GetResultantText method of the read strategy;

//Define file reading and parsing methods
PdfReader pr new PdfReader (tbx pdf.Text.Trim());
PdfReaderContentparser prcp new pdfReaderContentparser(pr);
ITextExtractionstrategy items;
ites prcp.ProcessContent<simpleTextExtractionstrategy>(i,new SimpleTextExtractionstrategy ())
str_pdf ites.GetResultantText();

e) The above method requires the use of namespace as follows:

Specific examples are as follows:

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text.RegularExpressions;
//Read PDF file
PdfReader pr = new PdfReader(pdfPath);
//PDF parser
PdfReaderContentParser prcp = new PdfReaderContentParser(pr);
//PDF text extraction method, where i is the page of the PDF being read, the page number starts from 1, and the total number of pages can be obtained using PdfReader.NumberOfPages
ITextExtractionStrategy ites = prcp.ProcessContent<SimpleTextExtractionStrategy>(i, new SimpleTextExtractionStrategy());
string pdfText = items.GetResultantText();

Split PDF file (extract specified pages)

ITEXTSHARP can be used to split PDF files. The process has the following steps (as shown in the figure below):

1. Define an empty document (Document type in ITEXTSHARP) and open it using its Open method;

2. Use the Document object and file stream (specify the file stream mode and the path of the split file storage) to generate an object of the PdfCopy class (this object associates the Document object with the output file stream);

4. Define the PdfImportPage class object, which is used to store a page of PDF extracted from the source PDF;

5. Use the GetImportedPage method of the PdfCopy object to extract a page (the second parameter of the method) from a PdfReader reader (associated with the source PDF file);

6. Add the extracted PdfImportPage class object (one PDF page) to the associated original Document through the AddPage method of the PdfCopy object;

Document doc new Document()
PdfCopy pc new Pdfcopy (doc,new Filestream(tbx targpath.Text + @"\KC"str id ".pdf",System.IO.FileMode.Append));
PdfImportedpage pip null;
doc.Open()
pip pc.GetImportedPage(pr,i);
pc.Addpage(pip);