.NET PDF to text (Tesseract OCR + PdfiumViewer)

Environment setup

refer to:

Tesseract OCR: .NET Tesseract OCR – Nuggets (juejin.cn)

PdfiumViewer: .NET Convert PDF to image using PdfiumViewer – Nuggets (juejin.cn)

Code integration

Create OCRHelper.cs

using System;
using System.IO;
using System.Text.RegularExpressions;
using Tesseract;

namespace OCR_7
{
    public static class OCRHelper
    {
        private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}";
        private static string tesseractPath = $@"Z:\.net_project\OCR_7\OCR_7\file\tesseract";

        public static string Scan(string path)
        {
            CreateDir();
            PDFToPng(path);
            return PngToText();
        }

        /// <summary>
        /// Get the 16-digit unique string based on the GUID
        /// </summary>
        /// <param name="guid"></param>
        /// <returns></returns>
        public static string GuidTo16String()
        {
            long i = 1;
            foreach (byte b in Guid.NewGuid().ToByteArray())
                i *= ((int)b + 1);
            return string.Format("{0:x}", i - DateTime.Now.Ticks);
        }

        private static void SaveText(string result)
        {
            string basePath = Environment.CurrentDirectory + "/ocr_file";

            try
            {
                //Pass the filepath and filename to the StreamWriter Constructor
                StreamWriter sw = new StreamWriter(basePath + "/text/result.txt");
                //Write a line of text
                sw.WriteLine(result);
                //Close the file
                sw.Close();
            }
            catch (Exception e)
            {
                Console.WriteLine("Exception: " + e.Message);
            }
            finally
            {
                Console.WriteLine("Executing finally block.");
            }
        }

        private static string PngToText()
        {
            DirectoryInfo directory = new DirectoryInfo(imagePath);

            string res = "";
            //Get file information under the file
            FileInfo[] files = directory.GetFiles();
            for (int i = 0; i < files.Length; i + + )
            {
                string filePath = files[i].FullName;
                res + = ScanPng(filePath);
               
            }
            try
            {
                Directory.Delete(imagePath,true);
            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.Message);
            }
            return handleResult(res);
        }

        private static string handleResult(string result)
        {
            Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase);
            result = replaceSpace.Replace(result, " ").Trim();

            return result;
        }

        private static string ScanPng(string path)
        {
            TesseractEngine engine = new TesseractEngine(tesseractPath, "eng");
            Pix pix = Pix.LoadFromFile(path);

            Page page = engine.Process(pix);
            return page.GetText();
        }

        private static void PDFToPng(string path)
        {
            PdfToImage(
              path,
              imagePath,
              "",
              "png",
              System.Drawing.Imaging.ImageFormat.Png,
              1,
              99
              );
        }



        private static void CreateDir()
        {
            CreateDirByPath(imagePath);
        }

        private static void CreateDirByPath(string path)
        {
            try
            {
                // Determine whether the directory exists.
                if (Directory.Exists(path))
                {
                    Console.WriteLine("That path exists already.");
                    return;
                }

                // Try to create the directory.
                DirectoryInfo di = Directory.CreateDirectory(path);
                Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path));

            }
            catch (Exception e)
            {
                Console.WriteLine("The process failed: {0}", e.ToString());
            }
        }

        /// <summary>
        /// Convert pdf to image
        /// </summary>
        /// <param name="pdfPath">pdf path</param>
        /// <param name="imagePath">Output image path</param>
        /// <param name="imageName">Output image name</param>
        /// <param name="imagePathFormat">Output image suffix</param>
        /// <param name="imageFormat">Output image format</param>
        /// <param name="startPageNum">Start page number</param>
        /// <param name="endPageNum">End page number</param>
        public static void PdfToImage(
            string pdfPath,
            string imagePath,
            string imageName,
            string imagePathFormat,
            System.Drawing.Imaging.ImageFormat imageFormat,
            int startPageNum,
            int endPageNum
            )
        {
            #region Folder and path processing
            if (!System.IO.Directory.Exists(imagePath))
            {
                System.IO.Directory.CreateDirectory(imagePath);
            }
            if (!imagePath.EndsWith("\") & amp; & amp; !imagePath.EndsWith("/"))
            {
                imagePath = imagePath + "\";
            }
            if (!imagePathFormat.StartsWith("."))
            {
                imagePathFormat = "." + imagePathFormat;
            }
            #endregion
            var pdf = PdfiumViewer.PdfDocument.Load(pdfPath);//Read pdf
            var pdfPage = pdf.PageCount;//pdf page number
            var pdfSize = pdf.PageSizes;
            #region start end page
            if (startPageNum <= 0) { startPageNum = 1; }
            if (endPageNum > pdf.PageCount) { endPageNum = pdf.PageCount; }
            if (startPageNum > endPageNum)//Start>End
            {
                int tempPageNum = startPageNum;
                startPageNum = endPageNum;
                endPageNum = startPageNum;
            }
            #endregion

            for (int i = startPageNum; i <= endPageNum; i + + )
            {
                System.Drawing.Size size = new System.Drawing.Size();
                //pdfSize is a list type, the index starts from 0, and the pdf page number starts from 1, so -1 is needed
                size.Width = (int)pdfSize[i - 1].Width;
                size.Height = (int)pdfSize[i - 1].Height;
                var stream = new System.IO.FileStream($"{imagePath}{imageName}{i}{imagePathFormat}", System.IO.FileMode.Create);
                var image = pdf.Render(i - 1, size.Width, size.Height, 350, 350, PdfiumViewer.PdfRenderFlags.Annotations);
                image.Save(stream, imageFormat);
                stream.Close();
                image.Dispose();
                stream.Dispose();
                System.Diagnostics.Process.Start(imagePath);
            }
            pdf.Dispose();
        }

    }
}

Notice:

image.png

Test

OCRHelper.Scan(path): Read and print the content according to the address of the pdf file

using System;

namespace OCR_7
{
    public class Program
    {
        static void Main(string[] args)
        {
            string s = OCRHelper.Scan("Z:\.net_project\OCR_7\OCR_7\file\pdf\01.pdf");
            Console.WriteLine(s);
        }
    }
}

image.png

image.png