Environment setup
refer to:
Tesseract OCR: .NET Tesseract OCR – Nuggets (juejin.cn)
PdfiumViewer: .NET Convert PDF to image using PdfiumViewer – Nuggets (juejin.cn)
Code integration
Create OCRHelper.cs
using System; using System.IO; using System.Text.RegularExpressions; using Tesseract; namespace OCR_7 { public static class OCRHelper { private static string imagePath = $"{Environment.CurrentDirectory}/ocr_file/image_{GuidTo16String()}"; private static string tesseractPath = $@"Z:\.net_project\OCR_7\OCR_7\file\tesseract"; public static string Scan(string path) { CreateDir(); PDFToPng(path); return PngToText(); } /// <summary> /// Get the 16-digit unique string based on the GUID /// </summary> /// <param name="guid"></param> /// <returns></returns> public static string GuidTo16String() { long i = 1; foreach (byte b in Guid.NewGuid().ToByteArray()) i *= ((int)b + 1); return string.Format("{0:x}", i - DateTime.Now.Ticks); } private static void SaveText(string result) { string basePath = Environment.CurrentDirectory + "/ocr_file"; try { //Pass the filepath and filename to the StreamWriter Constructor StreamWriter sw = new StreamWriter(basePath + "/text/result.txt"); //Write a line of text sw.WriteLine(result); //Close the file sw.Close(); } catch (Exception e) { Console.WriteLine("Exception: " + e.Message); } finally { Console.WriteLine("Executing finally block."); } } private static string PngToText() { DirectoryInfo directory = new DirectoryInfo(imagePath); string res = ""; //Get file information under the file FileInfo[] files = directory.GetFiles(); for (int i = 0; i < files.Length; i + + ) { string filePath = files[i].FullName; res + = ScanPng(filePath); } try { Directory.Delete(imagePath,true); } catch (Exception e) { Console.WriteLine("The process failed: {0}", e.Message); } return handleResult(res); } private static string handleResult(string result) { Regex replaceSpace = new Regex(@"\s{1,}", RegexOptions.IgnoreCase); result = replaceSpace.Replace(result, " ").Trim(); return result; } private static string ScanPng(string path) { TesseractEngine engine = new TesseractEngine(tesseractPath, "eng"); Pix pix = Pix.LoadFromFile(path); Page page = engine.Process(pix); return page.GetText(); } private static void PDFToPng(string path) { PdfToImage( path, imagePath, "", "png", System.Drawing.Imaging.ImageFormat.Png, 1, 99 ); } private static void CreateDir() { CreateDirByPath(imagePath); } private static void CreateDirByPath(string path) { try { // Determine whether the directory exists. if (Directory.Exists(path)) { Console.WriteLine("That path exists already."); return; } // Try to create the directory. DirectoryInfo di = Directory.CreateDirectory(path); Console.WriteLine("The directory was created successfully at {0}.", Directory.GetCreationTime(path)); } catch (Exception e) { Console.WriteLine("The process failed: {0}", e.ToString()); } } /// <summary> /// Convert pdf to image /// </summary> /// <param name="pdfPath">pdf path</param> /// <param name="imagePath">Output image path</param> /// <param name="imageName">Output image name</param> /// <param name="imagePathFormat">Output image suffix</param> /// <param name="imageFormat">Output image format</param> /// <param name="startPageNum">Start page number</param> /// <param name="endPageNum">End page number</param> public static void PdfToImage( string pdfPath, string imagePath, string imageName, string imagePathFormat, System.Drawing.Imaging.ImageFormat imageFormat, int startPageNum, int endPageNum ) { #region Folder and path processing if (!System.IO.Directory.Exists(imagePath)) { System.IO.Directory.CreateDirectory(imagePath); } if (!imagePath.EndsWith("\") & amp; & amp; !imagePath.EndsWith("/")) { imagePath = imagePath + "\"; } if (!imagePathFormat.StartsWith(".")) { imagePathFormat = "." + imagePathFormat; } #endregion var pdf = PdfiumViewer.PdfDocument.Load(pdfPath);//Read pdf var pdfPage = pdf.PageCount;//pdf page number var pdfSize = pdf.PageSizes; #region start end page if (startPageNum <= 0) { startPageNum = 1; } if (endPageNum > pdf.PageCount) { endPageNum = pdf.PageCount; } if (startPageNum > endPageNum)//Start>End { int tempPageNum = startPageNum; startPageNum = endPageNum; endPageNum = startPageNum; } #endregion for (int i = startPageNum; i <= endPageNum; i + + ) { System.Drawing.Size size = new System.Drawing.Size(); //pdfSize is a list type, the index starts from 0, and the pdf page number starts from 1, so -1 is needed size.Width = (int)pdfSize[i - 1].Width; size.Height = (int)pdfSize[i - 1].Height; var stream = new System.IO.FileStream($"{imagePath}{imageName}{i}{imagePathFormat}", System.IO.FileMode.Create); var image = pdf.Render(i - 1, size.Width, size.Height, 350, 350, PdfiumViewer.PdfRenderFlags.Annotations); image.Save(stream, imageFormat); stream.Close(); image.Dispose(); stream.Dispose(); System.Diagnostics.Process.Start(imagePath); } pdf.Dispose(); } } }
Notice:
Test
OCRHelper.Scan(path)
: Read and print the content according to the address of the pdf file
using System; namespace OCR_7 { public class Program { static void Main(string[] args) { string s = OCRHelper.Scan("Z:\.net_project\OCR_7\OCR_7\file\pdf\01.pdf"); Console.WriteLine(s); } } }