C#解析PDF的方式有很多,比较好用的有ITestSharp和PdfBox。

PDF内容页如果是图片类型,例如扫描件,则需要进行OCR(光学字符识别)。

文本内容的PDF文档,解析的过程中,我目前仅发现能以字符串的形式读取的,不能够读取其中的表格。据说PDF文档结构中是没有表格概念的,因此这个自然是读不到的,如果果真如此,则PDF中表格内容的解析,只能对获取到的字符串按照一定的逻辑自行解析了。

ITestSharp是一C#开源项目,PdfBox为Java开源项目,借助于IKVM在.Net平台下有实现。

Pdf转换Image,使用的是GhostScript,可以以API的方式调用,也可以以Windows命令行的方式调用。

OCR使用的是Asprise,识别效果较好(商业),另外还可以使用MS的ImageScaning(2007)或OneNote(2010)(需要依赖Office组件),Tessert(HP->Google)(效果很差)。

附上ITestSharp、PdfBox对PDF的解析代码。

ITestSharp辅助类

 using System;
using System.Collections.Generic;
using System.Text; using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.IO; namespace eyuan
{
public static class ITextSharpHandler
{
/// <summary>
/// 读取PDF文本内容
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static string ReadPdf(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
return string.Empty;
}
//
string fileContent = string.Empty;
StringBuilder sbFileContent = new StringBuilder();
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); if (reader != null)
{
reader.Close();
reader = null;
} return string.Empty;
} try
{
//循环各页(索引从1开始)
for (int i = ; i <= reader.NumberOfPages; i++)
{
sbFileContent.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i)); } }
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); }
finally
{
if (reader != null)
{
reader.Close();
reader = null;
}
}
//
fileContent = sbFileContent.ToString();
return fileContent;
}
/// <summary>
/// 获取PDF页数
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static int GetPdfPageCount(string fileName)
{
if (!File.Exists(fileName))
{
LogHandler.LogWrite(@"指定的PDF文件不存在:" + fileName);
return -;
}
//打开文件
PdfReader reader = null;
try
{
reader = new PdfReader(fileName);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { fileName, ex.ToString() })); if (reader != null)
{
reader.Close();
reader = null;
} return -;
}
//
return reader.NumberOfPages;
}
}
}

PDFBox辅助类

 using org.pdfbox.pdmodel;
using org.pdfbox.util;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text; namespace eyuan
{
public static class PdfBoxHandler
{
/// <summary>
/// 使用PDFBox组件进行解析
/// </summary>
/// <param name="input">PDF文件路径</param>
/// <returns>PDF文本内容</returns>
public static string ReadPdf(string input)
{
if (!File.Exists(input))
{
LogHandler.LogWrite(@"指定的PDF文件不存在:" + input);
return null;
}
else
{
PDDocument pdfdoc = null;
string strPDFText = null;
PDFTextStripper stripper = null; try
{
//加载PDF文件
pdfdoc = PDDocument.load(input);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"加载PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() })); if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
} return null;
} try
{
//解析PDF文件
stripper = new PDFTextStripper();
strPDFText = stripper.getText(pdfdoc); }
catch (Exception ex)
{
LogHandler.LogWrite(string.Format(@"解析PDF文件{0}失败,错误:{1}", new string[] { input, ex.ToString() })); }
finally
{
if (pdfdoc != null)
{
pdfdoc.close();
pdfdoc = null;
}
} return strPDFText;
} }
}
}

另外附上PDF转Image,然后对Image进行OCR的代码。

转换PDF为Jpeg图片代码(GhostScript辅助类)

 using System;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text; namespace eyuan
{
public class GhostscriptHandler
{ #region GhostScript Import
/// <summary>创建Ghostscript的实例
/// This instance is passed to most other gsapi functions.
/// The caller_handle will be provided to callback functions.
/// At this stage, Ghostscript supports only one instance. </summary>
/// <param name="pinstance"></param>
/// <param name="caller_handle"></param>
/// <returns></returns>
[DllImport("gsdll32.dll", EntryPoint = "gsapi_new_instance")]
private static extern int gsapi_new_instance(out IntPtr pinstance, IntPtr caller_handle);
/// <summary>This is the important function that will perform the conversion
///
/// </summary>
/// <param name="instance"></param>
/// <param name="argc"></param>
/// <param name="argv"></param>
/// <returns></returns>
[DllImport("gsdll32.dll", EntryPoint = "gsapi_init_with_args")]
private static extern int gsapi_init_with_args(IntPtr instance, int argc, IntPtr argv);
/// <summary>
/// Exit the interpreter.
/// This must be called on shutdown if gsapi_init_with_args() has been called,
/// and just before gsapi_delete_instance().
/// 退出
/// </summary>
/// <param name="instance"></param>
/// <returns></returns>
[DllImport("gsdll32.dll", EntryPoint = "gsapi_exit")]
private static extern int gsapi_exit(IntPtr instance);
/// <summary>
/// Destroy an instance of Ghostscript.
/// Before you call this, Ghostscript must have finished.
/// If Ghostscript has been initialised, you must call gsapi_exit before gsapi_delete_instance.
/// 销毁实例
/// </summary>
/// <param name="instance"></param>
[DllImport("gsdll32.dll", EntryPoint = "gsapi_delete_instance")]
private static extern void gsapi_delete_instance(IntPtr instance);
#endregion #region 变量
private string _sDeviceFormat;
private int _iWidth;
private int _iHeight;
private int _iResolutionX;
private int _iResolutionY;
private int _iJPEGQuality;
private Boolean _bFitPage;
private IntPtr _objHandle;
#endregion #region 属性
/// <summary>
/// 输出格式
/// </summary>
public string OutputFormat
{
get { return _sDeviceFormat; }
set { _sDeviceFormat = value; }
}
/// <summary>
///
/// </summary>
public int Width
{
get { return _iWidth; }
set { _iWidth = value; }
}
/// <summary>
///
/// </summary>
public int Height
{
get { return _iHeight; }
set { _iHeight = value; }
}
/// <summary>
///
/// </summary>
public int ResolutionX
{
get { return _iResolutionX; }
set { _iResolutionX = value; }
}
/// <summary>
///
/// </summary>
public int ResolutionY
{
get { return _iResolutionY; }
set { _iResolutionY = value; }
}
/// <summary>
///
/// </summary>
public Boolean FitPage
{
get { return _bFitPage; }
set { _bFitPage = value; }
}
/// <summary>Quality of compression of JPG
/// Jpeg文档质量
/// </summary>
public int JPEGQuality
{
get { return _iJPEGQuality; }
set { _iJPEGQuality = value; }
}
#endregion #region 初始化(实例化对象)
/// <summary>
///
/// </summary>
/// <param name="objHandle"></param>
public GhostscriptHandler(IntPtr objHandle)
{
_objHandle = objHandle;
}
public GhostscriptHandler()
{
_objHandle = IntPtr.Zero;
}
#endregion #region 字符串处理
/// <summary>
/// 转换Unicode字符串到Ansi字符串
/// </summary>
/// <param name="str">Unicode字符串</param>
/// <returns>Ansi字符串(字节数组格式)</returns>
private byte[] StringToAnsiZ(string str)
{
//' Convert a Unicode string to a null terminated Ansi string for Ghostscript.
//' The result is stored in a byte array. Later you will need to convert
//' this byte array to a pointer with GCHandle.Alloc(XXXX, GCHandleType.Pinned)
//' and GSHandle.AddrOfPinnedObject()
int intElementCount;
int intCounter;
byte[] aAnsi;
byte bChar;
intElementCount = str.Length;
aAnsi = new byte[intElementCount + ];
for (intCounter = ; intCounter < intElementCount; intCounter++)
{
bChar = (byte)str[intCounter];
aAnsi[intCounter] = bChar;
}
aAnsi[intElementCount] = ;
return aAnsi;
}
#endregion #region 转换文件
/// <summary>
/// 转换文件
/// </summary>
/// <param name="inputFile">输入的PDF文件路径</param>
/// <param name="outputFile">输出的Jpeg图片路径</param>
/// <param name="firstPage">第一页</param>
/// <param name="lastPage">最后一页</param>
/// <param name="deviceFormat">格式(文件格式)</param>
/// <param name="width">宽度</param>
/// <param name="height">高度</param>
public void Convert(string inputFile, string outputFile,
int firstPage, int lastPage, string deviceFormat, int width, int height)
{
//判断文件是否存在
if (!System.IO.File.Exists(inputFile))
{
LogHandler.LogWrite(string.Format("文件{0}不存在", inputFile));
return;
}
int intReturn;
IntPtr intGSInstanceHandle;
object[] aAnsiArgs;
IntPtr[] aPtrArgs;
GCHandle[] aGCHandle;
int intCounter;
int intElementCount;
IntPtr callerHandle;
GCHandle gchandleArgs;
IntPtr intptrArgs;
string[] sArgs = GetGeneratedArgs(inputFile, outputFile,
firstPage, lastPage, deviceFormat, width, height);
// Convert the Unicode strings to null terminated ANSI byte arrays
// then get pointers to the byte arrays.
intElementCount = sArgs.Length;
aAnsiArgs = new object[intElementCount];
aPtrArgs = new IntPtr[intElementCount];
aGCHandle = new GCHandle[intElementCount];
// Create a handle for each of the arguments after
// they've been converted to an ANSI null terminated
// string. Then store the pointers for each of the handles
for (intCounter = ; intCounter < intElementCount; intCounter++)
{
aAnsiArgs[intCounter] = StringToAnsiZ(sArgs[intCounter]);
aGCHandle[intCounter] = GCHandle.Alloc(aAnsiArgs[intCounter], GCHandleType.Pinned);
aPtrArgs[intCounter] = aGCHandle[intCounter].AddrOfPinnedObject();
}
// Get a new handle for the array of argument pointers
gchandleArgs = GCHandle.Alloc(aPtrArgs, GCHandleType.Pinned);
intptrArgs = gchandleArgs.AddrOfPinnedObject();
intReturn = gsapi_new_instance(out intGSInstanceHandle, _objHandle);
callerHandle = IntPtr.Zero;
try
{
intReturn = gsapi_init_with_args(intGSInstanceHandle, intElementCount, intptrArgs);
}
catch (Exception ex)
{
LogHandler.LogWrite(string.Format("PDF文件{0}转换失败.\n错误:{1}",new string[]{inputFile,ex.ToString()})); }
finally
{
for (intCounter = ; intCounter < intReturn; intCounter++)
{
aGCHandle[intCounter].Free();
}
gchandleArgs.Free();
gsapi_exit(intGSInstanceHandle);
gsapi_delete_instance(intGSInstanceHandle);
}
}
#endregion #region 转换文件
/// <summary>
///
/// </summary>
/// <param name="inputFile"></param>
/// <param name="outputFile"></param>
/// <param name="firstPage"></param>
/// <param name="lastPage"></param>
/// <param name="deviceFormat"></param>
/// <param name="width"></param>
/// <param name="height"></param>
/// <returns></returns>
private string[] GetGeneratedArgs(string inputFile, string outputFile,
int firstPage, int lastPage, string deviceFormat, int width, int height)
{
this._sDeviceFormat = deviceFormat;
this._iResolutionX = width;
this._iResolutionY = height;
// Count how many extra args are need - HRangel - 11/29/2006, 3:13:43 PM
ArrayList lstExtraArgs = new ArrayList();
if (_sDeviceFormat == "jpg" && _iJPEGQuality > && _iJPEGQuality < )
lstExtraArgs.Add("-dJPEGQ=" + _iJPEGQuality);
if (_iWidth > && _iHeight > )
lstExtraArgs.Add("-g" + _iWidth + "x" + _iHeight);
if (_bFitPage)
lstExtraArgs.Add("-dPDFFitPage");
if (_iResolutionX > )
{
if (_iResolutionY > )
lstExtraArgs.Add("-r" + _iResolutionX + "x" + _iResolutionY);
else
lstExtraArgs.Add("-r" + _iResolutionX);
}
// Load Fixed Args - HRangel - 11/29/2006, 3:34:02 PM
int iFixedCount = ;
int iExtraArgsCount = lstExtraArgs.Count;
string[] args = new string[iFixedCount + lstExtraArgs.Count];
/*
// Keep gs from writing information to standard output
"-q",
"-dQUIET", "-dPARANOIDSAFER", // Run this command in safe mode
"-dBATCH", // Keep gs from going into interactive mode
"-dNOPAUSE", // Do not prompt and pause for each page
"-dNOPROMPT", // Disable prompts for user interaction
"-dMaxBitmap=500000000", // Set high for better performance // Set the starting and ending pages
String.Format("-dFirstPage={0}", firstPage),
String.Format("-dLastPage={0}", lastPage), // Configure the output anti-aliasing, resolution, etc
"-dAlignToPixels=0",
"-dGridFitTT=0",
"-sDEVICE=jpeg",
"-dTextAlphaBits=4",
"-dGraphicsAlphaBits=4",
*/
args[] = "pdf2img";//this parameter have little real use
args[] = "-dNOPAUSE";//I don't want interruptions
args[] = "-dBATCH";//stop after
//args[3]="-dSAFER";
args[] = "-dPARANOIDSAFER";
args[] = "-sDEVICE=" + _sDeviceFormat;//what kind of export format i should provide
args[] = "-q";
args[] = "-dQUIET";
args[] = "-dNOPROMPT";
args[] = "-dMaxBitmap=500000000";
args[] = String.Format("-dFirstPage={0}", firstPage);
args[] = String.Format("-dLastPage={0}", lastPage);
args[] = "-dAlignToPixels=0";
args[] = "-dGridFitTT=0";
args[] = "-dTextAlphaBits=4";
args[] = "-dGraphicsAlphaBits=4";
//For a complete list watch here:
//http://pages.cs.wisc.edu/~ghost/doc/cvs/Devices.htm
//Fill the remaining parameters
for (int i = ; i < iExtraArgsCount; i++)
{
args[ + i] = (string)lstExtraArgs[i];
}
//Fill outputfile and inputfile
args[ + iExtraArgsCount] = string.Format("-sOutputFile={0}", outputFile);
args[ + iExtraArgsCount] = string.Format("{0}", inputFile);
return args;
}
#endregion }
}

OCR,识别Image代码(AsPrise辅助类)

 using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text; namespace PDFCaptureService
{
public static class AspriseOCRHandler
{
#region 外部引用
[DllImport("AspriseOCR.dll", EntryPoint = "OCR", CallingConvention = CallingConvention.Cdecl)]
public static extern IntPtr OCR(string file, int type);
[DllImport("AspriseOCR.dll", EntryPoint = "OCRpart", CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr OCRpart(string file, int type, int startX, int
startY, int width, int height);
[DllImport("AspriseOCR.dll", EntryPoint = "OCRBarCodes", CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr OCRBarCodes(string file, int type);
[DllImport("AspriseOCR.dll", EntryPoint = "OCRpartBarCodes", CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr OCRpartBarCodes(string file, int type, int
startX, int startY, int width, int height);
#endregion /// <summary>
///
/// </summary>
/// <param name="fileName"></param>
/// <returns></returns>
public static string ReadImage(string fileName)
{
IntPtr ptrFileContent = OCR(fileName, -);
string fileContent = Marshal.PtrToStringAnsi(ptrFileContent);
//
return fileContent;
}
}
}

调用示例

 GhostscriptHandler ghostscriptHandler = new GhostscriptHandler();
string tempJpgFileName = string.Format(GhostScriptImageName, Guid.NewGuid().ToString());
int pdfPageCount = ITextSharpHandler.GetPdfPageCount(fileName);
ghostscriptHandler.Convert(fileName, tempJpgFileName, , pdfPageCount, "jpeg", , );
fileContent = AspriseOCRHandler.ReadImage(fileName);

C#解析PDF的更多相关文章

  1. WPF解析PDF为图片

    偶遇需要解析PDF文件为单张图,此做, http://git.oschina.net/jiailiuyan/OfficeDecoder using System; using System.Colle ...

  2. Apache-Tika解析PDF文档

    通常在使用爬虫时,爬取到网上的文章都是各式各样的格式处理起来比较麻烦,这里我们使用Apache-Tika来处理PDF格式的文章,如下: package com.mengyao.tika.app; im ...

  3. Python解析PDF三法

    span{line-height:2em} --> 最近做调研想知道一些NZ当地的旅游信息,于是在NZ留学的友人自高奋勇地帮我去各个加油站拿了一堆旅游小册子,扫描了发给我. 但是他扫描出的高清图 ...

  4. Python使用PDFMiner解析PDF

    近期在做爬虫时有时会遇到网站只提供pdf的情况,这样就不能使用scrapy直接抓取页面内容了,只能通过解析PDF的方式处理,目前的解决方案大致只有pyPDF和PDFMiner.因为据说PDFMiner ...

  5. LIMS系统仪器数据采集-使用xpdf解析pdf内容

    不同语言解析PDF内容都有各自的库,比如Java的pdfbox,.net的itextsharp. c#解析PDF文本,关键代码可参考: http://www.cnblogs.com/mahongbia ...

  6. C#仪器数据文件解析-PDF文件

    不少仪器工作站输出的数据报告文件为PDF格式,PDF格式用于排版打印,但不易于数据解析,因此解析PDF数据需要首先读取到PDF文件中的文本内容,然后根据内容规则解析有意义的数据信息. C#解析PDF文 ...

  7. Java仪器数据文件解析-PDF文件

    一.概述 使用pdfbox可生成Pdf文件,同样可以解析PDF文本内容. pdfbox链接:https://pdfbox.apache.org/ 二.PDF文本内容解析 File file = new ...

  8. PHP通过PDFParser解析PDF文件

    之前一直找到的资料都是教你怎么生成pdf文档,比如:TCPDF.FPDF.wkhtmltopdf.而我碰到的项目里需要验证从远程获取的pdf文件是否受损.文件内容是否一致这些问题,这些都不能直接提供给 ...

  9. 代码片段,使用TIKA来解析PDF,WORD和EMAIL

    /** * com.jiaoyiping.pdstest.TestTika.java * Copyright (c) 2009 Hewlett-Packard Development Company, ...

随机推荐

  1. PHP中count()和sizeof()

    php中获取数组长度函数有两个count()和sizeof(),如果传递给这个函数的数组是一个空数组,或者是一个没有经过设定的变量,返回的数组元素个数就是0.两函数功能一样,手册上讲,sizeof() ...

  2. CVE-2018-15982漏洞复现

    作者:欧根 漏洞信息:CVE-2018-15982 Adobe已发布适用于Windows,macOS,Linux和Chrome OS的Adobe Flash Player安全更新.这些更新解决一个  ...

  3. 如何到python模块路径linux

    执行命令whereis python即可显示出python相关的所有的路径,包括可执行文件路径,安装路径等,该方法适用于大部分类似的场景抄自百度知道

  4. windows环境下ElasticSearch5以上版本安装head插件

    我的ElasticSearch版本是5以上的,网上搜了好多安装方式,都不对. 还好找到一个成功的,转载过来做记录. 原文地址:ElasticSearch-5.0安装head插件 步骤 下载node.j ...

  5. 注解中用于@target的方法annotation/--ElementType.METHOD,ElementType.TYPE对应方法,类接

    @Target: @Target说明了Annotation所修饰的对象范围:Annotation可被用于 packages.types(类.接口.枚举.Annotation类型).类型成员(方法.构造 ...

  6. Laravel 的核心概念

    工欲善其事,必先利其器.在开发Xblog的过程中,稍微领悟了一点Laravel的思想.确实如此,这篇文章读完你可能并不能从无到有写出一个博客,但知道Laravel的核心概念之后,当你再次写起Larav ...

  7. Ionic2 自学须知的基本知识点

    http://www.cnblogs.com/zsl123/p/5991336.html Ionic(ionicframework)一款接近原生的HTML5移动App开发框架. IONIC 是目前最有 ...

  8. iOS --UIScrollView的学习(三)自动轮播

    1.前面两章讲的都是基本的用法,这次讲一下比较重要的功能分页和自动播放 2.UIPageControl--分页 2.1只要将UIScrollView的pageEnabled属性设置为YES,UIScr ...

  9. Python笔记之字典循环

    Python笔记之字典循环   1.问题 Python是一门比较好入门的编程语言,但是入门简单,当然坑也是有的,今天就来介绍一个我遇到的坑吧,也是很简单的一个,就是当时脑子有点转不过弯来了. 先看代码 ...

  10. Exponentiation POJ-1001

    http://poj.org/problem?id=1001 //10000000 100000 #include<iostream> #include<cstring> us ...