程序调用:

using iTextSharp.text.pdf;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks; namespace TestIText
{
class Program
{
static void Main(string[] args)
{
PdfReader readerTemp = new PdfReader(@"D:\_Number position.pdf"); PdfHelper.LocationTextExtractionStrategyEx pz = new PdfHelper.LocationTextExtractionStrategyEx(); iTextSharp.text.pdf.parser.PdfReaderContentParser p = new iTextSharp.text.pdf.parser.PdfReaderContentParser(readerTemp);
p.ProcessContent<PdfHelper.LocationTextExtractionStrategyEx>(, pz); Console.WriteLine(pz.GetResultantText());//文字坐标信息等
Console.ReadLine(); }
}
}

PdfHelper帮助类:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text; using iTextSharp.text.pdf.parser; namespace PdfHelper
{
/// <summary>
/// Taken from http://www.java-frameworks.com/java/itext/com/itextpdf/text/pdf/parser/LocationTextExtractionStrategy.java.html
/// </summary>
class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
{
private List<TextChunk> m_locationResult = new List<TextChunk>();
private List<TextInfo> m_TextLocationInfo = new List<TextInfo>();
public List<TextChunk> LocationResult
{
get { return m_locationResult; }
}
public List<TextInfo> TextLocationInfo
{
get { return m_TextLocationInfo; }
} /// <summary>
/// Creates a new LocationTextExtracationStrategyEx
/// </summary>
public LocationTextExtractionStrategyEx()
{
} /// <summary>
/// Returns the result so far
/// </summary>
/// <returns>a String with the resulting text</returns>
public override String GetResultantText()
{
m_locationResult.Sort(); StringBuilder sb = new StringBuilder();
TextChunk lastChunk = null;
TextInfo lastTextInfo = null;
foreach (TextChunk chunk in m_locationResult)
{
if (lastChunk == null)
{
sb.Append(chunk.Text);
lastTextInfo = new TextInfo(chunk);
m_TextLocationInfo.Add(lastTextInfo);
}
else
{
if (chunk.sameLine(lastChunk))
{
float dist = chunk.distanceFromEndOf(lastChunk); if (dist < -chunk.CharSpaceWidth)
{
sb.Append(' ');
lastTextInfo.addSpace();
}
//append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space
else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[] != ' ' && lastChunk.Text[lastChunk.Text.Length - ] != ' ')
{
sb.Append(' ');
lastTextInfo.addSpace();
}
sb.Append(chunk.Text);
lastTextInfo.appendText(chunk);
}
else
{
sb.Append('\n');
sb.Append(chunk.Text);
lastTextInfo = new TextInfo(chunk);
m_TextLocationInfo.Add(lastTextInfo);
}
}
lastChunk = chunk;
}
return sb.ToString();
} /// <summary>
///
/// </summary>
/// <param name="renderInfo"></param>
public override void RenderText(TextRenderInfo renderInfo)
{
LineSegment segment = renderInfo.GetBaseline();
TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine());
m_locationResult.Add(location);
} public class TextChunk : IComparable, ICloneable
{
string m_text;
Vector m_startLocation;
Vector m_endLocation;
Vector m_orientationVector;
int m_orientationMagnitude;
int m_distPerpendicular;
float m_distParallelStart;
float m_distParallelEnd;
float m_charSpaceWidth; public LineSegment AscentLine;
public LineSegment DecentLine; public object Clone()
{
TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);
return copy;
} public string Text
{
get { return m_text; }
set { m_text = value; }
}
public float CharSpaceWidth
{
get { return m_charSpaceWidth; }
set { m_charSpaceWidth = value; }
}
public Vector StartLocation
{
get { return m_startLocation; }
set { m_startLocation = value; }
}
public Vector EndLocation
{
get { return m_endLocation; }
set { m_endLocation = value; }
} /// <summary>
/// Represents a chunk of text, it's orientation, and location relative to the orientation vector
/// </summary>
/// <param name="txt"></param>
/// <param name="startLoc"></param>
/// <param name="endLoc"></param>
/// <param name="charSpaceWidth"></param>
public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine)
{
m_text = txt;
m_startLocation = startLoc;
m_endLocation = endLoc;
m_charSpaceWidth = charSpaceWidth;
AscentLine = ascentLine;
DecentLine = decentLine; m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize();
m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * ); // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
// the two vectors we are crossing are in the same plane, so the result will be purely
// in the z-axis (out of plane) direction, so we just take the I3 component of the result
Vector origin = new Vector(, , );
m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3]; m_distParallelStart = m_orientationVector.Dot(m_startLocation);
m_distParallelEnd = m_orientationVector.Dot(m_endLocation);
} /// <summary>
/// true if this location is on the the same line as the other text chunk
/// </summary>
/// <param name="textChunkToCompare">the location to compare to</param>
/// <returns>true if this location is on the the same line as the other</returns>
public bool sameLine(TextChunk textChunkToCompare)
{
if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false;
if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false;
return true;
} /// <summary>
/// Computes the distance between the end of 'other' and the beginning of this chunk
/// in the direction of this chunk's orientation vector. Note that it's a bad idea
/// to call this for chunks that aren't on the same line and orientation, but we don't
/// explicitly check for that condition for performance reasons.
/// </summary>
/// <param name="other"></param>
/// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
public float distanceFromEndOf(TextChunk other)
{
float distance = m_distParallelStart - other.m_distParallelEnd;
return distance;
} /// <summary>
/// Compares based on orientation, perpendicular distance, then parallel distance
/// </summary>
/// <param name="obj"></param>
/// <returns></returns>
public int CompareTo(object obj)
{
if (obj == null) throw new ArgumentException("Object is now a TextChunk"); TextChunk rhs = obj as TextChunk;
if (rhs != null)
{
if (this == rhs) return ; int rslt;
rslt = m_orientationMagnitude - rhs.m_orientationMagnitude;
if (rslt != ) return rslt; rslt = m_distPerpendicular - rhs.m_distPerpendicular;
if (rslt != ) return rslt; // note: it's never safe to check floating point numbers for equality, and if two chunks
// are truly right on top of each other, which one comes first or second just doesn't matter
// so we arbitrarily choose this way.
rslt = m_distParallelStart < rhs.m_distParallelStart ? - : ; return rslt;
}
else
{
throw new ArgumentException("Object is now a TextChunk");
}
}
} public class TextInfo
{
public Vector TopLeft;
public Vector BottomRight;
private string m_Text; public string Text
{
get { return m_Text; }
} /// <summary>
/// Create a TextInfo.
/// </summary>
/// <param name="initialTextChunk"></param>
public TextInfo(TextChunk initialTextChunk)
{
TopLeft = initialTextChunk.AscentLine.GetStartPoint();
BottomRight = initialTextChunk.DecentLine.GetEndPoint();
m_Text = initialTextChunk.Text;
} /// <summary>
/// Add more text to this TextInfo.
/// </summary>
/// <param name="additionalTextChunk"></param>
public void appendText(TextChunk additionalTextChunk)
{
BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
m_Text += additionalTextChunk.Text;
} /// <summary>
/// Add a space to the TextInfo. This will leave the endpoint out of sync with the text.
/// The assumtion is that you will add more text after the space which will correct the endpoint.
/// </summary>
public void addSpace()
{
m_Text += ' ';
} }
}
}

C# 使用itextsharp 读取pdf中文字坐标的更多相关文章

  1. Java 读取PDF中的表格

    一.概述 本文以Java示例展示读取PDF中的表格的方法.这里导入Spire.PDF for Javah中的jar包,并使用其提供的相关及方法来实现获取表格中的文本内容.下表中整理了本次代码使用到的主 ...

  2. Java 读取PDF中的文本和图片

    本文将介绍通过Java程序来读取PDF文档中的文本和图片的方法.分别调用方法extractText()和extractImages()来读取.   使用工具:Free Spire.PDF for Ja ...

  3. 读取pdf中的内容

    import com.spire.pdf.PdfDocument;import com.spire.pdf.PdfPageBase;import java.io.*; public class Ext ...

  4. 利用百度AI OCR图片识别,Java实现PDF中的图片转换成文字

    序言:我们在读一些PDF版书籍的时候,如果PDF中不是图片,做起读书笔记的还好:如果PDF中的是图片的话,根本无法编辑,做起笔记来,还是很痛苦的.我是遇到过了.我们搞技术的,当然得自己学着解决现在的痛 ...

  5. SQL 横转竖 、竖专横 (转载) 使用Dapper.Contrib 开发.net core程序,兼容多种数据库 C# 读取PDF多级书签 Json.net日期格式化设置 ASPNET 下载共享文件 ASPNET 文件批量下载 递归,循环,尾递归 利用IDisposable接口构建包含非托管资源对象 《.NET 进阶指南》读书笔记2------定义不可改变类型

    SQL 横转竖 .竖专横 (转载)   普通行列转换 问题:假设有张学生成绩表(tb)如下: 姓名 课程 分数 张三 语文 74 张三 数学 83 张三 物理 93 李四 语文 74 李四 数学 84 ...

  6. java读取pdf文本转换html

    补充:一下代码基于maven,现将依赖的jar包单独导出 地址:pdf jar 完整代码地址 也就两个文件 java读取pdf中的纯文字,这里使用的是pdfbox工具包 maven引入如下配置 < ...

  7. C# 读取PDF多级书签

    在PDF中,书签作为一种导航的有效工具,能帮助我们快速地定位到文档中的指定段落.同时,书签也能让人对文档结构一目了然,在某种程度上也可作为目录使用.对于C#操作PDF中的书签,在上一篇文章中介绍了具体 ...

  8. Java 添加、替换、删除PDF中的图片

    概述 本文介绍通过java程序向PDF文档添加图片,以及替换和删除PDF中已有的图片.另外,关于图片的操作还可参考设置PDF 图片背景.设置PDF图片水印.读取PDF中的图片.将PDF保存为图片等文章 ...

  9. 在.NET中使用iTextSharp创建/读取PDF报告: Part I [翻译]

    原文地址:Create/Read Advance PDF Report using iTextSharp in C# .NET: Part I    By Debopam Pal, 27 Nov 20 ...

随机推荐

  1. 201521123018 《Java程序设计》第6周学习总结

    1. 本章学习总结 2. 书面作业 一.clone方法 1.1 Object对象中的clone方法是被protected修饰,在自定义的类中覆盖clone方法时需要注意什么? 用protected修饰 ...

  2. 201521123061 《Java程序设计》第一周学习总结

    1.本周学习总结 (1) Java的来历与版本演进 最早是Sun公司绿色项目Green Project 中所撰写的Strar7应用程序的程序语言: (2)Java根据应用领域分为三大平台:Java S ...

  3. 201521123064 《Java程序设计》第10周学习总结

    1. 本章学习总结 1.1 以你喜欢的方式(思维导图或其他)归纳总结异常与多线程相关内容. ① 定义Thread类的子类,覆盖Thread类的run()方法,然后创建该子类的实例(一般不用该方法,开销 ...

  4. 201521123065《java程序设计》第14周学习总结

    1. 本周学习总结 1.大部分情况下使用的数据库是关系型的数据库,使用表存储数据: 2.关系型数据库可以通过唯一的主键查找记录,也可以通过多个信息确定主键: 3.Mysql操作:显示-show dat ...

  5. 201521123070 《JAVA程序设计》第12周学习总结

    1. 本章学习总结 1.1 以你喜欢的方式(思维导图或其他)归纳总结多流与文件相关内容. 2. 书面作业 将Student对象(属性:int id, String name,int age,doubl ...

  6. Hyperledger Fabric 1.0 从零开始(二)——环境构建(公网)

    1:环境构建 在本文中用到的宿主机环境是Centos ,版本为Centos.x86_647.2,通过Docker 容器来运行Fabric的节点,版本为v1.0.因此,启动Fabric网络中的节点需要先 ...

  7. yum仓库管理

    yum在线管理 rpm包的管理分为 rpm命令管理和yum在线管理,rpm命令管理由于可能需要解决各种依赖问题,在安装软件的时候可能显得比较麻烦,然而,yum在线管理正好和它相反.Yum(全称为 Ye ...

  8. mybatis-resultMap使用与详解

    1,当数据库的字段名与属性名称不一致时,在mybatis中如何处理? 第一种方式: 采用投影对字段重命名<select id="load" parameterType=&qu ...

  9. 在Myeclipse中用Java语言操作mysql数据库

    package OperateMysql; import java.sql.*; public class MysqlTest { public static void main(String[] a ...

  10. 我的Spring学习记录(一)

    spring是一个框架,一个我理解为对象的大熔炉,它生产着各种bean,还可以对生产的对象进行加工. 这里有些概念需要理解一下,就是IOC和DI以及AOP,接下来,我们进入主题. spring简介 上 ...