1、引入maven依赖

  1. <dependency>
  2. <groupId>org.apache.pdfbox</groupId>
  3. <artifactId>pdfbox</artifactId>
  4. <version>2.0.4</version>
  5. </dependency>

2、相关工具类:PdfParser.java

  1. package com.insurance.tool;
  2.  
  3. import java.io.File;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.util.ArrayList;
  7. import java.util.List;
  8. import java.util.regex.Matcher;
  9. import java.util.regex.Pattern;
  10.  
  11. import org.apache.pdfbox.pdmodel.PDDocument;
  12. import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
  13. import org.apache.pdfbox.text.PDFTextStripper;
  14.  
  15. import com.insurance.pojo.Insurance;
  16. import com.insurance.pojo.InsuranceOrder;
  17. import com.insurance.pojo.InsuranceProgram;
  18.  
  19. public class PdfParser {
  20.  
  21. public static void main(String[] args) {
  22. readPDF("C:\\Users\\yinz\\Desktop\\场景1\\场景1_样例_电子保单识别.pdf");
  23. }
  24.  
  25. public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
  26. List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
  27. PDDocument document = null;
  28. document=PDDocument.load(stream);
  29.  
  30. // 获取页码
  31. int pages = document.getNumberOfPages();
  32.  
  33. // 读文本内容
  34. PDFTextStripper stripper=new PDFTextStripper();
  35. // 设置按顺序输出
  36. stripper.setSortByPosition(true);
  37. /*stripper.setStartPage(1);
  38. stripper.setEndPage(pages);
  39. String content = stripper.getText(document);
  40. System.out.println(content);*/
  41.  
  42. for(int page = 1; page <= pages; page++) {
  43. stripper.setStartPage(page);
  44. stripper.setEndPage(page);
  45. String content = stripper.getText(document);
  46. //System.out.println(content);
  47. parseContent(content, orderList);
  48. }
  49.  
  50. System.out.println(orderList);
  51. return orderList;
  52. }
  53.  
  54. public static void readPDF(String filePath) {
  55. List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
  56. File pdfFile = new File(filePath);
  57. PDDocument document = null;
  58. try
  59. {
  60. document=PDDocument.load(pdfFile);
  61.  
  62. // 获取页码
  63. int pages = document.getNumberOfPages();
  64.  
  65. // 读文本内容
  66. PDFTextStripper stripper=new PDFTextStripper();
  67. // 设置按顺序输出
  68. stripper.setSortByPosition(true);
  69. /*stripper.setStartPage(1);
  70. stripper.setEndPage(pages);
  71. String content = stripper.getText(document);
  72. System.out.println(content);*/
  73.  
  74. for(int page = 1; page <= pages; page++) {
  75. stripper.setStartPage(page);
  76. stripper.setEndPage(page);
  77. String content = stripper.getText(document);
  78. //System.out.println(content);
  79. parseContent(content, orderList);
  80. }
  81. System.out.println(orderList);
  82. }
  83. catch(Exception e)
  84. {
  85. System.out.println(e);
  86. }
  87.  
  88. }
  89.  
  90. private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\s(.*?)\\s");
  91. private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\s(.*?)\\s");
  92. private static Pattern policeHolderP = Pattern.compile("投 保 人.*\r\n");
  93. private static Pattern insuredP = Pattern.compile("被保险人.*\r\n");
  94. private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\s(.*?)(\r\n|\\s)");
  95. private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\s(.*?)(\r\n|\\s)");
  96. private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\s(.*?)(\r\n|\\s)");
  97. private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s合同生效日", Pattern.DOTALL);
  98. private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)(\r\n|\\s)");
  99. private static Pattern chargeWayP = Pattern.compile("交费方式\\s(.*?)\\s");
  100. private static Pattern feeP = Pattern.compile("保 险 费\\s(.*?)(\r\n|\\s)");
  101. private static Pattern policeHolderCount = Pattern.compile("投保份数\\s(.*?)(\r\n|\\s)");
  102. private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
  103. /*private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s");
  104. private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)\\s");*/
  105. private static void parseContent(String content, List<InsuranceOrder> list) {
  106. if(content == null || content.trim().length() == 0) {
  107. return;
  108. }
  109. if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
  110. //个人信息
  111. InsuranceOrder order = new InsuranceOrder();
  112. String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
  113. if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
  114. return;
  115. }
  116. list.add(order);
  117. order.setInsurancePoliceNo(insurancePoliceNo);
  118. order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));
  119.  
  120. String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
  121. if(policeHolderInfo != null) {
  122. Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
  123. Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
  124. Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
  125. Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");
  126.  
  127. order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
  128. order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
  129. order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
  130. order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
  131. }
  132. String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
  133. if(insuredInfo != null) {
  134. Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
  135. Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
  136. Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
  137. Pattern insuredIDP = Pattern.compile("证件号码(.*)$");
  138.  
  139. order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
  140. order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
  141. order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
  142. order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
  143. }
  144. order.setInsuredAge(retriveText(content, insuredAgeP, 1));
  145. order.setBeneficiary(retriveText(content, beneficiaryP, 1));
  146.  
  147. //保险信息
  148. Insurance insurance = new Insurance();
  149. order.setInsurance(insurance);
  150. insurance.setName(retriveText(content, insuranceNameP, 1));
  151. insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\r\n", ""));
  152. insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
  153. insurance.setChargeWay(retriveText(content, chargeWayP, 1));
  154. insurance.setFee(retriveText(content, feeP, 1));
  155. insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));
  156.  
  157. //保险项目信息
  158. String programList = retriveTextWithInnnerBlank(content, programListP, 1);
  159. if(programList != null) {
  160. String[] pArr = programList.split("\r\n");
  161. for(String str : pArr) {
  162. if(str != null && str.trim().length() > 0) {
  163. String[] subArr = str.split(" ");
  164. InsuranceProgram program = new InsuranceProgram();
  165. order.getProgramList().add(program);
  166. program.setName(subArr[0]);
  167. program.setFee(subArr[1]);
  168. }
  169. }
  170. }
  171. }
  172. }
  173.  
  174. private static String retriveText(String content, Pattern p, int position) {
  175. Matcher m = p.matcher(content);
  176. if(m.find()) {
  177. return m.group(position).trim().replace(" ", "");
  178. }
  179. return "";
  180. }
  181.  
  182. private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
  183. Matcher m = p.matcher(content);
  184. if(m.find()) {
  185. return m.group(position).trim();
  186. }
  187. return "";
  188. }
  189. }

相关实体类:InsuranceOrder .java

  1. package com.insurance.pojo;
  2.  
  3. import java.util.ArrayList;
  4. import java.util.List;
  5.  
  6. public class InsuranceOrder {
  7.  
  8. private String insurancePoliceNo; //保险单号
  9. private String insuranceApplicationNo; //投保单号
  10. private String policeHolderName; // 投保人
  11. private String policeHolderBirthday; //投保人出生日期
  12. private String policeHolderGender; //投保人性别
  13. private String policeHolderID; // 投保人证件号码
  14. private String insuredName; //被保险人
  15. private String insuredGender; //被保险人性别
  16. private String insuredBirthday; //被保险人出生日期
  17. private String insuredID; //被保险人证件号
  18. private String insuredAge; //被保险人投保年龄
  19. private String beneficiary; //身故受益人及分配方式
  20.  
  21. private Insurance insurance; //险种
  22. private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>(); //保险项目
  23.  
  24. public String getPoliceHolderBirthday() {
  25. return policeHolderBirthday;
  26. }
  27. public void setPoliceHolderBirthday(String policeHolderBirthday) {
  28. this.policeHolderBirthday = policeHolderBirthday;
  29. }
  30. public String getInsuredBirthday() {
  31. return insuredBirthday;
  32. }
  33. public void setInsuredBirthday(String insuredBirthday) {
  34. this.insuredBirthday = insuredBirthday;
  35. }
  36. public String getInsurancePoliceNo() {
  37. return insurancePoliceNo;
  38. }
  39. public void setInsurancePoliceNo(String insurancePoliceNo) {
  40. this.insurancePoliceNo = insurancePoliceNo;
  41. }
  42. public String getInsuranceApplicationNo() {
  43. return insuranceApplicationNo;
  44. }
  45. public void setInsuranceApplicationNo(String insuranceApplicationNo) {
  46. this.insuranceApplicationNo = insuranceApplicationNo;
  47. }
  48. public String getPoliceHolderName() {
  49. return policeHolderName;
  50. }
  51. public void setPoliceHolderName(String policeHolderName) {
  52. this.policeHolderName = policeHolderName;
  53. }
  54. public String getPoliceHolderGender() {
  55. return policeHolderGender;
  56. }
  57. public void setPoliceHolderGender(String policeHolderGender) {
  58. this.policeHolderGender = policeHolderGender;
  59. }
  60. public String getPoliceHolderID() {
  61. return policeHolderID;
  62. }
  63. public void setPoliceHolderID(String policeHolderID) {
  64. this.policeHolderID = policeHolderID;
  65. }
  66. public String getInsuredName() {
  67. return insuredName;
  68. }
  69. public void setInsuredName(String insuredName) {
  70. this.insuredName = insuredName;
  71. }
  72. public String getInsuredGender() {
  73. return insuredGender;
  74. }
  75. public void setInsuredGender(String insuredGender) {
  76. this.insuredGender = insuredGender;
  77. }
  78. public String getInsuredID() {
  79. return insuredID;
  80. }
  81. public void setInsuredID(String insuredID) {
  82. this.insuredID = insuredID;
  83. }
  84. public String getInsuredAge() {
  85. return insuredAge;
  86. }
  87. public void setInsuredAge(String insuredAge) {
  88. this.insuredAge = insuredAge;
  89. }
  90. public String getBeneficiary() {
  91. return beneficiary;
  92. }
  93. public void setBeneficiary(String beneficiary) {
  94. this.beneficiary = beneficiary;
  95. }
  96. public Insurance getInsurance() {
  97. return insurance;
  98. }
  99. public void setInsurance(Insurance insurance) {
  100. this.insurance = insurance;
  101. }
  102. public List<InsuranceProgram> getProgramList() {
  103. return programList;
  104. }
  105. public void setProgramList(List<InsuranceProgram> programList) {
  106. this.programList = programList;
  107. }
  108. @Override
  109. public String toString() {
  110. return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
  111. + ", insuranceApplicationNo=" + insuranceApplicationNo
  112. + ", policeHolderName=" + policeHolderName
  113. + ", policeHolderBirthday=" + policeHolderBirthday
  114. + ", policeHolderGender=" + policeHolderGender
  115. + ", policeHolderID=" + policeHolderID + ", insuredName="
  116. + insuredName + ", insuredGender=" + insuredGender
  117. + ", insuredBirthday=" + insuredBirthday + ", insuredID="
  118. + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
  119. + beneficiary + ", insurance=" + insurance + ", programList="
  120. + programList + "]";
  121. }
  122.  
  123. }

InsuranceProgram.java

  1. package com.insurance.pojo;
  2.  
  3. /**
  4. * 保险项目
  5. * @author yinz
  6. *
  7. */
  8. public class InsuranceProgram {
  9.  
  10. private String name; //项目名称
  11. private String fee; //金额
  12. public String getName() {
  13. return name;
  14. }
  15. public void setName(String name) {
  16. this.name = name;
  17. }
  18. public String getFee() {
  19. return fee;
  20. }
  21. public void setFee(String fee) {
  22. this.fee = fee;
  23. }
  24. @Override
  25. public String toString() {
  26. return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
  27. }
  28.  
  29. }

此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar

pdfBox 读取pdf文件的更多相关文章

  1. java 用PDFBox 删除 PDF文件中的某一页

    依赖: <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox-app ...

  2. 深入学习python解析并读取PDF文件内容的方法

    这篇文章主要学习了python解析并读取PDF文件内容的方法,包括对学习库的应用,python2.7和python3.6中python解析PDF文件内容库的更新,包括对pdfminer库的详细解释和应 ...

  3. Java 使用PDFBox提取PDF文件中的图片

    今天做PDF文件解析,遇到一个需求:提取文件中的图片并保存.使用的是流行的apache开源jar包pdfbox, 但还是遇到坑了,比如pdfbox版本太高或太低都不能用!!这个包竟然没有很好地做好兼容 ...

  4. pdf.js如何跨域读取pdf文件?

    今天,上线一个客户网站之后(使用的是广州新一代虚拟空间)发现在读取上传的pdf文件的时候读取错误,通过直接在浏览器输入文件地址的时候发现文件地址被重定向了(呵呵!),结果就是pdf文件源由本地直接变成 ...

  5. python3用pdfminer3k在线读取pdf文件

    import importlib import sys import random from urllib.request import urlopen from urllib.request imp ...

  6. java 库 pdfbox 将 pdf 文件转换成高清图片方法

    近期需要将 pdf 文件转成高清图片,使用库是 pdfbox.fontbox.可以使用 renderImageWithDPI 方法指定转换的清晰度,当然清晰度越高,转换需要的时间越长,转换出来的图片越 ...

  7. 读取pdf文件 .选择了itextsharp 库

    此库还是比较成熟.看博客园很多文章都介绍了此库 用法 如果项目用到读取pdf.  我这只是提供个思路.或者提供个方法.用itextsharp 能方便实现 StringBuilder text = ne ...

  8. 记一次为解决Python读取PDF文件的Shell操作

    目录 一.背景 二.问题 三.解决 四.一顿分析及 Shell 操作 五.后续 一.背景 本想将 PDF 文件转换为 Word 文档,然后网上搜索了一下发现有挺多转换的软件.有的是免费的.收费,咱也不 ...

  9. C# PDFBox 解析PDF文件

    下载 PDFBox-0.7.3.zip PDFBox-0.7.3.dlllucene-demos-2.0.0.dlllucene-core-2.0.0.dllbcmail-jdk14-132.dllb ...

随机推荐

  1. Mapper动态代理开发

    在开发的过程中只需要写Dao层的借口,无需写其实现类,实现类有框架自己补充. 框架是根据mapper文件自动补充的,因此需要满足下面四个条件 Mapper接口开发需要遵循以下规范: Mapper.xm ...

  2. jvm内存参数配置

    qunar国内旗舰店master  (4核 8G) qunar国内旗舰店hub(4核 8G) qunar国内旗舰店provider(4核 8G)

  3. GetAdaptersInfo & GetAdaptersAddresses

    I use GetAdaptersInfo to get MAC addresses of interfaces.   GetAdaptersInfo exist on old and new ver ...

  4. Delphi Thread

    Thread给几点说明:1.MyThread的实例作为TForm1的成员变量2.不要使用Form1这个全局变量,线程中可要使用它的Handle,你可以在Form中创建MyThread的实例时把Hand ...

  5. Ado.Net基础拾遗一:读取数据

    从数据库中读取数据: 使用DataReader对象从数据库中读取数据 首先需要添加几个命名空间 //需要添加的命名空间 using System.Configuration; using System ...

  6. [Linux] Ubuntu下非常给力的下载工具

    转载:http://blog.csdn.net/luojiming1990/article/details/9078447 Windows下的下载工具--迅雷,之所以下载速度快,乃是它能搜索资源.为己 ...

  7. C指针解析 ------ 运算符&amp;和*

    本文是自己学习所做笔记,欢迎转载,但请注明出处:http://blog.csdn.net/jesson20121020 & 是取地址运算符.* 叫做指针运算符或间接运算符.&a 的运算 ...

  8. windows 7 提示升级到windows 10补丁

    如果不需要这个提示,可以卸载KB3035583和KB2952664这两个系统更新补丁.   other update:KB2976978   and  KB2977759

  9. C++中的INL(转)

    inl 文件是内联函数的源文件. 内联函数通常在c++头文件中实现,但有的时候内联函数较多或者出于一些别的考虑(使头文件看起来更简洁等), 往往会将这部分具体定义的代码添加到INL文件中,然后在该头文 ...

  10. 【DB2】NICKNAME报错:SQL0206N "A0.CST_NM" 在使用它的上下文中无效。 SQLSTATE=42703

    1.环境展示: 2.操作描述 现在修改数据库A中CUST_INFO物理表的表结构,新增一个字段为desc varchar(100) ALTER TABLE CUST_INFO DROP COLUMN ...