pdfBox 读取pdf文件
1、引入maven依赖
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>2.0.4</version>
- </dependency>
2、相关工具类:PdfParser.java
- package com.insurance.tool;
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
- import org.apache.pdfbox.text.PDFTextStripper;
- import com.insurance.pojo.Insurance;
- import com.insurance.pojo.InsuranceOrder;
- import com.insurance.pojo.InsuranceProgram;
- public class PdfParser {
- public static void main(String[] args) {
- readPDF("C:\\Users\\yinz\\Desktop\\场景1\\场景1_样例_电子保单识别.pdf");
- }
- public static List<InsuranceOrder> readPDF(InputStream stream) throws Exception{
- List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
- PDDocument document = null;
- document=PDDocument.load(stream);
- // 获取页码
- int pages = document.getNumberOfPages();
- // 读文本内容
- PDFTextStripper stripper=new PDFTextStripper();
- // 设置按顺序输出
- stripper.setSortByPosition(true);
- /*stripper.setStartPage(1);
- stripper.setEndPage(pages);
- String content = stripper.getText(document);
- System.out.println(content);*/
- for(int page = 1; page <= pages; page++) {
- stripper.setStartPage(page);
- stripper.setEndPage(page);
- String content = stripper.getText(document);
- //System.out.println(content);
- parseContent(content, orderList);
- }
- System.out.println(orderList);
- return orderList;
- }
- public static void readPDF(String filePath) {
- List<InsuranceOrder> orderList = new ArrayList<InsuranceOrder>();
- File pdfFile = new File(filePath);
- PDDocument document = null;
- try
- {
- document=PDDocument.load(pdfFile);
- // 获取页码
- int pages = document.getNumberOfPages();
- // 读文本内容
- PDFTextStripper stripper=new PDFTextStripper();
- // 设置按顺序输出
- stripper.setSortByPosition(true);
- /*stripper.setStartPage(1);
- stripper.setEndPage(pages);
- String content = stripper.getText(document);
- System.out.println(content);*/
- for(int page = 1; page <= pages; page++) {
- stripper.setStartPage(page);
- stripper.setEndPage(page);
- String content = stripper.getText(document);
- //System.out.println(content);
- parseContent(content, orderList);
- }
- System.out.println(orderList);
- }
- catch(Exception e)
- {
- System.out.println(e);
- }
- }
- private static Pattern insurancePoliceNoP = Pattern.compile("保险单号\\s(.*?)\\s");
- private static Pattern insuranceApplicationNoP = Pattern.compile("投保单号\\s(.*?)\\s");
- private static Pattern policeHolderP = Pattern.compile("投 保 人.*\r\n");
- private static Pattern insuredP = Pattern.compile("被保险人.*\r\n");
- private static Pattern insuredAgeP = Pattern.compile("被保险人投保年龄\\s(.*?)(\r\n|\\s)");
- private static Pattern beneficiaryP = Pattern.compile("身故受益人及分配方式\\s(.*?)(\r\n|\\s)");
- private static Pattern insuranceNameP = Pattern.compile("险种名称及款式\\s(.*?)(\r\n|\\s)");
- private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s合同生效日", Pattern.DOTALL);
- private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)(\r\n|\\s)");
- private static Pattern chargeWayP = Pattern.compile("交费方式\\s(.*?)\\s");
- private static Pattern feeP = Pattern.compile("保 险 费\\s(.*?)(\r\n|\\s)");
- private static Pattern policeHolderCount = Pattern.compile("投保份数\\s(.*?)(\r\n|\\s)");
- private static Pattern programListP = Pattern.compile("保险金额(.*?)保险责任与责任免除详见条款", Pattern.DOTALL);
- /*private static Pattern validPeriodP = Pattern.compile("保险期间\\s(.*?)\\s");
- private static Pattern effectiveDateP = Pattern.compile("合同生效日\\s(.*?)\\s");*/
- private static void parseContent(String content, List<InsuranceOrder> list) {
- if(content == null || content.trim().length() == 0) {
- return;
- }
- if(content.startsWith("个 人 人 身 保 险 保 险 单")) {
- //个人信息
- InsuranceOrder order = new InsuranceOrder();
- String insurancePoliceNo = retriveText(content, insurancePoliceNoP, 1);
- if(insurancePoliceNo == null || insurancePoliceNo.length() <= 0) {
- return;
- }
- list.add(order);
- order.setInsurancePoliceNo(insurancePoliceNo);
- order.setInsuranceApplicationNo(retriveText(content, insuranceApplicationNoP, 1));
- String policeHolderInfo = retriveTextWithInnnerBlank(content, policeHolderP, 0);
- if(policeHolderInfo != null) {
- Pattern policeHolderNameP = Pattern.compile("投 保 人(.*?)性别");
- Pattern policeHolderGenderP = Pattern.compile("性别(.*?)出生日期");
- Pattern policeHolderBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
- Pattern policeHolderIDP = Pattern.compile("证件号码(.*)$");
- order.setPoliceHolderName(retriveText(policeHolderInfo, policeHolderNameP, 1));
- order.setPoliceHolderGender(retriveText(policeHolderInfo, policeHolderGenderP, 1));
- order.setPoliceHolderBirthday(retriveText(policeHolderInfo, policeHolderBirthdayP, 1));
- order.setPoliceHolderID(retriveText(policeHolderInfo, policeHolderIDP, 1));
- }
- String insuredInfo = retriveTextWithInnnerBlank(content, insuredP, 0);
- if(insuredInfo != null) {
- Pattern insuredNameP = Pattern.compile("被保险人(.*?)性别");
- Pattern insuredGenderP = Pattern.compile("性别(.*?)出生日期");
- Pattern insuredBirthdayP = Pattern.compile("出生日期(.*?)证件号码");
- Pattern insuredIDP = Pattern.compile("证件号码(.*)$");
- order.setInsuredName(retriveText(insuredInfo, insuredNameP, 1));
- order.setInsuredGender(retriveText(insuredInfo, insuredGenderP, 1));
- order.setInsuredBirthday(retriveText(insuredInfo, insuredBirthdayP, 1));
- order.setInsuredID(retriveText(insuredInfo, insuredIDP, 1));
- }
- order.setInsuredAge(retriveText(content, insuredAgeP, 1));
- order.setBeneficiary(retriveText(content, beneficiaryP, 1));
- //保险信息
- Insurance insurance = new Insurance();
- order.setInsurance(insurance);
- insurance.setName(retriveText(content, insuranceNameP, 1));
- insurance.setValidPeriod(retriveText(content, validPeriodP, 1).replaceAll("\r\n", ""));
- insurance.setEffectiveDate(retriveText(content, effectiveDateP, 1));
- insurance.setChargeWay(retriveText(content, chargeWayP, 1));
- insurance.setFee(retriveText(content, feeP, 1));
- insurance.setPoliceHolderCount(retriveText(content, policeHolderCount, 1));
- //保险项目信息
- String programList = retriveTextWithInnnerBlank(content, programListP, 1);
- if(programList != null) {
- String[] pArr = programList.split("\r\n");
- for(String str : pArr) {
- if(str != null && str.trim().length() > 0) {
- String[] subArr = str.split(" ");
- InsuranceProgram program = new InsuranceProgram();
- order.getProgramList().add(program);
- program.setName(subArr[0]);
- program.setFee(subArr[1]);
- }
- }
- }
- }
- }
- private static String retriveText(String content, Pattern p, int position) {
- Matcher m = p.matcher(content);
- if(m.find()) {
- return m.group(position).trim().replace(" ", "");
- }
- return "";
- }
- private static String retriveTextWithInnnerBlank(String content, Pattern p, int position) {
- Matcher m = p.matcher(content);
- if(m.find()) {
- return m.group(position).trim();
- }
- return "";
- }
- }
相关实体类:InsuranceOrder .java
- package com.insurance.pojo;
- import java.util.ArrayList;
- import java.util.List;
- public class InsuranceOrder {
- private String insurancePoliceNo; //保险单号
- private String insuranceApplicationNo; //投保单号
- private String policeHolderName; // 投保人
- private String policeHolderBirthday; //投保人出生日期
- private String policeHolderGender; //投保人性别
- private String policeHolderID; // 投保人证件号码
- private String insuredName; //被保险人
- private String insuredGender; //被保险人性别
- private String insuredBirthday; //被保险人出生日期
- private String insuredID; //被保险人证件号
- private String insuredAge; //被保险人投保年龄
- private String beneficiary; //身故受益人及分配方式
- private Insurance insurance; //险种
- private List<InsuranceProgram> programList = new ArrayList<InsuranceProgram>(); //保险项目
- public String getPoliceHolderBirthday() {
- return policeHolderBirthday;
- }
- public void setPoliceHolderBirthday(String policeHolderBirthday) {
- this.policeHolderBirthday = policeHolderBirthday;
- }
- public String getInsuredBirthday() {
- return insuredBirthday;
- }
- public void setInsuredBirthday(String insuredBirthday) {
- this.insuredBirthday = insuredBirthday;
- }
- public String getInsurancePoliceNo() {
- return insurancePoliceNo;
- }
- public void setInsurancePoliceNo(String insurancePoliceNo) {
- this.insurancePoliceNo = insurancePoliceNo;
- }
- public String getInsuranceApplicationNo() {
- return insuranceApplicationNo;
- }
- public void setInsuranceApplicationNo(String insuranceApplicationNo) {
- this.insuranceApplicationNo = insuranceApplicationNo;
- }
- public String getPoliceHolderName() {
- return policeHolderName;
- }
- public void setPoliceHolderName(String policeHolderName) {
- this.policeHolderName = policeHolderName;
- }
- public String getPoliceHolderGender() {
- return policeHolderGender;
- }
- public void setPoliceHolderGender(String policeHolderGender) {
- this.policeHolderGender = policeHolderGender;
- }
- public String getPoliceHolderID() {
- return policeHolderID;
- }
- public void setPoliceHolderID(String policeHolderID) {
- this.policeHolderID = policeHolderID;
- }
- public String getInsuredName() {
- return insuredName;
- }
- public void setInsuredName(String insuredName) {
- this.insuredName = insuredName;
- }
- public String getInsuredGender() {
- return insuredGender;
- }
- public void setInsuredGender(String insuredGender) {
- this.insuredGender = insuredGender;
- }
- public String getInsuredID() {
- return insuredID;
- }
- public void setInsuredID(String insuredID) {
- this.insuredID = insuredID;
- }
- public String getInsuredAge() {
- return insuredAge;
- }
- public void setInsuredAge(String insuredAge) {
- this.insuredAge = insuredAge;
- }
- public String getBeneficiary() {
- return beneficiary;
- }
- public void setBeneficiary(String beneficiary) {
- this.beneficiary = beneficiary;
- }
- public Insurance getInsurance() {
- return insurance;
- }
- public void setInsurance(Insurance insurance) {
- this.insurance = insurance;
- }
- public List<InsuranceProgram> getProgramList() {
- return programList;
- }
- public void setProgramList(List<InsuranceProgram> programList) {
- this.programList = programList;
- }
- @Override
- public String toString() {
- return "InsuranceOrder [insurancePoliceNo=" + insurancePoliceNo
- + ", insuranceApplicationNo=" + insuranceApplicationNo
- + ", policeHolderName=" + policeHolderName
- + ", policeHolderBirthday=" + policeHolderBirthday
- + ", policeHolderGender=" + policeHolderGender
- + ", policeHolderID=" + policeHolderID + ", insuredName="
- + insuredName + ", insuredGender=" + insuredGender
- + ", insuredBirthday=" + insuredBirthday + ", insuredID="
- + insuredID + ", insuredAge=" + insuredAge + ", beneficiary="
- + beneficiary + ", insurance=" + insurance + ", programList="
- + programList + "]";
- }
- }
InsuranceProgram.java
- package com.insurance.pojo;
- /**
- * 保险项目
- * @author yinz
- *
- */
- public class InsuranceProgram {
- private String name; //项目名称
- private String fee; //金额
- public String getName() {
- return name;
- }
- public void setName(String name) {
- this.name = name;
- }
- public String getFee() {
- return fee;
- }
- public void setFee(String fee) {
- this.fee = fee;
- }
- @Override
- public String toString() {
- return "InsuranceProgram [name=" + name + ", fee=" + fee + "]";
- }
- }
此处用于读取的pdf文件:http://files.cnblogs.com/files/yinz/场景1_样例_电子保单识别.rar
pdfBox 读取pdf文件的更多相关文章
- java 用PDFBox 删除 PDF文件中的某一页
依赖: <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox-app ...
- 深入学习python解析并读取PDF文件内容的方法
这篇文章主要学习了python解析并读取PDF文件内容的方法,包括对学习库的应用,python2.7和python3.6中python解析PDF文件内容库的更新,包括对pdfminer库的详细解释和应 ...
- Java 使用PDFBox提取PDF文件中的图片
今天做PDF文件解析,遇到一个需求:提取文件中的图片并保存.使用的是流行的apache开源jar包pdfbox, 但还是遇到坑了,比如pdfbox版本太高或太低都不能用!!这个包竟然没有很好地做好兼容 ...
- pdf.js如何跨域读取pdf文件?
今天,上线一个客户网站之后(使用的是广州新一代虚拟空间)发现在读取上传的pdf文件的时候读取错误,通过直接在浏览器输入文件地址的时候发现文件地址被重定向了(呵呵!),结果就是pdf文件源由本地直接变成 ...
- python3用pdfminer3k在线读取pdf文件
import importlib import sys import random from urllib.request import urlopen from urllib.request imp ...
- java 库 pdfbox 将 pdf 文件转换成高清图片方法
近期需要将 pdf 文件转成高清图片,使用库是 pdfbox.fontbox.可以使用 renderImageWithDPI 方法指定转换的清晰度,当然清晰度越高,转换需要的时间越长,转换出来的图片越 ...
- 读取pdf文件 .选择了itextsharp 库
此库还是比较成熟.看博客园很多文章都介绍了此库 用法 如果项目用到读取pdf. 我这只是提供个思路.或者提供个方法.用itextsharp 能方便实现 StringBuilder text = ne ...
- 记一次为解决Python读取PDF文件的Shell操作
目录 一.背景 二.问题 三.解决 四.一顿分析及 Shell 操作 五.后续 一.背景 本想将 PDF 文件转换为 Word 文档,然后网上搜索了一下发现有挺多转换的软件.有的是免费的.收费,咱也不 ...
- C# PDFBox 解析PDF文件
下载 PDFBox-0.7.3.zip PDFBox-0.7.3.dlllucene-demos-2.0.0.dlllucene-core-2.0.0.dllbcmail-jdk14-132.dllb ...
随机推荐
- iOS中的场景转换机制的浅显分析
目前Apple推荐的场景转换的方法有以下几个: 一般的跳转方法: presentViewController Discussion In a horizontally compact environm ...
- CRC代码实现
CRC代码实现1: #include <stdio.h> #include <string.h> unsigned int cfgCrc32(const unsigned ch ...
- [Linux] ubuntu 格式化u盘
$sudo fdisks -l 基本功,格式化命令,以格式化 /dev/sdb4 分区为例:$ sudo umount /dev/sdb4 # 必须先卸载该分区 # 格式化为 FAT 分区$ s ...
- 部署web Service到tomcat
建立项目 打开jdeveloper 12c,然后新建一个java项目,点击java,生成web services. package simple; import javax.jws.WebMethod ...
- Mac下Git的安装和卸载
1.安装最新版本:https://git-scm.com/download/mac,下载pkg进行安装 2.卸载:运行/usr/local/git/uninstall.sh
- npm install -S -D -g 有什么区别
npm install module_name -S 即 npm install module_name --save 写入dependencies npm install modu ...
- 更改Windows用户文件夹(Users)默认位置到其它盘
一.把 C盘Users文件夹里的用户数据,迁移到D盘Users文件夹中 系统环境:windows7 1.mklink命令详解 C:>mklink 创建符号链接. MKLINK [[/D] | [ ...
- ES6/ES2015核心内容(下)
import export 这两个家伙对应的就是es6自己的module功能. 我们之前写的Javascript一直都没有模块化的体系,无法将一个庞大的js工程拆分成一个个功能相对独立但相互依赖的小工 ...
- dubbo方法调用的timeout设置
方法调用的默认超时时间为1s,但是具体的超时时间受限于服务端方法性能.服务端个数.客户端的并发数等因素,所以超时时间需要根据不同的场景进行调试. 基本步骤为: 测试服务端的TPS,单位为 任务数或线程 ...
- maven命令解释
打包:mvn package编译:mvn compile编译测试程序:mvn test-compile清空:mvn clean运行测试:mvn test生成站点目录: mvn site生成站点目录并发 ...