使用POI将doc文件转换为html

需要的jar包有：有一些是依赖包，可以使用maven下载

doc文件转换为html文件

package com.gsww.sxzz.controller.service;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.jsoup.Jsoup;

import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import java.io.*;

import java.util.List;

/**

 * Created by Carey on 15-2-2.

 */

public class docTohtml {

    public static void main(String argv[]) {

        try {

            convert2Html("D:\\b.doc","D:\\1.html");

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    //输出html文件

    public static void writeFile(String content, String path) {

        FileOutputStream fos = null;

        BufferedWriter bw = null;

        org.jsoup.nodes.Document doc = Jsoup.parse(content);

        String styleOld=doc.getElementsByTag("style").html();

        //统一字体格式为宋体

        styleOld=styleOld.replaceAll("font-family:.+(?=;\\b)", "font-family:SimSun");

        doc.getElementsByTag("head").empty();

        doc.getElementsByTag("head").append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></meta>");

        doc.getElementsByTag("head").append(" <style type=\"text/css\"></style>");

        doc.getElementsByTag("style").append(styleOld);

        /*正则表达式查询字体内容：font-family:.+(?=;\b)*/

        System.out.println(content);

        content=doc.html();

        content=content.replace("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">", "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></meta>");

        try {

            File file = new File(path);

            fos = new FileOutputStream(file);

            bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));

            bw.write(content);

        } catch (FileNotFoundException fnfe) {

            fnfe.printStackTrace();

        } catch (IOException ioe) {

            ioe.printStackTrace();

        } finally {

            try {

                if (bw != null)

                    bw.close();

                if (fos != null)

                    fos.close();

            } catch (IOException ie) {

            }

        }

    }

    //word 转 html

    public static void convert2Html(String fileName, String outPutFile)

            throws TransformerException, IOException,

            ParserConfigurationException {

        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));//WordToHtmlUtils.loadDoc(new FileInputStream(inputFile));

         //兼容2007 以上版本

//        XSSFWorkbook  xssfwork=new XSSFWorkbook(new FileInputStream(fileName));

        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(

                DocumentBuilderFactory.newInstance().newDocumentBuilder()

                        .newDocument());

        wordToHtmlConverter.setPicturesManager( new PicturesManager()

        {

            public String savePicture( byte[] content,

                                       PictureType pictureType, String suggestedName,

                                       float widthInches, float heightInches )

            {

                return "test/"+suggestedName;

            }

        } );

        wordToHtmlConverter.processDocument(wordDocument);

        //save pictures

        List pics=wordDocument.getPicturesTable().getAllPictures();

        if(pics!=null){

            for(int i=0;i<pics.size();i++){

                Picture pic = (Picture)pics.get(i);

                System.out.println();

                try {

                    pic.writeImageContent(new FileOutputStream("D:/test/"

                            + pic.suggestFullFileName()));

                } catch (FileNotFoundException e) {

                    e.printStackTrace();

                }

            }

        }

        Document htmlDocument = wordToHtmlConverter.getDocument();

        ByteArrayOutputStream out = new ByteArrayOutputStream();

        DOMSource domSource = new DOMSource(htmlDocument);

        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();

        Transformer serializer = tf.newTransformer();

        serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");

        serializer.setOutputProperty(OutputKeys.INDENT, "yes");

        serializer.setOutputProperty(OutputKeys.METHOD, "HTML");

        serializer.transform(domSource, streamResult);

        out.close();

        writeFile(new String(out.toByteArray()), outPutFile);

    }

}

遇到的问题，当doc转换为html时不会将图像的线条给转换过来。只有在table表格中才可以转换为span标签。如果要作下滑线，可以放一个table的单元格只设定下边框就可以完美转换为html了。

将html转换为pdf

package com.gsww.sxzz.controller.service;

import com.lowagie.text.pdf.BaseFont;

import org.xhtmlrenderer.pdf.ITextFontResolver;

import org.xhtmlrenderer.pdf.ITextRenderer;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.OutputStream;

/**

 * Created by Carey on 15-2-2.

 */

public class htmlToPdf {

    public boolean convertHtmlToPdf(String inputFile, String outputFile)

             {

        try {

                OutputStream     os = new FileOutputStream(outputFile);

              ITextRenderer renderer = new ITextRenderer();

                String url = new File(inputFile).toURI().toURL().toString();

                renderer.setDocument(url);

                // 解决中文支持问题

                ITextFontResolver fontResolver = renderer.getFontResolver();

                /*fontResolver.addFont("C:\\Windows\\Fonts\\simsunb.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);

                *///宋体文件的相对路径

                fontResolver.addFont("C:\\Windows\\Fonts\\simsun.ttc", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);     

                renderer.getSharedContext().setBaseURL("file:/D:/");

                renderer.layout();

                renderer.createPDF(os);

                os.flush();

                os.close();

        } catch (Exception e) {

            // TODO Auto-generated catch block

            e.printStackTrace();

        }

        return true;

    }

     public   static  void  main(String [] args){

         htmlToPdf html2Pdf =new htmlToPdf();

         try {

             html2Pdf.convertHtmlToPdf("D:\\1.html","D:\\index.pdf");

         } catch (Exception e) {

             e.printStackTrace();

         }

     }

}

使用POI将doc文件转换为html的更多相关文章

Python如何实现doc文件转换为docx文件？
Python如何实现doc文件转换为docx文件? 在开发过程中遇到一个关于读写doc和docx的问题: 一个文件夹中有两种文件, 一种为doc结尾, 一种为docx结尾, 需要将这些文件全部重命名. ...
java将doc文件转换为pdf文件的三种方法
http://feifei.im/archives/93 —————————————————————————————————————————————— 项目要用到doc转pdf的功能,一番google ...
使用poi将Excel文件转换为data数据
pom <?xml version="1.0" encoding="UTF-8"?> <project xmlns="http:// ...
如何把rtf、doc文件转换为HTML文件
//retText是路径 1 public string ExtractHtml(string rtfText) { try { //Create word object Word.Applicati ...
使用POI转换word doc文件
目录 1 转换为Html文件 2 转换为Xml文件 3 转换为Text文件在POI中还存在有针对于word doc文件进行格式转换的功能.我们可以将word的内容 ...
POI转换word doc文件为（html,xml,txt）
在POI中还存在有针对于word doc文件进行格式转换的功能.我们可以将word的内容转换为对应的Html文件,也可以把它转换为底层用来描述doc文档的xml文件,还可以把它转换为底层用来描述doc ...
java使用poi读取doc和docx文件
这几天在学习java io流的东西,有一个网友看到博客后问了一个问题,就是说他的doc文档为什么用我所说的方法死活就是乱码. 我一开始以为是他方法问题,结果自己试了之后发现和他的结果一样也是乱码. 于 ...
使用POI读写Word doc文件
使用POI读写word doc文件目录 1 读word doc文件 1.1 通过WordExtractor读文件 1.2 通过HWPFDocument读文件 2 写w ...
android使用POI读写word doc文件
目录 1 读word doc文件 1.1 通过WordExtractor读文件 1.2 通过HWPFDocument读文件 2 写word doc文件 Apache p ...

随机推荐

Modern.IE，创建现代网站的给力开发工具！
Modern.IE是微软推出的用来帮助开发者创建现代网站的基本开发工具.作为Web攻城师,最头疼的问题莫过于浏览器兼容性测试,各种类型浏览器,各种版本的浏览器,还有各种头疼的前缀等等.Modern.I ...
PMD：Java源代码扫描器
PMD是一个开源代码分析器.可以查找常见编程缺陷,比如未使用的变量.空catch代码块.不必要的对象创建等.支持Java.JavaScript.PLSQL.Apache Velocity.XML.XS ...
phpcms控制器变量分配到模板
跟TP.CI框架不同,phpcmsv9分配变量的方式是: 控制器中声明了变量$a='zrp'或$data=array('aa','bb'); 在模板中就可以直接输出: 字符串:{$a} 数组:遍历 { ...
Java是否存在内存泄露
会的. 原因:长生命周期的对象持有短生命周期对象的引用,导致短生命周期对象不能被回收,由此可能发生内存泄露. 举例参考:http://blog.csdn.net/yakihappy/article/d ...
StarUML破解教程
StarUML破解教程 StarUML官方下载地址:http://staruml.io/download StarUML是一个非常好用的画UML图的工具,但是它是收费软件,以下是破解方法: 1.使用E ...
hdu 5969 最大的位或
最大的位或 Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/65536 K (Java/Others)Total Submi ...
EasyNVR无插件直播服务器软件使用详情功能 - 录像功能说明
背景介绍 EasyNVR不仅仅拥有无插件的直播功能,更拥有对于直播录像的存储和日期检索功能: 本篇博文主要用于介绍EasyNVR的录像功能. 之前有博文介绍相关的录像功能,本篇主要为了介绍录像的新功能 ...
iOS9 3D Touch使用
http://www.cnblogs.com/zhanglinfeng/p/5133939.html
OC中第三方库MJExtension的使用
MJExtension是一套常用的"字典和模型之间互相转换"的框架,在项目中也使用过,现在记录一下.随着Swift的普及,在Swift中也有一个类似功能的框架HandyJSON 也 ...
【python】-- 函数非固定参数，返回值(return)
函数非固定参数 1.默认参数: 代码如下: def information_register(name,age,country,sex): print("----注册信息------&quo ...

使用POI将doc文件转换为html

使用POI将doc文件转换为html的更多相关文章

随机推荐

热门专题