lucene 实现word，pdf全文检索源码

创建索引：

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import java.io.StringReader;

import java.text.SimpleDateFormat;

import java.util.Date;   

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.DateTools;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.SimpleFSDirectory;

import org.apache.lucene.util.Version;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.util.PDFTextStripper;

import org.apache.poi.hslf.HSLFSlideShow;

import org.apache.poi.hslf.model.Slide;

import org.apache.poi.hslf.model.TextRun;

import org.apache.poi.hslf.usermodel.RichTextRun;

import org.apache.poi.hslf.usermodel.SlideShow;

import org.apache.poi.hssf.usermodel.HSSFCell;

import org.apache.poi.hssf.usermodel.HSSFDateUtil;

import org.apache.poi.hssf.usermodel.HSSFRow;

import org.apache.poi.hssf.usermodel.HSSFSheet;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.usermodel.Paragraph;

import org.apache.poi.hwpf.usermodel.Range;

import org.apache.poi.poifs.filesystem.DocumentEntry;

import org.apache.poi.poifs.filesystem.DocumentInputStream;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.poi.util.LittleEndian;

/**

 * 创建索引 Lucene 3.0+

 * @author Administrator

 *

 */

public class indexer {   

    /**

     * @param args

     * @throws Exception

     */

    public static void main(String[] args) throws Exception {

        //保存索引文件的地方

        String indexDir = "data\\test\\indexDir";

        //将要搜索TXT文件的地方

        String dateDir = "data\\test\\dateDir";

        IndexWriter indexWriter = null;

        //创建Directory对象

        Directory dir = new SimpleFSDirectory(new File(indexDir));

        //创建IndexWriter对象,

        //第一个参数是Directory,第二个是分词器,

        //第三个表示是否是创建,如果为false为在此基础上面修改,

        //第四表示表示分词的最大值，比如说new MaxFieldLength(2)，就表示两个字一分，

        //一般用IndexWriter.MaxFieldLength.LIMITED

        indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,

        		IndexWriter.MaxFieldLength.UNLIMITED);

        File[] files = new File(dateDir).listFiles();

        for (int i = 0; i < files.length; i++) {

        	Document doc = null;

        	if(files[i].getName().endsWith(".txt")){

	            doc = new Document();

	            //创建Field对象，并放入doc对象中

	            doc.add(new Field("contents", new FileReader(files[i])));

	            doc.add(new Field("filename", files[i].getName(),

	                                Field.Store.YES, Field.Index.NOT_ANALYZED));

	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

	            		Field.Store.YES,Field.Index.NOT_ANALYZED));

        	}else if(files[i].getName().endsWith(".doc")){

        			doc = getDocument(files[i]);

        	}else if(files[i].getName().endsWith(".ppt")){

        		doc = getPPT(files[i]);

        	}else if(files[i].getName().endsWith(".xls")){

        		doc = getExcel(files[i]);

        	}else if(files[i].getName().endsWith(".pdf")){

        		doc = getPdf(files[i]);

        	}else{

        		doc = new Document();

	            //创建Field对象，并放入doc对象中

	            doc.add(new Field("contents", new FileReader(files[i])));

	            doc.add(new Field("filename", files[i].getName(),

	                                Field.Store.YES, Field.Index.NOT_ANALYZED));

	            doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

	            		Field.Store.YES,Field.Index.NOT_ANALYZED));   

        	}

        	//写入IndexWriter

        	if(doc!= null) indexWriter.addDocument(doc);

        }

        //查看IndexWriter里面有多少个索引

        System.out.println("numDocs："+indexWriter.numDocs());

        indexWriter.close();

    } 

    public static Document getDocument(File file) throws Exception {

		String docPath = file.getAbsolutePath();

		String title = file.getName();

		// 创建Document

		Document document = new Document();

		/*InputStream inputStream = null;

		Reader contents = null;

		try {

			inputStream = new FileInputStream(file);

		} catch (FileNotFoundException e) {

			e.printStackTrace();

		}

		WordExtractor extractor = new WordExtractor();

		//try{

		//	POIFSFileSystem fsys = new POIFSFileSystem(inputStream);

		//	DocumentEntry headerProps =

		//	         (DocumentEntry)fsys.getRoot().getEntry("WordDocument");

		//	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");

		//	byte[] header = new byte[headerProps.getSize()];

		//	din.read(header);

		//	din.close();

		//	int info = LittleEndian.getShort(header, 0xa);

		//	if ((info & 0x4) != 0)

		//	{

		//		throw new FastSavedException("Fast-saved files are unsupported at this time");

		//	}

		//	if ((info & 0x100) != 0)

		//	{

		//		throw new PasswordProtectedException("This document is password protected");

		//	}

		//}finally{

		//}

		try {

			contents = new StringReader(extractor.extractText(inputStream));

		} catch (Exception e) {

			e.printStackTrace();

		}*/

		StringBuffer contents = new StringBuffer("");// 文档内容

        try {

        	FileInputStream fs = new FileInputStream(docPath);

            HWPFDocument doc = new HWPFDocument(fs);

            Range range = doc.getRange();

            int paragraphCount = range.numParagraphs();// 段落

            for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据

                Paragraph pp = range.getParagraph(i);

                contents.append(pp.text());

            } 

        } catch (Exception e) {

        }

        String cont = contents.toString().trim();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		//document.add(new Field("contents", contents));

		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));

		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));

		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

        		Field.Store.YES,Field.Index.NOT_ANALYZED));

		return document;

	}

    public static Document getPPT(File pptFile) throws IOException{

    	String docPath = pptFile.getAbsolutePath();

		String title = pptFile.getName();

    	StringBuffer contents = new StringBuffer("");// 文档内容

    	InputStream is = new FileInputStream(pptFile);

    	SlideShow ppt = new SlideShow(new HSLFSlideShow(is));

    	Slide[] slides = ppt.getSlides();

    	//提取文本信息

    	/*for (Slide each : slides) {

    		//System.out.println("title:" + each.getTitle()) ;

    		//System.out.println("content:") ;

    		TextRun[] textRuns = each.getTextRuns();

    		for (int i=0 ;i< textRuns.length; i++ ) {

    			//System.out.println(textRuns[i].getText());

    			RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();

    			for (int j = 0; j < richTextRuns.length; j++) {

    				//System.out.println(richTextRuns[j].getText());

    				contents.append(richTextRuns[j].getText());

    			}

    		}

    		contents.append(each.getTitle());

    	}*/

    	for(int i=0;i <slides.length;i++){

            TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容，建立TextRun

            for(int   j=0;j <t.length;j++){

            	contents.append(t[j].getText());//这里会将文字内容加到content中去

            }

            //contents.append(slides[i].getTitle());

        }

    	Document document = new Document();

    	String cont = contents.toString().trim();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		//document.add(new Field("contents", contents));

		document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));

		//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));

		document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),

        		Field.Store.YES,Field.Index.NOT_ANALYZED));

    	return document;

    }

    public static Document getPdf(File pdf) {

		String pdfpath = pdf.getAbsolutePath();

		// 创建输入流读取pdf文件

		String title = pdf.getName();

		String result = "";

		FileInputStream is = null;

		PDDocument doc = null;

		try {

			is = new FileInputStream(pdf);

			PDFParser parser = new PDFParser(is);

			parser.parse();

			doc = parser.getPDDocument();

			PDFTextStripper stripper = new PDFTextStripper();

			result = stripper.getText(doc);

		} catch (Exception e) {

			e.printStackTrace();

		} finally {

			if (is != null) {

				try {

					is.close();

				} catch (Exception e) {

					e.printStackTrace();

				}

			}

			if (doc != null) {

				try {

					doc.close();

				} catch (Exception e) {

					e.printStackTrace();

				}

			}

		}

		Document document = new Document();

		document.add(new Field("filename", title, Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		document.add(new Field("contents", result, Field.Store.YES,

				Field.Index.ANALYZED));

		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));

		return document;

	}

    public static Document getExcel(File fileExcel) throws Exception {

    	InputStream is = new FileInputStream(fileExcel);

        StringBuffer content = new StringBuffer();

        HSSFWorkbook workbook = new HSSFWorkbook(is);

        for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {

            HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet

            content.append("\n");

            if (null == aSheet) {

               continue;

            }

            for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {

               content.append("\n");

               HSSFRow aRow = aSheet.getRow(rowNum);

               if (null == aRow) {

                   continue;

               }

               for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {

                   HSSFCell aCell = aRow.getCell(cellNum);

                   if (null == aCell) {

                      continue;

                   }

                   if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {

                      content.append(aCell.getRichStringCellValue().getString());

                   } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {

                      boolean b = HSSFDateUtil.isCellDateFormatted(aCell);

                      if (b) {

                          Date date = aCell.getDateCellValue();

                          SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");

                          content.append(df.format(date));

                      }

                   }

               }

            }

        }

        String cont = content.toString();

        Document document = new Document();

		document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,

				Field.Index.ANALYZED));//TOKENIZED

		document.add(new Field("contents", cont, Field.Store.YES,

				Field.Index.ANALYZED));

		//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));

		return document;

     }

    public static String readHtml(String urlString) {

        StringBuffer content = new StringBuffer("");

        File file = new File(urlString);

        FileInputStream fis = null;

        try {

            fis = new FileInputStream(file);

            // 读取页面

            BufferedReader reader = new BufferedReader(new InputStreamReader(

                    fis,"utf-8"));//这里的字符编码要注意，要对上html头文件的一致，否则会出乱码

            String line = null;

            while ((line = reader.readLine()) != null) {

                content.append(line + "\n");

            }

            reader.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

        String contentString = content.toString();

        return contentString;

    }

}

　　搜索索引


import java.io.File;

import java.io.IOException;   

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.ParseException;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.SimpleFSDirectory;

import org.apache.lucene.util.Version;

/**

 * 搜索索引 Lucene 3.0+

 * @author Administrator

 *

 */

public class searcher {   

    public static void main(String[] args) throws IOException, ParseException {

        //保存索引文件的地方

        String indexDir = "data\\test\\indexDir";

        Directory dir = new SimpleFSDirectory(new File(indexDir));

        //创建 IndexSearcher对象，相比IndexWriter对象，这个参数就要提供一个索引的目录就行了

        IndexSearcher indexSearch = new IndexSearcher(dir);

        //创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器

        QueryParser queryParser = new QueryParser(Version.LUCENE_30,

                "contents", new StandardAnalyzer(Version.LUCENE_30));

        //生成Query对象

        Query query = queryParser.parse("arcgis");

        //搜索结果 TopDocs里面有scoreDocs[]数组，里面保存着索引值

        TopDocs hits = indexSearch.search(query,10);

        //hits.totalHits表示一共搜到多少个

        System.out.println("找到了"+hits.totalHits+"个");

        //循环hits.scoreDocs数据，并使用indexSearch.doc方法把Document还原，再拿出对应的字段的值

        for (int i = 0; i < hits.scoreDocs.length; i++) {

            ScoreDoc sdoc = hits.scoreDocs[i];

            Document doc = indexSearch.doc(sdoc.doc);

            System.out.println(doc.get("filename"));

        }

        indexSearch.close();

    }

}

lucene 实现word，pdf全文检索源码的更多相关文章

《C++实践之路.pdf》源码
> 源码下载方法 < >> 打开微信 >> 扫描下方二维码 >> 关注林哥私房菜 >> 输入对应编号获取百度网盘提取密码全书源码[已更新完 ...
Spring实战（中文4,5版） PDF含源码
Spring实战读者评价看了一半后在做评论,物流速度挺快,正版行货,只是运输过程有点印记,但是想必大家和你关注内容,spring 4必之3更加关注的是使用注解做开发,对于初学者还是很有用,但是不排 ...
C#导出文本内容到word文档源码
将做工程过程中较好的代码片段珍藏起来,下面的代码内容是关于C#导出文本内容到word文档的代码,希望能对小伙伴们也有好处.<%@ Page Language="C#" Aut ...
tomcat　源码解析
how_tomcat_works https://www.uzh.ch/cmsssl/dam/jcr:00000000-29c9-42ee-0000-000074fab75a/how_tomcat_w ...
java源码剖析: 对象内存布局、JVM锁以及优化
一.目录 1.启蒙知识预热:CAS原理+JVM对象头内存存储结构 2.JVM中锁优化:锁粗化.锁消除.偏向锁.轻量级锁.自旋锁. 3.总结:偏向锁.轻量级锁,重量级锁的优缺点. 二.启蒙知识预热开启 ...
java集合树状结构及源码
java集合树状结构及源码最近一直想看一下java集合的源码,毕竟平时用的比较多,但总是感觉是跟着习惯new出来一个对象,比如ArrayList,HashMap等等,所以就简单的看了一下,了解了一下 ...
jdk源码剖析二: 对象内存布局、synchronized终极原理
很多人一提到锁,自然第一个想到了synchronized,但一直不懂源码实现,现特地追踪到C++层来剥开synchronized的面纱. 网上的很多描述大都不全,让人看了不够爽,看完本章,你将彻底了解 ...
早前阅读live555源码做的笔记
早前阅读live555源码的时候做了一些简单的笔记.现在看来那个时候对C++的理解还是不够,还有很多不足.当时对很多名词也不是很熟悉,对一些类的描述也很生硬,所以笔记中有一些不通畅之处. 阅读live ...
tomcat 源码分析
Tomcat源码分析——Session管理分析(下) Tomcat源码分析——Session管理分析(上) Tomcat源码分析——请求原理分析(下) Tomcat源码分析——请 ...

随机推荐

WindDbug应用
Windbg是windows平台上的一款相当强大的调试工具,可以从msdn网站下载得到,最新版本包含在windows sdk中,默认会被安装在C:\Program Files\Debugging To ...
Eclipse reports that Android SDK Content Loader has encountered a problem. parseSdkContent failed.
1) Download the SDK platform for API 20 (4.4W) 2) Navigate to your sdk folder (should be like D:\Ecl ...
[Javascript] Create an Array concatAll method
In addition to flat Arrays, programmers must often deal with nested Arrays. For example let's say we ...
hi3531的h264压缩中改动波特率
typedef struct hiVENC_ATTR_H264_CBR_S { HI_U32 u32Gop; HI_U32 u32StatTime; HI_U32 u32ViFrmRate; HI_F ...
PHP获取客户端和服务器IP地址
/** * 获取客户端IP地址 * @return string */ function get_client_ip() { if(getenv('HTTP_CLIENT_IP')){ $client ...
ant有什么用
内容摘要: ANT是一个基于Java的自动化脚本引擎,脚本格式为XML.除了做Java编译相关任务外,ANT还可以通过插件实现很多应用的调用. 1)ANT的基本概念: 2)ANT的安装:解包,设置路径 ...
从 setNeedsLayout 说起
本文从 setNeedsLayout 这个方法说起,分享与其相关的 UIKit 视图交互.使用场景等内容. UIKit 为 UIView 提供了这些方法来进行视图的更新与重绘: public func ...
HDU1518(dfs)java/ c++
Square Time Limit: 10000/5000 MS (Java/Others) Memory Limit: 65536/32768 K (Java/Others)Total Sub ...
Java基础知识强化之IO流笔记49：IO流练习之复制指定目录下指定后缀名的文件并修改名称的案例
1. 复制指定目录下指定后缀名的文件并修改名称的案例需求:复制指定目录下的指定文件,并修改后缀名. • 指定的文件是:.java文件. • 指定的后缀名是:.jad • 指 ...
win8 64位操作系统 Microsoft Visual Studio 2010在IIS上调试 “此任务要求应用程序具有提升的权限”等问题
很少在IIS上调试程序,因系统原因,所以不得不在IIS上预览项目和调试项目(因为只能在IIS上预览项目才能看到项目里的数据). 1.附加到进程(注意附加到进程前必须预览项目) 2.选择调试项需要注意 ...

lucene 实现word，pdf全文检索源码

lucene 实现word，pdf全文检索源码的更多相关文章

随机推荐

热门专题