lucene 实现word,pdf全文检索源码
创建索引: import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* 创建索引 Lucene 3.0+
* @author Administrator
*
*/
public class indexer { /**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
//将要搜索TXT文件的地方
String dateDir = "data\\test\\dateDir";
IndexWriter indexWriter = null;
//创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建IndexWriter对象,
//第一个参数是Directory,第二个是分词器,
//第三个表示是否是创建,如果为false为在此基础上面修改,
//第四表示表示分词的最大值,比如说new MaxFieldLength(2),就表示两个字一分,
//一般用IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_30),true,
IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = new File(dateDir).listFiles();
for (int i = 0; i < files.length; i++) {
Document doc = null;
if(files[i].getName().endsWith(".txt")){
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
}else if(files[i].getName().endsWith(".doc")){
doc = getDocument(files[i]);
}else if(files[i].getName().endsWith(".ppt")){
doc = getPPT(files[i]);
}else if(files[i].getName().endsWith(".xls")){
doc = getExcel(files[i]);
}else if(files[i].getName().endsWith(".pdf")){
doc = getPdf(files[i]);
}else{
doc = new Document();
//创建Field对象,并放入doc对象中
doc.add(new Field("contents", new FileReader(files[i])));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED)); }
//写入IndexWriter
if(doc!= null) indexWriter.addDocument(doc);
}
//查看IndexWriter里面有多少个索引
System.out.println("numDocs:"+indexWriter.numDocs());
indexWriter.close(); } public static Document getDocument(File file) throws Exception {
String docPath = file.getAbsolutePath();
String title = file.getName(); // 创建Document
Document document = new Document(); /*InputStream inputStream = null;
Reader contents = null;
try {
inputStream = new FileInputStream(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
} WordExtractor extractor = new WordExtractor();
//try{
// POIFSFileSystem fsys = new POIFSFileSystem(inputStream);
// DocumentEntry headerProps =
// (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
// DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
// byte[] header = new byte[headerProps.getSize()]; // din.read(header);
// din.close(); // int info = LittleEndian.getShort(header, 0xa);
// if ((info & 0x4) != 0)
// {
// throw new FastSavedException("Fast-saved files are unsupported at this time");
// }
// if ((info & 0x100) != 0)
// {
// throw new PasswordProtectedException("This document is password protected");
// }
//}finally{ //} try {
contents = new StringReader(extractor.extractText(inputStream));
} catch (Exception e) {
e.printStackTrace();
}*/ StringBuffer contents = new StringBuffer("");// 文档内容
try {
FileInputStream fs = new FileInputStream(docPath);
HWPFDocument doc = new HWPFDocument(fs);
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
contents.append(pp.text());
} } catch (Exception e) { }
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPPT(File pptFile) throws IOException{
String docPath = pptFile.getAbsolutePath();
String title = pptFile.getName(); StringBuffer contents = new StringBuffer("");// 文档内容
InputStream is = new FileInputStream(pptFile);
SlideShow ppt = new SlideShow(new HSLFSlideShow(is));
Slide[] slides = ppt.getSlides();
//提取文本信息
/*for (Slide each : slides) {
//System.out.println("title:" + each.getTitle()) ;
//System.out.println("content:") ;
TextRun[] textRuns = each.getTextRuns();
for (int i=0 ;i< textRuns.length; i++ ) {
//System.out.println(textRuns[i].getText());
RichTextRun[] richTextRuns = textRuns[i].getRichTextRuns();
for (int j = 0; j < richTextRuns.length; j++) {
//System.out.println(richTextRuns[j].getText());
contents.append(richTextRuns[j].getText());
}
}
contents.append(each.getTitle());
}*/
for(int i=0;i <slides.length;i++){
TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容,建立TextRun
for(int j=0;j <t.length;j++){
contents.append(t[j].getText());//这里会将文字内容加到content中去
}
//contents.append(slides[i].getTitle());
} Document document = new Document();
String cont = contents.toString().trim(); document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
//document.add(new Field("contents", contents));
document.add(new Field("contents", cont,Field.Store.YES,Field.Index.ANALYZED));
//document.add(new Field("path", docPath, Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("indexDate",DateTools.dateToString(new Date(), DateTools.Resolution.DAY),
Field.Store.YES,Field.Index.NOT_ANALYZED));
return document;
} public static Document getPdf(File pdf) {
String pdfpath = pdf.getAbsolutePath();
// 创建输入流读取pdf文件
String title = pdf.getName();
String result = "";
FileInputStream is = null;
PDDocument doc = null;
try {
is = new FileInputStream(pdf);
PDFParser parser = new PDFParser(is);
parser.parse();
doc = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(doc); } catch (Exception e) { e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (Exception e) {
e.printStackTrace();
}
}
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
Document document = new Document();
document.add(new Field("filename", title, Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", result, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static Document getExcel(File fileExcel) throws Exception { InputStream is = new FileInputStream(fileExcel);
StringBuffer content = new StringBuffer(); HSSFWorkbook workbook = new HSSFWorkbook(is); for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
HSSFSheet aSheet = workbook.getSheetAt(numSheets);// 获得一个sheet
content.append("\n");
if (null == aSheet) {
continue;
}
for (int rowNum = 0; rowNum <= aSheet.getLastRowNum(); rowNum++) {
content.append("\n");
HSSFRow aRow = aSheet.getRow(rowNum);
if (null == aRow) {
continue;
} for (short cellNum = 0; cellNum <= aRow.getLastCellNum(); cellNum++) {
HSSFCell aCell = aRow.getCell(cellNum);
if (null == aCell) {
continue;
} if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
content.append(aCell.getRichStringCellValue().getString());
} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
boolean b = HSSFDateUtil.isCellDateFormatted(aCell);
if (b) {
Date date = aCell.getDateCellValue();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
content.append(df.format(date));
}
}
}
}
} String cont = content.toString();
Document document = new Document();
document.add(new Field("filename",fileExcel.getName(), Field.Store.YES,
Field.Index.ANALYZED));//TOKENIZED
document.add(new Field("contents", cont, Field.Store.YES,
Field.Index.ANALYZED));
//document.add(new Field("path", pdfpath, Field.Store.YES,Field.Index.ANALYZED));
return document;
} public static String readHtml(String urlString) { StringBuffer content = new StringBuffer("");
File file = new File(urlString);
FileInputStream fis = null;
try {
fis = new FileInputStream(file);
// 读取页面
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis,"utf-8"));//这里的字符编码要注意,要对上html头文件的一致,否则会出乱码 String line = null; while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception e) {
e.printStackTrace();
}
String contentString = content.toString();
return contentString;
}
}
搜索索引
import java.io.File;
import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索索引 Lucene 3.0+
* @author Administrator
*
*/
public class searcher { public static void main(String[] args) throws IOException, ParseException {
//保存索引文件的地方
String indexDir = "data\\test\\indexDir";
Directory dir = new SimpleFSDirectory(new File(indexDir));
//创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
//创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
QueryParser queryParser = new QueryParser(Version.LUCENE_30,
"contents", new StandardAnalyzer(Version.LUCENE_30));
//生成Query对象
Query query = queryParser.parse("arcgis");
//搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query,10);
//hits.totalHits表示一共搜到多少个
System.out.println("找到了"+hits.totalHits+"个");
//循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
System.out.println(doc.get("filename"));
}
indexSearch.close();
}
}
lucene 实现word,pdf全文检索源码的更多相关文章
- 《C++实践之路.pdf》源码
> 源码下载方法 < >> 打开微信 >> 扫描下方二维码 >> 关注林哥私房菜 >> 输入对应编号获取百度网盘提取密码 全书源码[已更新完 ...
- Spring实战(中文4,5版) PDF含源码
Spring实战 读者评价 看了一半后在做评论,物流速度挺快,正版行货,只是运输过程有点印记,但是想必大家和你关注内容,spring 4必之3更加关注的是使用注解做开发,对于初学者还是很有用,但是不排 ...
- C#导出文本内容到word文档源码
将做工程过程中较好的代码片段珍藏起来,下面的代码内容是关于C#导出文本内容到word文档的代码,希望能对小伙伴们也有好处.<%@ Page Language="C#" Aut ...
- tomcat 源码解析
how_tomcat_works https://www.uzh.ch/cmsssl/dam/jcr:00000000-29c9-42ee-0000-000074fab75a/how_tomcat_w ...
- java源码剖析: 对象内存布局、JVM锁以及优化
一.目录 1.启蒙知识预热:CAS原理+JVM对象头内存存储结构 2.JVM中锁优化:锁粗化.锁消除.偏向锁.轻量级锁.自旋锁. 3.总结:偏向锁.轻量级锁,重量级锁的优缺点. 二.启蒙知识预热 开启 ...
- java集合树状结构及源码
java集合树状结构及源码 最近一直想看一下java集合的源码,毕竟平时用的比较多,但总是感觉是跟着习惯new出来一个对象,比如ArrayList,HashMap等等,所以就简单的看了一下,了解了一下 ...
- jdk源码剖析二: 对象内存布局、synchronized终极原理
很多人一提到锁,自然第一个想到了synchronized,但一直不懂源码实现,现特地追踪到C++层来剥开synchronized的面纱. 网上的很多描述大都不全,让人看了不够爽,看完本章,你将彻底了解 ...
- 早前阅读live555源码做的笔记
早前阅读live555源码的时候做了一些简单的笔记.现在看来那个时候对C++的理解还是不够,还有很多不足.当时对很多名词也不是很熟悉,对一些类的描述也很生硬,所以笔记中有一些不通畅之处. 阅读live ...
- tomcat 源码分析
Tomcat源码分析——Session管理分析(下) Tomcat源码分析——Session管理分析(上) Tomcat源码分析——请求原理分析(下) Tomcat源码分析——请 ...
随机推荐
- POJ 1330 Nearest Common Ancestors(Tree)
题目:Nearest Common Ancestors 根据输入建立树,然后求2个结点的最近共同祖先. 注意几点: (1)记录每个结点的父亲,比较层级时要用: (2)记录层级: (3)记录每个结点的孩 ...
- kontalk
Site: http://kontalk.org/ Code: https://github.com/kontalk/androidclient
- [Javascript] Drawing Styles on HTML5 Canvas
window.onload = function() { var canvas = document.getElementById("canvas"), context = can ...
- Ajax之旅(一)--什么是Ajax
本来在学习DRP,但是无意中发现所附资料中有一些參考书籍,当中就有一个关于Ajax的,看了看,挺好的,于是决定暂停一下DRP,再次学习一下Ajax.记得第一遍学习Ajax的时候认为真的是一团雾水,看了 ...
- 【转】CCUserDefault类深入分析——2013-08-25 22
http://game.dapps.net/gamedev/game-engine/8792.html 另:本章所用Cocos2d-x版本为: 2.1.1 (2013-01-28) 大家好,今天我们来 ...
- 神奇的 BlocksKit(1):源码分析(下)
私有类 _BKObserver _BKObserver 是用来观测属性的对象,它在接口中定义了 4 个属性: @property (nonatomic,readonly,unsafe_unretain ...
- Linux--------------安装vim
1.相关提示 -bash: vim: command not found 2.查看vim是否安装 rpm -qa|grep vim vim-en ...
- iOS数据处理之SQLite数据库
1. 数据库管理系统 1> SQL语言概述 SQL: SQL是Structured Query Language(结构化查询语言)的缩写.SQL是专为数据库而建立的操作命令集, 是一种功能齐全的 ...
- 查看kindle paperwhite2上卡索引书籍的方法
昨天kindle耗电量突然加快,经过检查和网络搜索得知是卡索引导致的耗电量增大.我自己通过关闭索引的方式解决了这个问题. 在这个过程中发现了一个可以直接找到所有卡索引书籍的方法,在此分享一下. 首先打 ...
- css定义多重背景动画
<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <style typ ...