Lucence使用入门
参考:
https://blog.csdn.net/u014209975/article/details/50525624
https://www.cnblogs.com/hanyinglong/p/5395600.html
http://lucene.apache.org/core/4_0_0/core/overview-summary.html
https://www.jianshu.com/p/0a2bbe0f4c42
依赖:
lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar
实体类:
package com.h3c.lucence;
import java.io.Serializable;
public class Entity implements Serializable {
private static final long serialVersionUID = 3701082756628915138L;
private Integer id;
private String type;
private String virtualDoc;
private String summary;
private float score;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getVirtualDoc() {
if (null == virtualDoc) {
// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
// 格式:字段1:属性值1,字段2:属性值2,...
}
return virtualDoc;
}
public void setVirtualDoc(String virtualDoc) {
this.virtualDoc = virtualDoc;
}
public String getSummary() {
StringBuilder sb = new StringBuilder();
String tmpSum = summary;
tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
tmpSum = tmpSum.replace("</SPAN>", "");
String virtualDoc2 = getVirtualDoc();
int length = tmpSum.length();
int firstIndex = virtualDoc2.indexOf(tmpSum);
if (firstIndex > 0) {
sb.append("...");
}
sb.append(summary);
if (firstIndex + length < virtualDoc2.length()) {
sb.append("...");
}
return sb.toString();
}
public void setSummary(String summary) {
this.summary = summary;
}
public float getScore() {
return score;
}
public void setScore(float score) {
this.score = score;
}
}
Demo类:
package com.h3c.lucence;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Demo {
/** lucene索引目录 */
private static Directory ciIndexDir;
private static final String CI_CONTENT_FLAG = "virtualDoc";
/** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
private static Pattern VALID_IPV4_PATTERN = null;
private static Pattern VALID_IPV6_PATTERN = null;
private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}";
private static IndexWriter indexWriter;
static {
VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
try {
indexWriter = new IndexWriter(getCiIndexDir(), conf);
} catch (IOException e) {
e.printStackTrace();
}
}
private static Directory getCiIndexDir() {
if (null == ciIndexDir) {
try {
ciIndexDir = FSDirectory.open(new File("D://indexs"));
} catch (IOException e) {
e.printStackTrace();
}
}
return ciIndexDir;
}
private static boolean isIpAddress(String ipAddress) {
Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
return m1.matches() || m2.matches();
}
private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
}
private static BooleanQuery parseChineseCharacters(String inputString){
BooleanQuery query = new BooleanQuery();
if(isIpAddress(inputString)){
query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
return query;
}
BooleanQuery fieldQuery = new BooleanQuery();
boolean isWord = false;
StringBuilder tempWord = new StringBuilder();
inputString = inputString.toLowerCase();
BooleanQuery booleanQuery = new BooleanQuery();
int length = inputString.length();
Query termQuery = null;
for(int i=0; i<length; i++){
char c = inputString.charAt(i);
if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
isWord = true;
tempWord.append(c);
}
else{//Delimiter or Chinese character
isWord = false;
if(tempWord.length() > 0){
termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
tempWord = new StringBuilder();
}
}
if(!isWord){
termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
if(isChinese(c)){//Chinese character
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
else{//Delimiter
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
}
}
if(tempWord.length() > 0){
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
// Begin 处理全局字段匹配
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
// End 处理全局字段匹配
BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
fieldQuery.add(clause);
BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
query.add(fieldClause);
return query;
}
/**
* 全文检索
* @param queryStr
* @throws Exception
*/
private static void contentSearch(String queryStr, boolean highlight) throws Exception {
IndexReader indexReader = null;
IndexSearcher indexSearcher = null;
try {
indexReader = IndexReader.open(getCiIndexDir());
indexSearcher = new IndexSearcher(indexReader);
//组合查询条件,需要根据业务自己定义
Query query = parseChineseCharacters(queryStr);
TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
if(hits.totalHits > 0) {
if (highlight) {
QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter
.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100));
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
entity.setScore(scoreDoc.score);
}
} else {
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc);
entity.setScore(scoreDoc.score);
}
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
close(indexSearcher);
close(indexReader);
}
}
/**
* 对实现Closeable接口的统一关闭
* @param object
*/
private static void close(Closeable object) {
if(null != object) {
try {
object.close();
} catch (IOException e) {
}
}
}
/**
* 实体转换为Doc
* @param entity
* @return
*/
public static Document convertToDocument(Entity entity) {
Document doc = new Document();
String virtualDoc = entity.getVirtualDoc();
//Field.Store.Yes存储,Field.Index.ANALYZED分词
doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
return doc;
}
/**
* Doc转换为实体
* @param doc
* @return
*/
public static Entity convertToEntity(Document doc) {
Entity ci = new Entity();
ci.setId(Integer.valueOf(doc.get("id")));
ci.setType(doc.get("type"));
ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
return ci;
}
/**
* 检索Entity,含高亮信息
* @param doc
* @param indexReader
* @param docId
* @param highlighter
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException {
Entity entity = convertToEntity(doc);
String virtualDoc = entity.getVirtualDoc();
TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
if(highlighterSummary == null){
highlighterSummary = virtualDoc;
}
entity.setSummary(highlighterSummary);
return entity;
}
/**
* 给entity信息增加索引
* @param entity
*/
public static void addIndex(Entity entity) {
try {
deleteIndex(entity);
Document doc = convertToDocument(entity);
indexWriter.addDocument(doc);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 批量增加索引
* @param list
*/
public static void addIndexs(List<Entity> list) {
try {
List<Document> docs = new ArrayList<Document>();
deleteIndexs(list);
for (Entity entity : list) {
Document doc = convertToDocument(entity);
docs.add(doc);
}
indexWriter.addDocuments(docs);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 给实体信息更新索引
* @param entity
*/
public static void updateIndex(Entity entity) {
try {
addIndex(entity);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 删除entity列表信息对应的索引
* @param entity
*/
public static void deleteIndexs(List<Entity> list) {
try {
int size = list.size();
Term[] terms = new Term[size];
for(int i=0; i<size; i++) {
terms[i] = new Term("id", list.get(i).getId().toString());
}
indexWriter.deleteDocuments(terms);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 删除实体信息对应的索引
* @param entity
*/
public static void deleteIndex(Entity entity) {
try {
indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 删除实体类型对应的所以索引信息
* @param type
*/
public static void deleteIndexByType(String type) {
try {
indexWriter.deleteDocuments(new Term("type", type));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
protected void finalize() throws Throwable {
indexWriter.close();
}
public static void main(String[] args) throws Exception {
String queryStr = "http://mail6c1.shenzhenair.com";
contentSearch(queryStr, true);
}
}
Lucence使用入门的更多相关文章
- 【solr专题之一】Solr快速入门 分类: H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读 评论(0) 收藏
一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...
- Google Guava入门(一)
Guava作为Java编程的助手,可以提升开发效率,对Guava设计思想的学习则极大的有益于今后的编程之路.故在此对<Getting Started with Google Guava>一 ...
- Elasticsearch7.X 入门学习第一课笔记----基本概念
原文:Elasticsearch7.X 入门学习第一课笔记----基本概念 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https: ...
- Lucene从入门到实战
Lucene 在了解Lucene之前,我们先了解下全文数据查询. 全文数据查询 我们的数据一般分为两种:结构化数据和非结构化数据 结构化数据:有固定格式或有限长度的数据,如数据库中的数据.元数据 非结 ...
- Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求
上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...
- ABP入门系列(1)——学习Abp框架之实操演练
作为.Net工地搬砖长工一名,一直致力于挖坑(Bug)填坑(Debug),但技术却不见长进.也曾热情于新技术的学习,憧憬过成为技术大拿.从前端到后端,从bootstrap到javascript,从py ...
- Oracle分析函数入门
一.Oracle分析函数入门 分析函数是什么?分析函数是Oracle专门用于解决复杂报表统计需求的功能强大的函数,它可以在数据中进行分组然后计算基于组的某种统计值,并且每一组的每一行都可以返回一个统计 ...
- Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数
上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...
- Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数
上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...
随机推荐
- WebSocket 网页聊天室
先给大家开一个原始的websocket的连接使用范例 <?php /* * recv是从套接口接收数据,也就是拿过来,但是不知道是什么 * read是读取拿过来的数据,就是要知道recv过来的是 ...
- Flask-Scrip
介绍及安装 Flask-Script是一个让你的命令行支持自定义命令的工具,它为Flask程序添加一个命令行解释器.可以让我们的程序从命令行直接执行相应的程序. 安装 pip install Flas ...
- top查看进程的参数
top命令是Linux下常用的性能分析工具,能够实时显示系统中各个进程的资源占用状况,类似于Windows的任务管理器. top显示系统当前的进程和其他状况,是一个动态显示过程,即可以通过用户按键来不 ...
- 一个简化的插件框架c#
利用MEF实现插件加载. 定义了一套接口,分别实现插件主界面,插件,业务插件等. 整套加载完全使用MEF2. 所有插件分开,包括主界面也是插件实现. 用一个应用程序,只有Main和插件加载方法.我管它 ...
- VoIP系统大盘点
一.VoIP拓扑 PBX是程控交换机,程控交换机有实体交换机和软件模拟的交换机. 软件模拟的交换机,即交换机服务器,常用开源的sip服务器有asterisk,freepbx, opensip, fre ...
- Sass--伪类嵌套
其实伪类嵌套和属性嵌套非常类似,只不过他需要借助`&`符号一起配合使用. a { &:link, &:visited { color: blue; } &:hover ...
- 启动ABP项目
1.在官网下载ABP项目 2.打开项目选择解决方案,右击还原NuGet包 3.修改appsettings.json中的ConnectionStrings 例子"ConnectionStrin ...
- git 初始化提交项目
Git初始化本地已有项目,并推送到远端Git仓库操作1. 创建本地项目,在项目根目录执行git init命令git init 2. 在git服务器上创建一个仓库,这里使用GitHub创建一个仓库.例如 ...
- python 在图像上写中文字体 (python write Chinese in image)
本人处理图像的时候经常使用opencv的包,但是 cv2.putText 显示不了中文,所以查找了如何在python在图像上写中文的方法,在伟大的Stack Overflow上面找到一个方法,分享给大 ...
- python--前端之CSS
CSS产生背景: 为了让网页元素的样式更加丰富,也为了让网页的内容和样式能拆分开,CSS由此思想而诞生,CSS是 Cascading Style Sheets 的首字母缩写,意思是层叠样式表. 有了C ...