参考:

https://blog.csdn.net/u014209975/article/details/50525624

https://www.cnblogs.com/hanyinglong/p/5395600.html

http://lucene.apache.org/core/4_0_0/core/overview-summary.html

https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {
return id;
} public void setId(Integer id) {
this.id = id;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public String getVirtualDoc() {
if (null == virtualDoc) {
// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
// 格式:字段1:属性值1,字段2:属性值2,...
}
return virtualDoc;
} public void setVirtualDoc(String virtualDoc) {
this.virtualDoc = virtualDoc;
} public String getSummary() {
StringBuilder sb = new StringBuilder();
String tmpSum = summary;
tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
tmpSum = tmpSum.replace("</SPAN>", "");
String virtualDoc2 = getVirtualDoc();
int length = tmpSum.length();
int firstIndex = virtualDoc2.indexOf(tmpSum);
if (firstIndex > 0) {
sb.append("...");
}
sb.append(summary);
if (firstIndex + length < virtualDoc2.length()) {
sb.append("...");
} return sb.toString();
} public void setSummary(String summary) {
this.summary = summary;
} public float getScore() {
return score;
} public void setScore(float score) {
this.score = score;
}
}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version; public class Demo {
/** lucene索引目录 */
private static Directory ciIndexDir; private static final String CI_CONTENT_FLAG = "virtualDoc"; /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); private static Pattern VALID_IPV4_PATTERN = null;
private static Pattern VALID_IPV6_PATTERN = null;
private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}"; private static IndexWriter indexWriter; static {
VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
try {
indexWriter = new IndexWriter(getCiIndexDir(), conf);
} catch (IOException e) {
e.printStackTrace();
}
} private static Directory getCiIndexDir() {
if (null == ciIndexDir) {
try {
ciIndexDir = FSDirectory.open(new File("D://indexs"));
} catch (IOException e) {
e.printStackTrace();
}
}
return ciIndexDir;
} private static boolean isIpAddress(String ipAddress) {
Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
return m1.matches() || m2.matches();
} private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
} private static BooleanQuery parseChineseCharacters(String inputString){
BooleanQuery query = new BooleanQuery();
if(isIpAddress(inputString)){
query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
return query;
}
BooleanQuery fieldQuery = new BooleanQuery();
boolean isWord = false;
StringBuilder tempWord = new StringBuilder();
inputString = inputString.toLowerCase();
BooleanQuery booleanQuery = new BooleanQuery();
int length = inputString.length();
Query termQuery = null;
for(int i=0; i<length; i++){
char c = inputString.charAt(i);
if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
isWord = true;
tempWord.append(c);
}
else{//Delimiter or Chinese character
isWord = false;
if(tempWord.length() > 0){
termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
tempWord = new StringBuilder();
}
}
if(!isWord){
termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
if(isChinese(c)){//Chinese character
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
else{//Delimiter
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} }
}
if(tempWord.length() > 0){
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} // Begin 处理全局字段匹配
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
// End 处理全局字段匹配 BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
fieldQuery.add(clause); BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
query.add(fieldClause); return query;
} /**
* 全文检索
* @param queryStr
* @throws Exception
*/
private static void contentSearch(String queryStr, boolean highlight) throws Exception {
IndexReader indexReader = null;
IndexSearcher indexSearcher = null;
try {
indexReader = IndexReader.open(getCiIndexDir());
indexSearcher = new IndexSearcher(indexReader); //组合查询条件,需要根据业务自己定义
Query query = parseChineseCharacters(queryStr); TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
if(hits.totalHits > 0) {
if (highlight) {
QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter
.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
entity.setScore(scoreDoc.score);
}
} else {
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc);
entity.setScore(scoreDoc.score);
}
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
close(indexSearcher);
close(indexReader);
}
} /**
* 对实现Closeable接口的统一关闭
* @param object
*/
private static void close(Closeable object) {
if(null != object) {
try {
object.close();
} catch (IOException e) {
}
}
} /**
* 实体转换为Doc
* @param entity
* @return
*/
public static Document convertToDocument(Entity entity) {
Document doc = new Document();
String virtualDoc = entity.getVirtualDoc();
//Field.Store.Yes存储,Field.Index.ANALYZED分词
doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
return doc;
} /**
* Doc转换为实体
* @param doc
* @return
*/
public static Entity convertToEntity(Document doc) {
Entity ci = new Entity();
ci.setId(Integer.valueOf(doc.get("id")));
ci.setType(doc.get("type"));
ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
return ci;
} /**
* 检索Entity,含高亮信息
* @param doc
* @param indexReader
* @param docId
* @param highlighter
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException { Entity entity = convertToEntity(doc);
String virtualDoc = entity.getVirtualDoc();
TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
if(highlighterSummary == null){
highlighterSummary = virtualDoc;
}
entity.setSummary(highlighterSummary); return entity;
} /**
* 给entity信息增加索引
* @param entity
*/
public static void addIndex(Entity entity) {
try {
deleteIndex(entity);
Document doc = convertToDocument(entity);
indexWriter.addDocument(doc);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 批量增加索引
* @param list
*/
public static void addIndexs(List<Entity> list) {
try {
List<Document> docs = new ArrayList<Document>();
deleteIndexs(list);
for (Entity entity : list) {
Document doc = convertToDocument(entity);
docs.add(doc);
}
indexWriter.addDocuments(docs);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 给实体信息更新索引
* @param entity
*/
public static void updateIndex(Entity entity) {
try {
addIndex(entity);
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除entity列表信息对应的索引
* @param entity
*/
public static void deleteIndexs(List<Entity> list) {
try {
int size = list.size();
Term[] terms = new Term[size];
for(int i=0; i<size; i++) {
terms[i] = new Term("id", list.get(i).getId().toString());
}
indexWriter.deleteDocuments(terms);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体信息对应的索引
* @param entity
*/
public static void deleteIndex(Entity entity) {
try {
indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体类型对应的所以索引信息
* @param type
*/
public static void deleteIndexByType(String type) {
try {
indexWriter.deleteDocuments(new Term("type", type));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} @Override
protected void finalize() throws Throwable {
indexWriter.close();
} public static void main(String[] args) throws Exception {
String queryStr = "http://mail6c1.shenzhenair.com";
contentSearch(queryStr, true);
}
}

Lucence使用入门的更多相关文章

  1. 【solr专题之一】Solr快速入门 分类: H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读 评论(0) 收藏

    一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...

  2. Google Guava入门(一)

    Guava作为Java编程的助手,可以提升开发效率,对Guava设计思想的学习则极大的有益于今后的编程之路.故在此对<Getting Started with Google Guava>一 ...

  3. Elasticsearch7.X 入门学习第一课笔记----基本概念

    原文:Elasticsearch7.X 入门学习第一课笔记----基本概念 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https: ...

  4. Lucene从入门到实战

    Lucene 在了解Lucene之前,我们先了解下全文数据查询. 全文数据查询 我们的数据一般分为两种:结构化数据和非结构化数据 结构化数据:有固定格式或有限长度的数据,如数据库中的数据.元数据 非结 ...

  5. Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求

    上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...

  6. ABP入门系列(1)——学习Abp框架之实操演练

    作为.Net工地搬砖长工一名,一直致力于挖坑(Bug)填坑(Debug),但技术却不见长进.也曾热情于新技术的学习,憧憬过成为技术大拿.从前端到后端,从bootstrap到javascript,从py ...

  7. Oracle分析函数入门

    一.Oracle分析函数入门 分析函数是什么?分析函数是Oracle专门用于解决复杂报表统计需求的功能强大的函数,它可以在数据中进行分组然后计算基于组的某种统计值,并且每一组的每一行都可以返回一个统计 ...

  8. Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数

    上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...

  9. Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数

    上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...

随机推荐

  1. WebSocket 网页聊天室

    先给大家开一个原始的websocket的连接使用范例 <?php /* * recv是从套接口接收数据,也就是拿过来,但是不知道是什么 * read是读取拿过来的数据,就是要知道recv过来的是 ...

  2. Flask-Scrip

    介绍及安装 Flask-Script是一个让你的命令行支持自定义命令的工具,它为Flask程序添加一个命令行解释器.可以让我们的程序从命令行直接执行相应的程序. 安装 pip install Flas ...

  3. top查看进程的参数

    top命令是Linux下常用的性能分析工具,能够实时显示系统中各个进程的资源占用状况,类似于Windows的任务管理器. top显示系统当前的进程和其他状况,是一个动态显示过程,即可以通过用户按键来不 ...

  4. 一个简化的插件框架c#

    利用MEF实现插件加载. 定义了一套接口,分别实现插件主界面,插件,业务插件等. 整套加载完全使用MEF2. 所有插件分开,包括主界面也是插件实现. 用一个应用程序,只有Main和插件加载方法.我管它 ...

  5. VoIP系统大盘点

    一.VoIP拓扑 PBX是程控交换机,程控交换机有实体交换机和软件模拟的交换机. 软件模拟的交换机,即交换机服务器,常用开源的sip服务器有asterisk,freepbx, opensip, fre ...

  6. Sass--伪类嵌套

    其实伪类嵌套和属性嵌套非常类似,只不过他需要借助`&`符号一起配合使用. a { &:link, &:visited { color: blue; } &:hover ...

  7. 启动ABP项目

    1.在官网下载ABP项目 2.打开项目选择解决方案,右击还原NuGet包 3.修改appsettings.json中的ConnectionStrings 例子"ConnectionStrin ...

  8. git 初始化提交项目

    Git初始化本地已有项目,并推送到远端Git仓库操作1. 创建本地项目,在项目根目录执行git init命令git init 2. 在git服务器上创建一个仓库,这里使用GitHub创建一个仓库.例如 ...

  9. python 在图像上写中文字体 (python write Chinese in image)

    本人处理图像的时候经常使用opencv的包,但是 cv2.putText 显示不了中文,所以查找了如何在python在图像上写中文的方法,在伟大的Stack Overflow上面找到一个方法,分享给大 ...

  10. python--前端之CSS

    CSS产生背景: 为了让网页元素的样式更加丰富,也为了让网页的内容和样式能拆分开,CSS由此思想而诞生,CSS是 Cascading Style Sheets 的首字母缩写,意思是层叠样式表. 有了C ...