参考:

https://blog.csdn.net/u014209975/article/details/50525624

https://www.cnblogs.com/hanyinglong/p/5395600.html

http://lucene.apache.org/core/4_0_0/core/overview-summary.html

https://www.jianshu.com/p/0a2bbe0f4c42

依赖:

lucene-analyzers.jar
lucene-benchmark.jar
lucene-core.jar
lucene-highlighter.jar
lucene-memory.jar
lucene-parser.jar
lucene-remote.jar
lucene-smartcn.jar

实体类:

package com.h3c.lucence;

import java.io.Serializable;

public class Entity implements Serializable {

	private static final long serialVersionUID = 3701082756628915138L;

	private Integer id;

	private String type;

    private String virtualDoc;

    private String summary;

    private float score;

    public Integer getId() {
return id;
} public void setId(Integer id) {
this.id = id;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public String getVirtualDoc() {
if (null == virtualDoc) {
// TODO 根据entity的值构造虚拟的文档,包括所有属性及对应的值,用于全文检索
// 格式:字段1:属性值1,字段2:属性值2,...
}
return virtualDoc;
} public void setVirtualDoc(String virtualDoc) {
this.virtualDoc = virtualDoc;
} public String getSummary() {
StringBuilder sb = new StringBuilder();
String tmpSum = summary;
tmpSum = tmpSum.replace("<SPAN style=\"color:red;\">", "");
tmpSum = tmpSum.replace("</SPAN>", "");
String virtualDoc2 = getVirtualDoc();
int length = tmpSum.length();
int firstIndex = virtualDoc2.indexOf(tmpSum);
if (firstIndex > 0) {
sb.append("...");
}
sb.append(summary);
if (firstIndex + length < virtualDoc2.length()) {
sb.append("...");
} return sb.toString();
} public void setSummary(String summary) {
this.summary = summary;
} public float getScore() {
return score;
} public void setScore(float score) {
this.score = score;
}
}

Demo类:

package com.h3c.lucence;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version; public class Demo {
/** lucene索引目录 */
private static Directory ciIndexDir; private static final String CI_CONTENT_FLAG = "virtualDoc"; /** 分词分析工具,使用标准分析工具,单个含字和连续的英文单词作为索引。 */
private static final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); private static Pattern VALID_IPV4_PATTERN = null;
private static Pattern VALID_IPV6_PATTERN = null;
private static final String ipv4Pattern = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.){3}([01]?\\d\\d?|2[0-4]\\d|25[0-5])";
private static final String ipv6Pattern = "([0-9a-f]{1,4}:){7}([0-9a-f]){1,4}"; private static IndexWriter indexWriter; static {
VALID_IPV4_PATTERN = Pattern.compile(ipv4Pattern, Pattern.CASE_INSENSITIVE);
VALID_IPV6_PATTERN = Pattern.compile(ipv6Pattern, Pattern.CASE_INSENSITIVE);
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
try {
indexWriter = new IndexWriter(getCiIndexDir(), conf);
} catch (IOException e) {
e.printStackTrace();
}
} private static Directory getCiIndexDir() {
if (null == ciIndexDir) {
try {
ciIndexDir = FSDirectory.open(new File("D://indexs"));
} catch (IOException e) {
e.printStackTrace();
}
}
return ciIndexDir;
} private static boolean isIpAddress(String ipAddress) {
Matcher m1 = VALID_IPV4_PATTERN.matcher(ipAddress);
Matcher m2 = VALID_IPV6_PATTERN.matcher(ipAddress);
return m1.matches() || m2.matches();
} private static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
return true;
}
return false;
} private static BooleanQuery parseChineseCharacters(String inputString){
BooleanQuery query = new BooleanQuery();
if(isIpAddress(inputString)){
query.add(new TermQuery(new Term(CI_CONTENT_FLAG,inputString)), BooleanClause.Occur.MUST);
return query;
}
BooleanQuery fieldQuery = new BooleanQuery();
boolean isWord = false;
StringBuilder tempWord = new StringBuilder();
inputString = inputString.toLowerCase();
BooleanQuery booleanQuery = new BooleanQuery();
int length = inputString.length();
Query termQuery = null;
for(int i=0; i<length; i++){
char c = inputString.charAt(i);
if(c >= 'a' && c <= 'z' || c >= '0' && c <= '9'){//English character
isWord = true;
tempWord.append(c);
}
else{//Delimiter or Chinese character
isWord = false;
if(tempWord.length() > 0){
termQuery = new PrefixQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()));
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
tempWord = new StringBuilder();
}
}
if(!isWord){
termQuery = new TermQuery(new Term(CI_CONTENT_FLAG,String.valueOf(c)));
if(isChinese(c)){//Chinese character
// booleanQuery.add(termQuery,BooleanClause.Occur.MUST);
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
}
else{//Delimiter
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} }
}
if(tempWord.length() > 0){
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,tempWord.toString()+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + tempWord.toString()));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
} // Begin 处理全局字段匹配
termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,inputString+"*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD); termQuery = new WildcardQuery(new Term(CI_CONTENT_FLAG,"*" + inputString + "*"));
booleanQuery.add(termQuery,BooleanClause.Occur.SHOULD);
// End 处理全局字段匹配 BooleanClause clause = new BooleanClause(booleanQuery, BooleanClause.Occur.MUST);
fieldQuery.add(clause); BooleanClause fieldClause = new BooleanClause(fieldQuery, BooleanClause.Occur.MUST);
query.add(fieldClause); return query;
} /**
* 全文检索
* @param queryStr
* @throws Exception
*/
private static void contentSearch(String queryStr, boolean highlight) throws Exception {
IndexReader indexReader = null;
IndexSearcher indexSearcher = null;
try {
indexReader = IndexReader.open(getCiIndexDir());
indexSearcher = new IndexSearcher(indexReader); //组合查询条件,需要根据业务自己定义
Query query = parseChineseCharacters(queryStr); TopDocs hits = indexSearcher.search(query, Integer.MAX_VALUE);
if(hits.totalHits > 0) {
if (highlight) {
QueryScorer scorer = new QueryScorer(query, CI_CONTENT_FLAG);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<SPAN style=\"color:red;\">", "</SPAN>");
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter
.setTextFragmenter(new SimpleSpanFragmenter(scorer, 100)); for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc, indexSearcher.getIndexReader(), scoreDoc.doc, highlighter);
entity.setScore(scoreDoc.score);
}
} else {
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println(doc.get("virtualDoc"));
Entity entity = null;
entity = convertToEntity(doc);
entity.setScore(scoreDoc.score);
}
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
close(indexSearcher);
close(indexReader);
}
} /**
* 对实现Closeable接口的统一关闭
* @param object
*/
private static void close(Closeable object) {
if(null != object) {
try {
object.close();
} catch (IOException e) {
}
}
} /**
* 实体转换为Doc
* @param entity
* @return
*/
public static Document convertToDocument(Entity entity) {
Document doc = new Document();
String virtualDoc = entity.getVirtualDoc();
//Field.Store.Yes存储,Field.Index.ANALYZED分词
doc.add(new Field("id", String.valueOf(entity.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("type", entity.getType(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(CI_CONTENT_FLAG, null == virtualDoc ? " " : virtualDoc, Field.Store.YES, Field.Index.ANALYZED));
return doc;
} /**
* Doc转换为实体
* @param doc
* @return
*/
public static Entity convertToEntity(Document doc) {
Entity ci = new Entity();
ci.setId(Integer.valueOf(doc.get("id")));
ci.setType(doc.get("type"));
ci.setVirtualDoc(doc.get(CI_CONTENT_FLAG));
return ci;
} /**
* 检索Entity,含高亮信息
* @param doc
* @param indexReader
* @param docId
* @param highlighter
* @return
* @throws IOException
* @throws InvalidTokenOffsetsException
*/
public static Entity convertToEntity(Document doc, IndexReader indexReader, int docId, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException { Entity entity = convertToEntity(doc);
String virtualDoc = entity.getVirtualDoc();
TokenStream stream = TokenSources.getAnyTokenStream(indexReader, docId, CI_CONTENT_FLAG, doc, analyzer);
String highlighterSummary = highlighter.getBestFragment(stream, virtualDoc);
if(highlighterSummary == null){
highlighterSummary = virtualDoc;
}
entity.setSummary(highlighterSummary); return entity;
} /**
* 给entity信息增加索引
* @param entity
*/
public static void addIndex(Entity entity) {
try {
deleteIndex(entity);
Document doc = convertToDocument(entity);
indexWriter.addDocument(doc);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 批量增加索引
* @param list
*/
public static void addIndexs(List<Entity> list) {
try {
List<Document> docs = new ArrayList<Document>();
deleteIndexs(list);
for (Entity entity : list) {
Document doc = convertToDocument(entity);
docs.add(doc);
}
indexWriter.addDocuments(docs);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 给实体信息更新索引
* @param entity
*/
public static void updateIndex(Entity entity) {
try {
addIndex(entity);
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除entity列表信息对应的索引
* @param entity
*/
public static void deleteIndexs(List<Entity> list) {
try {
int size = list.size();
Term[] terms = new Term[size];
for(int i=0; i<size; i++) {
terms[i] = new Term("id", list.get(i).getId().toString());
}
indexWriter.deleteDocuments(terms);
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体信息对应的索引
* @param entity
*/
public static void deleteIndex(Entity entity) {
try {
indexWriter.deleteDocuments(new Term("id", entity.getId().toString()));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} /**
* 删除实体类型对应的所以索引信息
* @param type
*/
public static void deleteIndexByType(String type) {
try {
indexWriter.deleteDocuments(new Term("type", type));
indexWriter.commit();
} catch (Exception e) {
e.printStackTrace();
}
} @Override
protected void finalize() throws Throwable {
indexWriter.close();
} public static void main(String[] args) throws Exception {
String queryStr = "http://mail6c1.shenzhenair.com";
contentSearch(queryStr, true);
}
}

Lucence使用入门的更多相关文章

  1. 【solr专题之一】Solr快速入门 分类: H4_SOLR/LUCENCE 2014-07-02 14:59 2403人阅读 评论(0) 收藏

    一.Solr学习相关资料 1.官方材料 (1)快速入门:http://lucene.apache.org/solr/4_9_0/tutorial.html,以自带的example项目快速介绍发Solr ...

  2. Google Guava入门(一)

    Guava作为Java编程的助手,可以提升开发效率,对Guava设计思想的学习则极大的有益于今后的编程之路.故在此对<Getting Started with Google Guava>一 ...

  3. Elasticsearch7.X 入门学习第一课笔记----基本概念

    原文:Elasticsearch7.X 入门学习第一课笔记----基本概念 版权声明:本文为博主原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明. 本文链接:https: ...

  4. Lucene从入门到实战

    Lucene 在了解Lucene之前,我们先了解下全文数据查询. 全文数据查询 我们的数据一般分为两种:结构化数据和非结构化数据 结构化数据:有固定格式或有限长度的数据,如数据库中的数据.元数据 非结 ...

  5. Angular2入门系列教程7-HTTP(一)-使用Angular2自带的http进行网络请求

    上一篇:Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数 感觉这篇不是很好写,因为涉及到网络请求,如果采用真实的网络请求,这个例子大家拿到手估计还要自己写一个web ...

  6. ABP入门系列(1)——学习Abp框架之实操演练

    作为.Net工地搬砖长工一名,一直致力于挖坑(Bug)填坑(Debug),但技术却不见长进.也曾热情于新技术的学习,憧憬过成为技术大拿.从前端到后端,从bootstrap到javascript,从py ...

  7. Oracle分析函数入门

    一.Oracle分析函数入门 分析函数是什么?分析函数是Oracle专门用于解决复杂报表统计需求的功能强大的函数,它可以在数据中进行分组然后计算基于组的某种统计值,并且每一组的每一行都可以返回一个统计 ...

  8. Angular2入门系列教程6-路由(二)-使用多层级路由并在在路由中传递复杂参数

    上一篇:Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数 之前介绍了简单的路由以及传参,这篇文章我们将要学习复杂一些的路由以及传递其他附加参数.一个好的路由系统可以使我们 ...

  9. Angular2入门系列教程5-路由(一)-使用简单的路由并在在路由中传递参数

    上一篇:Angular2入门系列教程-服务 上一篇文章我们将Angular2的数据服务分离出来,学习了Angular2的依赖注入,这篇文章我们将要学习Angualr2的路由 为了编写样式方便,我们这篇 ...

随机推荐

  1. mybatis关联查询之一对一查询

    一对一也就是 A 表的一条记录对应 B 表的一条记录,下面的测试数据中,从employee 表来看,一个员工对应一个部门,是一对一关系,如果从部门角度来看,则是一对多的关系,一个部门对应多个员工,本节 ...

  2. thinkphp中的exp查询

    今天遇到一个问题,就是在vendor表中查询出vendor_id = vendor_f_id的数据,其实使用原生的sql语句是非常简单的: select * from vendor where ven ...

  3. HDU 6470 /// 矩阵快速幂

    题目大意: f[1]=1 f[2]=2 f[n]=f[n-1]+2*f[n-2]+n^3 在某博客截的图 现在忘记原博位置了 抱歉 根据递推式1和递推式3构造出两个矩阵 #include <bi ...

  4. sublime中使用插件anaconda而在代码中出现方框

    这个标志是说不符合PEP8标准,比如使用了Tab做缩进:一行过长等问题. 可以在可以在 Sublime > Preferences > Package Settings > Anac ...

  5. saltstack基本操作第一篇章

    一.安装saltstack 1)官网安装 http://repo.saltstack.com/#rhel saltstack的模块:   https://www.unixhot.com/docs/sa ...

  6. 选择 NoSQL 需要考虑的 10 个问题

    那么我为什么要写这篇文章呢? 是因为我认为NoSQL解决方案不如RDBMS解决方案吗?当然不! 是因为我专注于SQL的做事方式,而不想陷入一种相对较新的技术的不确定性吗?不,也不是!事实上,我非常兴奋 ...

  7. vue证明题五,组件传值与绑定

    上文中写了一个input组件,该组件需要复用,但是并不是每个组件都相同的 比如我定义了一个组件,是个矿泉水瓶子,这个瓶子分为大中小三个号,定义了三种瓶子的容积,定义了必须有瓶盖,瓶口,瓶子质地 但是瓶 ...

  8. dotNET面试(三)

    1.简述 private. protected. public. internal 修饰符的访问权限.private : 私有成员, 在类的内部才可以访问 ,也就是类内部的函数等成员可以访问.prot ...

  9. go语言从例子开始之Example11.range遍历

    range 迭代各种各样的数据结构.让我们来看看如何在我们已经学过的数据结构上使用 rang 吧. package main import "fmt" func main() { ...

  10. IPv6 关于路由器配置静态IPv6路由的命令

    今天在学习路由器配置ipv6 的时候遇到了一点疑惑 一条命令为:ipv6 route FE80:0202::/32 serail 0/1 201 一条命令为:ipv6 route FE80:0202: ...