Lucene 自动补全
package com.pera.suggestion;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Sugesstion {
private static final String GRAMMED_WORDS_FIELD = "words";
private static final String SOURCE_WORD_FIELD = "sourceWord";
private static final String COUNT_FIELD = "count";
private static final String[] ENGLISH_STOP_WORDS = {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "i", "if", "in", "into", "is",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
private final Directory autoCompleteDirectory;
private IndexReader autoCompleteReader;
private IndexSearcher autoCompleteSearcher;
public Sugesstion(String autoCompleteDir) throws IOException {
this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir,
null);
reOpenReader();
}
public List<String> suggestTermsFor(String term) throws IOException {
// get the top 5 terms for query
Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term));
Sort sort = new Sort(COUNT_FIELD, true);
TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort);
List<String> suggestions = new ArrayList<String>();
for (ScoreDoc doc : docs.scoreDocs) {
suggestions.add(autoCompleteReader.document(doc.doc).get(
SOURCE_WORD_FIELD));
}
return suggestions;
}
@SuppressWarnings("unchecked")
public void reIndex(Directory sourceDirectory, String fieldToAutocomplete)
throws CorruptIndexException, IOException {
// build a dictionary (from the spell package)
IndexReader sourceReader = IndexReader.open(sourceDirectory);
LuceneDictionary dict = new LuceneDictionary(sourceReader,
fieldToAutocomplete);
// code from
// org.apache.lucene.search.spell.SpellChecker.indexDictionary(
// Dictionary)
IndexReader.unlock(autoCompleteDirectory);
// use a custom analyzer so we can do EdgeNGramFiltering
IndexWriter writer = new IndexWriter(autoCompleteDirectory,
new Analyzer() {
public TokenStream tokenStream(String fieldName,
Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new ISOLatin1AccentFilter(result);
result = new StopFilter(result,
ENGLISH_STOP_WORDS);
result = new EdgeNGramTokenFilter(
result, Side.FRONT,1, 20);
return result;
}
}, true);
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
// go through every word, storing the original word (incl. n-grams)
// and the number of times it occurs
Map<String, Integer> wordsMap = new HashMap<String, Integer>();
Iterator<String> iter = (Iterator<String>) dict.getWordsIterator();
while (iter.hasNext()) {
String word = iter.next();
int len = word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (wordsMap.containsKey(word)) {
throw new IllegalStateException(
"This should never happen in Lucene 2.3.2");
// wordsMap.put(word, wordsMap.get(word) + 1);
} else {
// use the number of documents this word appears in
wordsMap.put(word, sourceReader.docFreq(new Term(
fieldToAutocomplete, word)));
}
}
for (String word : wordsMap.keySet()) {
// ok index the word
Document doc = new Document();
doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES,
Field.Index.UN_TOKENIZED)); // orig term
doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES,
Field.Index.TOKENIZED)); // grammed
doc.add(new Field(COUNT_FIELD,
Integer.toString(wordsMap.get(word)), Field.Store.NO,
Field.Index.UN_TOKENIZED)); // count
writer.addDocument(doc);
}
sourceReader.close();
// close writer
writer.optimize();
writer.close();
// re-open our reader
reOpenReader();
}
private void reOpenReader() throws CorruptIndexException, IOException {
if (autoCompleteReader == null) {
autoCompleteReader = IndexReader.open(autoCompleteDirectory);
} else {
autoCompleteReader.reopen();
}
autoCompleteSearcher = new IndexSearcher(autoCompleteReader);
}
public static void main(String[] args) throws Exception {
Sugesstion autocomplete = new Sugesstion("/index/autocomplete");
// run this to re-index from the current index, shouldn't need to do
// this very often
// autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null),
// "content");
String term = "steve";
System.out.println(autocomplete.suggestTermsFor(term));
// prints [steve, steven, stevens, stevenson, stevenage]
}
}
Lucene 自动补全的更多相关文章
- ES系列十三、Elasticsearch Suggester API(自动补全)
1.概念 1.补全api主要分为四类 Term Suggester(纠错补全,输入错误的情况下补全正确的单词) Phrase Suggester(自动补全短语,输入一个单词补全整个短语) Comple ...
- jQuery 邮箱下拉列表自动补全
综述 我想大家一定见到过,在某个网站填写邮箱的时候,还没有填写完,就会出现一系列下拉列表,帮你自动补全邮箱的功能.现在我们就用jQuery来实现一下. 博主原创代码,如有代码写的不完善的地方还望大家多 ...
- eclipse自动补全的设置
eclipse自动补全的设置 如果你用过Visual Studio的自动补全功能后,再来用eclipse的自动补全功能,相信大家会有些许失望. 但是eclipse其实是非常强大的,eclipse的 ...
- vim 添加php自动补全 并格式化代码
自动补全,修改/etc/vimrc的配置 vim /etc/vimrc 添加: filetype plugin on autocmd FileType php set omnifunc=phpcomp ...
- Eclipse自动补全设置
如果你用过Visual Studio的自动补全功能后,再来用eclipse的自动补全功能,相信大家会有些许失望. 但是eclipse其实是非常强大的,eclipse的自动补全没有VS那么好是因为ecl ...
- Autocomplete 自动补全(Webform实战篇)
开篇语 因为项目中需要用到一个自动补全的功能,功能描述: 需求一:新增收件人的时候,自动下拉显示出数据库中所有的收件人信息(显示的信息包括:姓名-收件地址-联系方式) 需求二:选中一个值得时候,分别赋 ...
- eclipse自动补全的设置(自动提示)
如果你用过Visual Studio的自动补全功能后,再来用eclipse的自动补全功能,相信大家会有些许失望. 但是eclipse其实是非常强大的,eclipse的自动补全没有VS那么好是因为e ...
- jQuery AutoComplete 自动补全
jQuery.AutoComplete是一个基于jQuery的自动补全插件.借助于jQuery优秀的跨浏览器特性,可以兼容Chrome/IE/Firefox/Opera/Safari等多种浏览器. 特 ...
- Vim自动补全神器–YouCompleteMe
一.简介 YouCompleteMe是Vim的自动补全插件,与同类插件相比,具有如下优势 1.基于语义补全 2.整合实现了多种插件 clang_complete.AutoComplPop .Super ...
随机推荐
- linux系统性能监控--网络利用率
Linux中提供了许多有助于评估各种 Linux网络性能的监视工具,其中一些监视工具也可用于解决网络问题以及监视性能. Linux内核为用户提供了大量的网络系统信息,这有助于监视网络的健康状态并检测在 ...
- Kafka系列之-Kafka Protocol实例分析
本文基于A Guide To The Kafka Protocol文档,以及Spark Streaming中实现的org.apache.spark.streaming.kafka.KafkaClust ...
- 六星经典CSAPP-笔记(7)加载与链接(上)
六星经典CSAPP-笔记(7)加载与链接 1.对象文件(Object File) 1.1 文件类型 对象文件有三种形式: 可重定位对象文件(Relocatable object file):包含二进制 ...
- Querying CRM data with LINQ
http://www.powerxrm.com/querying-crm-data-with-linq/ 如果不喜欢看SDK中的示例,这篇里面讲的非常详细,值得一看.
- J2EE进阶(十八)基于留言板分析SSH工作流程
J2EE进阶(十八)基于留言板分析SSH工作流程 留言板采用SSH(Struts1.2 + Spring3.0 + Hibernate3.0)架构. 工作流程(以用户登录为例): 首先是用 ...
- Mybatis源码分析之参数映射及处理ParameterHandler
ParameterHandler是用来设置参数规则的,当StatementHandler调用prepare方法之后,接下来就是调用它来进行设置参数. ParameterHandler接口: publi ...
- Velocity 语法及其在springMVC中的配置
强烈推荐具体的整合博客:http://blog.csdn.net/duqi_2009/article/details/47752169 整合文章中有几处问题: xml中配置的vm视图解析器,应该按照本 ...
- 开源框架Volley的使用《一》
转载本专栏每一篇博客请注明转载出处地址,尊重原创.此博客转载链接地址:小杨的博客 http://blog.csdn.net/qq_32059827/article/details/52785378 本 ...
- collection 中对类排序
首先 写出 一个person类 让他继承Comparable 构造函数和get/set不用说 我们要覆盖父类中的comparto方法 代码如下 省略get/set package a; public ...
- lucene索引库的增删改查操作
1. 索引库的操作 保持数据库与索引库的同步 说明:在一个系统中,如果索引功能存在,那么数据库和索引库应该是同时存在的.这个时候需要保证索引库的数据和数据库中的数据保持一致性.可以在对数据库进行增.删 ...