这里使用的Lucene4.7.0和Lucene3.X稍有不同

有下面三段内容,我想对船一系列的搜索进行加分

  bike car jeep truck bus boat

  train car ship boat van subway

  car plane taxi boat vessel railway

  • 定义自定义的MyAnalyzer,实现对字段的有效载荷进行赋值
 package com.pera.lucene.score.payload;

 import java.io.Reader;

 import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.util.Version; public class MyAnalyzer extends Analyzer
{ private PayloadEncoder encoder; MyAnalyzer(PayloadEncoder encoder)
{
this.encoder = encoder;
} @Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader)
{
// 用来解析空格分隔的各个类别
Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_47, reader);
// 自定义的Filter,用来获取字段的Payload值
MyTokenFilter filter = new MyTokenFilter(source, encoder); return new TokenStreamComponents(source, filter);
} }
  • 自定义TokenFilter来达到取得字段的PayLoad值或通过字段对PayLoad值进行分析赋值
 package com.pera.lucene.score.payload;

 import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.payloads.PayloadEncoder;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; public class MyTokenFilter extends TokenFilter
{
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class);
private final PayloadEncoder encoder; public MyTokenFilter(TokenStream input, PayloadEncoder encoder)
{
super(input);
this.encoder = encoder;
} @Override
public boolean incrementToken() throws IOException
{
if (input.incrementToken())
{
String term = termAtt.toString();
if (App.scoreMap.containsKey(term))
{
payAtt.setPayload(encoder.encode(App.scoreMap.get(term).toCharArray()));
} else
{
payAtt.setPayload(null);
}
return true;
} else
return false;
} }
     public static ImmutableMap<String, String> scoreMap = ImmutableMap.of("boat", "5f", "ship", "20f", "vessel", "100f");
  • 自定义PayloadSimilarity继承DefaultSimilarity 重载scorePayload方法,在检索时获得之前设置的PayLoad值
 package com.pera.lucene.score.payload;

 import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.util.BytesRef; public class PayloadSimilarity extends DefaultSimilarity
{
@Override
public float scorePayload(int doc, int start, int end, BytesRef payload)
{
return PayloadHelper.decodeFloat(payload.bytes);
}
}
  • 建立索引 需要将之前定义的Analyzer和PayloadSimilarity设置到Config中
 package com.pera.lucene.score.payload;

 import java.io.File;
import java.io.IOException;
import java.util.Date; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.payloads.FloatEncoder;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version; public class Indexing
{
public void indexPayload() throws IOException
{
Directory dir = FSDirectory.open(new File(App.indexPath));
Analyzer analyzer = new MyAnalyzer(new FloatEncoder());
Similarity similarity = new PayloadSimilarity(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
iwc.setOpenMode(OpenMode.CREATE).setSimilarity(similarity);
Date start = new Date();
System.out.println("Indexing to directory '" + App.indexPath + "'...");
IndexWriter writer = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new TextField("tools", "bike car jeep truck bus boat", Store.YES));
writer.addDocument(doc); doc = new Document();
doc.add(new TextField("tools", "train car ship boat van subway", Store.YES));
writer.addDocument(doc); doc = new Document();
doc.add(new TextField("tools", "car plane taxi boat vessel railway", Store.YES));
writer.addDocument(doc); writer.close(); Date end = new Date();
System.out.println(end.getTime() - start.getTime() + " total milliseconds");
}
}
  • 进行检索 检索时要将PayloadSimilarity设置到searcher中
 package com.pera.lucene.score.payload;

 import java.io.File;
import java.io.IOException; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.payloads.AveragePayloadFunction;
import org.apache.lucene.search.payloads.PayloadTermQuery;
import org.apache.lucene.store.FSDirectory; public class Searching
{ public void searchPayload() throws IOException, ParseException
{
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(App.indexPath)));
IndexSearcher searcher = new IndexSearcher(reader); BooleanQuery bq = new BooleanQuery(); PayloadTermQuery ptq1 = new PayloadTermQuery(new Term("tools", "ship"), new AveragePayloadFunction());
PayloadTermQuery ptq2 = new PayloadTermQuery(new Term("tools", "boat"), new AveragePayloadFunction());
PayloadTermQuery ptq3 = new PayloadTermQuery(new Term("tools", "vessel"), new AveragePayloadFunction()); bq.add(ptq1, Occur.SHOULD);
bq.add(ptq2, Occur.SHOULD);
bq.add(ptq3, Occur.SHOULD); // 设置自定义的PayloadSimilarity
searcher.setSimilarity(new PayloadSimilarity());
TopDocs results = searcher.search(bq, 10);
ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits;
System.out.println(numTotalHits + " total matching documents"); for (int i = 0; i < hits.length; i++)
{
int docId = hits[i].doc; // 文档编号
float lucene_score = hits[i].score;
String tools = searcher.doc(docId).get("tools");
System.out.println("DocId:" + docId + "\tLucene Score:" + lucene_score + "\tTools:" + tools);
Explanation explanation = searcher.explain(bq, docId);
System.out.println(explanation.toString());
}
}
}
  • 检索结果 可以看到Doc2的排序由于有了PayLoad值排名得到了提升
3 total matching documents
DocId:2 Lucene Score:16.750757 Tools:car plane taxi boat vessel railway
16.750757 = (MATCH) product of:
25.126135 = (MATCH) sum of:
0.3186112 = (MATCH) btq, product of:
0.06372224 = weight(tools:boat in 2) [PayloadSimilarity], result of:
0.06372224 = score(doc=2,freq=0.5 = phraseFreq=0.5
), product of:
0.33736566 = queryWeight, product of:
0.71231794 = idf(docFreq=3, maxDocs=3)
0.4736167 = queryNorm
0.18888181 = fieldWeight in 2, product of:
0.70710677 = tf(freq=0.5), with freq of:
0.5 = phraseFreq=0.5
0.71231794 = idf(docFreq=3, maxDocs=3)
0.375 = fieldNorm(doc=2)
5.0 = AveragePayloadFunction.docScore()
24.807524 = (MATCH) btq, product of:
0.24807523 = weight(tools:vessel in 2) [PayloadSimilarity], result of:
0.24807523 = score(doc=2,freq=0.5 = phraseFreq=0.5
), product of:
0.66565174 = queryWeight, product of:
1.4054651 = idf(docFreq=1, maxDocs=3)
0.4736167 = queryNorm
0.37268022 = fieldWeight in 2, product of:
0.70710677 = tf(freq=0.5), with freq of:
0.5 = phraseFreq=0.5
1.4054651 = idf(docFreq=1, maxDocs=3)
0.375 = fieldNorm(doc=2)
100.0 = AveragePayloadFunction.docScore()
0.6666667 = coord(2/3) DocId:1 Lucene Score:3.5200772 Tools:train car ship boat van subway
3.5200772 = (MATCH) product of:
5.2801156 = (MATCH) sum of:
4.9615045 = (MATCH) btq, product of:
0.24807523 = weight(tools:ship in 1) [PayloadSimilarity], result of:
0.24807523 = score(doc=1,freq=0.5 = phraseFreq=0.5
), product of:
0.66565174 = queryWeight, product of:
1.4054651 = idf(docFreq=1, maxDocs=3)
0.4736167 = queryNorm
0.37268022 = fieldWeight in 1, product of:
0.70710677 = tf(freq=0.5), with freq of:
0.5 = phraseFreq=0.5
1.4054651 = idf(docFreq=1, maxDocs=3)
0.375 = fieldNorm(doc=1)
20.0 = AveragePayloadFunction.docScore()
0.3186112 = (MATCH) btq, product of:
0.06372224 = weight(tools:boat in 1) [PayloadSimilarity], result of:
0.06372224 = score(doc=1,freq=0.5 = phraseFreq=0.5
), product of:
0.33736566 = queryWeight, product of:
0.71231794 = idf(docFreq=3, maxDocs=3)
0.4736167 = queryNorm
0.18888181 = fieldWeight in 1, product of:
0.70710677 = tf(freq=0.5), with freq of:
0.5 = phraseFreq=0.5
0.71231794 = idf(docFreq=3, maxDocs=3)
0.375 = fieldNorm(doc=1)
5.0 = AveragePayloadFunction.docScore()
0.6666667 = coord(2/3) DocId:0 Lucene Score:0.106203735 Tools:bike car jeep truck bus boat
0.106203735 = (MATCH) product of:
0.3186112 = (MATCH) sum of:
0.3186112 = (MATCH) btq, product of:
0.06372224 = weight(tools:boat in 0) [PayloadSimilarity], result of:
0.06372224 = score(doc=0,freq=0.5 = phraseFreq=0.5
), product of:
0.33736566 = queryWeight, product of:
0.71231794 = idf(docFreq=3, maxDocs=3)
0.4736167 = queryNorm
0.18888181 = fieldWeight in 0, product of:
0.70710677 = tf(freq=0.5), with freq of:
0.5 = phraseFreq=0.5
0.71231794 = idf(docFreq=3, maxDocs=3)
0.375 = fieldNorm(doc=0)
5.0 = AveragePayloadFunction.docScore()
0.33333334 = coord(1/3)

Lucene 评分机制二 Payload的更多相关文章

  1. Apache Lucene评分机制的内部工作原理

    Apache Lucene评分机制的内部工作原理' 第5章

  2. Lucene 评分机制一

    1. 评分公式 1.1 公式介绍 这个公式是Lucene实际计算时使用的公式,是由原型公式推导而来 tf(t in d) 表示某个term的出现频率,定义了term t出现在当前document d的 ...

  3. lucene 的评分机制

    lucene 的评分机制 elasticsearch是基于lucene的,所以他的评分机制也是基于lucene的.评分就是我们搜索的短语和索引中每篇文档的相关度打分. 如果没有干预评分算法的时候,每次 ...

  4. Lucene Scoring 评分机制

    原文出处:http://blog.chenlb.com/2009/08/lucene-scoring-architecture.html Lucene 评分体系/机制(lucene scoring)是 ...

  5. Lucene 的 Scoring 评分机制

    转自: http://www.oschina.net/question/5189_7707  Lucene 评分体系/机制(lucene scoring)是 Lucene 出名的一核心部分.它对用户来 ...

  6. Solr4.8.0源码分析(19)之缓存机制(二)

    Solr4.8.0源码分析(19)之缓存机制(二) 前文<Solr4.8.0源码分析(18)之缓存机制(一)>介绍了Solr缓存的生命周期,重点介绍了Solr缓存的warn过程.本节将更深 ...

  7. Solr In Action 笔记(2) 之 评分机制(相似性计算)

    Solr In Action 笔记(2) 之评分机制(相似性计算) 1 简述 我们对搜索引擎进行查询时候,很少会有人进行翻页操作.这就要求我们对索引的内容提取具有高度的匹配性,这就搜索引擎文档的相似性 ...

  8. Elasticseach的评分机制

    lucene 的评分机制 elasticsearch是基于lucene的,所以他的评分机制也是基于lucene的.评分就是我们搜索的短语和索引中每篇文档的相关度打分. 如果没有干预评分算法的时候,每次 ...

  9. Wifi 评分机制分析

    从android N开始,引入了wifi评分机制,选择wifi的时候会通过评分来选择. android O源码 frameworks\opt\net\wifi\service\java\com\and ...

随机推荐

  1. Jmeter插件:jp@gc - Dummy Sampler

    Dummy Sampler可以比较方便地模拟测试场景,自定义Request Data和Response Data 1. 安装插件:打开页面插件管理网站,下载plugins-manager.jar. 在 ...

  2. 如何打开rdb文件

    后缀名是RDB用什么软件打开不能用记事本打开后是乱码不知用什么软件写入的... RDB文件是QQ2009SP以后的替代DB文件的一种新的文件格式,是一种数据库文件请下载 百度搜索下载:rdb打包解包工 ...

  3. VS2010-MFC(MFC常用类:MFC异常处理)

    转自:http://www.jizhuomi.com/software/236.html 上一节讲了CFile文件操作类,本节主要来说说MFC异常处理. 在鸡啄米C++编程入门系列的最后一节鸡啄米:C ...

  4. 固定定位fixed,绝对定位absolute,相对定位relative;以及overflow

    固定定位position:fixed /*固定定位 1.定位属性值:fixed 2.在页面中不再占位(浮起来了) 3.一旦定位后,定位的布局方位 top.bottom.left.right都能参与布局 ...

  5. map 与 lambda 的用法

    # 列表中的每个元素进行*2lis = [10, 30, 70]f = map(lambda li: li*2, lis) # 操作lis中的每个元素.print(list(f)) # 返回来一个新的 ...

  6. Linux QtCreator 创建工程

    这一天天的,都快成废物了, 每天忙得要死, 各种乱七八糟杂事,连点学习的时间都没有了, 这才一年不碰Linux,创建工程都不会了, Ubuntu 1N.N.N + QtCreator 创建工程 不安装 ...

  7. 解决在Spring整合Hibernate配置tx事务管理器出现错误的问题

    问题描述: Error occured processing XML 'org/aopalliance/intercept/MethodInterceptor'. See Error Log for ...

  8. 小米手机 DELETE_FAILED_INTERNAL_ERROR Error while Installing APKs

    手机:小米2s,MIUI 9 7.11.16 开发版 手机已处于开发者模式,启用了USB调试,已使用USB线连接了手机,在Android Studio 工具栏点击 "Run ‘app’(Sh ...

  9. echo 改变字体颜色

    字颜色:30—–37 echo -e “\033[30m 黑色字 \033[0m” echo -e “\033[31m 红色字 \033[0m” echo -e “\033[32m 绿色字 \033[ ...

  10. JavaWeb中请求转发和请求重定向的区别

    针对于JavaWeb中请求与重定向的一个cheatsheep: 1.转发 1)完成一次转发,用户浏览器发送一次请求 2)转发之后,浏览器URL地址栏不改变(服务器帮忙完成) 3)请求域中数据不丢失 4 ...