42、lucene和机器学习进行全文搜索,并排序
package com.lucene.test; import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.joone.engine.FullSynapse;
import org.joone.engine.LinearLayer;
import org.joone.engine.Monitor;
import org.joone.engine.NeuralNetEvent;
import org.joone.engine.NeuralNetListener;
import org.joone.engine.SigmoidLayer;
import org.joone.engine.learning.TeachingSynapse;
import org.joone.io.MemoryInputSynapse;
import org.joone.io.MemoryOutputSynapse;
import org.joone.net.NeuralNet;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer; import com.lucene.domain.Article; public class TestLucene implements NeuralNetListener{
private NeuralNet nnet = null;
private MemoryInputSynapse inputSynapse,desireOutputSynapse;
LinearLayer input;
SigmoidLayer hidden,output;
boolean singleThreadMode = true; //XOR input
private double[][] inputArray = new double[][]{
{0.0,0.0},
{0.0,1.0},
{1.0,0.0},
{1.0,1.0}
}; //XOR desired output
private double[][] desiredOutputArray = new double[][]{
{0.0},
{1.0},
{1.0},
{1.0}
}; /**
* 创建索引
* @throws Exception
*/
@Test
public void testCreateIndex() throws Exception{
int fileNum = 1;
List<String> contents = new ArrayList<String>();
InputStream inputStream = null;
String value = null;
File directory = new File("./20_newsgroups");
if(directory.isDirectory()){
File[] files = directory.listFiles();
for (int i = 0; i < 1; i++) {
if(files[i].isDirectory()){
File[] subFiles = files[i].listFiles();
for (int j = 0; j < 10; j++) {
inputStream = new BufferedInputStream(new FileInputStream(subFiles[j]));
StringBuffer tempContent = new StringBuffer();
byte[] bytes = new byte[1024*10];
int len = 0;
while((len = inputStream.read(bytes))!=-1){
tempContent = tempContent.append(new String(bytes));
}
value = tempContent.toString();
System.out.println(value);
inputStream.close();
Article article = new Article(fileNum,subFiles[j].getName(),tempContent.toString());
Directory saveDirectory = FSDirectory.open(Paths.get("./indexDir/"));
//分词器
Analyzer analyzer = new WhitespaceAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
Document doc = new Document();
doc.add(new TextField("id", article.getId().toString(), Store.YES));
doc.add(new TextField("title", article.getTitle(), Store.YES));
doc.add(new TextField("content", article.getContent(), Store.YES));
IndexWriter indexWriter = new IndexWriter(saveDirectory,iwc);
System.out.println("have already add file to fileDocment system"+fileNum);
indexWriter.addDocument(doc);
indexWriter.close();//释放资源
fileNum = fileNum+1;
}
}
}
} //1.将需要添加的实体构造成实体对象
Article article = new Article(1,"Lucene是全文检索框架",
"全文检索(Full-Test Retrieval)是以文本作为检索对象,找出含有指定词汇的文本。"+
"全面,准确和快速是衡量全文检索系统的关键指标。"); //2,保存到数据库(此步骤暂时省略) //3、建立索引(lucene)
//索引库目录 //将 Article 转换为Document //保存到索引库中 } /**
* 测试搜索
* @throws IOException
* @throws ParseException
*/
@Test
public void testSearch() throws IOException, ParseException{
//1、搜索条件
String queryCondition = "philosophical"; //2、执行搜索(lucene)
List<Article> articles = new ArrayList<Article>(); //----------搜索代码------------------------
Directory directory = FSDirectory.open(Paths.get("./indexDir/"));
Analyzer analyzer = new WhitespaceAnalyzer();//创建分词器 //把查询字符串转换为Query对象(只在title中查询)
QueryParser queryParser = new QueryParser("content",analyzer);
Query query = queryParser.parse(queryCondition); //2执行搜索得到搜索结果
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
TopDocs topDocs = indexSearcher.search(query, 100); Integer count = topDocs.totalHits;//总结果数量
ScoreDoc[] scoreDocs = topDocs.scoreDocs;//返回前N条结果 //2.3处理结果
for (int i = 0; i < scoreDocs.length; i++) {
ScoreDoc scoreDoc= scoreDocs[i];
int docId = scoreDoc.doc;
System.out.println("得分是:"+scoreDoc.score+"内部编号是:"+docId); //根据内部编号取出真正的Document数据
Document doc = indexSearcher.doc(docId); //将document转化为Article
Article article = new Article(Integer.parseInt(doc.get("id")),doc.get("title"),doc.get("content"));
articles.add(article);
} //------------------------------------------
//3、控制台显示结果
System.err.print("总结果数:"+count);
for (Article article : articles) {
System.out.println("查询结果:ID为:"+article.getId()+",title为:"+article.getTitle());
}
indexSearcher.getIndexReader().close();
} @Test
public void testNeuralNet(){
TestLucene testLucene = new TestLucene();
testLucene.initNeuralNet();
testLucene.train();
testLucene.interrogate();
} public void initNeuralNet(){
//First create the three layers
input = new LinearLayer();
hidden = new SigmoidLayer();
output = new SigmoidLayer(); //set the dimensions of the layers
input.setRows(2);
hidden.setRows(3);
output.setRows(1); input.setLayerName("L.input");
hidden.setLayerName("L.hidden");
output.setLayerName("L.output"); //Now create the two Synapses
FullSynapse synapse_IH = new FullSynapse();//input -->hidden conn
FullSynapse synapse_HO = new FullSynapse();//hidden -->output conn //Connect the input layer whit the hidden layer
input.addOutputSynapse(synapse_IH);
hidden.addInputSynapse(synapse_IH); //Connect the hidden layer whit the output layer
hidden.addOutputSynapse(synapse_HO);
output.addInputSynapse(synapse_HO); //the input to the neural net
inputSynapse = new MemoryInputSynapse();
input.addInputSynapse(inputSynapse); //The Trainer and its desired output
desireOutputSynapse = new MemoryInputSynapse();
TeachingSynapse trainer = new TeachingSynapse(); trainer.setDesired(desireOutputSynapse); //Now we add this structure to a NeuralNet object
nnet = new NeuralNet(); nnet.addLayer(input,NeuralNet.INPUT_LAYER);
nnet.addLayer(hidden,NeuralNet.HIDDEN_LAYER);
nnet.addLayer(output, NeuralNet.OUTPUT_LAYER);
nnet.setTeacher(trainer);
output.addOutputSynapse(trainer);
nnet.addNeuralNetListener(this);
} public void train(){
//set the inputs
inputSynapse.setInputArray(inputArray);
inputSynapse.setAdvancedColumnSelector("1,2");
//set the desired outputs
desireOutputSynapse.setInputArray(desiredOutputArray);
desireOutputSynapse.setAdvancedColumnSelector("1");
//get the monitor object to train or feed forward
Monitor monitor = nnet.getMonitor(); //set the monitor parameters
monitor.setLearningRate(0.8);
monitor.setMomentum(0.3);
monitor.setTrainingPatterns(inputArray.length);
monitor.setTotCicles(5000);
monitor.setLearning(true); long initms = System.currentTimeMillis();
//Run the network in single-thread,synchronized mode
nnet.getMonitor().setSingleThreadMode(singleThreadMode);
nnet.go(true);
System.out.println("Total time="+(System.currentTimeMillis()-initms)+"ms");
} public void interrogate(){
double[][] inputArray = new double[][]{
{0.0,1.0},
{1.0,0.0},
{1.0,1.0},
{0.0,0.0}
};
//set the inputs
inputSynapse.setInputArray(inputArray);
inputSynapse.setAdvancedColumnSelector("1,2");
Monitor monitor = nnet.getMonitor();
monitor.setTrainingPatterns(4);
monitor.setTotCicles(1);
monitor.setLearning(false);
MemoryOutputSynapse memOut = new MemoryOutputSynapse();
//set the output synapse to write the output of the net if(nnet != null){
nnet.addOutputSynapse(memOut);
System.out.println(nnet.check());
nnet.getMonitor().setSingleThreadMode(singleThreadMode);
nnet.go();
for (int i = 0; i < 4; i++) {
double[] pattern = memOut.getNextPattern();
System.out.println("Output pattern #"+(i+1)+"="+pattern[0]);
}
System.out.println("Interrogating Finished");
}
} public void cicleTerminated(NeuralNetEvent arg0) { } public void errorChanged(NeuralNetEvent e) {
Monitor mon=(Monitor) e.getSource();
if(mon.getCurrentCicle()%100==0){
System.out.println("Epoch:"+(mon.getTotCicles()-mon.getCurrentCicle())+"RMSE:"
+mon.getGlobalError());
}
} public void netStarted(NeuralNetEvent e) {
Monitor mon = (Monitor) e.getSource();
System.out.println("Network started for ");
if(mon.isLearning()){
System.out.println("training");
}else{
System.out.println("interrogation.");
}
} public void netStopped(NeuralNetEvent e) {
Monitor mon = (Monitor) e.getSource();
System.out.println("Network stopped . Last RMSE="
+mon.getGlobalError());
} public void netStoppedError(NeuralNetEvent e, String error) {
System.out.println("Network stopped due the following error:"
+error);
}
}
结果
得分是:0.25462872内部编号是:7840
得分是:0.24006625内部编号是:7841
查询结果:ID为:2,title为:51060总结果数:2
查询结果:ID为:1,title为:49960
42、lucene和机器学习进行全文搜索,并排序的更多相关文章
- 基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- Apache Solr采用Java开发、基于Lucene的全文搜索服务器
http://docs.spring.io/spring-data/solr/ 首先介绍一下solr: Apache Solr (读音: SOLer) 是一个开源.高性能.采用Java开发.基于Luc ...
- OSCHina技术导向:Java全文搜索框架Lucene
Lucene 是apache软件基金会一个开放源代码的全文检索引擎工具包,是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎.Lucene的目的是为软件开发人员提供一个简单易用 ...
- 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果: 上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图: 基本风格是模仿的百度搜索结果,绿色的分页略显小清新. 目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度 ...
- lucene全文搜索之四:创建索引搜索器、6种文档搜索器实现以及搜索结果分析(结合IKAnalyzer分词器的搜索器)基于lucene5.5.3
前言: 前面几章已经很详细的讲解了如何创建索引器对索引进行增删查(没有更新操作).如何管理索引目录以及如何使用分词器,上一章讲解了如何生成索引字段和创建索引文档,并把创建的索引文档保存到索引目录,到这 ...
- lucene全文搜索之三:生成索引字段,创建索引文档(给索引字段加权)基于lucene5.5.3
前言:上一章中我们已经实现了索引器的创建,但是我们没有索引文档,本章将会讲解如何生成字段.创建索引文档,给字段加权以及保存文档到索引器目录 luncene5.5.3集合jar包下载地址:http:// ...
- lucene全文搜索之二:创建索引器(创建IKAnalyzer分词器和索引目录管理)基于lucene5.5.3
前言: lucene全文搜索之一中讲解了lucene开发搜索服务的基本结构,本章将会讲解如何创建索引器.管理索引目录和中文分词器的使用. 包括标准分词器,IKAnalyzer分词器以及两种索引目录的创 ...
- lucene全文搜索之一:lucene的主要功能和基本结构(基于lucene5.5.3)
前言:lucene并不是像solr或elastic那样提供现成的.直接部署可用的系统,而是一套jar包,提供了一些常见语言分词.构建索引和创建搜索器等等功能的API,我们常用到的也就是分词器.索引目录 ...
- C# 全文搜索Lucene
全文出自:https://blog.csdn.net/huangwenhua5000/article/details/9341751 1 lucene简介1.1 什么是luceneLucene是一个全 ...
随机推荐
- .NET微信开发通过Access Token和OpenID获取用户信息
本文介绍如何获得微信公众平台关注用户的基本信息,包括昵称.头像.性别.国家.省份.城市.语言. 本文的方法将囊括订阅号和服务号以及自定义菜单各种场景,无论是否有高级接口权限,都有办法来获得用户基本信息 ...
- android模拟器没法通过localhost访问本地服务器的解决
当android项目访问在一台服务器上的WEB服务时,没法通过localhost或者127.0.0.1来访问.模拟器把它自己作为了localhost,代码中使用localhost或者127.0.0.1 ...
- HackerRank "Fair Rations"
Another fun Greedy problem to work on: we simply go from first to second last person, as long someon ...
- 修复sublime text系统右键菜单
修复sublime text系统右键菜单 安装完Sublime Text2后,拿掉电脑里面的备用硬盘,导致每次使用Open with Sublime Text2的时候,都会出错,打开注册表,找到 HK ...
- ios7 ios8 cell中下划线偏移(separator Insets)处理方法
在ios7中,UITableViewCell左侧会有默认15像素的空白.这时候,设置setSeparatorInset:UIEdgeInsetsZero 能将空白去掉. 但是在ios8中,设置setS ...
- 【学】React的学习之旅2 - React Component的生命周期
分成三个状态: Mounted Update Unmounted Mounted:当我们看到组件在浏览器中从无到有的效果的时候,mounted已经结束了,这个组件已经被mounted了 有这个阶段有2 ...
- html_博客博主
csdn: 工匠若水 http://blog.csdn.net/yanbober yunama: IT蓝豹:http://www.itlanbao.com/: http://ask.dcloud.ne ...
- vim常用操作
vim filename 编辑一个文件 在一般模式里按yy是复制的意思(复制当前行),按yy之前先按相应的数字键就是复制光标所在行到指定的行,然后按p粘贴在一般模式里按dd是删除的意思(也叫做剪切), ...
- Tornado实战项目(伪JD商城)
预备知识 在之前tornado商城项目中,在开始之前需要引入一些项目设计知识,如接口,抽象方法抽象类,组合,程序设计原则等,个人理解项目的合理设计可增加其灵活性, 降低数据之间的耦合性,提高稳定性,下 ...
- Tomcat服务相关
1. 将Tomcat安装成服务. 找到bin\service.bat文件,往cmd命令行窗口一拉,如果只提示service /remove [../..]那就是Tomcat和java的路径配置没问题. ...