Lucene 搜索的初步探究

搜索应用程序和 Lucene 之间的关系

一般的搜索引擎都会采用这样的

Lucene 采用的是一种称为反向索引（inverted index）的机制。反向索引就是说我们维护了一个词 / 短语表，对于这个表中的每个词 / 短语，都有一个链表描述了有哪些文档包含了这个词 / 短语。这样在用户输入查询条件的时候，就能非常快的得到搜索结果。

反向索引

Lucene 软件包的介绍

org.apache.lucene.document

封装要索引的文档所需要的类，比如 Document, Field。 文档一document对象在程序程序中运行

org.apache.lucene.analysis

对文档进行分词，因为文档在建立索引之前必须要进行分词

org.apache.lucene.index

创建索引以及对创建好的索引进行更新

IndexWriter 是用来创建索引并添加文档到索引中的，IndexReader 是用来删除索引中的文档的

org.apache.lucene.search

建立好的索引上进行搜索所需要的类

IndexSearcher 和 Hits, IndexSearcher 定义了在指定的索引上进行搜索的方法，Hits 用来保存搜索得到的结果。

Lucene建立索引和搜索的几个流程

建立索引

1 document 指文档可以死HTML，文本等。并且document是由多个Field对象组成的，可以把一个 Document 对象想象成数据库中的一个记录，而每个 Field 对象就是记录的一个字段。

2 Field Field 对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field 对象分别描述。

3 Analyzer 对文档进行分词

4 IndexWriter IndexWriter 是 Lucene 用来创建索引的一个核心的类，他的作用是把一个个的 Document 对象加到索引中来。

5 Directory 代表了Lucene 的索引的存储的位置，有两个实现，第一个是 FSDirectory，存储在文件系统中的索引。 RAMDirectory，存储在内存当中的索引。

搜索文档

之前建立好的索引文档，现在我们可以进行对文档进行搜索

1 Query 是抽象类，有多个继承实现 TermQuery, BooleanQuery, PrefixQuery. 。这些类可以将用户输入的字符串封装成Lucene能够识别的Query

2 Term Term 是搜索的基本单位基本的语法是

Term term = new Term(“fieldName”,”queryWord”);

fieldName 是指要在文档的那个field上查找，queryWord表示要查询的关键词

3 TermQuery TermQuery是抽象的query的一个之类，

	TermQuery termQuery = new TermQuery(new Term(“fieldName”,”queryWord”)); 它的构造函数只接受一个参数，那就是一个 Term 对象。

4 IndexSearcher 在建立好的索引上进行搜索，

5 Hits Hits 是用来保存搜索的结果的。

建立索引的过程和代码

	package LuceneTest.LuceneTest;

import java.io.File;

import java.io.FileReader;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field.Store;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

/**

 * 建立索引的类

 * @author Ni Shengwu

 *

 */

public class Indexer {

private IndexWriter writer; //写索引实例

    //构造方法，实例化IndexWriter

    public Indexer(String indexDir) throws Exception {

        Directory dir = FSDirectory.open(Paths.get(indexDir));

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词

        IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中

        writer = new IndexWriter(dir, config); //实例化写索引对象

    }

    //关闭写索引

    public void close() throws Exception {

        writer.close();

    }

    //索引指定目录下的所有文件

    public int indexAll(String dataDir) throws Exception {

        File[] files = new File(dataDir).listFiles(); //获取该路径下的所有文件

        for(File file : files) {

            indexFile(file); //调用下面的indexFile方法，对每个文件进行索引

        }

        return writer.numDocs(); //返回索引的文件数

    }

    //索引指定的文件

    private void indexFile(File file) throws Exception {

        System.out.println("索引文件的路径：" + file.getCanonicalPath());

        Document doc = getDocument(file); //获取该文件的document

        writer.addDocument(doc); //调用下面的getDocument方法，将doc添加到索引中

    }

    //获取文档，文档里再设置每个字段，就类似于数据库中的一行记录

    private Document getDocument(File file) throws Exception{

        Document doc = new Document();

        //添加字段

        doc.add(new TextField("contents", new FileReader(file))); //添加内容

        doc.add(new TextField("fileName", file.getName(), Store.YES)); //添加文件名，并把这个字段存到索引文件里

        doc.add(new TextField("fullPath", file.getCanonicalPath(), Store.YES)); //添加文件路径

        return doc;

    }

    public static void main(String[] args) {

        String indexDir = "D:\\lucene\\index"; //将索引保存到的路径

        String dataDir = "D:\\lucene\\data"; //需要索引的文件数据存放的目录

        Indexer indexer = null;

        int indexedNum = 0;

        long startTime = System.currentTimeMillis(); //记录索引开始时间

        try {

            indexer = new Indexer(indexDir);

            indexedNum = indexer.indexAll(dataDir);

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            try {

                indexer.close();

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

        long endTime = System.currentTimeMillis(); //记录索引结束时间

        System.out.println("索引耗时" + (endTime-startTime) + "毫秒");

        System.out.println("共索引了" + indexedNum + "个文件");

    }

 }

建立一个搜索的过程

	package LuceneTest.LuceneTest;

import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

public class Searcher {

public static void search(String indexDir, String q) throws Exception {

    Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径，也就是索引所在的位置

    IndexReader reader = DirectoryReader.open(dir);

    IndexSearcher searcher = new IndexSearcher(reader);

    Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词

    QueryParser parser = new QueryParser("contents", analyzer); //查询解析器

    Query query = parser.parse(q); //通过解析要查询的String，获取查询对象

    long startTime = System.currentTimeMillis(); //记录索引开始时间

    TopDocs docs = searcher.search(query, 10);//开始查询，查询前10条数据，将记录保存在docs中

    long endTime = System.currentTimeMillis(); //记录索引结束时间

    System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");

    System.out.println("查询到" + docs.totalHits + "条记录");

    for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果

        Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档

        System.out.println(doc.get("fullPath")); //fullPath是刚刚建立索引的时候我们定义的一个字段

    }

    reader.close();

}

public static void main(String[] args) {

    String indexDir = "D:\\lucene\\index";

    String q = "import"; //查询这个字符串

    try {

        search(indexDir, q);

    } catch (Exception e) {

        e.printStackTrace();

    }

}

}