利用whoosh对mongoDB的中文文档建立全文检索

1、建立索引

#coding=utf-8

from __future__ import unicode_literals

__author__ = 'zh'

import sys,os

from whoosh.index import create_in,open_dir

from whoosh.fields import *

from jieba.analyse import ChineseAnalyzer

import pymongo

import json

from pymongo.collection import Collection

from pymongo import database

class CreatIndex:

    def __init__(self):

        self.mongoClient = pymongo.MongoClient('192.168.229.128',27017)

        self.websdb = pymongo.database.Database(self.mongoClient,'webdb')

        self.pagesCollection = Collection(self.websdb,'pages')

    def BuiltIndex(self):

        analyzer = ChineseAnalyzer()

        # 索引模版

        schema = Schema(

            U_id=ID(stored=True),

            # md5=ID(stored=True),

            title=TEXT(stored=True,analyzer=analyzer),

            location=TEXT(stored=True),

            publish_time=DATETIME(stored=True,sortable=True),

            content=TEXT(stored=False,analyzer=analyzer)

        )

        from whoosh.filedb.filestore import FileStorage

        storage = FileStorage("../whoosh_index")

        if not os.path.exists("../whoosh_index"):

            os.mkdir("../whoosh_index")

            ix = storage.create_index(schema)

            print '建立索引文件！'

        else:

            ix=storage.open_index()

        # if not os.path.exists("whoosh_index"):

        #     os.mkdir("whoosh_index")

        #     ix = create_in("whoosh_index", schema) # for create new index

        # #ix = open_dir("tmp") # for read only

        writer = ix.writer()

        try:

            num=0

            while(True):

                # break

                try:

                    row=self.pagesCollection.find_one({'indexed':{'$exists':False}})

                    if row!=None:

                        publish_time=None

                        if row.has_key('publish_time'):

                            publish_time=row['publish_time']

                            if str(publish_time)=='' or str(publish_time)=='':

                                publish_time=None

                        location=''

                        if row.has_key('location'):

                            location=json.JSONEncoder().encode(row['location'])

                        writer.add_document(

                        U_id=''.join(str(row['_id'])),

                        # md5=row['md5'],

                        title=row['name'],

                        location=''.join(location),

                        publish_time=publish_time,

                        content=row['information']

                        )

                        self.pagesCollection.update_one({"_id":row["_id"]},{"$set":{"indexed":True}})

                        num+=1

                        print row["_id"],"已建立索引！"

                    else:

                        writer.commit()

                        print "全部处理完毕"

                        # time.sleep(3600)

                        # self.BuiltIndex()

                        break

                except:

                    print row["_id"],"异常"

                    break

        except:

            writer.commit()

            print "异常"

        # print '已处理',num,'共计', self.pagesCollection.find({'indexed':{'$exists':True}}).count()

            print '已处理',num,'共计', self.pagesCollection.find().count()

creatindext = CreatIndex()

creatindext.BuiltIndex()

注：注意编码

2、检索

from __future__ import unicode_literals

#coding=utf-8

__author__ = 'zh'

# from whoosh.qparser import QueryParser

from whoosh import qparser,sorting

# from jieba.analyse import ChineseAnalyzer

from whoosh.index import open_dir

from whoosh.query import *

# import pymongo

import datetime

# from pymongo.collection import Collection

# from pymongo import database

class FullText:

    def __init__(self,index_home='whoosh_index'):

        self.index_home = index_home

        self.ix = open_dir(self.index_home)

        self.searcher = self.ix.searcher()

    # 全文检索,目前主要利用关键字

    def Query(self,parameter):

        # analyzer = ChineseAnalyzer()

        # ix = open_dir(self.index_home) # for read only

        # searcher = ix.searcher()

        # print ix.schema['content']

        # 按照字段查询，可联合查询，MultifieldParser

        list=parameter['keys']

        if len(list)==1:

            parser = qparser.QueryParser(list[0], schema=self.ix.schema)

        if len(list)>1:

            parser = qparser.MultifieldParser(list, schema=self.ix.schema)

        # else:

        #     return None

        # print ix.schema

        keywords = parameter['keywords']

        # print keywords

        q = parser.parse(keywords)

        # mf = sorting.MultiFacet()

        scores = sorting.ScoreFacet()

        date = sorting.FieldFacet("publish_time", reverse=True)

        # 是否分页返回OR全部返回,默认全部返回

        _limit=None

        if parameter.has_key('page') and parameter.has_key('pagesize'):

            page=parameter['page']

            pagesize=parameter['pagesize']

            if page > 0 and pagesize !=0:

                _limit=page*pagesize

        # 是否按照location字段过滤,默认不过滤

        allow_q=None

        if parameter.has_key('includeFields') and parameter['includeFields'].__contains__(u'location'):

            allow_q = qparser.query.Term("location", u"coordinates")

        #  时间分组,暂时不用

        # start = datetime.datetime(2000, 1, 1)

        # end = datetime.datetime.now()

        # gap = datetime.timedelta(days=365)

        # bdayfacet = sorting.DateRangeFacet("publish_time", start, end, gap)

        results = self.searcher.search(q, limit=_limit,filter=allow_q,sortedby=[scores,date])

        # results = searcher.search(q, limit=_limit,filter=restrict_q,

        #                           groupedby=bdayfacet,sortedby=[scores,date])

        # print results.estimated_length()

        return results

fulltext_query = fulltext.FullText()

注：支持多字段检索、分类、排序等

whoosh参考提供陕西省POI数据（300万条，sqlserver备份文件）

利用whoosh对mongoDB的中文文档建立全文检索的更多相关文章

python 搜索引擎Whoosh中文文档和代码以及jieba的使用
注意, 数据库的表最好别有下划线中文文档链接: https://mr-zhao.gitbooks.io/whoosh/content/%E5%A6%82%E4%BD%95%E7%B4%A2%E5%B ...
Phoenix综述（史上最全Phoenix中文文档）
个人主页:http://www.linbingdong.com 简书地址:http://www.jianshu.com/users/6cb45a00b49c/latest_articles 网上关于P ...
Django 1.10中文文档—第一个Django应用Part1
在本教程中,我们将引导您完成一个投票应用程序的创建,它包含下面两部分: 一个可以进行投票和查看结果的公开站点: 一个可以进行增删改查的后台admin管理界面: 我们假设你已经安装了Django.您可以 ...
【Chromium中文文档】OS X 沙箱设计
OS X 沙箱设计转载请注明出处:https://ahangchen.gitbooks.io/chromium_doc_zh/content/zh//General_Architecture/OSX ...
【Chromium中文文档】Chrome/Chromium沙箱 - 安全架构设计
沙箱转载请注明出处:https://ahangchen.gitbooks.io/chromium_doc_zh/content/zh//General_Architecture/Sandbox.ht ...
openstack中文文档
http://www.openstack.cn/p392.html openStack Hacker中文文档 http://docs.mirantis.com/fuel-dev/develop/a ...
【Chromium中文文档】进程模型
进程模型转载请注明出处:https://ahangchen.gitbooks.io/chromium_doc_zh/content/zh//General_Architecture/Process_ ...
【Chromium中文文档】Web安全研究
转载请注明出处:https://ahangchen.gitbooks.io/chromium_doc_zh/content/zh//General_Architecture/Extension_Sec ...
Visual Studio Code中文文档
Visual Studio Code中文文档 Visual Studio Code是一个轻量级但是十分强大的源代码编辑器,重要的是它在Windows, OS X 和Linux操作系统的桌面上均可运行. ...

随机推荐

@restcontroller与@controller的区别
这段时间偷偷看了下spring boot.结果引用模板时没注意,把@restcontroller替换了@controlle,结果模板出不来.终究原因是spring的知识不到位. 下面说说这2的说明和区 ...
plx9030触发pci中断
if(((SWAB_16(PLX_INT(0x4C)))&0x04)==0x04) { ErrNo = *(UINT16*)(g_MemBase+0XFFFE*2); /*logMsg(&qu ...
Junit4测试报错
1.字符串数组越界 java.lang.String IndexOutOfBounds Exception:String index out of range:-1 导致: Transaction r ...
FusionCharts ScrollColumn2D图
FusionCharts ScrollColumn2D图 1.JSP页面 ScrollColumn2D.jsp: <%@ page language="java" conte ...
xml文件的规则
一,规则 1.1,样本 <?xml version="1.0" encoding="utf-8"?> <contactList> < ...
Excel VBA TextBox控件的滚动条不能刷新
问题:Excel中Textbox控件以及Form里的Textbox控件,当直接填充的内容很长时,滚动条不能实时刷新. 现象: 修改后: 原因: 虽然已经向Textbox的Text里设置了内容,但此时T ...
Rolling Update - 每天5分钟玩转 Docker 容器技术（140）
滚动更新是一次只更新一小部分副本,成功后,再更新更多的副本,最终完成所有副本的更新.滚动更新的最大的好处是零停机,整个更新过程始终有副本在运行,从而保证了业务的连续性. 下面我们部署三副本应用,初始镜 ...
iOS学习——自动定位
最近在项目中需要做自动定位功能,就是你在参加会议通过扫描二维码签到的时候自动定位并将你的定位信息在签到中上传,这样可以避免我们进行假签到.在这个功能中,主要用到的是系统自带的定位模块,首先我们是需要配 ...
【LightOJ1259】Goldbach`s Conjecture（数论）
[LightOJ1259]Goldbach`s Conjecture(数论) 题面 Vjudge T组询问,每组询问是一个偶数n 验证哥德巴赫猜想回答n=a+b 且a,b(a<=b)是质数的方 ...
[HAOI2012]高速公路
题面在这里题意维护区间加操作+询问区间任选两不同点途中线段权值之和的期望 sol 一道假的期望题... 因为所有事件的发生概率都相同,所以答案就是所有方案的权值总和/总方案数因为区间加法自然想到 ...

利用whoosh对mongoDB的中文文档建立全文检索

利用whoosh对mongoDB的中文文档建立全文检索的更多相关文章

随机推荐

热门专题