Python操作ElasticSearch

Python批量向ElasticSearch插入数据

Python 2的多进程不能序列化类方法, 所以改为函数的形式.

直接上代码:

#!/usr/bin/python

# -*- coding:utf-8 -*-

import os

import re

import json

import time

import elasticsearch

from elasticsearch.helpers import bulk

from multiprocessing import Pool

def write_file(doc_type, action_list):

    """"""

    with open("/home/{}_error.json".format(doc_type), "a") as f:

        for i in action_list:

            f.write(str(i))

def add_one(file_path, doc_type, index):

    """准备插入一条"""

    print doc_type, index

    es_client = elasticsearch.Elasticsearch(hosts=[{"host": "localhost", "port": "9200"}])

    with open(file_path, "r") as f:

        for line in f:

            try:

                line = re.sub("\n", "", line)

                dict_obj = json.loads(line)

                es_client.index(index=index, doc_type=doc_type, body=dict_obj)

            except Exception as e:

                print "出错了, 错误信息: {}".format(e)

def add_bulk(doc_type, file_path, bulk_num, index):

    """"""

    es_client = elasticsearch.Elasticsearch(hosts=[{"host": "localhost", "port": "9200"}])

    action_list = []

    # 文件过大, 先插入5000万试水

    total = 50000000

    num = 0

    with open(file_path, "r") as f:

        for line in f:

            num += 0

            if num >= total:

                break

            # 去除每一行数据中的"\n"字符, 也可以替换为"\\n"

            line = line.replace("\n", "")

            dict_obj = json.loads(line)

            # 根据bulk_num的值发送一个批量插入请求

            # action = {

            #     "_index": index,

            #     "_type": doc_type,

            #     "_source": {

            #         "ip": dict_obj.get("ip", "None"),

            #         "data": str(dict_obj.get("data", "None"))

            #     }

            # }

            # 如果动态插入，字段过长，会报错，导致插不进去, 转为字符串就可以

            action = {

                '_op_type': 'index',

                "_index": index,

                "_type": doc_type,

                "_source": dict_obj

            }

            action_list.append(action)

            if len(action_list) >= bulk_num:

                try:

                    print "Start Bulk {}...".format(doc_type)

                    success, failed = bulk(es_client, action_list, index=index, raise_on_error=True)

                    print "End Bulk {}...".format(doc_type)

                except Exception as e:

                    print "出错了, Type:{}, 错误信息:{}".format(doc_type, e[0])

                    write_file(doc_type, action_list)

                finally:

                    del action_list[0:len(action_list)]

        # 如果不是bulk_num的等值, 那么就判断列表是否为空, 再次发送一次请求

        if len(action_list) > 0:

                try:

                    success, failed = bulk(es_client, action_list, index=index, raise_on_error=True)

                except Exception as e:

                    print "出错了, Type:{}, 错误信息:{}".format(doc_type, e[0])

                    write_file(doc_type, action_list)

                finally:

                    del action_list[0:len(action_list)]

def mulit_process(path, index, bulk_num, data):

    """"""

    # 多进程执行

    pool = Pool(10)

    results = []

    for i in data:

        doc_type = i["doc_type"]

        file_path = i["file_path"]

        result = pool.apply_async(add_bulk, args=(doc_type, file_path, bulk_num, index))

        results.append(result)

    pool.close()

    pool.join()

def all_info(path):

    data = []

    for i in os.listdir(path):

        file_dict = {}

        if i.endswith(".json"):

            doc_type = i.split("_")[0]

            file_path = path + i

            if doc_type == "443":

                continue

            file_dict["doc_type"] = doc_type

            file_dict["file_path"] = file_path

            data.append(file_dict)

    return data

def es_insert(process_func=None):

    """"""

    # 库

    index = "test"

    # 文件路径

    path="/home/data/"

    # 批量插入的数量, 如果是json整条数据插入的话, 可能会出现字段过长的问题, 导致插不进去, 适当调整bulk_num的值

    bulk_num = 5000

    if not path.endswith("/"):

        path += "/"

    data = all_info(path)

    if process_func == "bulk":

        # 插入多条, doc_type, file_path, bulk_num, index

        add_bulk("80", path + "80_result.json", bulk_num, index)

    elif process_func == "one":

        # 插入单条file_path, doc_type, index

        add_one(path + "80_result.json", "80", index)

    else:

        # 多进程

        mulit_process(path, index, bulk_num, data)

if __name__ == "__main__":

    # 计算脚本执行时间

    start_time = time.time()

    if not os.path.exists("/home/test"):

        os.makedirs("/home/test")

    # 插入数据

    es_insert()

    # 计算脚本执行时间

    end_time = time.time()

    print end_time - start_time

Python搜索ElasticSearch

示例:

#!/usr/bin/python

# -*- coding:utf -*-

import json

import elasticsearch

def es_login(host="localhost", port="9200"):

    """连接es"""

    return elasticsearch.Elasticsearch(hosts=[{"host": host, "port": port}])

def get(es_client, _id):

    """获取一条内容"""

    # result = es_client.get(index="test", doc_type="80", id=_id)

    result = es_client.get(index="test", id=_id)

    return json.dumps(result)

def search(es_client, query, field="_all"):

    """聚合搜索内容"""

    result = es_client.search(index="test", body={

        "query": {

            "bool": {

                "must": [

                    {

                        "query_string": {

                            # 指定字段

                            "default_field": field,

                            # 查询字段

                            "query": query

                        }

                    },

                    {

                        "match_all": {}

                    }

                ],

                "must_not": [],

                "should": []

            }

        },

        "from": 0,

        "size": 10,

        "sort": [],

        # 聚合

        "aggs": {

            # "all_interests":{

            #     "terms":{

            #         "field":"interests"

            #     }

            # }

        }

    })

    return json.dumps(result)

def main():

    """入口"""

    # 连接es

    es_client = es_login()

    # result = search(es_client, query="123.125.115.110", field="_all")

    result = get(es_client, "AWTv-ROzCxZ1gYRliWhu")

    print result

if __name__ == "__main__":

    main()

删除ElasticSearch全部数据

curl -X DELETE localhost:9200/test, test为自己的index名称

Python操作ElasticSearch的更多相关文章

Python 操作 ElasticSearch
Python 操作 ElasticSearch 学习了:https://www.cnblogs.com/shaosks/p/7592229.html 官网:https://elasticsearch- ...
python操作elasticsearch增、删、改、查
最近接触了个新东西--es数据库这东西虽然被用的很多,但我是前些天刚刚接触的,发现其资料不多,学起来极其痛苦,写个文章记录下导入库from elasticsearch import Elastic ...
python操作Elasticsearch (一、例子)
E lasticsearch是一款分布式搜索引擎,支持在大数据环境中进行实时数据分析.它基于Apache Lucene文本搜索引擎,内部功能通过ReST API暴露给外部.除了通过HTTP直接访问El ...
python实现elasticsearch操作-CRUD API
python操作elasticsearch常用API 目录目录 python操作elasticsearch常用API1.基础2.常见增删改操作创建更新删除3.查询操作查询拓展类实现es的CRUD操作 ...
python使用elasticsearch模块操作elasticsearch
1.创建索引命令如下 from elasticsearch import Elasticsearch es = Elasticsearch([{"host":"10.8 ...
java操作elasticsearch实现批量添加数据（bulk）
java操作elasticsearch实现批量添加主要使用了bulk 代码如下: //bulk批量操作(批量添加) @Test public void test7() throws IOExcepti ...
利用NEST2.0 在C#中操作Elasticsearch
前言:本文主要演示了如何通过c#来操作elasticsearch,分两个方面来演示: 索引数据搜索数据 Note: 注意我索引数据和搜索数据是两个不同的例子,没有前后依赖关系准备工作:需要在vis ...
Python 和 Elasticsearch 构建简易搜索
Python 和 Elasticsearch 构建简易搜索作者:白宁超 2019年5月24日17:22:41 导读:件开发最大的麻烦事之一就是环境配置,操作系统设置,各种库和组件的安装.只有它们都正 ...
笔记13：Python 和 Elasticsearch 构建简易搜索
Python 和 Elasticsearch 构建简易搜索 1 ES基本介绍概念介绍 Elasticsearch是一个基于Lucene库的搜索引擎.它提供了一个分布式.支持多租户的全文搜索引擎,它可 ...

随机推荐

Blend_Effect
原文:Blend_Effect 版权声明:本文为博主原创文章,未经博主允许不得转载. https://blog.csdn.net/u010265681/article/details/76651796 ...
Symfony——如何使用Assetic实现资源管理
1. 安装和启用从Symfony 2.8开始,Assetic不再包含在Symfony Standard Edition中.在使用其任何功能之前,请在您的项目中安装执行此控制台命令的 AsseticB ...
mac 端安装JAVA开发环境
一.maven安装下载地址 https://maven.apache.org/download.cgi 下载 apache-maven-3.5.2-bin-zip 将下载的文件放在某路径下修改 ...
MEF 插件式开发 - WPF 初体验
原文:MEF 插件式开发 - WPF 初体验目录 MEF 在 WPF 中的简单应用加载插件获取元数据依赖注入总结 MEF 在 WPF 中的简单应用 MEF 的开发模式主要适用于插件化的业务场 ...
WPF DataGrid自动生成列
<Window x:Class="DataGridExam.MainWindow" xmlns="http://schemas.microsoft.c ...
delphi中WebBrowser的parent改变时变成空白问题的解决（覆盖CreateWnd和DestroyWnd）
这段时间在做一个delphi界面打开网页的功能,且此网页所在窗口可完整显示,可缩小到另一个窗口的panel上显示可是在改变网页所在窗口时,WebBrowser控件变成了空白上网google了半天, ...
Win8Metro(C#)数字图像处理--2.29图像除法运算
原文:Win8Metro(C#)数字图像处理--2.29图像除法运算 [函数名称] 图像除法函数DivisionProcess(WriteableBitmap src, WriteableBit ...
使用Newtonsoft.Json输出JSON
安装: Install-Package Newtonsoft.Json 代码: //序列化DataTable DataTable dt = new DataTable(); dt.Columns.Ad ...
基于事件驱动的DDD领域驱动设计框架分享（附源代码）
原文:基于事件驱动的DDD领域驱动设计框架分享(附源代码) 补充:现在再回过头来看这篇文章,感觉当初自己偏激了,呵呵.不过没有以前的我,怎么会有现在的我和现在的enode框架呢?发现自己进步了真好! ...
UWP实现时钟
UWP现在的开发确实很方便,不过资料真的好少啊... 前些天看到同学有实实现自定义的时钟,这东东挺简单的,就自己也写个,没成想,遇到个坑,费了好长时间,记下来一下. 效果图: 画个圆,三条线就好.XA ...

Python操作ElasticSearch

Python批量向ElasticSearch插入数据

Python搜索ElasticSearch

删除ElasticSearch全部数据

Python操作ElasticSearch的更多相关文章

随机推荐

热门专题