from ProjectUtil.usingModuleTOMODIFY import getNow

export_q_f, q_l, start_ = '/mnt/mongoexport/superpub-ask-question.csv', [], getNow()

def save_(q_l):
export_q_f = '/mnt/mongoexport/superpub-ask-question-cleaned-NUM-{}.txt'.format(len(q_l))
with open(export_q_f, 'w', encoding='utf-8') as fw:
s = '\n'.join(q_l)
fw.write(s) step = 500000
with open(export_q_f, 'r', encoding='utf-8') as fr:
c = 0
for i in fr:
question = i.rstrip('\n')
c += 1
q_l.append(question)
if c % step == 0:
q_l = [i for i in set(q_l)]
print(step, ':', c / step, 'start_', start_, 'now', getNow(), 'DistinctNum', len(q_l))
save_(q_l)

  

500000 : 1.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:05 DistinctNum 270513
500000 : 2.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:07 DistinctNum 539468
500000 : 3.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:09 DistinctNum 804547
500000 : 4.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:10 DistinctNum 1073529
500000 : 5.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:14 DistinctNum 1342413
500000 : 6.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:17 DistinctNum 1616368
500000 : 7.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:20 DistinctNum 1888643
500000 : 8.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:24 DistinctNum 2159613
500000 : 9.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:28 DistinctNum 2433085
500000 : 10.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:32 DistinctNum 2705454
500000 : 11.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:37 DistinctNum 2978046
500000 : 12.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:43 DistinctNum 3244211
500000 : 13.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:50 DistinctNum 3512526
500000 : 14.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:28:56 DistinctNum 3782082
500000 : 15.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:02 DistinctNum 4054694
500000 : 16.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:09 DistinctNum 4325960
500000 : 17.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:19 DistinctNum 4595687
500000 : 18.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:26 DistinctNum 4870389
500000 : 19.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:35 DistinctNum 5144203
500000 : 20.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:46 DistinctNum 5416514
500000 : 21.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:29:56 DistinctNum 5687541
500000 : 22.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:08 DistinctNum 5959566
500000 : 23.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:19 DistinctNum 6235717
500000 : 24.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:31 DistinctNum 6508576
500000 : 25.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:43 DistinctNum 6784810
500000 : 26.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:30:57 DistinctNum 7057572
500000 : 27.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:10 DistinctNum 7327870
500000 : 28.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:26 DistinctNum 7600230
500000 : 29.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:41 DistinctNum 7874540
500000 : 30.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:31:58 DistinctNum 8148841
500000 : 31.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:13 DistinctNum 8421791
500000 : 32.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:33 DistinctNum 8695611
500000 : 33.0 start_ 2018-11-29 09:28:04 now 2018-11-29 09:32:48 DistinctNum 8968033

  

500000 : 103.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:19:54 DistinctNum 28080404
500000 : 104.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:20:56 DistinctNum 28349367
500000 : 105.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:22:03 DistinctNum 28618117
500000 : 106.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:23:07 DistinctNum 28886698
500000 : 107.0 start_ 2018-11-29 09:28:04 now 2018-11-29 10:24:11 DistinctNum 29157115
Traceback (most recent call last):
File "distinctMongoExportQuestion.py", line 23, in <module>
save_(q_l)
File "distinctMongoExportQuestion.py", line 10, in save_
fw.write(s)
MemoryError
[root@e selfPlatformAskAnswerProjeect]# ll -ash /mnt/mongoexport/
total 12G
12K drwxr-xr-x 2 root root 12K Nov 29 10:24 .
4.0K drwxr-xr-x 8 root root 4.0K Nov 26 10:01 ..
1.4G -rw-r--r-- 1 root root 1.4G Nov 29 10:16 superpub-ask-question-cleaned-NUM-27004655.txt
1.4G -rw-r--r-- 1 root root 1.4G Nov 29 10:17 superpub-ask-question-cleaned-NUM-27272026.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:18 superpub-ask-question-cleaned-NUM-27537864.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:19 superpub-ask-question-cleaned-NUM-27809291.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:20 superpub-ask-question-cleaned-NUM-28080404.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:21 superpub-ask-question-cleaned-NUM-28349367.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:22 superpub-ask-question-cleaned-NUM-28618117.txt
1.5G -rw-r--r-- 1 root root 1.5G Nov 29 10:23 superpub-ask-question-cleaned-NUM-28886698.txt
0 -rw-r--r-- 1 root root 0 Nov 29 10:24 superpub-ask-question-cleaned-NUM-29157115.txt
[root@e selfPlatformAskAnswerProjeect]#

  

修改代码

from ProjectUtil.usingModuleTOMODIFY import getNow

export_q_f, q_l, start_ = '/data/bigdata/mongoexport/superpub-ask-question.csv', [], getNow()

step = 500000
with open(export_q_f, 'r', encoding='utf-8') as fr:
c = 0
for i in fr:
question = i.rstrip('\n')
c += 1
q_l.append(question)
if c % step == 0:
q_l = [i for i in set(q_l)]
print(step, ':', c / step, 'start_', start_, 'now', getNow(), 'DistinctNum', len(q_l))
export_q_f = '/data/bigdata/mongoexport/superpub-ask-question-cleaned-NUM-{}.txt'.format(len(q_l))
with open(export_q_f, 'w', encoding='utf-8') as fw:
s = '\n'.join(q_l)
fw.write(s)

  更换主机为16G--->32G(开启进程前,内存消耗约5G)

6核--->同规格8核(之前cpu消耗情况未统计)

cat /proc/cpuinfo

processor : 7
vendor_id : GenuineIntel
cpu family : 6
model : 63
model name : Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz
stepping : 2
microcode : 0x1
cpu MHz : 2494.224
cache size : 30720 KB
physical id : 0
siblings : 8
core id : 3
cpu cores : 4
apicid : 7
initial apicid : 7
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm xsaveopt fsgsbase bmi1 avx2 smep bmi2 erms invpcid
bogomips : 4988.44
clflush size : 64
cache_alignment : 64
address sizes : 46 bits physical, 48 bits virtual
power management:

500000 : 96.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:40:37 DistinctNum 26197155
500000 : 97.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:41:19 DistinctNum 26466813
500000 : 98.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:42:03 DistinctNum 26737397
500000 : 99.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:42:45 DistinctNum 27005103
500000 : 100.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:43:28 DistinctNum 27272487
500000 : 101.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:44:10 DistinctNum 27538331
500000 : 102.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:44:55 DistinctNum 27809771
500000 : 103.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:45:38 DistinctNum 28080901
500000 : 104.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:46:24 DistinctNum 28349871
500000 : 105.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:47:10 DistinctNum 28618630
500000 : 106.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:47:56 DistinctNum 28887233
500000 : 107.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:48:43 DistinctNum 29157679
500000 : 108.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:49:33 DistinctNum 29420209
500000 : 109.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:50:21 DistinctNum 29675048
500000 : 110.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:51:10 DistinctNum 29934499
500000 : 111.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:51:59 DistinctNum 30193756
500000 : 112.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:52:50 DistinctNum 30453618
500000 : 113.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:53:40 DistinctNum 30712426
500000 : 114.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:54:31 DistinctNum 30972908
500000 : 115.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:55:25 DistinctNum 31234766
500000 : 116.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:56:18 DistinctNum 31495613
500000 : 117.0 start_ 2018-11-29 16:07:50 now 2018-11-29 16:57:13 DistinctNum 31756776

  

500000 : 152.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:35:05 DistinctNum 40981071
500000 : 153.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:36:20 DistinctNum 41243684
500000 : 154.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:37:40 DistinctNum 41511378
500000 : 155.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:38:57 DistinctNum 41777831
500000 : 156.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:40:16 DistinctNum 42043333
500000 : 157.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:41:33 DistinctNum 42308552
500000 : 158.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:42:49 DistinctNum 42568225
500000 : 159.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:44:06 DistinctNum 42818269
500000 : 160.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:45:24 DistinctNum 43069718
500000 : 161.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:46:42 DistinctNum 43322396
500000 : 162.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:48:06 DistinctNum 43573573
500000 : 163.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:49:23 DistinctNum 43826414
500000 : 164.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:50:42 DistinctNum 44079373
500000 : 165.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:52:01 DistinctNum 44335042
500000 : 166.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:53:22 DistinctNum 44593450
500000 : 167.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:54:46 DistinctNum 44854064
500000 : 168.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:56:11 DistinctNum 45115737
500000 : 169.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:57:36 DistinctNum 45378583
500000 : 170.0 start_ 2018-11-29 16:07:50 now 2018-11-29 17:58:58 DistinctNum 45638980
500000 : 171.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:00:23 DistinctNum 45902750
500000 : 172.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:01:46 DistinctNum 46163054
500000 : 173.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:03:12 DistinctNum 46212601
500000 : 174.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:04:37 DistinctNum 46240277
500000 : 175.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:06:02 DistinctNum 46269660
500000 : 176.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:07:28 DistinctNum 46317443
500000 : 177.0 start_ 2018-11-29 16:07:50 now 2018-11-29 18:08:54 DistinctNum 46492828
Traceback (most recent call last):
File "distinctMongoExportQuestion.py", line 17, in <module>
s = '\n'.join(q_l)
MemoryError

  

磁盘监控程序

import os, time

while True:
s = 'find ./* -mmin +3 | grep txt | xargs rm -f'
print(s)
os.system(s)
time.sleep(120)

  

但是 没有对监控的监控,导致,数据都被删除了。。。。。

数据去重优化 MemoryError 内存不足的更多相关文章

  1. 转 iOS和android游戏纹理优化和内存优化(cocos2d-x)

    iOS和android游戏纹理优化和内存优化(cocos2d-x) (未完成) 1.2d游戏最占内存的无疑是图片资源. 2.cocos2d-x不同平台读取纹理的机制不同.ios下面使用CGImage, ...

  2. 老李分享:Android性能优化之内存泄漏1

    老李分享:Android性能优化之内存泄漏   前言 对于内存泄漏,我想大家在开发中肯定都遇到过,只不过内存泄漏对我们来说并不是可见的,因为它是在堆中活动,而要想检测程序中是否有内存泄漏的产生,通常我 ...

  3. Spring+SpringMVC+MyBatis+easyUI整合优化篇(十三)数据层优化-表规范、索引优化

    本文提要 最近写的几篇文章都是关于数据层优化方面的,这几天也在想还有哪些地方可以优化改进,结合日志和项目代码发现,关于数据层的优化,还是有几个方面可以继续修改的,代码方面,整合了druid数据源也开启 ...

  4. jvm性能优化及内存分区

     jvm性能优化及内存分区 2012-09-17 15:51:37 分类: Java Some of the default values for Sun JVMs are listed below. ...

  5. mssql sqlserver 三种数据表数据去重方法分享

    摘要: 下文将分享三种不同的数据去重方法数据去重:需根据某一字段来界定,当此字段出现大于一行记录时,我们就界定为此行数据存在重复. 数据去重方法1: 当表中最在最大流水号时候,我们可以通过关联的方式为 ...

  6. Android 性能优化之内存泄漏检测以及内存优化(中)

    https://blog.csdn.net/self_study/article/details/66969064 上篇博客我们写到了 Java/Android 内存的分配以及相关 GC 的详细分析, ...

  7. 微擎开启性能优化里面的性能优化memcache内存优化及数据库读写分离

    http://www.mitusky.com/forum.php?mod=viewthread&tid=3135 [微擎 安装使用] 微擎开启性能优化里面的性能优化memcache内存优化及数 ...

  8. Kafka丢失数据问题优化总结

    数据丢失是一件非常严重的事情事,针对数据丢失的问题我们需要有明确的思路来确定问题所在,针对这段时间的总结,我个人面对kafka 数据丢失问题的解决思路如下: 是否真正的存在数据丢失问题,比如有很多时候 ...

  9. Redis数据存储优化机制(转)

    原文:Redis学习笔记4--Redis数据存储优化机制 1.zipmap优化hash: 前面谈到将一个对象存储在hash类型中会占用更少的内存,并且可以更方便的存取整个对象.省内存的原因是新建一个h ...

随机推荐

  1. 如何在ChemDraw中绘制分子立体结构

    ChemDraw是当前最常用的的化学结构绘图软件,软件功能包括化学作图.分子模型生成.化学数据库信息管理等,可以说是化学家和生物学家所需要最终极的化学结构绘图工具.本教程主要介绍ChemDraw绘制分 ...

  2. Visual Studio 2013 离线版msdn下载和安装

    Visual Studio 2013出来后,并没有自带msdn安装包,而变成了在线安装msdn,好处是msdn可以随时进行更新,坏处是难道以后每次重新安装系统,都需要重新下载吗,如何解决这个问题呢?本 ...

  3. angular学习(十五)——Provider

    转载请写明来源地址:http://blog.csdn.net/lastsweetop/article/details/60966263 Provider简单介绍 每一个web应用都是由多个对象协作完毕 ...

  4. 用cocos2d 2.1制作一个过河小游戏(4): 游戏主逻辑BaseLayer设计

    前段时间一直在忙.没有时间更新博客.今天还是抽点时间把最后一小部分游戏的实现放上来吧. BaseLayer.h: #import <GameKit/GameKit.h> #import & ...

  5. Serlvet学习笔记之一 ——实现servlet的3种方法

    1.配置环境,从tomcat的lib下面引入servlet-api.jar包.

  6. Serlvet学习笔记之四—对文件的操作

    1.读文件 package com.demo; import java.io.BufferedReader; import java.io.FileReader; import java.io.Pri ...

  7. js遍历商品编码

    function bathAuditGoods(state) { var selections = $("#deliveryGrid").datagrid('getSelectio ...

  8. 检测你的php代码执行效率

    在写程序的时候,经常会为是改用empty()还是isset好,或是用单引号还是双引号来显示连接字符串而发出疑问,现在好了.我们其实可以通过程序很科学的得出精确的答案.知道我们的程序到底怎样写效率会更好 ...

  9. php遍历文件夹下的所有文件及文件夹

    //第一种 遍历放入数据中 function my_scandir($dir) { $files = array(); if ( $handle = opendir($dir) ) { while ( ...

  10. react实现的点击拖拽元素效果

    之前用vue做日程管理组件的时候,用到了点击拖拽的效果,即点击元素,鼠标移动到哪里,元素移动到哪里,鼠标松开,拖拽停止,现在在弄react,于是也在想实现这个效果,经过一番折腾,效果出来了,代码如下: ...