1、

对.bz2 后缀文件 跳过不处理

2、逐行同字段的json文件,压缩后大小为原文件的12.81%

测试文件近似认为为逐行json文本数据,没有进行多文件重复测试,没有统计时间;

{"uid":50013896,"uuid":"f32feacf-5f83-4866-8dfe-41bff794b8d4","ip":"666298884","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/55821723.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":2,"country":1,"province":0,"city":0,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50015357,"uuid":"388b3676-8835-49b4-827b-5c1f3ddf6bc8","ip":"1973056862","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/55218551.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":2,"country":1,"province":0,"city":0,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50016991,"uuid":"dbd44846-4b4a-4b26-aad2-8a70a7a31c74","ip":"2004569145","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/VVZv_q1-Hpas_pCYVW1sfg.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":1,"country":1,"province":6,"city":77,"flash":"0","java":"0","request_time":1547395198,"create_date":"2019-01-13 23:59:58"}
{"uid":50001228,"uuid":"1b4908cd-1306-40e7-bd4e-df0372bcc749","ip":"3740751066","site":0,"source":0,"address":"http:\/\/www.ijntv.cn\/inews\/CKHzIMoRfJUYOkAwNZTfMg.html","engine":0,"referer":"","keyword":"","browser":11,"language":0,"screen_color":34,"screen_size":0,"system":14,"platform":61,"operator":1,"country":1,"province":14,"city":197,"flash":"0","java":"0","request_time":1547395199,"create_date":"2019-01-13 23:59:59"}
{"ad_slots_id":1002,"uuid":"a369a303-1d70-49eb-9e73-7a2a8f028626","industry_pid":0,"industry_id":0,"ip":"1700604567","site":72,"address":"https:\/\/info.b2b168.com\/s168-47325051.html","create_date":"2019-01-13 23:59:59","ad_id":"50012715","uid":"50012715","keyword":"\u8bbe\u5907","pageinfo":""}
{"ad_slots_id":1002,"uuid":"a369a303-1d70-49eb-9e73-7a2a8f028626","industry_pid":0,"industry_id":0,"ip":"1700604567","site":72,"address":"https:\/\/info.b2b168.com\/s168-47325051.html","create_date":"2019-01-13 23:59:59","ad_id":"50015314","uid":"50015314","keyword":"\u5b81\u6ce2\u6536\u94f6\u8f6f\u4ef6","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"50020536","uid":"50020536","keyword":"Sup\u53e3\u7ea2\u8272\u53f7\u63a8\u8350","pageinfo":"\u7545\u9500\u7684\u56fe\u96c6\u53f7\u8fbd2011J606\u63a8\u8350 |\u8ba2\u8d2d\u56fe\u96c6\u53f7\u8fbd2011J606_\u4f9b\u6c42\u5546\u673a_\u91d1\u6cc9\u7f51#|^#|^http:\/\/www.jqw.com\/Businfo\/1688002049073.htm"}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"34064333","uid":"34064333","keyword":"\u8f6f\u4ef6\u8ba2\u5236","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"50014483","uid":"50014483","keyword":"\u5c71\u6cc9\u6c34\u6279\u53d1","pageinfo":""}
{"ad_slots_id":1001,"uuid":"5eb7efec-9eb1-4493-9739-e466035606b4","industry_pid":0,"industry_id":0,"ip":"2029060375","site":70,"address":"http:\/\/www.jqw.com\/Businfo\/1688002049073.htm","create_date":"2019-01-13 23:59:59","ad_id":"34022975","uid":"34022975","keyword":"\u718a\u638c\u53f7","pageinfo":""}

  

  

137M -rw-r--r-- 1 root root 137M Jan 10 11:45 visit-2019-01-10
20M -rw-r--r-- 1 root root 20M Jan 10 11:48 visit-2019-01-10.zip

bzip2  visit-2019-01-10

16M -rw-r--r-- 1 root root 16M Jan 10 11:45 visit-2019-01-10.bz2
20M -rw-r--r-- 1 root root 20M Jan 10 11:48 visit-2019-01-10.zip

默认 bzip2 theFile 删除原文件,结果文件命名为theFile.bzip2

压缩后的体积为zip的0.8

bzip2 -9 visit-2019-01-03-u

890M -rw-r--r-- 1 root root 890M Jan 10 11:59 visit-2019-01-03-u
65M -rw-r--r-- 1 root root 65M Jan 10 11:59 visit-2019-01-03-u.bz2
87M -rw-r--r-- 1 root root 87M Jan 10 12:00 visit-2019-01-03-u.zip

压缩后的体积为zip的0.7475,为原始文件的0.0730

用压缩后的文件覆盖原文件

import sys, glob, os

targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
for i in LocalFiles:
if passFeature in i:
continue
cmd = 'cd {};bzip2 -9 {}'.format(targetDir, i)
print(cmd)
os.system(cmd)

[root@a data]# tree testBiz2Py/
testBiz2Py/
├── 2-23-3
├── 2-23-a
├── 2-23-b
├── a
└── b

0 directories, 5 files
[root@a data]# python bzip2Action/biz2SaveCost.py /data/testBiz2Py/ b
cd /data/testBiz2Py/;bzip2 -9 /data/testBiz2Py/2-23-a
cd /data/testBiz2Py/;bzip2 -9 /data/testBiz2Py/2-23-3
[root@a data]# tree testBiz2Py/
testBiz2Py/
├── 2-23-3.bz2
├── 2-23-a.bz2
├── 2-23-b
├── a
└── b

0 directories, 5 files

cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

压缩前

[root@a data]# du --max-depth=2 -h ./
141G ./unionlog
8.0K ./bzip2Action
21G ./visitlog
169G ./
[root@a data]# tree visitlog/
visitlog/
├── visit-2018-09-18
├── visit-2018-09-19
├── visit-2018-09-20

[root@b ~]# cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

19G ./visitlog
104G ./unionlog

1.1T ./
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-09-19
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-09-25

[root@c ~]# cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

21G ./visitlog
141G ./unionlog

940G ./
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-24
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-01
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-11-19
cd /data/visitlog/;bzip2 -9 /data/visitlog/visit-2018-10-22

统计压缩速度

单个文件的平均速度

总数据量的平均速度

注意增加计算压缩率的功能代码

# -*- coding: utf-8 -*-

import sys, glob, os, time
import random targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
allMB, allSeconds, singleSeconds = 0, 0, []
for i in LocalFiles:
if passFeature in i:
continue # 进入原文件目录,压缩后覆盖原文件
cmd = 'cd {};bzip2 -9 {}'.format(targetDir, i) # 研究压缩速度 fileMB = os.stat(i).st_size / 1024 / 1024
t_start = time.time()
print(cmd)
# os.system(cmd)
t = random.random()*10
time.sleep(t)
t_end = time.time()
fileSeconds = t_end - t_start
allMB += fileMB
allSeconds += fileSeconds
singleSeconds.append(fileMB / fileSeconds) # 按照速度大小由小到大排序
singleSeconds = list(sorted(singleSeconds, reverse=True))
singleSeconds = sorted(singleSeconds)
print('averageSpeed(MB/s):', allMB / allSeconds)
print('singleSeconds(MB/s):', singleSeconds) 压缩:主要消耗cpu,计算密集型

压缩后

[root@b data]# cd /data;du --max-depth=2 -h ./;

8.0K ./bzip2Action
4.6G ./visitlog
104G ./unionlog

1016G ./
[root@b data]#

压缩前后比值19G:4.6G =1: 0.2421052631578947,

a节点
4.9G    ./visitlog 21G:4.9G= 1:0.21904761904761902 c节点

4.9G ./visitlog

同a节点

[root@a data]# tree visitlog/ -h
visitlog/
├── [6.2M] visit-2018-09-18.bz2
├── [8.4M] visit-2018-09-19.bz2
├── [8.3M] visit-2018-09-20.bz2
├── [8.8M] visit-2018-09-21.bz2
├── [8.7M] visit-2018-09-22.bz2
├── [7.5M] visit-2018-09-23.bz2
├── [7.4M] visit-2018-09-24.bz2
├── [8.8M] visit-2018-09-25.bz2
├── [9.3M] visit-2018-09-26.bz2
├── [9.6M] visit-2018-09-27.bz2
├── [ 12M] visit-2018-09-28.bz2
├── [ 15M] visit-2018-09-29.bz2
├── [ 15M] visit-2018-09-30.bz2
├── [ 13M] visit-2018-10-01.bz2
├── [ 13M] visit-2018-10-02.bz2
├── [ 14M] visit-2018-10-03.bz2
├── [ 14M] visit-2018-10-04.bz2
├── [ 15M] visit-2018-10-05.bz2
├── [ 15M] visit-2018-10-06.bz2
├── [ 15M] visit-2018-10-07.bz2
├── [ 17M] visit-2018-10-08.bz2
├── [ 16M] visit-2018-10-09.bz2
├── [ 17M] visit-2018-10-10.bz2
├── [ 15M] visit-2018-10-11.bz2
├── [ 16M] visit-2018-10-12.bz2
├── [ 16M] visit-2018-10-13.bz2
├── [ 23M] visit-2018-10-14.bz2
├── [ 28M] visit-2018-10-15.bz2
├── [ 25M] visit-2018-10-16.bz2
├── [ 21M] visit-2018-10-17.bz2
├── [ 23M] visit-2018-10-18.bz2
├── [ 21M] visit-2018-10-19.bz2
├── [ 21M] visit-2018-10-20.bz2
├── [ 24M] visit-2018-10-21.bz2
├── [ 18M] visit-2018-10-22.bz2
├── [ 20M] visit-2018-10-23.bz2
├── [ 20M] visit-2018-10-24.bz2
├── [ 20M] visit-2018-10-25.bz2
├── [ 21M] visit-2018-10-26.bz2
├── [ 20M] visit-2018-10-27.bz2
├── [ 18M] visit-2018-10-28.bz2
├── [ 21M] visit-2018-10-29.bz2
├── [ 22M] visit-2018-10-30.bz2
├── [ 21M] visit-2018-10-31.bz2
├── [ 22M] visit-2018-11-01.bz2
├── [ 21M] visit-2018-11-02.bz2
├── [9.8M] visit-2018-11-03.bz2
├── [7.6M] visit-2018-11-04.bz2
├── [9.7M] visit-2018-11-05.bz2
├── [9.6M] visit-2018-11-06.bz2
├── [9.5M] visit-2018-11-07.bz2
├── [ 19M] visit-2018-11-08.bz2
├── [ 12M] visit-2018-11-09.bz2
├── [ 12M] visit-2018-11-10.bz2
├── [ 11M] visit-2018-11-11.bz2
├── [ 13M] visit-2018-11-12.bz2
├── [ 14M] visit-2018-11-13.bz2
├── [ 16M] visit-2018-11-14.bz2
├── [ 16M] visit-2018-11-15.bz2
├── [ 15M] visit-2018-11-16.bz2
├── [ 15M] visit-2018-11-17.bz2
├── [ 17M] visit-2018-11-18.bz2
├── [ 18M] visit-2018-11-19.bz2
├── [ 16M] visit-2018-11-20.bz2
├── [ 20M] visit-2018-11-21.bz2
├── [ 22M] visit-2018-11-22.bz2
├── [ 13M] visit-2018-11-23.bz2
├── [ 11M] visit-2018-11-24.bz2
├── [ 11M] visit-2018-11-25.bz2
├── [ 11M] visit-2018-11-26.bz2
├── [9.7M] visit-2018-11-27.bz2
├── [8.0M] visit-2018-11-28.bz2
├── [ 12M] visit-2018-11-29.bz2
├── [ 15M] visit-2018-11-30.bz2
├── [ 15M] visit-2018-12-01.bz2
├── [ 16M] visit-2018-12-02.bz2
├── [ 20M] visit-2018-12-03.bz2
├── [ 21M] visit-2018-12-04.bz2
├── [ 23M] visit-2018-12-05.bz2
├── [ 25M] visit-2018-12-06.bz2
├── [ 32M] visit-2018-12-07.bz2
├── [ 36M] visit-2018-12-08.bz2
├── [ 35M] visit-2018-12-09.bz2
├── [ 37M] visit-2018-12-10.bz2
├── [ 38M] visit-2018-12-11.bz2
├── [ 35M] visit-2018-12-12.bz2
├── [ 35M] visit-2018-12-13.bz2
├── [ 30M] visit-2018-12-14.bz2
├── [ 32M] visit-2018-12-15.bz2
├── [ 31M] visit-2018-12-16.bz2
├── [ 39M] visit-2018-12-17.bz2
├── [ 39M] visit-2018-12-18.bz2
├── [ 38M] visit-2018-12-19.bz2
├── [ 29M] visit-2018-12-20.bz2
├── [ 43M] visit-2018-12-21.bz2
├── [ 37M] visit-2018-12-22.bz2
├── [ 35M] visit-2018-12-23.bz2
├── [ 38M] visit-2018-12-24.bz2
├── [ 38M] visit-2018-12-25.bz2
├── [ 36M] visit-2018-12-26.bz2
├── [ 38M] visit-2018-12-27.bz2
├── [ 38M] visit-2018-12-28.bz2
├── [ 37M] visit-2018-12-29.bz2
├── [ 30M] visit-2018-12-30.bz2
├── [ 35M] visit-2018-12-31.bz2
├── [296M] visit-2019-01-01
├── [345M] visit-2019-01-02
├── [397M] visit-2019-01-03
├── [331M] visit-2019-01-04
├── [300M] visit-2019-01-05
├── [312M] visit-2019-01-06
├── [311M] visit-2019-01-07
├── [154M] visit-2019-01-08
├── [173M] visit-2019-01-09
└── [176M] visit-2019-01-10

0 directories, 115 files
[root@a data]#

[root@a tmp]# ll -ash
total 32K
4.0K drwxr-xr-x 2 root root 4.0K Jan 11 14:22 .
4.0K drwxr-xr-x 17 root root 4.0K Jan 10 16:51 ..
24K -rw-r--r-- 1 root root 21K Jan 11 14:22 a
[root@a tmp]# bzip2 -9 a
[root@a tmp]# ll -as
total 12
4 drwxr-xr-x 2 root root 4096 Jan 11 14:22 .
4 drwxr-xr-x 17 root root 4096 Jan 10 16:51 ..
4 -rw-r--r-- 1 root root 1036 Jan 11 14:22 a.bz2
[root@a tmp]# bzip2 -9 a.bz2
bzip2: Input file a.bz2 already has .bz2 suffix.
[root@a tmp]# ll -as
total 12
4 drwxr-xr-x 2 root root 4096 Jan 11 14:22 .
4 drwxr-xr-x 17 root root 4096 Jan 10 16:51 ..
4 -rw-r--r-- 1 root root 1036 Jan 11 14:22 a.bz2
[root@a tmp]#

cd /data;du --max-depth=2 -h ./;python bzip2Action/biz2SaveCost.py /data/visitlog/ 2019-01

2019年1月14日

c

6.0G ./visitlog
20G ./unionlog

b

5.8G ./visitlog
18G ./unionlog

a

20G     ./unionlog

6.0G    ./visitlog

27M -rw-r--r-- 1 nginx nginx 27M Dec 30 23:59 visit-2018-12-30.bz2
36M -rw-r--r-- 1 nginx nginx 36M Dec 31 23:59 visit-2018-12-31.bz2
312M -rw-r--r-- 1 nginx nginx 312M Jan 6 23:59 visit-2019-01-06
312M -rw-r--r-- 1 nginx nginx 312M Jan 7 23:59 visit-2019-01-07

44M -rw-r--r-- 1 nginx nginx 44M Dec 30 23:59 visit-2018-12-30.bz2
53M -rw-r--r-- 1 nginx nginx 53M Dec 31 23:59 visit-2018-12-31.bz2

882M -rw-r--r-- 1 nginx nginx 882M Jan 11 23:59 visit-2019-01-11
745M -rw-r--r-- 1 nginx nginx 745M Jan 12 23:59 visit-2019-01-12
707M -rw-r--r-- 1 nginx nginx 707M Jan 13 23:59 visit-2019-01-13
232M -rw-r--r-- 1 nginx nginx 232M Jan 14 09:21 visit-2019-01-14

压缩率计算

因为每日都有新文件写入,处理前的数据没有记录,如果不解压还原数据的话,无法计算准确的压缩率

压缩速度计算

认为cpu、内存资源充足

控制台输出的日志

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-09.bz2
bzip2: Input file /data/unionlog/visit-2018-11-09.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.211187493937172)
('singleSeconds(MB/s):', [3.9369898031816426, 3.953846125040358, 3.9544741312123928, 3.9555894807291088, 3.96099337092276, 3.983298697446923, 3.9966511209824667, 4.007815560864753, 4.013902687515588, 4.015872734532144, 4.015899236549791, 4.015963246206192, 4.01612198327753, 4.023731445780551, 4.025416758738823, 4.025951959772834, 4.030831979910141, 4.039693901910457, 4.0399486196050765, 4.040242824350764, 4.040648424669689, 4.041098180762507, 4.043051325648554, 4.051655360512291, 4.056658593948987, 4.059627164614112, 4.070020953590698, 4.073870225127285, 4.07503751826594, 4.075686989285653, 4.080265084217549, 4.082345972466677, 4.090936968718271, 4.0944335040477275, 4.099429160013611, 4.102229025161095, 4.104974537958556, 4.110126096413723, 4.118484472726296, 4.119251467116442, 4.121534548809426, 4.125553711713982, 4.12775144900931, 4.129621429296399, 4.129656881725015, 4.1315901550586105, 4.131830165781944, 4.143680130292085, 4.145293603443776, 4.146942161873823, 4.147313376948774, 4.148370367740056, 4.151411958798099, 4.153755223178981, 4.161263788273014, 4.164412810381955, 4.166850751469844, 4.167063598601332, 4.169355624609407, 4.176170448673875, 4.1940635910827355, 4.195109540816128, 4.2000814466148055, 4.200333163905996, 4.2022824476243406, 4.202418248410636, 4.20572094512217, 4.212585249380411, 4.218441487185745, 4.427734600215904, 4.837932137856126, 5.076886456535105, 6.319574088013213, 6.375565540330376, 7.40075797478448, 8.700273234442928, 9.955987719965876, 10.49359459267714, 10.496288104978296, 10.910080297989559, 13.4994372035219, 14.011910382913635, 14.077535801136763, 14.982532672419739, 15.577184977610813, 16.567526614277405, 16.72303989453991, 17.031745290872077, 19.135089160791104, 19.481129833087913, 20.071232748258293, 20.639159774908073, 28.797696550990196, 29.481318443179987, 32.960199161359675, 35.348744965782345, 35.914231265432065, 44.91248986354474, 56.068395022554554, 63.90561770409619, 67.3045079377858, 80.84762211431958, 88.98706103023787, 175.14815255493826, 557.3579021970233])
[root@a data]#

  

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-03.bz2
bzip2: Input file /data/unionlog/visit-2018-11-03.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.037645142862762)
('singleSeconds(MB/s):', [3.4115271073680473, 3.5042062998346606, 3.509713341704194, 3.525571499281982, 3.5898929553667154, 3.6505914130679624, 3.7138527066218354, 3.7231339152271996, 3.7267693810378284, 3.7292860405119153, 3.7299857191562316, 3.7326899795857953, 3.756952872366287, 3.757740179198384, 3.758101035864619, 3.762634699575258, 3.771730878546173, 3.7786267621034892, 3.796819445397061, 3.8048003527368794, 3.8085626615863237, 3.8112231318976035, 3.8156227214117053, 3.841775745310672, 3.848201931373685, 3.851350834838122, 3.8566428423319925, 3.857162507505528, 3.863292589421678, 3.863331261341491, 3.8643059756625355, 3.8917795293132476, 3.8927296353495002, 3.893436977500035, 3.8935765838449194, 3.8965081510857744, 3.90837814215203, 3.9189434690852534, 3.931661792967054, 3.9543185154898364, 3.962796230312998, 3.9670385630201, 4.002202845736961, 4.140499586487628, 4.2275292796865545, 4.606845720648893, 4.712329383339015, 4.723474167059724, 4.763673069508994, 4.8135033859300425, 4.842742592123715, 4.950956959538387, 4.964229453203472, 5.00257129469767, 7.159600281109767, 7.301770358234334, 7.7868991551617475, 8.339245078376065, 9.1340349502132, 9.325438851566286, 9.746514302246108, 10.417916214518563, 10.505980787495512, 10.672366971761184, 10.932940199850863, 10.976049990046109, 11.65977574461905, 11.955475170447237, 12.03807663466081, 12.591329733176506, 13.033587237840626, 13.06160835399656, 13.063173024665307, 13.130837040649459, 13.3820749715684, 13.770620023288442, 14.631663068222947, 15.281334268265432, 20.345541381319876, 23.357577176500726, 23.618352333724083, 26.66225279976816, 28.154498014416912, 28.77254085414158, 33.61942092676073, 38.447972182087994, 296.4583324183667, 1415.6438387404276])
[root@b data]#

  

cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-30.bz2
bzip2: Input file /data/unionlog/visit-2018-11-30.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.100344833546164)
('singleSeconds(MB/s):', [3.7181192973321773, 3.7257172341979174, 3.745482158633604, 3.7692114529613185, 3.7872264010472043, 3.7884661064039555, 3.8186428004503985, 3.82068338626644, 3.8231526245648015, 3.83125733853526, 3.8345807959000737, 3.8487431513676458, 3.8533984710523392, 3.8766502888766508, 3.8792671200198057, 3.8794958504143318, 3.8864832683027672, 3.8874083389735117, 3.890098049760509, 3.9004211061797065, 3.9012259669791716, 3.904081019180935, 3.909088169138795, 3.9100598586939057, 3.9182811647981137, 3.9223673146999176, 3.936911082605703, 3.938102928814517, 3.9431581709314845, 3.9469155257226864, 3.9477303083616584, 3.9510218414752734, 3.9544768734685007, 3.9561312758351868, 3.9603868364070123, 3.960529493355076, 3.973218434311659, 3.973952987812832, 3.9750079546493047, 3.9769556093199063, 3.990533382215301, 3.9908648419479373, 3.99253131026352, 3.9993173904820893, 4.000028933408353, 4.009540707394956, 4.0167451008623525, 4.01694265894807, 4.021467667067072, 4.025888190955974, 4.029591312996541, 4.034701091498445, 4.051077667021889, 4.051677223836611, 4.05476273834563, 4.063306221670503, 4.067358092550384, 4.068730730698932, 4.102108601845601, 4.106712519686551, 4.112994160199945, 4.123322845773183, 4.124306143488609, 4.14616216102037, 4.190121259265525, 4.2012301048613345, 4.231113928027722, 4.635490375664297, 4.672385583985039, 4.693718071514089, 4.723321575516211, 4.723548196405968, 4.786506177340032, 4.850884632133513, 4.859158112858001, 6.1953653787024, 9.018906346164437, 9.323122551505794, 10.900660226263645, 11.832227279482243, 12.166883663453696, 12.860438635558914, 13.911042528296319, 13.942537479159501, 14.251319697516088, 15.801254911294155, 18.116126384680413, 19.22717367957711, 20.43278105002856, 20.50204524305958, 21.19584266823886, 27.97025937406051, 28.08823466724362, 35.51835114999952, 36.066163946710745, 37.79058467225096, 46.753210686310574, 47.149060556499826, 55.56960474674869, 67.19344402304698, 67.72819158484143, 88.15080202857988, 90.13568800307546, 161.22373765616393, 184.2411337594747])
[root@c data]#

  

由于没在统计时标识或者过滤.biz2文件,认为压缩速度为4MB/s;

# unionlog 假设从11号上午统计时,至14号上午统计时,空间增量为4个自然日,
# 空间大小 (882+745+707)/3*4/1024
res, ori, cut = [20, 18, 20], [141, 104, 141], (882 + 745 + 707) / 3 * 4 / 1024
compression_ratio = []
for i in range(0, len(res), 1):
ii = (res[i] - cut) / ori[i]
compression_ratio.append(ii)
print(compression_ratio)
print('压缩率均值', sum(compression_ratio) / len(compression_ratio))

[0.12029033687943262, 0.14385516826923078, 0.12029033687943262]
压缩率均值 0.128145280676032

# -*- coding: utf-8 -*-
import sys, glob, os, time targetDir, passFeature = sys.argv[1], sys.argv[2]
file_feature = '*-*-*'
targetGlob = targetDir + file_feature
LocalFiles = glob.glob(targetGlob)
allMBCompressed, allSeconds, singleSeconds = 0, 0, []
allCompressionRatio, singleCompressionRatio = 0, []
for i in LocalFiles:
if not i.endswith('.bz2'):
continue
if passFeature in i:
continue # 进入原文件目录,解压后覆盖原文件
cmd = 'cd {};bzip2 -d {}'.format(targetDir, i)
print(cmd)
# 研究解压速度
fileMBCompressed = os.stat(i).st_size / 1024 / 1024
t_start = time.time()
os.system(cmd)
t_end = time.time()
try:
fileMBDecompressed = os.stat(i.strip('.bz2')).st_size / 1024 / 1024
fileSeconds = t_end - t_start
allMBCompressed += fileMBCompressed
allSeconds += fileSeconds
singleSeconds.append(fileMBCompressed / fileSeconds)
singleCompressionRatio.append(fileMBCompressed / allMBCompressed) # 按照速度大小由大到小排序
singleSeconds = list(sorted(singleSeconds, reverse=True))
singleSeconds = sorted(singleSeconds) print('averageSpeed(MB/s):', allMBCompressed / allSeconds)
print('singleSeconds(MB/s):', singleSeconds) print('singleCompressionRatio:', singleCompressionRatio)
print('arithmeticAverageSingleCompressionRatio:', sum(singleCompressionRatio) / len(singleCompressionRatio)) # 计算压缩率不考虑调和平均数,只考虑算术平均数
except Exception as e:
print(e)
cd /data/unionlog/;bzip2 -9 /data/unionlog/visit-2018-11-03.bz2
bzip2: Input file /data/unionlog/visit-2018-11-03.bz2 already has .bz2 suffix.
('averageSpeed(MB/s):', 4.037645142862762)
('singleSeconds(MB/s):', [3.4115271073680473, 3.5042062998346606, 3.509713341704194, 3.525571499281982, 3.5898929553667154, 3.6505914130679624, 3.7138527066218354, 3.7231339152271996, 3.7267693810378284, 3.7292860405119153, 3.7299857191562316, 3.7326899795857953, 3.756952872366287, 3.757740179198384, 3.758101035864619, 3.762634699575258, 3.771730878546173, 3.7786267621034892, 3.796819445397061, 3.8048003527368794, 3.8085626615863237, 3.8112231318976035, 3.8156227214117053, 3.841775745310672, 3.848201931373685, 3.851350834838122, 3.8566428423319925, 3.857162507505528, 3.863292589421678, 3.863331261341491, 3.8643059756625355, 3.8917795293132476, 3.8927296353495002, 3.893436977500035, 3.8935765838449194, 3.8965081510857744, 3.90837814215203, 3.9189434690852534, 3.931661792967054, 3.9543185154898364, 3.962796230312998, 3.9670385630201, 4.002202845736961, 4.140499586487628, 4.2275292796865545, 4.606845720648893, 4.712329383339015, 4.723474167059724, 4.763673069508994, 4.8135033859300425, 4.842742592123715, 4.950956959538387, 4.964229453203472, 5.00257129469767, 7.159600281109767, 7.301770358234334, 7.7868991551617475, 8.339245078376065, 9.1340349502132, 9.325438851566286, 9.746514302246108, 10.417916214518563, 10.505980787495512, 10.672366971761184, 10.932940199850863, 10.976049990046109, 11.65977574461905, 11.955475170447237, 12.03807663466081, 12.591329733176506, 13.033587237840626, 13.06160835399656, 13.063173024665307, 13.130837040649459, 13.3820749715684, 13.770620023288442, 14.631663068222947, 15.281334268265432, 20.345541381319876, 23.357577176500726, 23.618352333724083, 26.66225279976816, 28.154498014416912, 28.77254085414158, 33.61942092676073, 38.447972182087994, 296.4583324183667, 1415.6438387404276])

  

bzip2, a block-sorting file compressor. Version 1.0.6, 6-Sept-2010.

usage: bzip2 [flags and input files in any order]

-h --help print this message
-d --decompress force decompression
-z --compress force compression
-k --keep keep (don't delete) input files
-f --force overwrite existing output files
-t --test test compressed file integrity
-c --stdout output to standard out
-q --quiet suppress noncritical error messages
-v --verbose be verbose (a 2nd -v gives more)
-L --license display software version & license
-V --version display software version & license
-s --small use less memory (at most 2500k)
-1 .. -9 set block size to 100k .. 900k
--fast alias for -1
--best alias for -9

If invoked as `bzip2', default action is to compress.
as `bunzip2', default action is to decompress.
as `bzcat', default action is to decompress to stdout.

If no file names are given, bzip2 compresses or decompresses
from standard input to standard output. You can combine
short flags, so `-v -4' means the same as -v4 or -4v, &c.



bzip2 zip 压缩后体积比 0.8:1的更多相关文章

  1. ES索引文件和数据文件大小对比——splunk索引文件大小远小于ES,数据文件的压缩比也较ES更低,有趣的现象:ES数据文件zip压缩后大小和splunk的数据文件相当!词典文件tim/tip+倒排doc/pos和cfs文件是索引的大头

    和splunk对比: ES中各个倒排索引文件的分布: 测试说明:ES2.41版本,数据使用500次批量插入,每批数据都不同,大小500条,每条数据50个字段,对应的字符串使用长度为1-10个单词随机生 ...

  2. Android总结之Gzip/Zip压缩

    前言: 做过Android网络开发的都知道,在网络传输中我们一般都会开启GZIP压缩,但是出于刨根问底的天性仅仅知道如何开启就不能满足俺的好奇心的,所以想着写个demo测试一下比较常用的两个数据压缩方 ...

  3. zip压缩详细分析

    该文章转自:http://www.cnblogs.com/esingchan/p/3958962.html (文章写得很详细,让我对zip压缩有了了解,感谢博主,贴在这是为了防止忘了有这么好的文章,侵 ...

  4. Linux系统Zip压缩和解压缩

    Linux系统可以使用Zip来压缩占用空间较大的文件以便进行文件传输,传输完成后再进行解压缩来获取原文件.Linux安装Zip的命令为 apt-get install zip 安装完成后,使用 zip ...

  5. Spark- Spark从SFTP中读取zip压缩文件数据做计算

    我们遇到个特别的需求,一个数据接入的流程跑的太慢,需要升级为用大数据方式去处理,提高效率. 数据: 数据csv文件用Zip 压缩后放置在SFTP中 数据来源: SFTP 数据操作: 文件和它的压缩包一 ...

  6. [拾 得] zip gzip bzip2 & tar 压缩/打包 四大金刚

    坚持知识分享,该文章由Alopex编著, 转载请注明源地址: http://www.cnblogs.com/alopex/    索引: 介绍压缩和打包 gzip bzip2 zip 的基本使用 gz ...

  7. java zip压缩优化版 解决压缩后文件一直被占用无法删除

    最近进行zip操作,从网上找到一个处理方法,但是经过试验存在一些bug,主要是文件流的申明存在问题,导致jvm一直占用文件而不释放,特意把自己修改的发出来,已备记录 import java.io.Bu ...

  8. JAVA 实现将多目录多层级文件打成ZIP包后保留层级目录下载 ZIP压缩 下载

    将文件夹保留目录打包为 ZIP 压缩包并下载 上周做了一个需求,要求将数据库保存的 html 界面取出后将服务器下的css和js文件一起打包压缩为ZIP文件,返回给前台:在数据库中保存的是html标签 ...

  9. zip压缩命令的使用

    file命令可以查看文件的类型 tar类型 .tar gzip类型   .gz  bzip2类型  .bz2 zip类型    .zip 如果一个压缩文件由tar命令解压的前提,2个条件 1.这个文件 ...

随机推荐

  1. [AX]AX2012 R2 HR Jobs, Positions, Department和Workers

    部门.作业(Job的官方翻译)和位置(Position的官方翻译)是AX人力资源管理的基本组织元素,Job和Position在AX有的地方又称作工作和职位,其实这个翻译更为恰当. Job定义的是一个工 ...

  2. HTML 注释

    "<!-- xxx -->" 用于在 HTML 中插入注释,注释是用来给开发人员看的,浏览器不会显示注释内容 <!DOCTYPE HTML> <htm ...

  3. Win8交互UX——触摸板交互

    针对触摸输入优化 Window 应用商店应用设计,并在默认情况下获得触摸板支持. 设计用户可以通过触摸板交互的 Windows 应用商店应用. 触摸板结合间接的多点触控输入和指针设备(如鼠标)的精确输 ...

  4. 百度前端学院js课堂作业合集+分析(更新中...)

    第一课:简陋的登录框 <!DOCTYPE html> <html lang="en"> <head> <meta charset=&quo ...

  5. VS 2008 头文件库文件设置

    在程序开发中,很多时候需要用到别人开发的工具包,如OpenCV和itk.一般而言,在vs2008中,很少使用源文件,大部分是使用对类进行声明的头文件和封装了类的链接库(静态lib或动态dll). 如果 ...

  6. 设置RabbitMQ远程ip登录

    由于账号guest具有所有的操作权限,并且又是默认账号,出于安全因素的考虑,guest用户只能通过localhost登陆使用,并建议修改guest用户的密码以及新建其他账号管理使用rabbitmq. ...

  7. openstack-networking-neutron(一)---端到端和点到点的理解

    本博客已经添加"打赏"功能,"打赏"位置位于右边栏红色框中,感谢您赞助的咖啡. ====本文目的===== 理解搞清楚两个概念: 1.端到端    2.点到点  ...

  8. 【转】Windows socket基础

    转自:http://blog.csdn.net/ithzhang/article/details/8448655 Windows socket 基础 Windows socket是一套在Windows ...

  9. 【BZOJ1417】Pku3156 Interconnect 记忆化搜索

    [BZOJ1417]Pku3156 Interconnect Description 给出无向图G(V, E). 每次操作任意加一条非自环的边(u, v), 每条边的选择是等概率的. 问使得G连通的期 ...

  10. python nose测试框架全面介绍三

    三.nose的测试工具集 nose.tools模块提供了一系列的小工具,包括测试执行时间.异常输出及unittest框架中所有的assert功能. 为了使写用例更加容易,nose.tools提供了部分 ...