LZ77.py

import math

from bitarray import bitarray

class LZ77Compressor:

	"""

	A simplified implementation of the LZ77 Compression Algorithm

	"""

	MAX_WINDOW_SIZE = 400

	def __init__(self, window_size=20):

		self.window_size = min(window_size, self.MAX_WINDOW_SIZE)

		self.lookahead_buffer_size = 15 # length of match is at most 4 bits

	def compress(self, input_file_path, output_file_path=None, verbose=False):

		"""

		Given the path of an input file, its content is compressed by applying a simple

		LZ77 compression algorithm. 

		The compressed format is:

		0 bit followed by 8 bits (1 byte character) when there are no previous matches

			within window

		1 bit followed by 12 bits pointer (distance to the start of the match from the

			current position) and 4 bits (length of the match)

		If a path to the output file is provided, the compressed data is written into

		a binary file. Otherwise, it is returned as a bitarray

		if verbose is enabled, the compression description is printed to standard output

		"""

		data = None

		i = 0

		output_buffer = bitarray(endian='big')

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data = input_file.read()

		except IOError:

			print 'Could not open input file ...'

			raise

		while i < len(data):

			#print i

			match = self.findLongestMatch(data, i)

			if match:

				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length

				# of the match

				(bestMatchDistance, bestMatchLength) = match

				output_buffer.append(True)

				output_buffer.frombytes(chr(bestMatchDistance >> 4))

				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))

				if verbose:

					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),

				i += bestMatchLength

			else:

				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character

				output_buffer.append(False)

				output_buffer.frombytes(data[i])

				if verbose:

					print "<0, %s>" % data[i],

				i += 1

		# fill the buffer with zeros if the number of bits is not a multiple of 8

		output_buffer.fill()

		# write the compressed data into a binary file if a path is provided

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(output_buffer.tobytes())

					print "File was compressed successfully and saved to output path ..."

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		# an output file path was not provided, return the compressed data

		return output_buffer

	def decompress(self, input_file_path, output_file_path=None):

		"""

		Given a string of the compressed file path, the data is decompressed back to its

		original form, and written into the output file path if provided. If no output

		file path is provided, the decompressed data is returned as a string

		"""

		data = bitarray(endian='big')

		output_buffer = []

		# read the input file

		try:

			with open(input_file_path, 'rb') as input_file:

				data.fromfile(input_file)

		except IOError:

			print 'Could not open input file ...'

			raise

		while len(data) >= 9:

			flag = data.pop(0)

			if not flag:

				byte = data[0:8].tobytes()

				output_buffer.append(byte)

				del data[0:8]

			else:

				byte1 = ord(data[0:8].tobytes())

				byte2 = ord(data[8:16].tobytes())

				del data[0:16]

				distance = (byte1 << 4) | (byte2 >> 4)

				length = (byte2 & 0xf)

				for i in range(length):

					output_buffer.append(output_buffer[-distance])

		out_data =  ''.join(output_buffer)

		if output_file_path:

			try:

				with open(output_file_path, 'wb') as output_file:

					output_file.write(out_data)

					print 'File was decompressed successfully and saved to output path ...'

					return None

			except IOError:

				print 'Could not write to output file path. Please check if the path is correct ...'

				raise

		return out_data

	def findLongestMatch(self, data, current_position):

		"""

		Finds the longest match to a substring starting at the current_position

		in the lookahead buffer from the history window

		"""

		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)

		best_match_distance = -1

		best_match_length = -1

		# Optimization: Only consider substrings of length 2 and greater, and just

		# output any substring of length 1 (8 bits uncompressed is better than 13 bits

		# for the flag, distance, and length)

		for j in range(current_position + 2, end_of_buffer):

			start_index = max(0, current_position - self.window_size)

			substring = data[current_position:j]

			for i in range(start_index, current_position):

				repetitions = len(substring) / (current_position - i)

				last = len(substring) % (current_position - i)

				matched_string = data[i:current_position] * repetitions + data[i:i+last]

				if matched_string == substring and len(substring) > best_match_length:

					best_match_distance = current_position - i

					best_match_length = len(substring)

		if best_match_distance > 0 and best_match_length > 0:

			return (best_match_distance, best_match_length)

		return None

LZ77.py的更多相关文章

python调用py中rar的路径问题。
1.python调用py,在py中的os.getcwd()获取的不是py的路径,可以通过os.path.split(os.path.realpath(__file__))[0]来获取py的路径. 2. ...
Python导入其他文件中的.py文件即模块
import sys sys.path.append("路径") import .py文件
LZ77压缩算法编码原理详解(结合图片和简单代码)
前言 LZ77算法是无损压缩算法,由以色列人Abraham Lempel发表于1977年.LZ77是典型的基于字典的压缩算法,现在很多压缩技术都是基于LZ77.鉴于其在数据压缩领域的地位,本文将结合图 ...
import renumber.py in pymol
cp renumber.py /usr/local/lib/python2.7/dist-packages/pymol import renumber or run /path/to/renumber ...
python gettitle.py
#!/usr/bin/env python # coding=utf-8 import threading import requests import Queue import sys import ...
解决 odoo.py: error: option --addons-path: The addons-path 'local-addons/' does not seem to a be a valid Addons Directory!
情况说明 odoo源文件路径-/odoo-dev/odoo/: 我的模块插件路径 ~/odoo-dev/local-addons/my-module 在my-module中创建了__init__.py ...
caffe机器学习自带图片分类器classify.py实现输出预测结果的概率及caffe的web_demo例子运行实例
caffe机器学习环境搭建及python接口编译参见我的上一篇博客:机器学习caffe环境搭建--redhat7.1和caffe的python接口编译 1.运行caffe图片分类器python接口还 ...
【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优
libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...
MySqlNDB使用自带的ndb_setup.py安装集群
在用Mysql做集群时,使用Mysql的NDB版本更易于集群的扩展,稳定和数据的实时性. 我们可以使用Mysql自带的工具进行集群安装与管理:ndb_setup.py.位于Mysql的安装目录bin下 ...

随机推荐

HTTP的长连接和短连接
本文总结&分享网络编程中涉及的长连接.短连接概念. 关键字:Keep-Alive,并发连接数限制,TCP,HTTP 一.什么是长连接 HTTP1.1规定了默认保持长连接(HTT ...
IDDD 实现领域驱动设计－上下文映射图及其相关概念
上一篇:<IDDD 实现领域驱动设计-理解限界上下文> 距离上一篇有几天时间了,<实现领域驱动设计>第三章的内容都是围绕一个词-上下文映射图,我大概断断续续看了几天,总共看了两 ...
YYModel 源码解读（二）之NSObject+YYModel.h (3)
本篇主要介绍的是在真正转之前的几个辅助函数 /** Get number from property. @discussion Caller should hold strong reference ...
Linux下的磁盘分割和文件系统
一.各硬件装置在Linux下的文件名 1.IDE硬盘机在Linux内的文件名: /dev/hd[a-d] (a-d 刚好是四个这个是有原因的具体如下) 解释:以 IDE 接口来说,由于一个 IDE ...
再谈React.js实现原生js拖拽效果
前几天写的那个拖拽,自己留下的疑问...这次在热心博友的提示下又修正了一些小小的bug,也加了拖拽的边缘检测部分...就再聊聊拖拽吧一.不要直接操作dom元素 react中使用了虚拟dom的概念,目 ...
[协议]ICMP协议剖析
1.ICMP简介 ICMP全名为(INTERNET CONTROL MESSAGE PROTOCOL)网络控制消息协议. ICMP的协议号为1. ICMP报文就像是IP报文的小弟,总顶着IP报文的名头 ...
介绍，介绍我的底层支持库 Net.Sz.CFramework
Net.Sz.CFramework 是我自己的底层库,是经过验证的底层库. 包含: socket tcp协议,socket http协议线程池,线程模型,任务模型,定时器模型,日志模块脚本模块一些辅 ...
ADO.NET存取数据库数据
步骤: //数据库连接串 string conStr = "......" //创建连接对象 SqlConnection connection = new SqlConnectio ...
cookie保存中文登录账号获取时乱码问题
登录成功后写入cookie的代码 Response.Cookies["account"].Value = account;//"管理员" Response.Co ...
解决VS调试时断点不会命中
断点调试是VS中的一大利器,有了它我们可以快速定位到代码的问题所在.在某些情况下会导致设置了断点后程序无法在断点处停下,下面分4种情况来解决断点不会命中的问题百度经验:jingyan.baidu.c ...

LZ77.py

LZ77.py的更多相关文章

随机推荐

热门专题