1 初探


  举个具体的例子,分别向数据库db中插入两条数据,"a=1, b=1" 和 "a=1, b=2", 然后想查询a=1的数据可能会使用这样的语句db.query(a=1),结果就是返回前面插入的两条数据; 如果想查询a=1, b=2的数据,就使用这样的语句db.query(a=1, b=2),结果就返回前面的第二条数据。



import pydblite
# 使用内存数据库
pydb = pydblite.Base(':memory:')
# 创建a,b,c三个字段
pydb.create('a', 'b', 'c')
# 为字段a,b创建索引
pydb.create_index('a', 'b')
# 插入一条数据
pydb.insert(a=-1, b=0, c=1)
# 查询符合特定要求的数据
results = pydb(a=-1, b=0)


import sqlite3
# 使用内存数据库
con = sqlite3.connect(':memory:')
# 创建a,b,c三个字段
cur = con.cursor()
cur.execute('create table test (a char(256), b char(256), c char(256));')
# 为字段a,b创建索引
cur.execute('create index a_index on test(a)')
cur.execute('create index b_index on test(b)')
# 插入一条数据
cur.execute('insert into test values(?, ?, ?)', (-1,0,1))
# 查询符合特定要求的数据
cur.execute('select * from test where a=? and b=?',(-1, 0))

2 pydblite和sqlite的性能


import time
count = 100000 def timeit(func):
def wrapper(*args, **kws):
t = time.time()
print time.time() - t, kws['des']
return wrapper @timeit
def test_insert(mdb, des=''):
for i in xrange(count):
mdb.insert(a=i-1, b=i, c=i+1) @timeit
def test_query_object(mdb, des=''):
for i in xrange(count):
c = mdb(a=i-1, b=i) @timeit
def test_sqlite_insert(cur, des=''):
for i in xrange(count):
cur.execute('insert into test values(?, ?, ?)', (i-1, i, i+1)) @timeit
def test_sqlite_query(cur, des=''):
for i in xrange(count):
cur.execute('select * from test where a=? and b=?', (i-1, i)) print '-------pydblite--------'
import pydblite
pydb = pydblite.Base(':memory:')
pydb.create('a', 'b', 'c')
pydb.create_index('a', 'b')
test_insert(pydb, des='insert')
test_query_object(pydb, des='query, object call') print '-------sqlite3--------'
import sqlite3
con = sqlite3.connect(':memory:')
cur = con.cursor()
cur.execute('create table test (a char(256), b char(256), c char(256));')
cur.execute('create index a_index on test(a)')
cur.execute('create index b_index on test(b)')
test_sqlite_insert(cur, des='insert')
test_sqlite_query(cur, des='query')


1.14199995995 insert
0.308000087738 query, object call
0.411999940872 insert
0.30999994278 query


0.0989999771118 insert
5.15300011635 query, object call
0.0169999599457 insert
7.43400001526 query



3 优化



class _BasePy2(_Base):

    def __iter__(self):
"""Iteration on the records"""
return iter(self.records.itervalues()) class _BasePy3(_Base): def __iter__(self):
"""Iteration on the records"""
return iter(self.records.values()) if sys.version_info[0] == 2:
Base = _BasePy2
Base = _BasePy3


class _Base(object):

    def __init__(self, path, protocol=pickle.HIGHEST_PROTOCOL, save_to_file=True,
"""protocol as defined in pickle / pickle.
Defaults to the highest protocol available.
For maximum compatibility use protocol = 0 """
self.path = path
"""The path of the database in the file system"""
self.name = os.path.splitext(os.path.basename(path))[0]
"""The basename of the path, stripped of its extension"""
self.protocol = protocol
self.mode = None
if path == ":memory:":
save_to_file = False
self.save_to_file = save_to_file
self.sqlite_compat = sqlite_compat
self.fields = []
"""The list of the fields (does not include the internal
fields __id__ and __version__)"""
# if base exists, get field names
if save_to_file and self.exists():
if protocol == 0:
_in = open(self.path) # don't specify binary mode !
_in = open(self.path, 'rb')
self.fields = pickle.load(_in)


    def create(self, *fields, **kw):
Create a new base with specified field names. Args:
- \*fields (str): The field names to create.
- mode (str): the mode used when creating the database. - if mode = 'create' : create a new base (the default value)
- if mode = 'open' : open the existing base, ignore the fields
- if mode = 'override' : erase the existing base and create a
new one with the specified fields Returns:
- the database (self).
self.mode = kw.get("mode", 'create')
if self.save_to_file and os.path.exists(self.path):
if not os.path.isfile(self.path):
raise IOError("%s exists and is not a file" % self.path)
elif self.mode is 'create':
raise IOError("Base %s already exists" % self.path)
elif self.mode == "open":
return self.open()
elif self.mode == "override":
raise ValueError("Invalid value given for 'open': '%s'" % open) self.fields = []
self.default_values = {}
for field in fields:
if type(field) is dict:
self.default_values[field["name"]] = field.get("default", None)
elif type(field) is tuple:
self.default_values[field[0]] = field[1]
self.default_values[field] = None self.records = {}
self.next_id = 0
self.indices = {}
return self def create_index(self, *fields):
Create an index on the specified field names An index on a field is a mapping between the values taken by the field
and the sorted list of the ids of the records whose field is equal to
this value For each indexed field, an attribute of self is created, an instance
of the class Index (see above). Its name it the field name, with the
prefix _ to avoid name conflicts Args:
- fields (list): the fields to index
reset = False
for f in fields:
if f not in self.fields:
raise NameError("%s is not a field name %s" % (f, self.fields))
# initialize the indices
if self.mode == "open" and f in self.indices:
reset = True
self.indices[f] = {}
for _id, record in self.records.items():
# use bisect to quickly insert the id in the list
bisect.insort(self.indices[f].setdefault(record[f], []), _id)
# create a new attribute of self, used to find the records
# by this index
setattr(self, '_' + f, Index(self, f))
if reset:



# records
{0: {'__id__': 0, '__version__': 0, 'a': -1, 'b': 0, 'c': 1},
1: {'__id__': 1, '__version__': 0, 'a': 0, 'b': 1, 'c': 2}} # indices
{'a': {-1: [0], 0: [1]}, 'b': {0: [0], 1: [1]}}

  比方说现在我们想查找a=0的数据,那么就会在indices中找key为'a'的value,即{-1: set([0]), 0: set([1])},然后在这里面找key为0的value,即[1],由此我们直到了我们想要的这条数据它的id是1(也可能会有多个);假设我们对数据还有其他要求比如a=0,b=1,那么它会继续上述的查找过程,找到a=0和b=1分别对应的ids,做交集,就得到了满足这两个条件的ids,然后再到records里根据ids找到所有对应的数据。



  python语句,不难看出,整个_Base为了同时兼容python2和python3,不得不使用了2和3都支持的语句,这就导致在部分语句上针对特定版本的python就会造成浪费或者说是性能开销。比如说,d是个字典,那么为了同事兼容python2和3,作者使用了类似与for key in d.keys()这样的语句,在python2中,d.keys()会首先产生一个list,用d.iterkeys是个更明智的方案。再如,作者会使用类似set(d.keys()) - set([1])这样的语句,但是python2中,使用d.viewkeys() - set([1])效率将会更高,因为它不需要将list转化成set。



4 memlite、pydblite和sqlite的性能


def test_query_method(mdb, des=''):
for i in xrange(count):
c = mdb.query(a=i-1, b=i) print '-------memlite-------'
import memlite
db = memlite.Base()
db.create('a', 'b', 'c')
db.create_index('a', 'b')
test_insert(db, des='insert')
test_query_method(db, des='query, method call')


0.378000020981 insert
0.285000085831 query, method call
1.3140001297 insert
0.309000015259 query, object call
0.414000034332 insert
0.3109998703 query


0.0179998874664 insert
5.90199995041 query, method call
0.0980000495911 insert
4.87400007248 query, object call
0.0170001983643 insert
7.42399978638 query


