本文内容

最近看《写给程序员的数据挖掘指南》,研究推荐算法,书中的测试数据集是 Book-Crossing Dataset 提供的亚马逊用户对书籍评分的真实数据。推荐大家看本书,写得不错,立刻就能对推荐算法上手,甚至应用到你的项目中。

Book-Crossing Dataset 提供两种格式的数据集:CVS 格式SQL dump,问题是:

如果你有 UE 打开 cvs 文件,有乱码。无论如何转换编码,都不行~因为,这个文件是亚马逊通过程序持久化后,再导出来的。你还会发现,文件中有 html 标记,另外,关于用户名,书名等等信息,基本都是德文的(看域名就知道了)~

虽然,作者提供了加载测试数据集的 python 代码,不过不能导入到 MySQL 数据库中,其中,作者只是简单地按分号来分割字段内容(虽然推荐算法并不需要全部字段),可数据集中包含类似“ऩ”或“\“”这样的字符,不可能导入到 MySQL 数据库中~

你也许会问,作者都不导入到数据库,你为什么要导?因为,作者提供的推荐算法属于内存模型,也就是一次性把数据加载到内存,但之前,总还是要持久化吧~

因此,只能改造一下作者的 Python 代码~

Github Demo

改造后测试数据集

Python

  1. # -*- coding: utf-8 -*-

  1.  

  1. import mysql.connector

  1. import codecs

  1. import string

  1. import os

  1. import sys

  1. import ConfigParser

  1. from collections import OrderedDict

  1. import re

  1.  

  1. class MysqlPythonFacotry(object):

  1. """

  1. Python Class for connecting with MySQL server.

  1. """

  1.  

  1. __instance = None

  1. __host = None

  1. __user = None

  1. __password = None

  1. __database = None

  1. __session = None

  1. __connection = None

  1.  

  1. def __init__(self, host='localhost', user='root', password='', database=''):

  1. self.__host = host

  1. self.__user = user

  1. self.__password = password

  1. self.__database = database

  1. ## End def __init__

  1.  

  1. def open(self):

  1. try:

  1. cnx = mysql.connector.connect(host=self.__host,\

  1. user= self.__user,\

  1. password= self.__password,\

  1. database= self.__database)

  1. self.__connection = cnx

  1. self.__session = cnx.cursor()

  1. except mysql.connector.Error as e:

  1. print('connect fails!{}'.format(e))

  1. ## End def open

  1.  

  1. def close(self):

  1. self.__session.close()

  1. self.__connection.close()

  1. ## End def close

  1.  

  1. def select(self, table, where=None, *args, **kwargs):

  1. result = None

  1. query = 'SELECT '

  1. keys = args

  1. values = tuple(kwargs.values())

  1. l = len(keys) - 1

  1.  

  1. for i, key in enumerate(keys):

  1. query += "`" + key + "`"

  1. if i <; l:

  1. query += ","

  1. ## End for keys

  1.  

  1. query += 'FROM %s' % table

  1.  

  1. if where:

  1. query += " WHERE %s" % where

  1. ## End if where

  1.  

  1. self.__session.execute(query, values)

  1. number_rows = self.__session.rowcount

  1. number_columns = len(self.__session.description)

  1. result = self.__session.fetchall()

  1.  

  1. return result

  1. ## End def select

  1.  

  1. def update(self, table, where=None, *args, **kwargs):

  1. try:

  1. query = "UPDATE %s SET " % table

  1. keys = kwargs.keys()

  1. values = tuple(kwargs.values()) + tuple(args)

  1. l = len(keys) - 1

  1. for i, key in enumerate(keys):

  1. query += "`" + key + "` = %s"

  1. if i <; l:

  1. query += ","

  1. ## End if i less than 1

  1. ## End for keys

  1. query += " WHERE %s" % where

  1.  

  1. self.__session.execute(query, values)

  1. self.__connection.commit()

  1.  

  1. # Obtain rows affected

  1. update_rows = self.__session.rowcount

  1.  

  1. except mysql.connector.Error as e:

  1. print(e.value)

  1.  

  1. return update_rows

  1. ## End function update

  1.  

  1. def insert(self, table, *args, **kwargs):

  1. values = None

  1. query = "INSERT INTO %s " % table

  1. if kwargs:

  1. keys = kwargs.keys()

  1. values = tuple(kwargs.values())

  1. query += "(" + ",".join(["`%s`"] * len(keys)) % tuple(keys) + ") VALUES (" + ",".join(["%s"] * len(values)) + ")"

  1. elif args:

  1. values = args

  1. query += " VALUES(" + ",".join(["%s"] * len(values)) + ")"

  1.  

  1. self.__session.execute(query, values)

  1. self.__connection.commit()

  1. cnt = self.__session.rowcount

  1. return cnt

  1. ## End def insert

  1.  

  1. def delete(self, table, where=None, *args):

  1. query = "DELETE FROM %s" % table

  1. if where:

  1. query += ' WHERE %s' % where

  1.  

  1. values = tuple(args)

  1.  

  1. self.__session.execute(query, values)

  1. self.__connection.commit()

  1. delete_rows = self.__session.rowcount

  1. return delete_rows

  1. ## End def delete

  1.  

  1. def select_advanced(self, sql, *args):

  1. od = OrderedDict(args)

  1. query = sql

  1. values = tuple(od.values())

  1. self.__session.execute(query, values)

  1. number_rows = self.__session.rowcount

  1. number_columns = len(self.__session.description)

  1. result = self.__session.fetchall()

  1. return result

  1. ## End def select_advanced

  1. ## End class

  1.  

  1.  

  1. class ErrorMyProgram(Exception):

  1. """

  1. My Exception Error Class

  1. """

  1. def __init__(self, value):

  1. self.value = value

  1. ##End def __init__

  1. def __str__(self):

  1. return repr(self.value)

  1. ##End def __str__

  1. ## End class ErrorMyProgram

  1. class LoadAppConf(object):

  1. """

  1. Load app.conf Config File Class

  1. """

  1. __configFileName = "app.conf"

  1.  

  1. def __init__(self):

  1. config = ConfigParser.ConfigParser()

  1. config.read(self.__configFileName)

  1.  

  1. self.biz_db_host = config.get("biz_db","host")

  1. self.biz_db_user = config.get("biz_db","user")

  1. self.biz_db_password = config.get("biz_db","password")

  1. self.biz_db_database = config.get("biz_db","database")

  1. ## End def __init__

  1. ## End class LoadAppConf

  1. class Biz_Base(object):

  1. """

  1. biz base class

  1. """

  1. def __init__(self, db):

  1. self.db = db

  1. ## End def __init__

  1. ## End class Biz_Base

  1.  

  1. class Biz_bx_book_ratings(Biz_Base):

  1. """

  1. bx_book_ratings table

  1. """

  1.  

  1. __tableName = "bx_book_ratings"

  1.  

  1. def __init__(self, db):

  1. Biz_Base.__init__(self, db)

  1. ## End def __init__

  1. def insert(self, userid, isbn, bookrating):

  1. cnt = self.db.insert(self.__tableName,\

  1. userid = userid, \

  1. isbn = isbn,\

  1. bookrating = bookrating)

  1. return cnt >; 0

  1. ## End def insert

  1. ## End class Biz_bx_book_ratings

  1.  

  1.  

  1. class Biz_bx_books(Biz_Base):

  1. """

  1. bx_books table

  1. """

  1.  

  1. __tableName = "bx_books"

  1.  

  1. def __init__(self, db):

  1. Biz_Base.__init__(self, db)

  1. ## End def __init__

  1. def insert(self, isbn, booktitle, bookauthor, yearofpublication, publisher, imageurls, imageurlm, imageurll):

  1. cnt = self.db.insert(self.__tableName,\

  1. isbn = isbn, \

  1. booktitle = booktitle, \

  1. bookauthor = bookauthor,\

  1. yearofpublication = yearofpublication, \

  1. publisher = publisher, \

  1. imageurls = imageurls, \

  1. imageurlm = imageurlm, \

  1. imageurll = imageurll)

  1. return cnt >; 0

  1. ## End def insert

  1. ## End class Biz_bx_books

  1.  

  1. class Biz_bx_users(Biz_Base):

  1. """

  1. bx_users table

  1. """

  1.  

  1. __tableName = "bx_users"

  1.  

  1. def __init__(self, db):

  1. Biz_Base.__init__(self, db)

  1. ## End def __init__

  1. def insert(self, userid, location, age):

  1. cnt = self.db.insert(self.__tableName,\

  1. userid = userid, \

  1. location = location,\

  1. age = age)

  1. return cnt >; 0

  1. ## End def insert

  1. ## End class Biz_bx_users

  1.  

  1. def regx(l):

  1. """

  1. split line by regex

  1. """

  1. p = re.compile(r'"[^"]*"')

  1. return p.findall(l)

  1. ## End def regx

  1.  

  1. class LoadDataset(object):

  1. """

  1. bx_books table

  1. """

  1. __loadConf = None

  1. __users = None

  1. __books = None

  1. __book_ratings = None

  1. __bizDb = None

  1.  

  1. def __init__(self):

  1. self.__loadConf = LoadAppConf()

  1. self.__bizDb = MysqlPythonFacotry(self.__loadConf.biz_db_host,\

  1. self.__loadConf.biz_db_user, \

  1. self.__loadConf.biz_db_password,\

  1. self.__loadConf.biz_db_database)

  1.  

  1. self.__users = Biz_bx_users(self.__bizDb)

  1. self.__books = Biz_bx_books(self.__bizDb)

  1. self.__book_ratings = Biz_bx_book_ratings(self.__bizDb)

  1. self.__bizDb.open()

  1. ## End def __init__

  1. def toDB(self, path=''):

  1. """

  1. loads the BX book dataset. Path is where the BX files are

  1. located

  1. """

  1. self.data = {}

  1. i = 0

  1. j = 0

  1. try:

  1. #

  1. # First load book ratings into self.data

  1. #

  1. f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')

  1. for line in f:

  1. i += 1

  1. j += 1

  1. print(j)

  1. print(line)

  1. #separate line into fields

  1. fields = line.split(';')

  1. user = fields[0].strip('"')

  1. book = fields[1].strip('"')

  1. rating = int(fields[2].strip().strip('"'))

  1.  

  1. self.__book_ratings.insert(user, book, rating)

  1.  

  1. f.close()

  1. #

  1. # Now load books into self.productid2name

  1. # Books contains isbn, title, and author among other fields

  1. #

  1. j = 0

  1. f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')

  1. for line in f:

  1. i += 1

  1. j += 1

  1.  

  1. print(j)

  1. print(line)

  1. #separate line into fields

  1. fields = regx(line)

  1. isbn = fields[0].strip('"')

  1. title = fields[1].strip('"')

  1. author = fields[2].strip().strip('"')

  1. yearOfPublication = fields[3].strip().strip('"')

  1. publisher = fields[4].strip().strip('"')

  1. imageUrlS = fields[5].strip().strip('"')

  1. imageUrlM = fields[6].strip().strip('"')

  1. imageUrlL = fields[7].strip().strip('"')

  1.  

  1. self.__books.insert(isbn, title, author, yearOfPublication, publisher, imageUrlS, imageUrlM, imageUrlL)

  1. f.close()

  1. #

  1. # Now load user info into both self.userid2name and

  1. # self.username2id

  1. #

  1. j = 0

  1. f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')

  1. for line in f:

  1. i += 1

  1. j += 1

  1. print(j)

  1. print(line)

  1. #separate line into fields

  1. fields = regx(line)

  1. userid = fields[0].strip('"')

  1. location = fields[1].strip('"')

  1. if len(fields) >; 2:

  1. age = fields[2].strip().strip('"')

  1. else:

  1. age = None

  1. if age != None:

  1. value = location + ' (age: ' + age + ')'

  1. else:

  1. value = location

  1.  

  1. if age == None:

  1. age =0

  1. self.__users.insert(userid, location, age)

  1. f.close()

  1. except ErrorMyProgram as e:

  1. print(e.value)

  1. finally:

  1. self.__bizDb.close()

  1.  

  1. print(i)

  1. ## End def toDB

  1. ## End class LoadData

Github Demo

测试数据集

将 Book-Crossing Dataset 书籍推荐算法中 CVS 格式测试数据集导入到MySQL数据库的更多相关文章

  1. 用JDBC把Excel中的数据导入到Mysql数据库中

    步骤:0.在Mysql数据库中先建好table 1.从Excel表格读数据 2.用JDBC连接Mysql数据库 3.把读出的数据导入到Mysql数据库的相应表中 其中,步骤0的table我是先在Mys ...

  2. SQL自连接(源于推荐算法中的反查表问题)

    ”基于用户的协同过滤算法“是推荐算法的一种,这类算法强调的是:把和你有相似爱好的其他的用户的物品推荐给你. 要实现该推荐算法,就需要计算和你有交集的用户,这就要用到物品到用户的反查表. 先举个例子说明 ...

  3. Attention机制在深度学习推荐算法中的应用(转载)

    AFM:Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Ne ...

  4. Access数据库导入到mysql数据库中

    做项目时需要查询手机号归属地的,用网上提供的接口,耗时太长,反应慢,只能自己在网上搜了一个包含所有手机号归属地的Access数据库,导入到自己的mysql数据库中 Access数据库导入到mysql中 ...

  5. 虚拟机中ubuntu-16.04 Linux系统下配置mysql数据库,并在windows下使用navicat远程连接

    Linux系统下mysql数据库安装配置步骤: 1.在服务器上安装mysql:sudo apt-get install mysql-server sudo apt-get install mysql- ...

  6. 如何用java POI将word中的内容导入到mysql数据库中

    由于作业需要,要求我们将word文档中的数据直接导入到mysql中,在网上找了很常时间,终于将其解决. 由于比较初级,所以处理的word文档是那种比较规范的那种,条例比较清晰,设计的思路也比较简单,就 ...

  7. MySQL中 如何查询表名中包含某字段的表 ,查询MySql数据库架构信息:数据库,表,表字段

    --查询tablename 数据库中 以"_copy" 结尾的表 select table_name from information_schema.tables where ta ...

  8. 将Hive统计分析结果导入到MySQL数据库表中(一)——Sqoop导入方式

    https://blog.csdn.net/niityzu/article/details/45190787 交通流的数据分析,需求是对于海量的城市交通数据,需要使用MapReduce清洗后导入到HB ...

  9. php中ip转int 并存储在mysql数据库

    遇到一个问题,于是百度一下. 得到最佳答案 http://blog.163.com/metlive@126/blog/static/1026327120104232330131/     如何将四个字 ...

随机推荐

  1. Educational Codeforces Round 10

    A:Gabriel and Caterpillar 题意:蜗牛爬树问题:值得一提的是在第n天如果恰好在天黑时爬到END,则恰好整除,不用再+1: day = (End - Begin - day0)/ ...

  2. 【动态规划】bzoj1664 [Usaco2006 Open]County Fair Events 参加节日庆祝

    将区间按左端点排序. f(i)=max{f(j)+1}(p[j].x+p[j].y<=p[i].x && j<i) #include<cstdio> #incl ...

  3. sql2008日志文件截断

    日志文件比较大时,使用语句减少大小. USE DATABASENAME;GO-- Truncate the log by changing the database recovery model to ...

  4. ThinkPHP提交表单判断上传图片经验总结

    在用TP框架开发程序处理接收到的表单的时候,要判断<input type="file" name="img">是否有上传图片,来决定是否要将对应的图 ...

  5. Excel VBA自动添加证书(二)

    继续上次没有写完的随笔,本来是很想一次性写完的,但是到中午一点了还没有吃东西,其实饿的不行了,还好写博客时会自动保存,中间电脑实然蓝屏,花了二个多小时写的没有点击保存,吓我一下,以为会全没了. 前面讲 ...

  6. 高性能的数据压缩库libzling

    libzling(https://github.com/richox/libzling)是一款高性能的数据压缩库,在压缩时间和压缩率上都超过了流行的zlib/gzip.libzling使用的是ROLZ ...

  7. iPad应用开发者的建议

    原文摘自Smashing Magazine<A Dad’s Plea To Developers of iPad Apps For Children> 我花了很长时间为孩子购买和测试iPa ...

  8. 六天玩转javascript:javascript变量与表达式(2)

    本系列内容为本人平时项目实践和参照MDN,MSDN,<javascript语言精粹>,<Effective Javascript>等资料,并且整理自己EverNote的日常积累 ...

  9. 享受LINQ:判断一组文字是否在字符串中同时出现的最简单方法

    需求是这样的:不允许在一个字符串中同时出现"博", "客", "园", "团", "队"这5个文字. ...

  10. Server Develop (九) Simple Web Server

    Simple Web Server web服务器hello world!-----简单的socket通信实现. HTTP HTTP是Web浏览器与Web服务器之间通信的标准协议,HTTP指明了客户端如 ...