python 爬取豆瓣书籍信息

继爬取猫眼电影TOP100榜单之后，再来爬一下豆瓣的书籍信息（主要是书的信息，评分及占比，评论并未爬取）。原创，转载请联系我。

需求：爬取豆瓣某类型标签下的所有书籍的详细信息及评分

语言：python

支持库：

正则、解析和搜索：re、requests、bs4、lxml （后三者需要安装）
随机数：time、random

步骤：三步走

访问标签页面，获取该标签下的所有书籍的链接
逐一访问书籍链接，爬取书籍信息和评分
持久化存储书籍信息（这里用了excel，可以使用数据库）

一、访问标签页面，获取该标签下的所有书籍的链接

照例，我们先看一下豆瓣的Robots.txt ，不能爬取禁止的内容。

我们这一步要爬取的标签页面，以小说为例 https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4

先去看看它的HTML结构

发现，每一本书，都在一个<li>标签当中，而我们需要的只是那张图片的链接（就是书籍页面的链接）

这样，就可以写正则或者是利用bs4(BeatuifulSoup)来获取书籍的链接。

可以看到，每一页只显示了20本书，所以需要遍历访问所有的页面，它的页面链接也是有规律的。

第二页：https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T

第三页：https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T

即：start每次递增20就好了。

下面来看代码：

 # -*- coding: utf-8 -*-

 # @Author  : yocichen

 # @Email   : yocichen@126.com

 # @File    : labelListBooks.py

 # @Software: PyCharm

 # @Time    : 2019/11/11 20:10

 import re

 import openpyxl

 import requests

 from requests import RequestException

 from bs4 import BeautifulSoup

 import lxml

 import time

 import random

 src_list = []

 def get_one_page(url):

     '''

     Get the html of a page by requests module

     :param url: page url

     :return: html / None

     '''

     try:

         head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']

         headers = {

             'user-agent':head[random.randint(0, 2)]

         }

         response = requests.get(url, headers=headers, proxies={'http':'171.15.65.195:9999'}) # 这里的代理，可以设置也可以不加，如果失效，不加或者替换其他的即可

         if response.status_code == 200:

             return response.text

         return None

     except RequestException:

         return None

 def get_page_src(html, selector):

     '''

     Get book's src from label page

     :param html: book

     :param selector: src selector

     :return: src(list)

     '''

     # html = get_one_page(url)

     if html is not None:

         soup = BeautifulSoup(html, 'lxml')

         res = soup.select(selector)

         pattern = re.compile('href="(.*?)"', re.S)

         src = re.findall(pattern, str(res))

         return src

     else:

         return []

 def write_excel_xlsx(items, file):

     '''

     Write the useful info into excel(*.xlsx file)

     :param items: book's info

     :param file: memory excel file

     :return: the num of successful item

     '''

     wb = openpyxl.load_workbook(file)

     ws = wb.worksheets[0]

     sheet_row = ws.max_row

     item_num = len(items)

     # Write film's info

     for i in range(0, item_num):

         ws.cell(sheet_row+i+1, 1).value = items[i]

     # Save the work book as *.xlsx

     wb.save(file)

     return item_num

 if __name__ == '__main__':

     total = 0

     for page_index in range(0, 50): # 这里为什么是50页？豆瓣看起来有很多页，其实访问到后面就没有数据了，目前是只有50页可访问。

         # novel label src : https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=

         # program label src : https://book.douban.com/tag/%E7%BC%96%E7%A8%8B?start=

         # computer label src : https://book.douban.com/tag/%E8%AE%A1%E7%AE%97%E6%9C%BA?start=

         # masterpiece label src : https://book.douban.com/tag/%E5%90%8D%E8%91%97?start=

         url = 'https://book.douban.com/tag/%E5%90%8D%E8%91%97?start='+str(page_index*20)+'&type=T' # 你要做的就是把URL前面的部分替换成你所有爬的那个标签的对应部分,确切的来说是红色加粗的文字部分。

         one_loop_done = 0

         # only get html page once

         html = get_one_page(url)

         for book_index in range(1, 21):

             selector = '#subject_list > ul > li:nth-child('+str(book_index)+') > div.info > h2'

             src = get_page_src(html, selector)

             row = write_excel_xlsx(src, 'masterpiece_books_src.xlsx') # 要存储的文件，需要先创建好

             one_loop_done += row

         total += one_loop_done

         print(one_loop_done, 'done')

     print('Total', total, 'done')

注释比较清楚了，先获取页面HTML，正则或者bs4遍历获取每一页当中的书籍链接，存到excel文件中。

注意：如果需要直接使用我的代码，你只需要去看一下那个标签页面的链接，而后把红色加粗部分（中文标签编码）替换即可，以及先创建一个excel文件，用以存储爬到的书籍链接。

二、逐一访问书籍链接，爬取书籍信息和评分

上一步我们已经爬到了，小说标签下的所有书籍的src，这一步，就是要逐一去访问书籍的src，然后爬取书籍的具体信息。

先看看要爬的信息的HTML结构

下面是书籍信息页面结构

再是评分页面结构

这样就可以利用正则表达式和bs4库来匹配到我们所需要的数据了。（试了试纯正则，比较难写，行不通）

下面看代码

 # -*- coding: utf-8 -*-

 # @Author  : yocichen

 # @Email   : yocichen@126.com

 # @File    : doubanBooks.py

 # @Software: PyCharm

 # @Time    : 2019/11/9 11:38

 import re

 import openpyxl

 import requests

 from requests import RequestException

 from bs4 import BeautifulSoup

 import lxml

 import time

 import random

 def get_one_page(url):

     '''

     Get the html of a page by requests module

     :param url: page url

     :return: html / None

     '''

     try:

         head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']

         headers = {

             'user-agent':head[random.randint(0, 2)]

         }

         response = requests.get(url, headers=headers) #, proxies={'http':'171.15.65.195:9999'}

         if response.status_code == 200:

             return response.text

         return None

     except RequestException:

         return None

 def get_request_res(pattern_text, html):

     '''

     Get the book info by re module

     :param pattern_text: re pattern

     :param html: page's html text

     :return: book's info

     '''

     pattern = re.compile(pattern_text, re.S)

     res = re.findall(pattern, html)

     if len(res) > 0:

         return res[0].split('<', 1)[0][1:]

     else:

         return 'NULL'

 def get_bs_res(selector, html):

     '''

     Get the book info by bs4 module

     :param selector: info selector

     :param html: page's html text

     :return: book's info

     '''

     soup = BeautifulSoup(html, 'lxml')

     res = soup.select(selector)

     # if res is not None or len(res) is not 0:

     #     return res[0].string

     # else:

     #     return 'NULL'

     if res is None:

         return 'NULL'

     elif len(res) == 0:

         return 'NULL'

     else:

         return res[0].string

 # Get other info by bs module

 def get_bs_img_res(selector, html):

     soup = BeautifulSoup(html, 'lxml')

     res = soup.select(selector)

     if len(res) is not 0:

         return str(res[0])

     else:

         return 'NULL'

 def parse_one_page(html):

     '''

     Parse the useful info of html by re module

     :param html: page's html text

     :return: all of book info(dict)

     '''

     book_info = {}

     book_name = get_bs_res('div > h1 > span', html)

     # print('Book-name', book_name)

     book_info['Book_name'] = book_name

     # info > a:nth-child(2)

     author = get_bs_res('div > span:nth-child(1) > a', html)

     if author is None:

         author = get_bs_res('#info > a:nth-child(2)', html)

     # print('Author', author)

     author = author.replace(" ", "")

     author = author.replace("\n", "")

     book_info['Author'] = author

     publisher = get_request_res(u'出版社:</span>(.*?)<br/>', html)

     # print('Publisher', publisher)

     book_info['publisher'] = publisher

     publish_time = get_request_res(u'出版年:</span>(.*?)<br/>', html)

     # print('Publish-time', publish_time)

     book_info['publish_time'] = publish_time

     ISBN = get_request_res(u'ISBN:</span>(.*?)<br/>', html)

     # print('ISBN', ISBN)

     book_info['ISBN'] = ISBN

     img_label = get_bs_img_res('#mainpic > a > img', html)

     pattern = re.compile('src="(.*?)"', re.S)

     img = re.findall(pattern, img_label)

     if len(img) is not 0:

         # print('img-src', img[0])

         book_info['img_src'] = img[0]

     else:

         # print('src not found')

         book_info['img_src'] = 'NULL'

     book_intro = get_bs_res('#link-report > div:nth-child(1) > div > p', html)

     # print('book introduction', book_intro)

     book_info['book_intro'] = book_intro

     author_intro = get_bs_res('#content > div > div.article > div.related_info > div:nth-child(4) > div > div > p', html)

     # print('author introduction', author_intro)

     book_info['author_intro'] = author_intro

     grade = get_bs_res('div > div.rating_self.clearfix > strong', html)

     if len(grade) == 1:

         # print('Score no mark')

         book_info['Score'] = 'NULL'

     else:

         # print('Score', grade[1:])

         book_info['Score'] = grade[1:]

     comment_num = get_bs_res('#interest_sectl > div > div.rating_self.clearfix > div > div.rating_sum > span > a > span', html)

     # print('commments', comment_num)

     book_info['commments'] = comment_num

     five_stars = get_bs_res('#interest_sectl > div > span:nth-child(5)', html)

     # print('5-stars', five_stars)

     book_info['5_stars'] = five_stars

     four_stars = get_bs_res('#interest_sectl > div > span:nth-child(9)', html)

     # print('4-stars', four_stars)

     book_info['4_stars'] = four_stars

     three_stars = get_bs_res('#interest_sectl > div > span:nth-child(13)', html)

     # print('3-stars', three_stars)

     book_info['3_stars'] = three_stars

     two_stars = get_bs_res('#interest_sectl > div > span:nth-child(17)', html)

     # print('2-stars', two_stars)

     book_info['2_stars'] = two_stars

     one_stars = get_bs_res('#interest_sectl > div > span:nth-child(21)', html)

     # print('1-stars', one_stars)

     book_info['1_stars'] = one_stars

     return book_info

 def write_bookinfo_excel(book_info, file):

     '''

     Write book info into excel file

     :param book_info: a dict

     :param file: memory excel file

     :return: the num of successful item

     '''

     wb = openpyxl.load_workbook(file)

     ws = wb.worksheets[0]

     sheet_row = ws.max_row

     sheet_col = ws.max_column

     i = sheet_row

     j = 1

     for key in book_info:

         ws.cell(i+1, j).value = book_info[key]

         j += 1

     done = ws.max_row - sheet_row

     wb.save(file)

     return done

 def read_booksrc_get_info(src_file, info_file):

     '''

     Read the src file and access each src, parse html and write info into file

     :param src_file: src file

     :param info_file: memory file

     :return: the num of successful item

     '''

     wb = openpyxl.load_workbook(src_file)

     ws = wb.worksheets[0]

     row = ws.max_row

     done = 0

     for i in range(868, row+1):

         src = ws.cell(i, 1).value

         if src is None:

             continue

         html = get_one_page(str(src))

         book_info = parse_one_page(html)

         done += write_bookinfo_excel(book_info, info_file)

         if done % 10 == 0:

             print(done, 'done')

     return done

 if __name__ == '__main__':

     # url = 'https://book.douban.com/subject/1770782/'

     # html = get_one_page(url)

     # # print(html)

     # book_info = parse_one_page(html)

     # print(book_info)

     # res = write_bookinfo_excel(book_info, 'novel_books_info.xlsx')

     # print(res, 'done')

     res = read_booksrc_get_info('masterpiece_books_src.xlsx', 'masterpiece_books_info.xlsx') # 读取的src文件，要写入书籍信息的存储文件

     print(res, 'done')

注意：如果要直接使用的话，需要做的只是给参数而已，第一个是上一步获取的src文件，第二个是需要存储书籍信息的文件（需要事先创建一下）

三、持久化存储书籍信息（Excel）

使用excel存储书籍的src列表和书籍的具体信息，需要使用openpyxl库进行读写excel。代码在上面write_*/read_*函数中。

效果

爬到的小说类书籍的src

爬到的书籍详细信息

后记

写这个前后大概花了有两整天吧，爬虫要做的工作还是比较细致的，需要分析HTML页面还要写正则表达式。话说，使用bs4真的是很简单，只需要copy一下selector就ok了，较正则可以大大提高效率。另外，单线程爬虫是比较蠢的。还有很多不足（诸如代码不规整，不够健壮），欢迎指正。

你可能需要的 GitHub 传送门

参考资料

【1】豆瓣robots.txt https://www.douban.com/robots.txt

【2】https://blog.csdn.net/jerrygaoling/article/details/81051447

【3】https://blog.csdn.net/zhangfn2011/article/details/7821642

【4】https://www.kuaidaili.com/free