Python爬取小说+Servlet+C3P0+MVC构建小说api

一、摘要：

使用python爬取网络小说数据存入数据库，利用C3P0数据库连接池获取数据库数据，采用MVC三层架构对数据库数据进行操作，输出JSON格式数据到前端页面

二、内容：

1.gitee外链消失，故删除

2.核心源码分析

2.1 通过Python爬取小说数据存入数据库

import os

from time import sleep

import requests

import parsel

from parsel import Selector #请求网页html数据

import pymysql #请求数据库

# 从chapter_url中得到小说的名字对应的xiaoshuoid  chapter_urlshuzu[4]，然后从章节中得到zhangjieid chapter_urlshuzu[5].split('.')[0],章节内容是for循环里所有 利用test 和章节名称就是title好拿

def download_one_chapter(chapter_url,bookname,lebieming): #爬取单一章节内容代码块，函数

    response=requests.get(chapter_url)

    chapter_urlshuzu=chapter_url.split('/') #拆分章节url，通过切片获取需要的信息

    response.encoding=response.apparent_encoding    #字符编码 自动识别

    # response.encoding = 'utf-8' 手动改变字符编码

    # 提取网页内容的方法：正则表达式：提取字符串   xpath css选择器 提取网页数据结构（html） 语法糖 或者 lxml pyquery parsel

    sel=Selector(response.text) # 提取网页小说内容

    title=sel.css('h1::text').get()

    f=open('biquge/'+lebieming+'/'+bookname+'/'+title+'.txt',mode='w',encoding='utf-8')     # f 打开的文件对象 打开文件，创建文件

    f.write(title)

    for line in sel.css('#content::text').getall():

        print(line.strip(),file=f) #逐行读入

    f.close()  # 关闭文件

    # 连接数据库

    conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='xiaoshuo',charset='utf8',)

    cursor = conn.cursor()

    f = open('biquge/' + lebieming + '/' + bookname + '/' + title + '.txt', mode='r', encoding='utf-8')

    while True:

        # 逐行读取

        line = f.readlines()

        if line:

            # 处理每行\n

            line = "".join(line)

            line = line.strip('\n')

            line = line.split(",")

            content = line[0]

            cursor.execute(

                "insert into zhangjie(zhangjieid,zhangjiename,zhangjieleirong,xiaoshuoid) values(%s,%s,%s,%s)",

                [chapter_urlshuzu[5].split('.')[0],title,content,chapter_urlshuzu[4]])

        else:

            break

    f.close()

    cursor.close()

    conn.commit()

    conn.close()

# xiaoshuoming就是bookname xiaoshuoid就是book_url里的参数book_urlshuzu[4] xiaoshuofenleiid就是用lebieming对应数组来取leibieshuzu.index(lebieming)+1

def download_one_book(book_url,bookname,lebieming): # 下载一本小说

    response=requests.get(book_url)

    book_urlshuzu=book_url.split('/')

    response.encoding=response.apparent_encoding

    sel=Selector(response.text)

    if os.path.exists('biquge/' + lebieming + '/' + bookname): # 创建小说的文件夹

        print("已存储有" + bookname + "小说")

        return

    if not os.path.exists('biquge/' + lebieming + '/' + bookname):

        os.mkdir('biquge/' +lebieming+'/'+ bookname)

        leibieshuzu=['玄幻','武侠','都市','历史','侦探','网游','科幻']

        xiaoshuoimg='http://www.shuquge.com/files/article/image/'+book_urlshuzu[4][0:len(book_urlshuzu[4])-3]+'/'+book_urlshuzu[4]+'/'+book_urlshuzu[4]+'s.jpg'

        # 连接数据库

        conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='xiaoshuo',charset='utf8',)

        cursor = conn.cursor()

        cursor.execute(

            "insert into xiaoshuoinfo(xiaoshuoid,xiaoshuoname,xiaoshuofenleiid,xiaoshuoimg) values(%s,%s,%s,%s)",

            [book_urlshuzu[4],bookname,leibieshuzu.index(lebieming)+1,xiaoshuoimg])

        cursor.close()

        conn.commit()

        conn.close()

        i=0;

        index=sel.css('.listmain a::attr(href)').getall()

        # 限制不下载前十二章

        # print(index)

        # for line in index:

        #     i+=1

        #     # print('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+line)

        #     if i>12:

        #         download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+line,bookname,lebieming)

        download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+index[12],bookname,lebieming)

        download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+index[13],bookname,lebieming)

        print("读取"+bookname+"小说完成")

# 下载单一类别小说

def download_category(category_url,leibieming):

    response = requests.get(category_url)

    response.encoding = response.apparent_encoding

    sel = Selector(response.text)

    os.mkdir('biquge/' + leibieming)

    if os.path.exists('biquge/' + leibieming):

        index2 = sel.css('span.s2 a::text').getall()

        i=0

        index = sel.css('span.s2 a::attr(href)').getall()

        for line in index:

            download_one_book(line,index2[i],leibieming)

            i+=1

def download_allcategory(website_url): # 下载全部类别的小说

    response = requests.get(website_url)

    response.encoding = response.apparent_encoding

    sel = Selector(response.text)

    index2 = sel.css('.nav a::text').getall()

    i=1

    j=1

    index = sel.css('.nav a::attr(href)').getall()

    for line in index:

        if i>=8:

            break

        if j!=0 and j<8:

            print(line)

            download_category(line, index2[i])

        i+=1

        j+=1

download_allcategory('http://www.shuquge.com/')

2.2 获取所有小说信息

2.2.1 Controller—BookServlet

 //获取当前book路由下的方法

String method = req.getParameter("method");

System.out.println("BookServlet中使用方法为：" + method);

Gson gson = new Gson();

String json="";

List<NovelInfo> list=new ArrayList<>();

//如果为空默认使用findAll方法

if (method == null) {

    method = "findAllBook";

}

switch(method){

    case "findAllBook":

        list=bookService.findAllBook();

        json = gson.toJson(list);

        break;

}

2.2.2 Service—BookService

public interface BookService {

    public List<NovelInfo> findAllBook();

}

public class BookServiceImpl implements BookService {

    private BookRepository bookRepository=new BookRepositoryImpl();

    @Override

    public List<NovelInfo> findAllBook() {

        return bookRepository.findAllBook();

    }

}

2.2.3 Repository—Book Repository

public interface BookRepository {

    public List<NovelInfo> findAllBook();

}

public class BookRepositoryImpl implements BookRepository {

    List<NovelInfo> list=new ArrayList<>();

    @Override

    public List<NovelInfo> findAllBook() {

        //        获取连接对象

        Connection connection= JdbcTools.getConnection();

        //        运行数据库语句的对象

        PreparedStatement statement=null;

        //        保存结果集的对象

        ResultSet resultSet =null;

        //需要执行的sql语句

        String sql="SELECT * FROM xiaoshuoinfo";

        try {

            statement=connection.prepareStatement(sql);

            resultSet=statement.executeQuery();

            while (resultSet.next()){

                list.add(new NovelInfo(resultSet.getString(1),resultSet.getString(2),resultSet.getString(3),resultSet.getString(4)));

            }

        } catch (SQLException e) {

            e.printStackTrace();

        }finally {

            JdbcTools.release(connection,statement,resultSet);

        }

        return list;

    }

}

2.3 其他

根据小说种类ID获取该种类所有小说信息,获取小说所有种类,获取所有小说章节,获取一本小说所有章节,根据小说ID获取该小说信息参看源码链接

python爬取小说

小说api

Python爬取小说+Servlet+C3P0+MVC构建小说api的更多相关文章

python 爬取qidian某一页全部小说
本文纯粹用于技术练习,请勿用作非法途径 import re import urllib.request from bs4 import BeautifulSoup import time url= ...
python爬取+使用网易卡搭作品数量api
第一步,当然是打开浏览器~ 然后打开卡搭~ 看着熟悉的界面,是不是有点不知所措? 这就对了,咱找点事情干干. 随便找个倒霉蛋,比如这位:"混世大王",打开他的主页! 按下f12(我 ...
python入门学习之Python爬取最新笔趣阁小说
Python爬取新笔趣阁小说,并保存到TXT文件中我写的这篇文章,是利用Python爬取小说编写的程序,这是我学习Python爬虫当中自己独立写的第一个程序,中途也遇到了一些困难,但是最后 ...
python 爬取王者荣耀高清壁纸
代码地址如下:http://www.demodashi.com/demo/13104.html 一.前言打过王者的童鞋一般都会喜欢里边设计出来的英雄吧,特别想把王者荣耀的英雄的高清图片当成电脑桌面 ...
python爬取博客圆首页文章链接+标题
新人一枚,初来乍到,请多关照来到博客园,不知道写点啥,那就去瞄一瞄大家都在干什么好了. 使用python 爬取博客园首页文章链接和标题. 首先当然是环境了,爬虫在window10系统下,python ...
python 爬取豆瓣书籍信息
继爬取猫眼电影TOP100榜单之后,再来爬一下豆瓣的书籍信息(主要是书的信息,评分及占比,评论并未爬取).原创,转载请联系我. 需求:爬取豆瓣某类型标签下的所有书籍的详细信息及评分语言:pyth ...
python爬取《龙岭迷窟》的数据，看看质量剧情还原度到底怎么样
前言文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者:简单 PS:如有需要Python学习资料的小伙伴可以加点击下方链接自行 ...
手把手教你使用Python爬取西刺代理数据（下篇）
/1 前言/ 前几天小编发布了手把手教你使用Python爬取西次代理数据(上篇),木有赶上车的小伙伴,可以戳进去看看.今天小编带大家进行网页结构的分析以及网页数据的提取,具体步骤如下. /2 首页分析 ...
Python 爬取所有51VOA网站的Learn a words文本及mp3音频
Python 爬取所有51VOA网站的Learn a words文本及mp3音频 #!/usr/bin/env python # -*- coding: utf-8 -*- #Python 爬取所有5 ...
python爬取网站数据
开学前接了一个任务,内容是从网上爬取特定属性的数据.正好之前学了python,练练手. 编码问题因为涉及到中文,所以必然地涉及到了编码的问题,这一次借这个机会算是彻底搞清楚了. 问题要从文字的编码讲 ...

随机推荐

Splashtop ：符合 HIPAA 标准的远程桌面软件
如果您正在寻找可帮助您保持 HIPAA 遵从性的远程桌面软件,那么 Splashtop 就是您的最佳选择. 如果您的公司属于美国医疗保健行业,则您知道您必须遵守有关敏感和私人患者信息的联邦 HIPAA ...
我的书《Unity3D动作游戏开发实战》出版了
首先感谢帮助和参与前期检阅的朋友们.本书是我经验积累的提炼,书中既有干货分享也有对基础内容的详解补充. 同时由于是第一次撰写书籍,许多地方仍有不足还请读者朋友们见谅. 在京东或当当等都可以购买到本书: ...
Python函数与模块的精髓与高级特性
本文分享自华为云社区<Python函数与模块的精髓与高级特性>,作者:柠檬味拥抱. Python 是一种功能强大的编程语言,拥有丰富的函数和模块,使得开发者能够轻松地构建复杂的应用程序.本 ...
Pageoffice6 实现后台批量转PDF文档
在实际项目开发中如果遇到批量动态生成PDF文档的需求,只需参考后台批量生成PDF文档,目前网上也有一些针对此需求的方案,如果您想要了解这些方案的对比,请查看后台生成单个Word文档中的"方案 ...
第三方调用saltstack
python api使用实例及工作原理解析指定target 函数,命令等等,就可以可以了. [root@mcw01 ~]# python Python 2.7.5 (default, Aug 4 ...
EDP .Net开发框架--权限
平台下载地址:https://gitee.com/alwaysinsist/edp 权限介绍权限实际上就是谁有权使用或是访问什么,这里的"谁"可以视作"授权对象&quo ...
flask blinker信号
Flask框架中的信号基于blinker,其主要就是让开发者可是在flask请求过程中定制一些用户行为. pip3 install blinker 1.内置信号 request_started = _ ...
逆向WeChat(四)
本篇在博客园地址https://www.cnblogs.com/bbqzsl/p/18209439 mars 先回顾一下,在上两篇我对wechat如何使用chrome::base框架的分析中存有错漏. ...
神经网络常见参数解释：epoch、batch、batch size、step、iteration
本文介绍在机器学习.深度学习的神经网络模型中,epoch.batch.batch size.step与iteration等名词的具体含义. epoch:表示将训练数据集中的所有样本都过一遍(且 ...
Qt-FFmpeg开发-视频播放（4）
音视频/FFmpeg #Qt Qt-FFmpeg开发-视频播放[软解码 + OpenGL显示YUV420P图像] 目录音视频/FFmpeg #Qt Qt-FFmpeg开发-视频播放[软解码 + Op ...

Python爬取小说+Servlet+C3P0+MVC构建小说api

一、摘要：

使用python爬取网络小说数据存入数据库，利用C3P0数据库连接池获取数据库数据，采用MVC三层架构对数据库数据进行操作，输出JSON格式数据到前端页面

二、内容：

1.gitee外链消失，故删除

2.核心源码分析

2.1 通过Python爬取小说数据存入数据库

2.2 获取所有小说信息

2.2.1 Controller—BookServlet

2.2.2 Service—BookService

2.2.3 Repository—Book Repository

2.3 其他

根据小说种类ID获取该种类所有小说信息,获取小说所有种类,获取所有小说章节,获取一本小说所有章节,根据小说ID获取该小说信息参看源码链接

Python爬取小说+Servlet+C3P0+MVC构建小说api的更多相关文章

随机推荐

热门专题