requests爬取知乎话题和子话题

zhihu.py

# *_*coding:utf-8 *_*

import pymysql

import requests

from lxml import etree

from requests_test.child_topic import GetChildTopic

from requests_test.parent_topic import GetParentTopic

if __name__ == "__main__":

    parent = GetParentTopic()

    res = parent.get_parent_data()

    # child  = GetChildTopic()

    # child.get_child_data(1027,2)

    child = GetChildTopic()

    for i in res:

        print("parent_id:",i)

        child.get_child_data(i,50)

parent_topic.py

# *_*coding:utf-8 *_*

import pymysql

from lxml import etree

import requests

class GetParentTopic(object):

    def __init__(self):

        self.conn = pymysql.connect(host='192.168.33.10', user='root', passwd='root', db='spider', charset='utf8')

        self.cur = self.conn.cursor()

    def get_parent_data(self):

        headers = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

        }

        url = 'https://www.zhihu.com/topics'

        response = requests.get(url, headers=headers)

        res = response.text

        html = etree.HTML(res)

        ul = html.xpath("//ul[@class='zm-topic-cat-main clearfix']/li");

        parent_topic = {}

        for li in ul:

            title = li.xpath('./a/text()')[0];

            topic_id = li.xpath('./@data-id')[0];

            parent_topic[topic_id] = title

            import time

            # 格式化成2016-03-20 11:45:39形式

            now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            # 插入数据

            sql = "insert ignore   into topic(`title`,`topic_id`,`create_time`) values('{}','{}','{}')".format(title,

                                                                                                         topic_id, now)

            #print(sql)

            reCount = self.cur.execute(sql)

            self.conn.commit()

        self.cur.close()

        self.conn.close()

        return parent_topic

child_topic.py

# *_*coding:utf-8 *_*

import json

import urllib

from time import sleep

import pymysql

from lxml import etree

import requests

class GetChildTopic(object):

    def __init__(self):

        self.conn = pymysql.connect(host='192.168.33.10', user='root', passwd='root', db='spider', charset='utf8')

        self.cur = self.conn.cursor()

    def sql_filter(self,sql, max_length=20):

        dirty_stuff = ["\"", "\\", "/", "*", "'", "=", "-", "#", ";", "<", ">", "+", "%", "$", "(", ")", "%", "@", "!"]

        for stuff in dirty_stuff:

            sql = sql.replace(stuff, "")

        return sql[:max_length]

    def get_child_data(self,parent_id, total_pages):

        int(parent_id)

        for page in range(1, total_pages + 1):

            #sleep(1)

            output = []

            print("now_parent_id",parent_id,"now_page:",page)

            url = "https://www.zhihu.com/node/TopicsPlazzaListV2"

            headers = {

                "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",

            }

            offset = (page - 1) * 20

            data = {'method': 'next', "params": json.dumps({"topic_id": parent_id, "offset": offset, "hash_id": ""})}

            response = requests.post(url, data=data, headers=headers)

            print(url,response,);

            print(data)

            res = response.json()['msg']

            if(len(res) < 0):

                break;

            for item in res:

                html = etree.HTML(item)

                title = html.xpath('//img/@alt')[0]

                img_url = html.xpath('//img/@src')[0]

                topic_url = html.xpath('//a[1]/@href')[0]

                topic_id = topic_url.split('/')[-1]

                topic_url = urllib.parse.urljoin(url, topic_url)

                desc = html.xpath('//p/text()')

                if desc is not None and len(desc) == 1:

                    desc = desc[0]

                else:

                    desc = ''

                title = self.sql_filter(title, 200)

                img_url = self.sql_filter(img_url, 200)

                topic_url = self.sql_filter(topic_url, 200)

                desc = self.sql_filter(desc, 200)

                output.append({'title': title, 'img_url': img_url, "topic_url": topic_url, "desc": desc, "topic_id": topic_id,'parent_id': parent_id})

            print(output)

            self.save_child_topic(output)

    def save_child_topic(self,data):

        for item in data:

            import time

            # 格式化成2016-03-20 11:45:39形式

            now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

            # 插入数据

            sql = "insert  ignore into topic(`title`,`topic_id`,`img_url`,`parent_id`,`desc`,`topic_url`,`level`,`create_time`) values('{}','{}','{}','{}','{}','{}','{}','{}')".format(

                item['title'], item['topic_id'], item['img_url'], item['parent_id'], item['desc'], item['topic_url'], 1,

                now)

            #print(sql)

            reCount = self.cur.execute(sql)

            self.conn.commit()

    def __del__(self):

        self.cur.close()

        self.conn.close()

　　sql

CREATE TABLE `topic` (

  `id` int(11) NOT NULL AUTO_INCREMENT,

  `title` varchar(255) NOT NULL DEFAULT '' COMMENT '标题',

  `topic_id` int(11) NOT NULL,

  `img_url` varchar(255) NOT NULL DEFAULT '' COMMENT '子标题图片',

  `parent_id` int(11) NOT NULL DEFAULT '0',

  `desc` text,

  `create_time` varchar(255) NOT NULL DEFAULT '',

  `topic_url` varchar(255) DEFAULT '' COMMENT '子标题超链接',

  `level` tinyint(4) NOT NULL DEFAULT '0' COMMENT '0父级 ',

  PRIMARY KEY (`id`),

  UNIQUE KEY `uni_top_par` (`topic_id`,`parent_id`),

  KEY `index_parent_id` (`parent_id`),

  KEY `index_topic_id` (`topic_id`)

) ENGINE=InnoDB AUTO_INCREMENT=8379 DEFAULT CHARSET=utf8mb4;

requests爬取知乎话题和子话题的更多相关文章

爬取知乎热榜标题和连接（python，requests，xpath）
用python爬取知乎的热榜,获取标题和链接. 环境和方法:ubantu16.04.python3.requests.xpath 1.用浏览器打开知乎,并登录 2.获取cookie和User—Agen ...
16、爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”，并存储到本地文件
爬取知乎大v张佳玮的文章“标题”.“摘要”.“链接”,并存储到本地文件 # 爬取知乎大v张佳玮的文章“标题”.“摘要”.“链接”,并存储到本地文件 # URL https://www.zhihu.co ...
教程+资源,python scrapy实战爬取知乎最性感妹子的爆照合集(12G)!
一.出发点: 之前在知乎看到一位大牛(二胖)写的一篇文章:python爬取知乎最受欢迎的妹子(大概题目是这个,具体记不清了),但是这位二胖哥没有给出源码,而我也没用过python,正好顺便学一学,所以 ...
scrapy 爬取知乎问题、答案，并异步写入数据库（mysql）
python版本 python2.7 爬取知乎流程: 一 .分析在访问知乎首页的时候(https://www.zhihu.com),在没有登录的情况下,会进行重定向到(https://www. ...
python 爬取知乎图片
先上完整代码 import requests import time import datetime import os import json import uuid from pyquery im ...
scrapy爬取知乎某个问题下的所有图片
前言: 1.仅仅是想下载图片,别人上传的图片也是没有版权的,下载来可以自己欣赏做手机背景但不商用 2.由于爬虫周期的问题,这个代码写于2019.02.13 1.关于知乎爬虫网上能访问到的理论上都能爬 ...
通过scrapy，从模拟登录开始爬取知乎的问答数据
这篇文章将讲解如何爬取知乎上面的问答数据. 首先,我们需要知道,想要爬取知乎上面的数据,第一步肯定是登录,所以我们先介绍一下模拟登录: 先说一下我的思路: 1.首先我们需要控制登录的入口,重写star ...
使用requests爬取梨视频、bilibili视频、汽车之家，bs4遍历文档树、搜索文档树，css选择器
今日内容概要使用requests爬取梨视频 requests+bs4爬取汽车之家 bs4遍历文档树 bs4搜索文档树 css选择器内容详细 1.使用requests爬取梨视频 # 模拟发送http ...
利用 Scrapy 爬取知乎用户信息
思路:通过获取知乎某个大V的关注列表和被关注列表,查看该大V和其关注用户和被关注用户的详细信息,然后通过层层递归调用,实现获取关注用户和被关注用户的关注列表和被关注列表,最终实现获取大量用户信息. 一 ...

随机推荐

【转载】完成C++不能做到的事 - Visitor模式
原文: 完成C++不能做到的事 - Visitor模式拿着刚磨好的热咖啡,我坐在了显示器前.“美好的一天又开始了”,我想. 昨晚做完了一个非常困难的任务并送给美国同事Review,因此今天只需要根据 ...
一维码EAN 13简介及其解码实现(zxing-cpp)
一维码EAN 13:属于国际标准条码, 由13个数字组成,为EAN的标准编码型式(EAN标准码). 依结构的不同,EAN条码可区分为: 1． EAN 13码: 由13个数字组成,为EAN的标准编码型 ...
LBP人脸识别的python实现
这几天看了看LBP及其人脸识别的流程,并在网络上搜相应的python代码,有,但代码质量不好,于是自己就重新写了下,对于att_faces数据集的识别率能达到95.0%~99.0%(40种类型,每种随 ...
我的Cocos Creator成长之路1环境搭建以及基本的文档阅读
本人原来一直是做cocos-js和cocos-lua的,应公司发展需要,现转型为creator.会在自己的博客上记录自己的成长之路. 1.文档阅读:(cocos的官方文档) http://docs.c ...
微服务构建： Spring Boot
在展开 Spring Cloud 的微服务架构部署之前, 我们先了解一下用于构建微服务的基础框架-Spring Boot. 由于 Spring Cloud 的构建基于 Spring Boot 实现, ...
Flink架构分析之Standalone模式启动流程
概述 FLIP6 对Flink架构进行了改进,引入了Dispatcher组件集成了所有任务共享的一些组件:SubmittedJobGraphStore,LibraryCacheManager等,为了保 ...
Hyperledger Fabric 1.0.1至Hyperledger Fabric 1.0.5所升级的内容及修复的问题
基础更新各版本每次迭代都会有一些基础更新内容,如文档修改覆盖.测试用例完善.用户体验改进及删除冗余无效代码等… 下面分类介绍的是一些版本迭代的重要更新内容,因个人实操和理解有限,部分更新并未明确,如 ...
python FTP服务器实现（Python3）
创建一个ftp.py文件(Linux环境),插入以下代码: from pyftpdlib.authorizers import DummyAuthorizer from pyftpdlib.handl ...
NIO基本概念
1. IO和NIO的区别 IO 面向流(stream oriented) 阻塞(blocking io) 无 NIO 面向缓冲区(buffer orie ...
xml配置文件特殊符号的处理方法
2017.7.19遇到问题:偶然出现“认证失败,请重新登录”的现象在xml中英文问号“?”是可以被正常解析的,但是以下这几种符号是不能正常解析的:分别是“&”.“<”.“>” ...

requests爬取知乎话题和子话题

requests爬取知乎话题和子话题的更多相关文章

随机推荐

热门专题