urllib+BeautifulSoup爬取并解析2345天气王历史天气数据

1、代码

import json

import logging

import urllib.parse

from datetime import date, datetime

from random import randint

from time import sleep

import pymysql

from bs4 import BeautifulSoup

# 定义目标URL

import requests

def weather_req():

    month_list = [1,2,3,4,5,6]  # 月份

    code_list = get_code()  # 获取所有的 天气code 和 地区code

    # 需要 2018 1月 到 2023 6月

    url = "https://tianqi.2345.com/Pc/GetHistory"   # 原始URL

    full_url = ""   # 最终拼好的url

    # 定义请求头

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58',

    }

    # 定义GET参数

    params = {

        'areaInfo[areaId]': 70809,

        'areaInfo[areaType]': 2,

        'date[year]': 2023,

        'date[month]': 6

    }

    # 遍历 天气code 和 地区code 的列表

    for code_item in code_list:

        weather_code = code_item[0] # 拿到天气code

        area_code = code_item[1]    # 拿到地区code

        # 修改 url 参数 天气code的值

        params['areaInfo[areaId]'] = weather_code

        # 开始遍历月份列表

        for month_item in month_list:

            print(f"正在爬取天气ID为【{weather_code}】，地区ID为【{area_code}】的第【{month_item}】月的数据！")

            # 修改 month 的值为新值

            params['date[month]'] = month_item

            # 编码 GET参数

            encoded_params = urllib.parse.urlencode(params)

            # 拼接完整的URL

            full_url = url + '?' + encoded_params

            print(full_url)

            try:

                sleep(randint(1, 3))    # 睡眠(随机1-3秒)

                # 发起请求

                res = requests.get(full_url, headers=headers)

                res_data = json.loads(res.text)

                weather_data = res_data['data']

                # print(weather_data)

                # 解析数据

                soup = BeautifulSoup(weather_data, 'html.parser')

                # 拿到需要的table

                table_data = soup.find('table', attrs={'class': 'history-table'})

                # print(type(table_data),'\n',table_data)

                all_tr = table_data.find_all('tr')  # 拿到所有的tr

                # print(all_tr[0])

                weather_list = []   # 这是要存储数据的list

                # 开始遍历tr列表 一个列表存储了某地区的某年份的某月完整的数据

                for i in range(1, len(all_tr)):

                    temp_list = []  # 暂时存储一天的数据 每次循环都刷新

                    tr_item = all_tr[i] # 拿到一个tr数据

                    all_td = tr_item.find_all("td") # 拿到一个tr里的所有td，td里面的text就是需要的值

                    rdate = str(all_td[0].text)  # 日期 2023-01-01 周日

                    # 日期需要转化格式，去掉星期

                    rdate_new = rdate.split(" ")[0] # 拿到日期字符串

                    # 解析字符串为日期对象

                    date_object = datetime.strptime(rdate_new, "%Y-%m-%d")

                    # 将日期对象格式化为 MySQL 可存储的日期字符串

                    mysql_date = date_object.strftime("%Y-%m-%d")   # 最终被存储的日期

                    wind_and_power = all_td[4].text # 风向和风力是在一起的 需要解析

                    wind = str(wind_and_power).split("风")[0]    # 风向

                    winp = str(wind_and_power).split("风")[1]   # 风力

                    temp_max = str(all_td[1].text)  # 最高温

                    temp_min = str(all_td[2].text)  # 最低温

                    weather = str(all_td[3].text)   # 天气情况

                    # 把上面的变量存储到 temp_list 然后再一起存到 weather_list

                    temp_list.append(mysql_date)    # 日期

                    temp_list.append(weather_code)  # 天气编码

                    temp_list.append(area_code) # 地区编码

                    temp_list.append(wind)  # 风向

                    temp_list.append(winp) # 风力

                    temp_list.append(temp_max)  # 最高温度

                    temp_list.append(temp_min)  # 最低温度

                    temp_list.append(weather)   # 天气情况

                    weather_list.append(temp_list)

                print(weather_list)

                # 开始插入数据 【某个地区的，某一年的，某一个月份的数据】

                conn_his,cursor_his = get_conn()    # 建立数据库连接

                # 遍历数据

                for save_item in weather_list:

                    INSERT_SQL = "insert into w_weather_day_history (rdate,weather_code,area_code,wind,winp,temp_max,temp_min,weather) " \

                                 "values(%s,%s,%s,%s,%s,%s,%s,%s)" \

                                 "              "%("\""+save_item[0]+"\"",

                                                  "\""+save_item[1]+"\"",

                                                  "\""+save_item[2]+"\"",

                                                  "\""+save_item[3]+"\""

                                                 ,"\""+save_item[4]+"\""

                                                 ,"\""+save_item[5]+"\""

                                                 ,"\""+save_item[6]+"\""

                                                 ,"\""+save_item[7]+"\"")

                    print(INSERT_SQL)

                    cursor_his.execute(INSERT_SQL)  # 执行sql语句

                    conn_his.commit()  # 提交事务

                    print("--------------------------------------------------")

            except urllib.error.URLError as e:

                print("发生错误：", e)

def get_code():

    conn,cursor = get_conn()

    SQL = "select fwc.weather_code,fwc.area_code from f_weather_area_code fwc;"

    cursor.execute(SQL)

    res = cursor.fetchall()

    print(res)

    return res

def get_conn():

    """

    :return: 连接，游标

    """

    # 创建连接

    conn = pymysql.connect(host="127.0.0.1",

                    user="root",

                    password="reliable",

                    db="weather",

                    charset="utf8")

    # 创建游标

    cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示

    return conn, cursor

def close_conn(conn, cursor):

    if cursor:

        cursor.close()

    if conn:

        conn.close()

if __name__ == '__main__':

    # get_code()

    weather_req()

2、分析

url构成如下：

基础url：https://tianqi.2345.com/Pc/GetHistory

参数：

params = {

        'areaInfo[areaId]': 70809,

        'areaInfo[areaType]': 2,

        'date[year]': 2023,

        'date[month]': 6

    }

areaInfo[areaId] 表示的是某地区的天气编码，这个需要去自己获取。

areaInfo[areaType] 不用管

后面两个参数就是年份和月份了

3、发起请求demo

url = "https://tianqi.2345.com/Pc/GetHistory"   # 原始URL

    full_url = ""   # 最终拼好的url

    # 定义请求头

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.58',

    }

    # 定义GET参数

    params = {

        'areaInfo[areaId]': 70809,

        'areaInfo[areaType]': 2,

        'date[year]': 2023,

        'date[month]': 6

    }

    # 解析参数

    encoded_params = urllib.parse.urlencode(params)

    # 拼接完整的URL

    full_url = url + '?' + encoded_params

    sleep(randint(1, 3))    # 睡眠(随机1-3秒)

    # 发起请求

    res = requests.get(full_url, headers=headers)

    res_data = json.loads(res.text)

    weather_data = res_data['data']

4、解析数据demo

# 解析数据

soup = BeautifulSoup(weather_data, 'html.parser')

# 拿到需要的table

table_data = soup.find('table', attrs={'class': 'history-table'})

# print(type(table_data),'\n',table_data)

all_tr = table_data.find_all('tr')  # 拿到所有的tr

urllib+BeautifulSoup爬取并解析2345天气王历史天气数据的更多相关文章

Python使用urllib,urllib3,requests库+beautifulsoup爬取网页
Python使用urllib/urllib3/requests库+beautifulsoup爬取网页 urllib urllib3 requests 笔者在爬取时遇到的问题 1.结果不全 2.'抓取失 ...
PYTHON 爬虫笔记九:利用Ajax+正则表达式+BeautifulSoup爬取今日头条街拍图集（实战项目二）
利用Ajax+正则表达式+BeautifulSoup爬取今日头条街拍图集目标站点分析今日头条这类的网站制作,从数据形式,CSS样式都是通过数据接口的样式来决定的,所以它的抓取方法和其他网页的抓取方 ...
beautifulsoup爬取糗事百科
# _*_ coding:utf-8 _*_ import urllib2 from bs4 import BeautifulSoup user_agent = "Mozilla/5.0 ( ...
nodejs中使用cheerio爬取并解析html网页
nodejs中使用cheerio爬取并解析html网页转 https://www.jianshu.com/p/8e4a83e7c376 cheerio用于node环境,用法与语法都类似于jquery ...
Python爬取网上车市[http://www.cheshi.com/]的数据
#coding:utf8 #爬取网上车市[http://www.cheshi.com/]的数据 import requests, json, time, re, os, sys, time,urlli ...
python网络爬虫之解析网页的BeautifulSoup(爬取电影图片)[三]
目录前言一.BeautifulSoup的基本语法二.爬取网页图片扩展学习后记前言本章同样是解析一个网页的结构信息在上章内容中(python网络爬虫之解析网页的正则表达式(爬取4k动漫图 ...
python 3.6 urllib库实现天气爬取、邮件定时给妹子发送天气
#由于每天早上要和妹子说早安,于是做个定时任务,每天早上自动爬取天气,发送天气问好邮件##涉及模块:#(1)定时任务:windows的定时任务# 配置教程链接:http://b ...
使用正则表达式和urllib模块爬取最好大学排名信息
题目使用urllib模块编程实现爬取网站的大学排名. (网址:http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html) (1)获取网站页面,分析代 ...
Python爬虫学习之使用beautifulsoup爬取招聘网站信息
菜鸟一只,也是在尝试并学习和摸索爬虫相关知识. 1.首先分析要爬取页面结构.可以看到一列搜索的结果,现在需要得到每一个链接,然后才能爬取对应页面. 关键代码思路如下: html = getHtml(& ...
python简单爬虫用beautifulsoup爬取百度百科词条
目标:爬取“湖南大学”百科词条并处理数据需要获取的数据: 源代码: <div class="basic-info cmn-clearfix"> <dl clas ...

随机推荐

Caused by: com.alibaba.druid.pool.DataSourceClosedException: dataSource already closed
报错场景:spring boot+mybatis,线程池执行批量任务.springboot正常启动后,定时任务中数据库查询报错.报错信息如下: 1 Caused by: org.apache.ibat ...
【LeetCode回溯算法#05】分割回文串（复习双指针判断回文以及substr函数使用记录）
分割回文串力扣题目链接给你一个字符串 s,请你将 s 分割成一些子串,使每个子串都是回文串 .返回 s 所有可能的分割方案. 回文串是正着读和反着读都一样的字符串. 示例 1: 输入:s = ...
【Azure 事件中心】 Event Grid(事件网格)+Azure Functions处理IOT Hub中的消息
问题描述使用IOT Hub的Events功能,使用Event Grid(事件网格)订阅IOTHub状态消息,发送到Azure Functions.那如何来创建Event Grid触发的Functio ...
C++ //常用算术生成算法 //#include<numeric> accumulate //fill //向容器中填充指定的元素
1 //常用算术生成算法 //#include<numeric> accumulate 2 //fill //向容器中填充指定的元素 3 #include<iostream> ...
A left join B B表有多条记录，max(create_time)取最新一条
例如:A表合同表t_contract B表合同审核表t_contract_audit.两个表根据contract_id关联.且一条合同有多条审核记录.求:A.合同状态.B.最新审核记录结果. 简单: ...
[学习笔记] CentOS + .Net后端常用的中间件工具安装
Redis 5.0+ 官方文档:https://redis.io/download/#redis-downloads sudo yum install redis RabbitMQ 3.7.11+ 官 ...
favorite 单词学习主要是发音 fa - vor - it 注意 ri不连读是自然带出来的r的尾音
favorite 单词学习主要注意发音 [ ˈfeɪ v(ə)r ɪt ] 主要是发音 fa - vor - it 注意 ri不连读是自然带出来的r的尾音 favor : 来自拉丁语favere, ...
[已读带总结] Effective JavaScript 编写高质量JavaScript代码的68个有效方法
目录电子书下载:https://www.jb51.net/books/328297.html 第2章第11条熟练掌握闭包 https://www.cnblogs.com/wengxuesong/ ...
Android TextView设置某段文字可点击
初次进入app,需要有个勾选隐私协议的UI,其中的隐私协议文字点击是可跳转到新页面对隐私协议机型展示这里选择使用Android自带的SpannedString来设置TextView的文字内容即可设置 ...
C# 中的for/foreach循环
for 循环是一个执行特定次数的循环的重复控制结构. C# 中 for 循环的语法: for ( init; condition; increment ) { statement(s); } 执行流程 ...

urllib+BeautifulSoup爬取并解析2345天气王历史天气数据

1、代码

2、分析

3、发起请求demo

4、解析数据demo

urllib+BeautifulSoup爬取并解析2345天气王历史天气数据的更多相关文章

随机推荐

热门专题