<day001>存储到Mysql、mongoDB数据库+简单的Ajax请求+os模块+进程池+MD5

任务1:记住如何存储到Mysql、mongoDB数据库

'''

存储到Mysql

'''

import pymysql.cursors

class QuotePipeline(object):

	def __init__(self):

		self.connect = pymysql.connect(

			host='localhost',

			user='root',

			password='',

			database='quotes',

			charset='utf8',

		)

		self.cursor = self.connect.cursor()

	def process_item(self, item, spider):

		item = dict(item)

		table = 'quote'

		keys = ','.join(item.keys())

		values = ','.join(['%s'] * len(item))

		sql = 'insert into {table}({keys}) values({values})'.format(table=table, keys=keys, values=values)

		try:

			if self.cursor.execute(sql, tuple(item.values())):

				self.connect.commit()

				print("Successful!")

		except:

			print("Failed!")

			self.connect.rollback()

		return item

	def close_spider(self, spider):

		self.cursor.close()

		self.connect.close()

'''

存储到mongoDB

'''

import pymongo

class MongoPipeline(object):

	# 表名字

	collection = 'domo'

	def __init__(self, mongo_uri, mongo_db):

		self.mongo_uri = mongo_uri

		self.mongo_db = mongo_db

	@classmethod

	# cls作为一个参数表示类本身

	def from_crawler(cls, crawler):

		return cls(

			mongo_uri=crawler.settings.get('MONGO_URI'),

			mongo_db=crawler.settings.get('MONGO_DB'),

		)

	def open_spider(self, spider):

		self.client = pymongo.MongoClient(self.mongo_uri)

		self.db = self.client[self.mongo_db]

	def process_item(self, item, spider):

		# 插入到mongo数据库

		self.db[self.collection].insert(dict(item))

		return item

	def close_spider(self, spider):

		self.client.close()

任务2:爬取微博Ajax加载的数据

# url拼接

from urllib.parse import urlencode

# 去掉html标签

from pyquery import PyQuery as pq

# 请求

import requests

# 链接mongo

from pymongo import MongoClient

# 爬的太快大概36页的时候就会出现418,加点延迟吧

import time

# 连接

client = MongoClient()

# 指定数据库

db = client['weibo']

# 指定表

collection = db['weibo_domo2']

max_page = 100

# 存储到mongoDB

def save_to_mongo(result):

	if collection.insert(result):

		print("saved to mongo")

# https://m.weibo.cn/api/container/getIndex?containerid=1076032830678474&page=2

# 找到X-Requested-With: XMLHttpRequest的Ajax请求

# 基础url,之后利用urlencode进行拼接

base_url = 'https://m.weibo.cn/api/container/getIndex?'

#https://m.weibo.cn/api/container/getIndex?type=uid&value=1005052830678474&containerid=1005051005052830678474

headers = {

	'host': 'm.weibo.cn',

	# 手机端打开,查到链接,在解析

	# 'Referer': 'https://m.weibo.cn/p/1005052830678474',

	'Referer': 'https://m.weibo.cn/u/2202323951',

	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',

	'X-Requested-With': 'XMLHttpRequest',

}

def get_page(page):

	params = {

		'type':'uid',

		'value': '2202323951',

		# 'containerid': '1076032830678474',

		'containerid': '1076032202323951',

		'page': page,

	}

	url = base_url + urlencode(params)

	print(url)

	try:

		response = requests.get(url, headers=headers)

		if response.status_code == 200:

			# response = json.dump(response.text)

			return response.json(),page

	except requests.ConnectionError as e:

		print('Error', e.args)

def parse_page(json,page:int):

	if json:

		# 只需要data下的cards内的数据

		items = json.get('data').get('cards')

		# index 下标

		for index,item in enumerate(items):

			# 在第一页,index==1没有mblog,只有这个没用,所以直接循环会导则索引报错

			# 跳过这段

			if index==1 and page==1:

				continue

			else:

				item = item.get('mblog')

				weibo = {}

				# 微博ID

				# "id":"4349509976406880",

				weibo['ID'] = item.get('id')

				# 微博内容 使用pq去掉html标签

				weibo['text'] = pq(item.get('text')).text()

				# 发表所用手机

				weibo['phone'] = item.get('source')

				# 发表时间

				weibo['time'] = item.get('edit_at')

				# 赞数量 attitudes:态度,意思,姿态

				weibo['attitudes'] = item.get('attitudes_count')

				# 评论数 comment:评论

				weibo['comments'] = item.get('comments_count')

				# 转发数 repost:转帖

				weibo['reposts'] = item.get('reposts_count')

				yield weibo

if __name__ == '__main__':

	for page in range(1, max_page + 1):

		json = get_page(page)

		# *json==*args 将返回的json和page传入

		results = parse_page(*json)

		time.sleep(3)

		for result in results:

			print(result)

			save_to_mongo(result)

　　总结:

　　1.不加延迟爬到36-38页会出现418　　(418 I’m a teapot 服务器拒绝尝试用 “茶壶冲泡咖啡”。)

　　2. Ajax请求中可能在中间出现不是你想要的数据,例如微博page1,index1代表的是关注列表,关注的信息,不是你想要的数据

　　3.使用手机端获取Ajax数据,比在PC端,容易很多.

　　4.启动mongo需要先指定dbpath(数据存储的地方),查询插入文件的数量

　　　　形如:mongod --dbpath="F:\MongoDB\Server\3.4\data"

　　　　形如: db.weibo_domo2.find().count()

　　5.最终爬取出了朱子奇的所有微博,一共848条,web端显示一共894条,去掉文章48条,去掉一条自己舍弃的,刚好848条(成功!)

任务三:理解进程池,os模块,网页端Ajax请求的拼接,MD5

# 拼接URL

from urllib.parse import urlencode

# 请求URL

import requests

# 文件操作

import os

# md5:类似加密,不会重复

from hashlib import md5

# 进程池

from multiprocessing.pool import Pool

# 延迟

import time

base_url = 'https://www.toutiao.com/api/search/content/?'

headers = {

	'Referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',

	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',

	'X-Requested-With': 'XMLHttpRequest',

}

def get_page(offset):

	#https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis

	#根据链接传入params,offset是变化的

	params = {

		'aid':'24',

		'app_name':'web_search',

		'offset':offset,

		'format':'json',

		'keyword':'街拍',

		'autoload':'ture',

		'count':'20',

		'en_qc':'1',

		'cur_tab':'1',

		'from':'search_tab',

		'pd':'synthesis',

	}

	url = base_url + urlencode(params)

	# 返回json格式的数据

	try:

		response = requests.get(url, headers=headers)

		if response.status_code == 200:

			return response.json()

	except requests.ConnectionError as e:

		print('Error', e.args)

def get_images(json):

	if json:

		items = json.get('data')

		for item in items:

			# 标题

			title = item.get('title')

			# 图片列表

			images = item.get('image_list')

			for image in images:

				# 返回单个图片链接+标题的字典

				yield {

					'image':image.get('url'),

					'title':title,

				}

def save_image(item):

	# 改变当前工作目录

	os.chdir('F:\\domo')

	# 如果没有item传过来title命名的文件,就创建一个

	if not os.path.exists(item.get('title')):

		os.mkdir(item.get('title'))

	try:

		# 请求图片URL

		response = requests.get(item.get('image'))

		if response.status_code == 200:

			# 构造图片名字

			file_path = '{0}\\{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')

			# 如果不存在这张图片就以二进制方式写入

			if not os.path.exists(file_path):

				with open(file_path,'wb') as f:

					f.write(response.content)

			else:

				print("已经下载过这个文件了",file_path)

	except:

		print("图片下载失败")

GROUP_START = 1

GROUP_END = 20

def main(offset):

	json = get_page(offset)

	for item in get_images(json):

		print(item)

		save_image(item)

if __name__ == '__main__':

	pool = Pool()

	# 构造一个offset列表 20-400(20页)

	groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)])

	# 多进程运行main函数

	pool.map(main,groups)

	# 关闭进程池

	pool.close()

	# 等待还没运行完的进程

	pool.join()

总结:1.os模块的基本操作

　　　　os.chdir('路径') --------------------表示改变当前工作目录到路径

　　　　os.path.exists('文件名') ------------当前目录下是否存在该文件,存在返回Ture,不存在返回False

　　　　os.mkdir()-----------创建文件夹

　　2. 用MD5值命名文件,可以有效的解决重复抓取的问题

　　3.进程池能大大降低爬取时间