scrapy 爬取天猫商品信息

spider

# -*- coding: utf-8 -*-

from urllib.parse import urlencode

import requests

import scrapy

import re

import json

from ..items import TmallItem

cookie = {'thw': 'cn', 'hng': 'CN%7Czh-CN%7CCNY%7C156', 'tracknick': 'yzhy1372', 'tg': '', 'miid': '', 'x': 'e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0', '_cc_': 'UIHiLt3xSw%3D%3D', 'enc': '52fRsc7qpI96LDqf%2FkMA7AfWwN0%2BYmGMXsa4AdC3He4jEbrP%2BRbmYwz%2Bn3xwMrIk4fqBuRCR6BYtQvI%2FP7UBRw%3D%3D', 'UM_distinctid': '165c600d3903a8-0dc9190eb920d3-c343567-100200-165c600d39319', 'cna': 'iSbqEnsQrkoCAXM7KlL0pQWu', 't': '8489c373deedc2a297ebe4c4ad6debb5', '_uab_collina': '', '_umdata': '6AF5B463492A874D05644EF9A3CE888C0BB3EC8395620198BCCF71C40733CB6AAB98C444C566382ECD43AD3E795C914C010C8EDA083E64FAFA9E46E3CF4DEA41', '_m_h5_tk': 'bf46d22c8564ad537f01664eb002112c_1539921942514', '_m_h5_tk_enc': 'f2a1bff4b69d2c036314c66504744070', 'v': '', 'cookie2': '2b9488dea40dbe840f20ea5f14836ef7', '_tb_token_': 'fb83ee7ebeed7', 'alitrackid': 'www.taobao.com', 'lastalitrackid': 'www.taobao.com', 'JSESSIONID': '9787B4CF4D2812E2BA1E407B224AE53A', 'isg': 'BOfnzJhvcDexNPXcxwaGYkk8dhtxxJBNn5b9BrlUMnacqAVqyz-ynoHpzuiTQJPG', 'Hm_lvt_dde6ba2851f3db0ddc415ce0f895822e': '1539912803,1539913323,1539944839,1539944853', 'Hm_lpvt_dde6ba2851f3db0ddc415ce0f895822e': '', 'unb': '', 'uc1': 'cookie16=VFC%2FuZ9az08KUQ56dCrZDlbNdA%3D%3D&cookie21=WqG3DMC9FxUx&cookie15=VT5L2FSpMGV7TQ%3D%3D&existShop=false&pas=0&cookie14=UoTfItnW5e2f1g%3D%3D&tag=8&lng=zh_CN', 'sg': '', '_l_g_': 'Ug%3D%3D', 'skt': '5c93ad4f47f0c1ca', 'cookie1': 'U%2BTs5qAQHjB1CoYPMJcEQ4UfC6zh%2FdhqLG66mPjcz38%3D', 'csg': 'e312c3a6', 'uc3': 'vt3=F8dByRmq%2Bp63ob4wR7I%3D&id2=VW3j%2BbmcVcIV&nk2=GhETDBFSx%2Fs%3D&lg2=VT5L2FSpMGV7TQ%3D%3D', 'existShop': 'MTUzOTk0NTUzNw%3D%3D', 'lgc': 'yzhy1372', 'dnk': 'yzhy1372', '_nk_': 'yzhy1372', 'cookie17': 'VW3j%2BbmcVcIV', 'mt': 'np='}

class MianbaoSpider(scrapy.Spider):

    name = "mianbao"

    # allowed_domains = ["https://www.taobao.com"]

    def start_requests(self):

        url = 'https://s.taobao.com/search'

        pars = {

            'q': '女士上衣',     #搜索关键字

            'initiative_id': 'staobaoz_20181019',

            'ie': 'utf8',

            'tab': 'mall',       #搜索天猫 1,all天猫淘宝 2，tmall天猫 3，old二手

            # 's': '0',            #页码  44递增

            'sort': 'sale-desc'  #默认 default

                                 #排序类型

                                 # #credit-desc信用排序

                                 # #price-asc 价格升序

                                 #price-desc 价格降序序

        }

        data = urlencode(pars)

        urls = [url+'?'+data+'&s='+str(page) for page in range(0,450,44)]  #翻页爬取

        for u in urls:

            yield scrapy.Request(u,self.mianbao,cookies=cookie)

    def mianbao(self, response):

        res = re.compile(r'g_page_config = (.*?);\s*g_srp_loadCss',re.S)

        datas = json.loads(res.findall(response.text)[0])['mods']['itemlist']['data']['auctions']

        for i in datas:

            title = i['raw_title']  #商品名称

            pic_url = 'http:'+i['pic_url']  #图片链接  #列表页图片

            # view_price = i['view_price']  #商品价格

            detail_url = 'https:'+i['detail_url']  #商品详情url

            nick = i['nick']  #店铺名称

            view_sales = i['view_sales']   #付款人数

            item_loc = i['item_loc']  #商品所在地

            comment_count = i['comment_count']  #评论数

            user_id = i['user_id']  #取评论内容用

            yield scrapy.Request(detail_url,self.detail_info,meta={'title':title,'nick':nick,'view_sales':view_sales,'item_loc':item_loc,'comment_count':comment_count,'pic_url':pic_url,'user_id':user_id})

    def detail_info(self,response):

        item = TmallItem()

        res = re.compile(r'"defaultItemPrice":"(.*?)",',re.S)

        price = res.findall(response.text)[0]     #单价

        good_imgs = response.xpath('//*[@id="J_UlThumb"]/li/a/img/@src').extract()#抓取图片

        good_info = response.xpath('//*[@id="J_AttrUL"]/li/text()').extract()

        if len(good_info) == 0:   #商品详情

            good_infos = '暂无'

        else:

            good_infos = good_info

        item_id = re.findall(r'id=(.*?)&',response.url)[0]  #这里是取出商品id

        user_id = response.meta['user_id'] #取出商家id

        url = 'https://rate.tmall.com/list_detail_rate.htm'

        data = {

            'itemId': item_id,   #商品id

            'sellerId': user_id    #商家id

        }

        headers = {

            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'

        }

        try:

            rote_response = requests.get(url=url,params=data,headers=headers)  #发起请求

            rote_json = json.loads(re.findall(r'jsonp128\((.*?)\)',rote_response.text)[0])['rateDetail']['rateList']

            rote_list = []  # 评论列表

            for i in rote_json:

                rote_dict = {}

                rote_dict['auctionSku'] = i['auctionSku']  #购买商品名称

                rote_dict['rateContent'] = i['rateContent']  #商品评论内容

                rote_dict['pics'] = i['pics']  #评价图片

                if len(rote_list) < 5: #每件商品只抓5条评论

                    rote_list.append(rote_dict)  #把评论内容放到列表里

        except:

            print('该商品评论 无法抓取')

            rote_list = []

        item['title'] = response.meta['title']

        item['nick'] = response.meta['nick']

        item['price'] = price

        item['view_sales'] = response.meta['view_sales']

        item['item_loc'] = response.meta['item_loc']

        item['comment_count'] = response.meta['comment_count']

        item['pic_url'] = response.meta['pic_url']

        item['good_infos'] = good_infos

        item['good_imgs'] = good_imgs

        item['rote_list'] = rote_list

        return item

piplines

# -*- coding: utf-8 -*-

import pymongo

mongo = pymongo.MongoClient('127.0.0.1',27017)

mongodb = mongo['tmall']

mongocoll = mongodb['good_info']

import os

import requests

import csv

import pymysql

db = pymysql.connect(

    db = 'test',

    user = 'root',

    port = 3306,

    host = 'localhost',

    password = 'mysql',

    charset = 'utf8'

)

cursor = db.cursor()

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

class TmallPipeline(object):

    def process_item(self, item, spider):

        good_imgs = item['good_imgs']

        title = item['title']

        path = 'tmalls/' + title  #商品信息路径

        if not os.path.exists(path):

            os.mkdir(path)

        img = []  #更改图片链接

        count = 0

        for i in good_imgs:

            count += 1

            url = 'https:'+i[:-13]

            img.append(url)

            with open(path+'\\'+str(count)+'.jpg','wb') as f:  #写入图片

                response = requests.get(url)

                f.write(response.content)

            item['good_imgs'] = img

            with open(path+'\\'+'商品信息'+'.csv','w+',encoding='utf-8',newline='') as f:

                writer = csv.writer(f)

                for k, j in dict(item).items():

                    datas = [

                        [k, j]

                    ]

                    writer.writerows(datas)

                writer.writerows('\n')

        mongocoll.insert(dict(item))

        title = item['title']

        price = item['price']

        good_infos = item['good_infos']

        view_sales = item['view_sales']

        comment_count = item['comment_count']

        item_loc = item['item_loc']

        nick = item['nick']

        sql = 'insert into tmall values (0,%s,%s,%s,%s,%s,%s,%s)'

        cursor.execute(sql,[title,price,str(good_infos),view_sales,comment_count,item_loc,nick])

        db.commit()

        return item

scrapy 爬取天猫商品信息的更多相关文章

selenium模块使用详解、打码平台使用、xpath使用、使用selenium爬取京东商品信息、scrapy框架介绍与安装
今日内容概要 selenium的使用打码平台使用 xpath使用爬取京东商品信息 scrapy 介绍和安装内容详细 1.selenium模块的使用 # 之前咱们学requests,可以发送htt ...
[爬虫]采用Go语言爬取天猫商品页面
最近工作中有一个需求,需要爬取天猫商品的信息,整个需求的过程如下: 修改后端广告交易平台的代码,从阿里上传的素材中解析url,该url格式如下: https://handycam.alicdn.com ...
selenium跳过webdriver检测并爬取天猫商品数据
目录简介编写思路使用教程演示图片源代码 @(文章目录) 简介现在爬取淘宝,天猫商品数据都是需要首先进行登录的.上一节我们已经完成了模拟登录淘宝的步骤,所以在此不详细讲如何模拟登录淘宝.把关 ...
selenium+phantomjs爬取京东商品信息
selenium+phantomjs爬取京东商品信息今天自己实战写了个爬取京东商品信息,和上一篇的思路一样,附上链接:https://www.cnblogs.com/cany/p/10897618. ...
Python爬虫-爬取京东商品信息-按给定关键词
目的:按给定关键词爬取京东商品信息,并保存至mongodb. 字段:title.url.store.store_url.item_id.price.comments_count.comments 工具 ...
爬虫—Selenium爬取JD商品信息
一,抓取分析本次目标是爬取京东商品信息,包括商品的图片,名称,价格,评价人数,店铺名称.抓取入口就是京东的搜索页面,这个链接可以通过直接构造参数访问https://search.jd.com/Sea ...
用 BeautifulSoup爬取58商品信息
最近对Python爬虫比较迷恋,看了些爬虫相关的教程于是乎跟着一起爬取了58上面的一些商品信息,并存入到xlsx文件中,并通过xlsxwirter的方法给表格设置了一些格式.好了,直接贴代码吧~ # ...
利用selenium爬取京东商品信息存放到mongodb
利用selenium爬取京东商城的商品信息思路: 1.首先进入京东的搜索页面,分析搜索页面信息可以得到路由结构 2.根据页面信息可以看到京东在搜索页面使用了懒加载,所以为了解决这个问题,使用递归.等待 ...
爬虫框架之Scrapy——爬取某招聘信息网站
案例1:爬取内容存储为一个文件 1.建立项目 C:\pythonStudy\ScrapyProject>scrapy startproject tenCent New Scrapy projec ...

随机推荐

关于Bootstrap的整理和理解
随着CSS3和HTML5的流行,我们的WEB页面不仅需要更人性化的设计理念,而且需要更酷的页面特效和用户体验.作为开发者,我们需要了解一些宝贵的CSS UI开源框架资源,它们可以帮助我们更快更好地实现 ...
linux的netstat命令详解
简介 Netstat 命令用于显示各种网络相关信息,如网络连接,路由表,接口状态 (Interface Statistics),masquerade 连接,多播成员 (Multicast Member ...
bash shell笔记2 结构化命令
二.使用结构化命令知识内容: # 改变命令流 # 使用if-then逻辑 # 嵌套if-then # 测试条件 # 高级if-then功能许多程序在脚本命令之间需要某些逻辑控制流,有些命令允许脚本 ...
Eclipse oxygen安装中文包
help->install new software Eclipse software repository http://download.eclipse.org/technology/ba ...
js如何解析后台传过来的json字符串
1.js如何解析后台传过来的json字符串? 注意:js是无法直接接收和使用json或者Php的数据,用的话会出现undefined,所以要转换一下. 方式一: var str = '{"r ...
【bzoj1787】[Ahoi2008]Meet 紧急集合
1787: [Ahoi2008]Meet 紧急集合 Time Limit: 20 Sec Memory Limit: 162 MBSubmit: 2466 Solved: 1117[Submit] ...
【bzoj1602】[Usaco2008 Oct]牧场行走
1602: [Usaco2008 Oct]牧场行走 Time Limit: 5 Sec Memory Limit: 64 MBSubmit: 1793 Solved: 935[Submit][St ...
laravel数据迁移(创建错误列不能创建)
创建数据表的命令 php artisan make:migration create_users_table 执行这个迁移的命令, php artisan migrate 其实感觉就像简单的方法创建数 ...
Map-making Robots: A Review of the Occupancy Grid Map Algorithm
栅格地图算法:http://www.ikaros-project.org/articles/2008/gridmaps/
Luogu 3957 [NOIP2017]普及组跳房子
写了好久,感觉自己好菜,唉…… 首先发现这个$g$的取值具有单调性,可以想到二分答案,然后考虑用$dp$来检验,这样子可以写出朴素的转移方程: 设$f_i$表示以$i$结尾的最大价值,那么有$f_i ...

scrapy 爬取天猫商品信息

scrapy 爬取天猫商品信息的更多相关文章

随机推荐

热门专题