爬虫亚马逊Bestselling类别产品数据TOP100
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/11 16:23
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_bestseller_cate_url.py
6 # @Software: PyCharm
7
8 import random,requests
9 import re
10
11 def secend_cates_url(url):#正则匹配二级标题
12 # print(url)
13 page_data = get_data(url)
14 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>", page_data, re.S)
15 # print('二级标题有',url_cates)
16 url_cate_all.append(url_cates)
17 # print(page_data)
18
19 def get_html_data(page_data):#正则匹配一级标题
20 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>",page_data,re.S)
21 # print('一级标题有',url_cates)
22 url_cate_all.append(url_cates)
23 # secend_cates_url(url_cates[0][0])
24 for i in range(len(url_cates)):
25 secend_cates_url(url_cates[i][0])
26
27 def randHeader():
28 head_connection = ['Keep-Alive', 'close']
29 head_accept = ['text/html, application/xhtml+xml, */*']
30 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
31 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
32 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
33 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
34 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
35 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
36 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
37 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
38 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
39 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
40 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
41 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
42 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
43 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
44 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
45 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
46 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
47 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
48 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
49 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
50 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
51
52 header = {
53 'Connection': head_connection[0],
54 'Accept': head_accept[0],
55 'Accept-Language': head_accept_language[1],
56 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
57 }
58 return header
59
60 def get_data(url):#获取页面数据
61 headers = randHeader()
62 page_data = requests.get(url, headers, timeout=20)
63 import html
64 page_data = html.unescape(page_data.text)
65 return page_data
66
67 def save_to_excel(url_cate_all):
68 url_cate_alls = []
69 for i in range(len(url_cate_all)):
70 for j in range(len(url_cate_all[i])):
71 # print('所有的标题链接:',url_cate_all[i][j])
72 url_cate_alls.append(url_cate_all[i][j])
73 url_cate_all_only = list(set(url_cate_alls))
74 # print('唯一一个链接和分类:',url_cate_all_only)
75 return url_cate_all_only
76
77 def url_cate_all_only():
78 global url_cate_all
79 url_cate_all = []
80 url = 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1'
81 page_data = get_data(url)
82 # print(page_data)
83 get_html_data(page_data)
84 # print(url_cate_all)
85 url_cate_all_only = save_to_excel(url_cate_all)
86 return url_cate_all_only
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/9 17:30
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_best_sellers.py
6 # @Software: PyCharm
7
8 import requests
9 import re,os,random
10 from openpyxl import load_workbook
11
12 from amazon_bestseller_cate_url2 import url_cate_all_only
13
14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片
15 for i in range(len(url_xuhao)):
16 print('正在下载第' + str(i+1) + '张图片,图片地址:' + str(url_img[i]))
17 try:
18 header = randHeader()
19 pic = requests.get(url_img[i], header,timeout=10)
20 except requests.exceptions.ConnectionError:
21 print('错误!当前图片无法下载')
22 continue
23 dir = cwd + '\\images_amazon\\' + pro_name + '_' + url_xuhao[i] + '.jpg'#运行路径下自己手动新建一个images_amazon文件加,存放图片
24 with open(dir, 'wb') as file:
25 file.write(pic.content)
26
27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表
28 t = ws.max_row
29 for i in range(len(products_inf)):
30 for j in range(len(products_inf[i])):
31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号
32 ws.cell(t+1+j, 1).value = pro_name
33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息
34 wb.save(path)
35
36 def down_products(result,pro_name):#正则匹配产品信息
37 products_inf = []
38 # url_title = re.findall('<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>',result,re.S)
39 url_title = re.findall('<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>',result, re.S)
40 url_pro = re.findall('<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">',result,re.S)
41 url_price = re.findall('<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>',result,re.S)
42 url_xuhao = re.findall('<span class="zg-badge-text">#(.*?)</span></span>',result,re.S)
43 url_img = re.findall('<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>',result,re.S)
44
45
46 products_inf.append(url_xuhao)
47 products_inf.append(url_title)
48 products_inf.append(url_price)
49 products_inf.append(url_pro)
50 products_inf.append(url_img)
51 print(products_inf)
52
53 save_pro_to_excel(products_inf, pro_name)
54 down_imgs(url_xuhao, url_img, pro_name)
55
56 #生成随机头
57 def randHeader():
58 head_connection = ['Keep-Alive', 'close']
59 head_accept = ['text/html, application/xhtml+xml, */*']
60 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
61 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
62 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
63 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
64 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
65 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
66 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
67 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
68 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
69 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
70 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
71 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
72 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
73 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
74 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
75 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
76 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
77 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
78 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
79 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
80 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
81
82 header = {
83 'Connection': head_connection[0],
84 'Accept': head_accept[0],
85 'Accept-Language': head_accept_language[1],
86 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
87 }
88 return header
89
90 def start_url(pro_name,url):
91 headers = randHeader()
92 result = requests.get(url, headers, timeout=20)
93 import html
94 result = html.unescape(result.text)
95 # print(result)
96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接
97 down_products(result,pro_name)
98
99 if __name__ == '__main__':
100 cwd = os.getcwd()
101 path = cwd+'\\AmazonBestsellers.xlsx'
102 wb = load_workbook(path)
103 ws = wb.worksheets[0]
104 table_titles = ['产品类别','序号','产品标题','产品最低价格','产品链接','产品图片链接']
105 for i,table_title in enumerate(table_titles):
106 ws.cell(1,i+1).value = table_title
107 wb.save(path)
108
109 # amazon_urls = [
110 # #一级标题--女装衣服
111 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
112 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
113 # #二级标题--女装裙子
114 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
115 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
116 # #三级标题--女装日常款裙子
117 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
118 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2'
119 # ]
120
121 amazon_urls = []
122 all_urls = url_cate_all_only()
123 for i in range(len(all_urls)):
124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接
125 print(len(amazon_urls))
126 print(amazon_urls)
127
128 for i in range(len(amazon_urls)):
129 pro_name = amazon_urls[i].split('/')
130 print(pro_name[3])
131 print(pro_name[3][13:])
132 start_url(pro_name[3][13:],amazon_urls[i])
爬虫亚马逊Bestselling类别产品数据TOP100的更多相关文章
- 亚马逊副总裁谈Marketplace平台的个性化服务
说到个性化,亚马逊无疑是挖掘与利用数据为消费者打造个性化网购体验的先驱之一.而现在,几乎所有的公司和网站都在利用更加个性化的推荐算法为用户提供更好的购物和浏览体验. 亚马逊近年来尤其重视将其个性化特性 ...
- 国外物联网平台(1):亚马逊AWS IoT
国外物联网平台(1)——亚马逊AWS IoT 马智 平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并 ...
- 亚马逊AWS业务副总裁:如何在基础设施上降成本
腾讯科技 林靖东 11月17日编译 亚马逊Amazon Web Services业务的副总裁.著名工程师詹姆斯汉密尔顿(James Hamilton)在AWS re:Invent大会上解释了公司是如何 ...
- (转)来自互联网巨头的46个用户体验面试问题(谷歌,亚马逊,facebook及微软)
原文出处: uxdesign - Eleonora Zucconi 译文出处:UXRen - 邓俊杰 如果你是个正在找工作的用户体验研究员,或是一个招聘经理正急需一些启发性问题来测试你的候选人,这 ...
- 国外物联网平台初探(一) ——亚马逊AWS IoT
平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并且可以对这些消息进行处理并将其安全可靠地路由至 AW ...
- 亚马逊云科技现身世界人工智能大会,揭示AI最新技术趋势
2022世界人工智能大会(WAIC)于日前落幕.经过过去四届的发展与沉淀,今天的世界人工智能大会已成为人工智能领域最有影响力的国际盛会之一,今年大咖云集.国际大厂扎堆,充分彰显了大会的国际影响力和磁力 ...
- 如何增加亚马逊listing多个类目节点
流量是电商销售的必要因素,可以说,任何成功的电商平台都离不开流量.亚马逊listing优化做得好,不仅能提高产品的曝光率,还能提升转换率,而好的类目可以吸引大的流量.帮你快速爬升. 首先我们来了解一下 ...
- 面对IBM与亚马逊的犄角攻势,微软云如何招架?
亚马逊AWS和微软Azure是全球公有云的焦点.不就前公布的财报不久前公布的财报,这两家公司云计算的收入越来越接近,从数据显示来看,亚马逊的利润比微软稍高,有人称微软云的高增长来自于捆绑销售,背后真正 ...
- 亚马逊Prime会员的杀价,能说明会员+会越来越便宜吗?
前段时间,京东又坑了!京东调整了物流方案--从原来的购物不满49元只需6元运费,调整到购物不满46元运费15元,运费猛涨了9元!原本京东PLUS会员每月有5张免运费券,但在运费涨价后运费券限制在6元, ...
随机推荐
- APACHE ACTIVEMQ安装
APACHE ACTIVEMQ安装 一.特点 支持各种语言和协议的客户端.如:Java.C.C++.C#.Ruby.Perl.Python及Php. 完全支持JMS的客户端及其他消息代理 完全支持JM ...
- node的内置常量 __dirname和 __filename
node的内置常量 __ dirname和 __ filename __dirname当前文件(你用node运行的文件)所在的文件夹地址 // dirname.js console.log(__dir ...
- java代码常用知识点
1.Assert java断言assert是jdk1.4引入的.assert这个关键字我们称之为"断言".当这个关键字后边的条件为假的时候,程序自动崩溃并抛出AssertionEr ...
- hashCode()方法的作用?
hashCode()方法与equals()方法相似,都是来自java.lang.Object类的方法,都允许用户定义的子类重写这两个方法. 一般来说,equals这个方法是给用户调用的,如果你想根据自 ...
- 使用过 Redis 做异步队列么,你是怎么用的?
答:一般使用 list 结构作为队列,rpush 生产消息,lpop 消费消息.当 lpop 没有 消息的时候,要适当 sleep 一会再重试. 如果对方追问可不可以不用 sleep 呢? list ...
- 一、mycat介绍
一.背景 随着时间和业务的发展,数据库中的数据量增长是不可控的,库和表中的数据会越来越大,随之带来的是更高的磁盘.IO.系统开销,甚至性能上的瓶颈,而一台服务的资源终究是有限的,因此需要对数据库和表进 ...
- Spring Framework 有哪些不同的功能?
轻量级 - Spring 在代码量和透明度方面都很轻便.IOC - 控制反转 AOP - 面向 切面编程可以将应用业务逻辑和系统服务分离,以实现高内聚.容器 - Spring 负 责创建和管理对象(B ...
- Netty学习摘记 —— 预置SSL / HTTP / WebSocket编解码器
本文参考 本篇文章是对<Netty In Action>一书第十一章"预置的ChannelHandler和编解码器"的学习摘记,主要内容为通过 SSL/TLS 保护 N ...
- ctfhub web信息泄露备份文件下载(vim缓存 Ds-Store)
Vim缓存 进入环境由于不懂得vim是什么借鉴大佬的博客 网页提示flag在index.php中我们按着这个思路去找 将文件保存下来因为是swp文件我们用kail进行打开 使用vim -r index ...
- vs code下代码提示图标的含义(c++)
其实不同的语言这些东西的含义还有不同 但差别也不是很大,比如Python中的那个大括号图标就成了模块(module)了