爬虫亚马逊Bestselling类别产品数据TOP100
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/11 16:23
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_bestseller_cate_url.py
6 # @Software: PyCharm
7
8 import random,requests
9 import re
10
11 def secend_cates_url(url):#正则匹配二级标题
12 # print(url)
13 page_data = get_data(url)
14 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>", page_data, re.S)
15 # print('二级标题有',url_cates)
16 url_cate_all.append(url_cates)
17 # print(page_data)
18
19 def get_html_data(page_data):#正则匹配一级标题
20 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>",page_data,re.S)
21 # print('一级标题有',url_cates)
22 url_cate_all.append(url_cates)
23 # secend_cates_url(url_cates[0][0])
24 for i in range(len(url_cates)):
25 secend_cates_url(url_cates[i][0])
26
27 def randHeader():
28 head_connection = ['Keep-Alive', 'close']
29 head_accept = ['text/html, application/xhtml+xml, */*']
30 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
31 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
32 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
33 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
34 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
35 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
36 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
37 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
38 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
39 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
40 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
41 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
42 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
43 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
44 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
45 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
46 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
47 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
48 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
49 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
50 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
51
52 header = {
53 'Connection': head_connection[0],
54 'Accept': head_accept[0],
55 'Accept-Language': head_accept_language[1],
56 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
57 }
58 return header
59
60 def get_data(url):#获取页面数据
61 headers = randHeader()
62 page_data = requests.get(url, headers, timeout=20)
63 import html
64 page_data = html.unescape(page_data.text)
65 return page_data
66
67 def save_to_excel(url_cate_all):
68 url_cate_alls = []
69 for i in range(len(url_cate_all)):
70 for j in range(len(url_cate_all[i])):
71 # print('所有的标题链接:',url_cate_all[i][j])
72 url_cate_alls.append(url_cate_all[i][j])
73 url_cate_all_only = list(set(url_cate_alls))
74 # print('唯一一个链接和分类:',url_cate_all_only)
75 return url_cate_all_only
76
77 def url_cate_all_only():
78 global url_cate_all
79 url_cate_all = []
80 url = 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1'
81 page_data = get_data(url)
82 # print(page_data)
83 get_html_data(page_data)
84 # print(url_cate_all)
85 url_cate_all_only = save_to_excel(url_cate_all)
86 return url_cate_all_only
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/9 17:30
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_best_sellers.py
6 # @Software: PyCharm
7
8 import requests
9 import re,os,random
10 from openpyxl import load_workbook
11
12 from amazon_bestseller_cate_url2 import url_cate_all_only
13
14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片
15 for i in range(len(url_xuhao)):
16 print('正在下载第' + str(i+1) + '张图片,图片地址:' + str(url_img[i]))
17 try:
18 header = randHeader()
19 pic = requests.get(url_img[i], header,timeout=10)
20 except requests.exceptions.ConnectionError:
21 print('错误!当前图片无法下载')
22 continue
23 dir = cwd + '\\images_amazon\\' + pro_name + '_' + url_xuhao[i] + '.jpg'#运行路径下自己手动新建一个images_amazon文件加,存放图片
24 with open(dir, 'wb') as file:
25 file.write(pic.content)
26
27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表
28 t = ws.max_row
29 for i in range(len(products_inf)):
30 for j in range(len(products_inf[i])):
31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号
32 ws.cell(t+1+j, 1).value = pro_name
33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息
34 wb.save(path)
35
36 def down_products(result,pro_name):#正则匹配产品信息
37 products_inf = []
38 # url_title = re.findall('<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>',result,re.S)
39 url_title = re.findall('<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>',result, re.S)
40 url_pro = re.findall('<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">',result,re.S)
41 url_price = re.findall('<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>',result,re.S)
42 url_xuhao = re.findall('<span class="zg-badge-text">#(.*?)</span></span>',result,re.S)
43 url_img = re.findall('<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>',result,re.S)
44
45
46 products_inf.append(url_xuhao)
47 products_inf.append(url_title)
48 products_inf.append(url_price)
49 products_inf.append(url_pro)
50 products_inf.append(url_img)
51 print(products_inf)
52
53 save_pro_to_excel(products_inf, pro_name)
54 down_imgs(url_xuhao, url_img, pro_name)
55
56 #生成随机头
57 def randHeader():
58 head_connection = ['Keep-Alive', 'close']
59 head_accept = ['text/html, application/xhtml+xml, */*']
60 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
61 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
62 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
63 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
64 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
65 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
66 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
67 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
68 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
69 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
70 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
71 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
72 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
73 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
74 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
75 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
76 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
77 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
78 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
79 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
80 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
81
82 header = {
83 'Connection': head_connection[0],
84 'Accept': head_accept[0],
85 'Accept-Language': head_accept_language[1],
86 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
87 }
88 return header
89
90 def start_url(pro_name,url):
91 headers = randHeader()
92 result = requests.get(url, headers, timeout=20)
93 import html
94 result = html.unescape(result.text)
95 # print(result)
96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接
97 down_products(result,pro_name)
98
99 if __name__ == '__main__':
100 cwd = os.getcwd()
101 path = cwd+'\\AmazonBestsellers.xlsx'
102 wb = load_workbook(path)
103 ws = wb.worksheets[0]
104 table_titles = ['产品类别','序号','产品标题','产品最低价格','产品链接','产品图片链接']
105 for i,table_title in enumerate(table_titles):
106 ws.cell(1,i+1).value = table_title
107 wb.save(path)
108
109 # amazon_urls = [
110 # #一级标题--女装衣服
111 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
112 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
113 # #二级标题--女装裙子
114 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
115 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
116 # #三级标题--女装日常款裙子
117 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
118 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2'
119 # ]
120
121 amazon_urls = []
122 all_urls = url_cate_all_only()
123 for i in range(len(all_urls)):
124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接
125 print(len(amazon_urls))
126 print(amazon_urls)
127
128 for i in range(len(amazon_urls)):
129 pro_name = amazon_urls[i].split('/')
130 print(pro_name[3])
131 print(pro_name[3][13:])
132 start_url(pro_name[3][13:],amazon_urls[i])
爬虫亚马逊Bestselling类别产品数据TOP100的更多相关文章
- 亚马逊副总裁谈Marketplace平台的个性化服务
说到个性化,亚马逊无疑是挖掘与利用数据为消费者打造个性化网购体验的先驱之一.而现在,几乎所有的公司和网站都在利用更加个性化的推荐算法为用户提供更好的购物和浏览体验. 亚马逊近年来尤其重视将其个性化特性 ...
- 国外物联网平台(1):亚马逊AWS IoT
国外物联网平台(1)——亚马逊AWS IoT 马智 平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并 ...
- 亚马逊AWS业务副总裁:如何在基础设施上降成本
腾讯科技 林靖东 11月17日编译 亚马逊Amazon Web Services业务的副总裁.著名工程师詹姆斯汉密尔顿(James Hamilton)在AWS re:Invent大会上解释了公司是如何 ...
- (转)来自互联网巨头的46个用户体验面试问题(谷歌,亚马逊,facebook及微软)
原文出处: uxdesign - Eleonora Zucconi 译文出处:UXRen - 邓俊杰 如果你是个正在找工作的用户体验研究员,或是一个招聘经理正急需一些启发性问题来测试你的候选人,这 ...
- 国外物联网平台初探(一) ——亚马逊AWS IoT
平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并且可以对这些消息进行处理并将其安全可靠地路由至 AW ...
- 亚马逊云科技现身世界人工智能大会,揭示AI最新技术趋势
2022世界人工智能大会(WAIC)于日前落幕.经过过去四届的发展与沉淀,今天的世界人工智能大会已成为人工智能领域最有影响力的国际盛会之一,今年大咖云集.国际大厂扎堆,充分彰显了大会的国际影响力和磁力 ...
- 如何增加亚马逊listing多个类目节点
流量是电商销售的必要因素,可以说,任何成功的电商平台都离不开流量.亚马逊listing优化做得好,不仅能提高产品的曝光率,还能提升转换率,而好的类目可以吸引大的流量.帮你快速爬升. 首先我们来了解一下 ...
- 面对IBM与亚马逊的犄角攻势,微软云如何招架?
亚马逊AWS和微软Azure是全球公有云的焦点.不就前公布的财报不久前公布的财报,这两家公司云计算的收入越来越接近,从数据显示来看,亚马逊的利润比微软稍高,有人称微软云的高增长来自于捆绑销售,背后真正 ...
- 亚马逊Prime会员的杀价,能说明会员+会越来越便宜吗?
前段时间,京东又坑了!京东调整了物流方案--从原来的购物不满49元只需6元运费,调整到购物不满46元运费15元,运费猛涨了9元!原本京东PLUS会员每月有5张免运费券,但在运费涨价后运费券限制在6元, ...
随机推荐
- EVM
靶机设置 将靶机导入VirtualBox中,有时候导入VM会出错,扫描不到ip地址. kali:192.168.1.100 kali扫描获得ip地址:192.168.1.107 渗透测试 接着扫描端口 ...
- redis事务及相关命令介绍
redis事务及相关命令介绍 一.概述:和众多其它数据库一样,Redis作为NoSQL数据库也同样提供了事务机制.在Redis中,MULTI/EXEC/DISCARD/WATCH这四个命令是我们实现事 ...
- (转载)linux下Yum的$releasever和$basearch的取值
https://blog.csdn.net/whatday/article/details/51097456
- 什么是B+树??
上一篇中,我们了解了B树,辣么..B+树又是什么呢?? 一:定义:B+树是基于B树的,是B树的变形,也是一种多路搜索树.查询性能更加出色. 1.每个父节点元素出现在子节点中,是子节点的最大或最小元素. ...
- mybatis学习一:基于xml与注解配置入门实例与问题
注:本case参考自:http://www.cnblogs.com/ysocean/p/7277545.html 一:Mybatis的介绍: MyBatis 本是apache的一个开源项目iBatis ...
- DWR是什么?有什么作用?
DWR(Direct Web Remoting)是一个用于改善web页面与Java类交互的远程服务器端Ajax开源框架,可以帮助开发人员开发包含AJAX技术的网站. 它可以允许在浏览器里的代码使用运行 ...
- 哪些是重要的bean生命周期方法? 你能重载它们吗?
有两个重要的bean 生命周期方法,第一个是setup , 它是在容器加载bean的时候被调用.第二个方法是 teardown 它是在容器卸载类的时候被调用. The bean 标签有两个重要的属性 ...
- synchronize、Lock、ReenTrantLock 的区别
synchronize 和Lock: 1.synchronize 系java 内置关键字:而Lock 是一个类 2.synchronize 可以作用于变量.方法.代码块:而Lock 是显式地指定开始和 ...
- 4-Pandas数据预处理之数据转换(df.map()、df.replace())
在数据分析中,根据需求,有时候需要将一些数据进行转换,而在Pandas中,实现数据转换的常用方法有: 利用函数或是映射 可以将自己定义的或者是其他包提供的函数用在Pandas对象上实现批量修改. ap ...
- Matlab解析LQR与MPC的关系
mathworks社区中的这个资料还是值得一说的. 1 openExample('mpc/mpccustomqp') 我们从几个角度来解析两者关系,简单的说就是MPC是带了约束的LQR. 在陈虹模型预 ...