爬虫亚马逊Bestselling类别产品数据TOP100
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/11 16:23
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_bestseller_cate_url.py
6 # @Software: PyCharm
7
8 import random,requests
9 import re
10
11 def secend_cates_url(url):#正则匹配二级标题
12 # print(url)
13 page_data = get_data(url)
14 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>", page_data, re.S)
15 # print('二级标题有',url_cates)
16 url_cate_all.append(url_cates)
17 # print(page_data)
18
19 def get_html_data(page_data):#正则匹配一级标题
20 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>",page_data,re.S)
21 # print('一级标题有',url_cates)
22 url_cate_all.append(url_cates)
23 # secend_cates_url(url_cates[0][0])
24 for i in range(len(url_cates)):
25 secend_cates_url(url_cates[i][0])
26
27 def randHeader():
28 head_connection = ['Keep-Alive', 'close']
29 head_accept = ['text/html, application/xhtml+xml, */*']
30 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
31 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
32 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
33 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
34 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
35 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
36 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
37 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
38 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
39 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
40 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
41 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
42 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
43 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
44 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
45 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
46 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
47 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
48 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
49 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
50 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
51
52 header = {
53 'Connection': head_connection[0],
54 'Accept': head_accept[0],
55 'Accept-Language': head_accept_language[1],
56 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
57 }
58 return header
59
60 def get_data(url):#获取页面数据
61 headers = randHeader()
62 page_data = requests.get(url, headers, timeout=20)
63 import html
64 page_data = html.unescape(page_data.text)
65 return page_data
66
67 def save_to_excel(url_cate_all):
68 url_cate_alls = []
69 for i in range(len(url_cate_all)):
70 for j in range(len(url_cate_all[i])):
71 # print('所有的标题链接:',url_cate_all[i][j])
72 url_cate_alls.append(url_cate_all[i][j])
73 url_cate_all_only = list(set(url_cate_alls))
74 # print('唯一一个链接和分类:',url_cate_all_only)
75 return url_cate_all_only
76
77 def url_cate_all_only():
78 global url_cate_all
79 url_cate_all = []
80 url = 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1'
81 page_data = get_data(url)
82 # print(page_data)
83 get_html_data(page_data)
84 # print(url_cate_all)
85 url_cate_all_only = save_to_excel(url_cate_all)
86 return url_cate_all_only
1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/9 17:30
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_best_sellers.py
6 # @Software: PyCharm
7
8 import requests
9 import re,os,random
10 from openpyxl import load_workbook
11
12 from amazon_bestseller_cate_url2 import url_cate_all_only
13
14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片
15 for i in range(len(url_xuhao)):
16 print('正在下载第' + str(i+1) + '张图片,图片地址:' + str(url_img[i]))
17 try:
18 header = randHeader()
19 pic = requests.get(url_img[i], header,timeout=10)
20 except requests.exceptions.ConnectionError:
21 print('错误!当前图片无法下载')
22 continue
23 dir = cwd + '\\images_amazon\\' + pro_name + '_' + url_xuhao[i] + '.jpg'#运行路径下自己手动新建一个images_amazon文件加,存放图片
24 with open(dir, 'wb') as file:
25 file.write(pic.content)
26
27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表
28 t = ws.max_row
29 for i in range(len(products_inf)):
30 for j in range(len(products_inf[i])):
31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号
32 ws.cell(t+1+j, 1).value = pro_name
33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息
34 wb.save(path)
35
36 def down_products(result,pro_name):#正则匹配产品信息
37 products_inf = []
38 # url_title = re.findall('<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>',result,re.S)
39 url_title = re.findall('<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>',result, re.S)
40 url_pro = re.findall('<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">',result,re.S)
41 url_price = re.findall('<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>',result,re.S)
42 url_xuhao = re.findall('<span class="zg-badge-text">#(.*?)</span></span>',result,re.S)
43 url_img = re.findall('<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>',result,re.S)
44
45
46 products_inf.append(url_xuhao)
47 products_inf.append(url_title)
48 products_inf.append(url_price)
49 products_inf.append(url_pro)
50 products_inf.append(url_img)
51 print(products_inf)
52
53 save_pro_to_excel(products_inf, pro_name)
54 down_imgs(url_xuhao, url_img, pro_name)
55
56 #生成随机头
57 def randHeader():
58 head_connection = ['Keep-Alive', 'close']
59 head_accept = ['text/html, application/xhtml+xml, */*']
60 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
61 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
62 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
63 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
64 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
65 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
66 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
67 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
68 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
69 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
70 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
71 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
72 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
73 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
74 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
75 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
76 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
77 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
78 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
79 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
80 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
81
82 header = {
83 'Connection': head_connection[0],
84 'Accept': head_accept[0],
85 'Accept-Language': head_accept_language[1],
86 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
87 }
88 return header
89
90 def start_url(pro_name,url):
91 headers = randHeader()
92 result = requests.get(url, headers, timeout=20)
93 import html
94 result = html.unescape(result.text)
95 # print(result)
96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接
97 down_products(result,pro_name)
98
99 if __name__ == '__main__':
100 cwd = os.getcwd()
101 path = cwd+'\\AmazonBestsellers.xlsx'
102 wb = load_workbook(path)
103 ws = wb.worksheets[0]
104 table_titles = ['产品类别','序号','产品标题','产品最低价格','产品链接','产品图片链接']
105 for i,table_title in enumerate(table_titles):
106 ws.cell(1,i+1).value = table_title
107 wb.save(path)
108
109 # amazon_urls = [
110 # #一级标题--女装衣服
111 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
112 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
113 # #二级标题--女装裙子
114 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
115 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
116 # #三级标题--女装日常款裙子
117 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
118 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2'
119 # ]
120
121 amazon_urls = []
122 all_urls = url_cate_all_only()
123 for i in range(len(all_urls)):
124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接
125 print(len(amazon_urls))
126 print(amazon_urls)
127
128 for i in range(len(amazon_urls)):
129 pro_name = amazon_urls[i].split('/')
130 print(pro_name[3])
131 print(pro_name[3][13:])
132 start_url(pro_name[3][13:],amazon_urls[i])
爬虫亚马逊Bestselling类别产品数据TOP100的更多相关文章
- 亚马逊副总裁谈Marketplace平台的个性化服务
说到个性化,亚马逊无疑是挖掘与利用数据为消费者打造个性化网购体验的先驱之一.而现在,几乎所有的公司和网站都在利用更加个性化的推荐算法为用户提供更好的购物和浏览体验. 亚马逊近年来尤其重视将其个性化特性 ...
- 国外物联网平台(1):亚马逊AWS IoT
国外物联网平台(1)——亚马逊AWS IoT 马智 平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并 ...
- 亚马逊AWS业务副总裁:如何在基础设施上降成本
腾讯科技 林靖东 11月17日编译 亚马逊Amazon Web Services业务的副总裁.著名工程师詹姆斯汉密尔顿(James Hamilton)在AWS re:Invent大会上解释了公司是如何 ...
- (转)来自互联网巨头的46个用户体验面试问题(谷歌,亚马逊,facebook及微软)
原文出处: uxdesign - Eleonora Zucconi 译文出处:UXRen - 邓俊杰 如果你是个正在找工作的用户体验研究员,或是一个招聘经理正急需一些启发性问题来测试你的候选人,这 ...
- 国外物联网平台初探(一) ——亚马逊AWS IoT
平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并且可以对这些消息进行处理并将其安全可靠地路由至 AW ...
- 亚马逊云科技现身世界人工智能大会,揭示AI最新技术趋势
2022世界人工智能大会(WAIC)于日前落幕.经过过去四届的发展与沉淀,今天的世界人工智能大会已成为人工智能领域最有影响力的国际盛会之一,今年大咖云集.国际大厂扎堆,充分彰显了大会的国际影响力和磁力 ...
- 如何增加亚马逊listing多个类目节点
流量是电商销售的必要因素,可以说,任何成功的电商平台都离不开流量.亚马逊listing优化做得好,不仅能提高产品的曝光率,还能提升转换率,而好的类目可以吸引大的流量.帮你快速爬升. 首先我们来了解一下 ...
- 面对IBM与亚马逊的犄角攻势,微软云如何招架?
亚马逊AWS和微软Azure是全球公有云的焦点.不就前公布的财报不久前公布的财报,这两家公司云计算的收入越来越接近,从数据显示来看,亚马逊的利润比微软稍高,有人称微软云的高增长来自于捆绑销售,背后真正 ...
- 亚马逊Prime会员的杀价,能说明会员+会越来越便宜吗?
前段时间,京东又坑了!京东调整了物流方案--从原来的购物不满49元只需6元运费,调整到购物不满46元运费15元,运费猛涨了9元!原本京东PLUS会员每月有5张免运费券,但在运费涨价后运费券限制在6元, ...
随机推荐
- [转载]详解ssh端口转发(二)
关于使用ssh portforwarding来进行FQ的操作,网络上已经有很多很好的文章,我在这里只是画两个图解释一下. 首先要记住一件事情就是: SSH 端口转发自然需要 SSH 连接,而 SSH ...
- 22.1.7 master公式及O(NLogN)的排序
22.1.7 master公式及O(NLogN)的排序 1 master 公式 (1) 写公式 T(N) = a * T(N/b) + O(N^d); master公式用来求递归行为的时间复杂度,式中 ...
- synchronized已经不在臃肿了,放下对他的成见之初识轻量级锁
前言 物竞天择,适者生存.JDK也在不断的优化中.关于JDK中synchronized锁内部也是不断的优化,前面我们分析了偏向锁用来解决初期问题,随着争抢的不断堆积轻量级锁营运而生. 关注我,一个不断 ...
- 一文看懂:ChIP实验和qPCR定量分析怎么做|易基因技术
大家好,这里是专注表观组学十余年,领跑多组学科研服务的易基因. 染色质免疫共沉淀(Chromatin Immunoprecipitation,ChIP),是研究体内蛋白质与DNA相互作用的经典方法. ...
- Kafka 缺点?
由于是批量发送,数据并非真正的实时: 对于mqtt协议不支持: 不支持物联网传感数据直接接入: 仅支持统一分区内消息有序,无法实现全局消息有序: 监控不完善,需要安装插件: 依赖zookeeper进行 ...
- MyISAM Static 和 MyISAM Dynamic 有什么区别?
在 MyISAM Static 上的所有字段有固定宽度.动态 MyISAM 表将具有像 TEXT, BLOB 等字段,以适应不同长度的数据类型. MyISAM Static 在受损情况下更容易恢复.
- 说说for循环的两种写法
for循环 执行多次,条件写在()里,语法形式: 1 2 3 for(计数器变量;条件;计数器增减){ // 将要执行的代码 } 示例: 1 2 3 for (int i = 0; i < 5; ...
- ArrayList、LinkedList、Vector、Array
ArrayList 本质是一个数组. 优势:追加元素到数组末尾的时候速度快,同时检索元素的速度也快. 劣势:如果要插入一个元素到数组之间慢:如果要追加的元素数量多于数组的容量,则需要频繁扩容使用Arr ...
- 云计算:Ubuntu下Vue+Springboot前后端分离项目部署(多节点)
一.机器准备 首先准备三台机器: 我是一台WINDOWS系统主机,在WINDOWS里的 VMware 中安装两台Ubuntu系统虚拟机 如果你的虚拟机只有 CentOS,可以参考这篇文章:https: ...
- (stm32学习总结)—对寄存器的理解 _
芯片里面有什么 我们看到的 STM32 芯片是已经封装好的成品,主要由内核和片上外设组成.若与电脑类比,内核与外设就如同电脑上的 CPU 与主板.内存.显卡.硬盘的关系.STM32F103 采用的是 ...