1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/11 16:23
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_bestseller_cate_url.py
6 # @Software: PyCharm
7
8 import random,requests
9 import re
10
11 def secend_cates_url(url):#正则匹配二级标题
12 # print(url)
13 page_data = get_data(url)
14 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>", page_data, re.S)
15 # print('二级标题有',url_cates)
16 url_cate_all.append(url_cates)
17 # print(page_data)
18
19 def get_html_data(page_data):#正则匹配一级标题
20 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>",page_data,re.S)
21 # print('一级标题有',url_cates)
22 url_cate_all.append(url_cates)
23 # secend_cates_url(url_cates[0][0])
24 for i in range(len(url_cates)):
25 secend_cates_url(url_cates[i][0])
26
27 def randHeader():
28 head_connection = ['Keep-Alive', 'close']
29 head_accept = ['text/html, application/xhtml+xml, */*']
30 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
31 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
32 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
33 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
34 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
35 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
36 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
37 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
38 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
39 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
40 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
41 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
42 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
43 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
44 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
45 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
46 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
47 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
48 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
49 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
50 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
51
52 header = {
53 'Connection': head_connection[0],
54 'Accept': head_accept[0],
55 'Accept-Language': head_accept_language[1],
56 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
57 }
58 return header
59
60 def get_data(url):#获取页面数据
61 headers = randHeader()
62 page_data = requests.get(url, headers, timeout=20)
63 import html
64 page_data = html.unescape(page_data.text)
65 return page_data
66
67 def save_to_excel(url_cate_all):
68 url_cate_alls = []
69 for i in range(len(url_cate_all)):
70 for j in range(len(url_cate_all[i])):
71 # print('所有的标题链接:',url_cate_all[i][j])
72 url_cate_alls.append(url_cate_all[i][j])
73 url_cate_all_only = list(set(url_cate_alls))
74 # print('唯一一个链接和分类:',url_cate_all_only)
75 return url_cate_all_only
76
77 def url_cate_all_only():
78 global url_cate_all
79 url_cate_all = []
80 url = 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1'
81 page_data = get_data(url)
82 # print(page_data)
83 get_html_data(page_data)
84 # print(url_cate_all)
85 url_cate_all_only = save_to_excel(url_cate_all)
86 return url_cate_all_only
  1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/9 17:30
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_best_sellers.py
6 # @Software: PyCharm
7
8 import requests
9 import re,os,random
10 from openpyxl import load_workbook
11
12 from amazon_bestseller_cate_url2 import url_cate_all_only
13
14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片
15 for i in range(len(url_xuhao)):
16 print('正在下载第' + str(i+1) + '张图片,图片地址:' + str(url_img[i]))
17 try:
18 header = randHeader()
19 pic = requests.get(url_img[i], header,timeout=10)
20 except requests.exceptions.ConnectionError:
21 print('错误!当前图片无法下载')
22 continue
23 dir = cwd + '\\images_amazon\\' + pro_name + '_' + url_xuhao[i] + '.jpg'#运行路径下自己手动新建一个images_amazon文件加,存放图片
24 with open(dir, 'wb') as file:
25 file.write(pic.content)
26
27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表
28 t = ws.max_row
29 for i in range(len(products_inf)):
30 for j in range(len(products_inf[i])):
31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号
32 ws.cell(t+1+j, 1).value = pro_name
33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息
34 wb.save(path)
35
36 def down_products(result,pro_name):#正则匹配产品信息
37 products_inf = []
38 # url_title = re.findall('<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>',result,re.S)
39 url_title = re.findall('<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>',result, re.S)
40 url_pro = re.findall('<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">',result,re.S)
41 url_price = re.findall('<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>',result,re.S)
42 url_xuhao = re.findall('<span class="zg-badge-text">#(.*?)</span></span>',result,re.S)
43 url_img = re.findall('<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>',result,re.S)
44
45
46 products_inf.append(url_xuhao)
47 products_inf.append(url_title)
48 products_inf.append(url_price)
49 products_inf.append(url_pro)
50 products_inf.append(url_img)
51 print(products_inf)
52
53 save_pro_to_excel(products_inf, pro_name)
54 down_imgs(url_xuhao, url_img, pro_name)
55
56 #生成随机头
57 def randHeader():
58 head_connection = ['Keep-Alive', 'close']
59 head_accept = ['text/html, application/xhtml+xml, */*']
60 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
61 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
62 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
63 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
64 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
65 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
66 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
67 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
68 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
69 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
70 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
71 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
72 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
73 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
74 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
75 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
76 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
77 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
78 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
79 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
80 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
81
82 header = {
83 'Connection': head_connection[0],
84 'Accept': head_accept[0],
85 'Accept-Language': head_accept_language[1],
86 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
87 }
88 return header
89
90 def start_url(pro_name,url):
91 headers = randHeader()
92 result = requests.get(url, headers, timeout=20)
93 import html
94 result = html.unescape(result.text)
95 # print(result)
96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接
97 down_products(result,pro_name)
98
99 if __name__ == '__main__':
100 cwd = os.getcwd()
101 path = cwd+'\\AmazonBestsellers.xlsx'
102 wb = load_workbook(path)
103 ws = wb.worksheets[0]
104 table_titles = ['产品类别','序号','产品标题','产品最低价格','产品链接','产品图片链接']
105 for i,table_title in enumerate(table_titles):
106 ws.cell(1,i+1).value = table_title
107 wb.save(path)
108
109 # amazon_urls = [
110 # #一级标题--女装衣服
111 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
112 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
113 # #二级标题--女装裙子
114 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
115 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
116 # #三级标题--女装日常款裙子
117 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
118 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2'
119 # ]
120
121 amazon_urls = []
122 all_urls = url_cate_all_only()
123 for i in range(len(all_urls)):
124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接
125 print(len(amazon_urls))
126 print(amazon_urls)
127
128 for i in range(len(amazon_urls)):
129 pro_name = amazon_urls[i].split('/')
130 print(pro_name[3])
131 print(pro_name[3][13:])
132 start_url(pro_name[3][13:],amazon_urls[i])

爬虫亚马逊Bestselling类别产品数据TOP100的更多相关文章

  1. 亚马逊副总裁谈Marketplace平台的个性化服务

    说到个性化,亚马逊无疑是挖掘与利用数据为消费者打造个性化网购体验的先驱之一.而现在,几乎所有的公司和网站都在利用更加个性化的推荐算法为用户提供更好的购物和浏览体验. 亚马逊近年来尤其重视将其个性化特性 ...

  2. 国外物联网平台(1):亚马逊AWS IoT

    国外物联网平台(1)——亚马逊AWS IoT 马智 平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并 ...

  3. 亚马逊AWS业务副总裁:如何在基础设施上降成本

    腾讯科技 林靖东 11月17日编译 亚马逊Amazon Web Services业务的副总裁.著名工程师詹姆斯汉密尔顿(James Hamilton)在AWS re:Invent大会上解释了公司是如何 ...

  4. (转)来自互联网巨头的46个用户体验面试问题(谷歌,亚马逊,facebook及微软)

    原文出处: uxdesign - Eleonora Zucconi   译文出处:UXRen - 邓俊杰 如果你是个正在找工作的用户体验研究员,或是一个招聘经理正急需一些启发性问题来测试你的候选人,这 ...

  5. 国外物联网平台初探(一) ——亚马逊AWS IoT

    平台定位 AWS IoT是一款托管的云平台,使互联设备可以轻松安全地与云应用程序及其他设备交互. AWS IoT可支持数十亿台设备和数万亿条消息,并且可以对这些消息进行处理并将其安全可靠地路由至 AW ...

  6. 亚马逊云科技现身世界人工智能大会,揭示AI最新技术趋势

    2022世界人工智能大会(WAIC)于日前落幕.经过过去四届的发展与沉淀,今天的世界人工智能大会已成为人工智能领域最有影响力的国际盛会之一,今年大咖云集.国际大厂扎堆,充分彰显了大会的国际影响力和磁力 ...

  7. 如何增加亚马逊listing多个类目节点

    流量是电商销售的必要因素,可以说,任何成功的电商平台都离不开流量.亚马逊listing优化做得好,不仅能提高产品的曝光率,还能提升转换率,而好的类目可以吸引大的流量.帮你快速爬升. 首先我们来了解一下 ...

  8. 面对IBM与亚马逊的犄角攻势,微软云如何招架?

    亚马逊AWS和微软Azure是全球公有云的焦点.不就前公布的财报不久前公布的财报,这两家公司云计算的收入越来越接近,从数据显示来看,亚马逊的利润比微软稍高,有人称微软云的高增长来自于捆绑销售,背后真正 ...

  9. 亚马逊Prime会员的杀价,能说明会员+会越来越便宜吗?

    前段时间,京东又坑了!京东调整了物流方案--从原来的购物不满49元只需6元运费,调整到购物不满46元运费15元,运费猛涨了9元!原本京东PLUS会员每月有5张免运费券,但在运费涨价后运费券限制在6元, ...

随机推荐

  1. TiDB 5.0认证指南之PCTA PCTP

    1. TiDB简介 TiDB 是 PingCAP 公司自主设计.研发的开源分布式关系型数据库,是一款同时支持在线事务处理与在线分析处理 (Hybrid Transactional and Analyt ...

  2. 比Tensorflow还强?

    大家好,我是章北海 Python是机器学习和深度学习的首选编程语言,但绝不是唯一.训练机器学习/深度学习模型并部署对外提供服务(尤其是通过浏览器)JavaScript 是一个不错的选择,市面上也出现了 ...

  3. Mybatis框架基础入门(三)--Mapper动态代理方式开发

    使用MyBatis开发Dao,通常有两个方法,即原始Dao开发方法和Mapper动态代理开发方法. 原始Dao开发方法需要程序员编写Dao接口和Dao实现类,此方式开发Dao,存在以下问题: Dao方 ...

  4. redis 是什么?都有哪些使用场景?

    一.什么是redis 首先要说redis,应该先说一下nosql,NoSQL(NoSQL = Not Only SQL ),意即"不仅仅是SQL",泛指非关系型的数据库.随着互联网 ...

  5. “a==b”和”a.equals(b)”有什么区别?

    如果 a 和 b 都是对象,则 a==b 是比较两个对象的引用,只有当 a 和 b 指 向的是堆中的同一个对象才会返回 true,而 a.equals(b) 是进行逻辑比较,所以 通常需要重写该方法来 ...

  6. input 弹起数字键盘的那些坑

    input ios 踩的大坑 前言:最近有个需求要将全平台的交易密码由原来的 6-16位 复杂密码改为6位纯数字交易密码,涉及到非常多的业务场景,但修改起来也无非两种:设置交易密码,使用交易密码 设置 ...

  7. JS:数组中push对象,覆盖问题

    发现将对象push进数组,后面的值会覆盖前面的值,最后输出的都是最后一次的值.其实这一切都是引用数据类型惹的祸.如果你也有类似问题,可以继续看下去哦.下面代码模拟:将json对象的每个键值对,单独搞成 ...

  8. HTML+CSS基础课程-imooc-【更新完毕】

    6-1 认识CSS样式 CSS全称为"层叠样式表 (Cascading Style Sheets)",它主要是用于定义HTML内容在浏览器内的显示样式,如文字大小.颜色.字体加粗等 ...

  9. 关于css中选择器的小归纳(一)

    关于css中选择器的小归纳 一.基本选择器 基本选择器是我们平常用到的最多的也是最便捷的选择器,其中有元素选择器(类似于a,div,body,ul),类选择器(我们在HTML标签里面为其添加的clas ...

  10. Java/C++实现备忘录模式--撤销操作

    改进课堂上的"用户信息操作撤销"实例,使得系统可以实现多次撤销(可以使用HashMap.ArrayList等集合数据结构实现). 类图: Java代码: import java.u ...