Amazon后台模拟登陆

本文基于python3.4的selenium库打开浏览器，并将浏览器中的登陆cookie保存到本地，那么下次登陆就可以直接利用cookie了：

 # !/usr/bin/python3.4

 # -*- coding: utf-8 -*-

 from selenium import webdriver

 import time

 import requests

 from bs4 import BeautifulSoup

 import os

 import re

 import random

 import xlsxwriter

 # 找出文件夹下所有xml后缀的文件，可选择递归

 def listfiles(rootdir, prefix='.xml', iscur=False):

     file = []

     for parent, dirnames, filenames in os.walk(rootdir):

         if parent == rootdir:

             for filename in filenames:

                 if filename.endswith(prefix):

                     file.append(filename)

             if not iscur:

                 return file

         else:

             if iscur:

                 for filename in filenames:

                     if filename.endswith(prefix):

                         file.append(filename)

             else:

                 pass

     return file

 # 抓取dp的正则表达式

 def getdp(string):

     reg = r'(http.+?/dp/)(.+)'

     all = re.compile(reg)

     alllist = re.findall(all, string)

     return alllist[0][1]

 # 抓取filter的正则表达式

 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish

 def getfilter(string):

     reg = r'(https.+?filter=)(.+?)(&)'

     all = re.compile(reg)

     alllist = re.findall(all, string)

     return alllist[0][1]

 # 抓取最大页数的正则

 def getpagenum(string):

     reg = r'(.+?\()(\d+)(\))'

     all = re.compile(reg)

     alllist = re.findall(all, string)

     return alllist[0][1]

 # 创建文件夹

 def createjia(path):

     try:

         os.makedirs(path)

     except:

         pass

 def timetochina(longtime, formats='{}天{}小时{}分钟{}秒'):

     day = 0

     hour = 0

     minutue = 0

     second = 0

     try:

         if longtime > 60:

             second = longtime % 60

             minutue = longtime // 60

         else:

             second = longtime

         if minutue > 60:

             hour = minutue // 60

             minutue = minutue % 60

         if hour > 24:

             day = hour // 24

             hour = hour % 24

         return formats.format(day, hour, minutue, second)

     except:

         raise Exception('时间非法')

 # 打开浏览器抓取cookie

 def openbrowser(url):

     # 打开谷歌浏览器

     # Firefox() Chrome()

     browser = webdriver.Chrome()

     # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe')

     # 输入网址

     browser.get(url)

     # 打开浏览器时间

     # print("等待10秒打开浏览器...")

     # time.sleep(10)

     # 找到id="ap_email"的对话框

     # 清空输入框

     browser.find_element_by_id("ap_email").clear()

     browser.find_element_by_id("ap_password").clear()

     # 输入账号密码

     inputemail = input("请输入账号：")

     inputpassword = input("请输入密码：")

     browser.find_element_by_id("ap_email").send_keys(inputemail)

     browser.find_element_by_id("ap_password").send_keys(inputpassword)

     # 点击登陆sign in

     # id="signInSubmit"

     browser.find_element_by_id("signInSubmit").click()

     # 等待登陆10秒

     # print('等待登陆10秒...')

     # time.sleep(10)

     print("等待网址加载完毕...")

     select = input("请观察浏览器网站是否已经登陆(y/n)：")

     while 1:

         if select == "y" or select == "Y":

             print("登陆成功！")

             # 获取cookie

             cookie = [item["name"] + ":" + item["value"] for item in browser.get_cookies()]

             cookiestr = ';'.join(item for item in cookie)

             print("正在复制网页cookie...")

             # 写入本地txt

             if "jp" in url:

                 path = "../data/Japcookie.txt"

             else:

                 path = "../data/Amecookie.txt"

             filecookie = open(path, "w")

             filecookie.write(cookiestr)

             filecookie.close()

             time.sleep(1)

             print("准备关闭浏览器...")

             browser.quit()

             # print(cookiestr)

             break

         elif select == "n" or select == "N":

             selectno = input("账号密码错误请按0，验证码出现请按1...")

             # 账号密码错误则重新输入

             if selectno == "":

                 # 找到id="ap_email"的对话框

                 # 清空输入框

                 browser.find_element_by_id("ap_email").clear()

                 browser.find_element_by_id("ap_password").clear()

                 # 输入账号密码

                 inputemail = input("请输入账号：")

                 inputpassword = input("请输入密码：")

                 browser.find_element_by_id("ap_email").send_keys(inputemail)

                 browser.find_element_by_id("ap_password").send_keys(inputpassword)

                 # 点击登陆sign in

                 # id="signInSubmit"

                 browser.find_element_by_id("signInSubmit").click()

             elif selectno == "":

                 # 验证码的id为id="ap_captcha_guess"的对话框

                 input("请在浏览器中输入验证码并登陆...")

                 select = input("请观察浏览器网站是否已经登陆(y/n)：")

         else:

             print("请输入“y”或者“n”！")

             select = input("请观察浏览器网站是否已经登陆(y/n)：")

     return cookiestr

 def gethtml(url):

     # 读取cookie

     # 写入字典

     mycookie = {}

     if "jp" in url:

         path = "../data/Japcookie.txt"

     else:

         path = "../data/Amecookie.txt"

     try:

         filecookie = open(path, "r")

         cookies = filecookie.read().split(";")

         for items in cookies:

             item = items.split(":")

             mycookie[item[0]] = item[1]

         # print(mycookie)

         filecookie.close()

     except:

         print("cookie为空...")

     if "jp" in url:

         referer = "https://sellercentral.amazon.co.jp/"

         host = "www.amazon.co.jp"

     else:

         referer = "https://sellercentral.amazon.com/"

         host = "www.amazon.com"

     # 制作头部

     header = {

         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',

         'Referer': referer,

         'Host': host,

         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',

         'Connection': 'keep-alive',

         'Upgrade-Insecure-Requests': '',

         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

         'Accept-Encoding': 'gzip, deflate, br'

     }

     htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60)

     htmlcontent = htmlget.content.decode("UTF-8", "ignore")

     return htmlcontent

 def getinfo(html, Loginurl):

     # BeautifulSoup解析需要的东西

     soups = BeautifulSoup(html, "html.parser")

     # 筛选出商品的div

     sellyours = soups.find_all("div", attrs={"class": "a-box product"})

     information = []

     for item in sellyours:

         # 一个一个商品筛选

         # 第一次筛选，得到有“出售您的”的商品列表

         temp = item.find("a", attrs={"class", "a-button-text"})

         if temp != None:

             if "sellYoursClick" in temp["data-csm"]:

                 # 第二次筛选得到“无数字、无新品”字样的商品列表

                 temp = item.find("span", attrs={"class", "offerCountDetails"})

                 if temp == None:

                     temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"})

                     # 得到详情页网址

                     hrefurl = temp.find('a').get('href')

                     # 得到所有当前class下的文本信息

                     # 包括title、UPC、EAN、Rank

                     try:

                         spans = temp.get_text()

                     except:

                         spans = "Nothing"

                     # 将得到的文本信息写入数组里面

                     temparr = spans.strip().split("\n")

                     # 正则得到Asin

                     asin = getdp(hrefurl)

                     temparr.append(asin)

                     temparr.append(hrefurl)

                     # 这里记录一份副本到txt中，防止程序中断什么都没保存

                     txtcontent = ' '.join(temparr)

                     filename = time.strftime('%Y%m%d', time.localtime())

                     path = "../xls/" + filename

                     createjia(path)

                     file = open(path + "/" + filename + ".txt", "a")

                     file.write("\n" + txtcontent)

                     file.close()

                     # 这里是解析详情页，如果详情页有price，就抓取review下来

                     # 并且将抓取的东西储存到数组，并写入excel中

                     # 解析详情页

                     htmldetail = gethtml(hrefurl)

                     if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail:

                         print("抓取得太快！需要重新登陆...")

                         openbrowser(Loginurl)

                         htmldetail = gethtml(hrefurl)

                     # BeautifulSoup解析需要的东西

                     soups = BeautifulSoup(htmldetail, "html.parser")

                     # 筛选出商品的centerCol列表

                     centerCols = soups.findAll('div', attrs={'id': "centerCol"})

                     if centerCols:

                         for item in centerCols:

                             temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"})

                             if temp == None:

                                 # 得到评分等级

                                 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text()

                                 # 得到评分人数

                                 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text()

                                 # 将抓取的东西写入数组

                                 if star:

                                     temparr.append(star.strip().replace(" out of 5 stars", ""))

                                 else:

                                     temparr.append("")

                                 if reviews:

                                     temparr.append(reviews.strip().replace(" customer reviews", ""))

                                 else:

                                     temparr.append("")

                                 information.append(temparr)

                                 print(information)

                     else:

                         temparr.append("")

                         temparr.append("")

                         information.append(temparr)

                         print(information)

     return information

 def begin():

     taoyanbai = '''

             -----------------------------------------

             | 欢迎使用后台爬虫系统                   |

             | 时间：2016年10月21日                  |

             | 出品：技术部                          |

             -----------------------------------------

         '''

     print(taoyanbai)

 if __name__ == "__main__":

     a = time.clock()

     while 1:

         try:

             LoginWhere = int(input("抓取美国请按0，日本请按1："))

             if LoginWhere == 0:

                 Loginurl = "https://sellercentral.amazon.com/"

                 break

             elif LoginWhere == 1:

                 Loginurl = "https://sellercentral.amazon.co.jp/"

                 break

         except:

             print("请正确输入0或1！！")

             LoginWhere = int(input("抓取美国请按0，日本请按1："))

     keywords = input("请输入查找的关键词：")

     keyword = keywords.replace(" ", "+")

     print("正在检查登陆状态...")

     if "jp" in Loginurl:

         seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" + str(keyword)

     else:

         seekurl = "https://sellercentral.amazon.com/productsearch?q=" + str(keyword)

     try:

         htmlpage = gethtml(seekurl)

     except Exception as err:

         input("网络似乎有点问题...")

         print(err)

         exit()

     while 1:

         if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage:

             print("cookie已经过期，需要重新登陆...")

             print("等待网页打开...")

             openbrowser(Loginurl)

             htmlpage = gethtml(seekurl)

         else:

             print("直接使用cookie登陆...")

             break

     # BeautifulSoup解析需要的东西

     soups = BeautifulSoup(htmlpage, "html.parser")

     # 筛选出类别及其网址

     categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"})

     categoryurl = []

     categoryname = ""

     pagenum = []

     filtername = []

     for item in categorys:

         for temp in item.find_all("a"):

             hrefurl = temp.get('href')

             categoryurl.append(hrefurl)

         for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}):

             spantext = temp.get_text()

             pagenum.append(getpagenum(spantext))

     for i in range(0, len(categoryurl)):

         name = getfilter(categoryurl[i])

         filtername.append(name)

         categoryname = categoryname + "抓取(" + str(name) + ")请按" + str(i) + ","

     # 选择抓取的类型

     try:

         print(categoryname)

         selectcategory = int(input("请选择你要抓取类型的数字号码："))

     except:

         print("请正确输入前面的数字!!!")

         print(categoryname)

         selectcategory = int(input("请选择你要抓取类型的数字编码："))

     filter = filtername[selectcategory]

     mustpage = int(pagenum[selectcategory]) // 10

     try:

         print("温馨提醒：（1）后台仅仅展现1000页...（2）你要抓取的类型大约有" + str(mustpage) + "页...")

         page = int(input("请问你要抓取多少页？（默认15页）："))

         if page > 1000:

             print("后台最多只能看到1000页！！！")

             page = int(input("后台仅仅展现1000页！！！你要抓取的类型大约有" + str(mustpage) + "页！！！请问你要抓取多少页？（默认15页）："))

     except:

         page = 15

     # 储存抓取到的东西

     information = []

     temparr = []

     for i in range(0, page):

         try:

             if "jp" in Loginurl:

                 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2

                 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" + str(filter) + "&q=" + str(

                         keyword) + "&page=" + str(i + 1)

             else:

                 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog

                 openurl = "https://sellercentral.amazon.com/productsearch?filter=" + str(filter) + "&q=" + str(

                         keyword) + "&page=" + str(i + 1)

             print("开始抓取：" + str(openurl))

             openhtml = gethtml(openurl)

             # BeautifulSoup解析需要的东西

             soups = BeautifulSoup(openhtml, "html.parser")

             # 筛选出商品的div

             sellyours = soups.findAll('div', attrs={'class': "product"})

             if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml:

                 print("抓取得太快！需要重新登陆...")

                 openbrowser(Loginurl)

                 openhtml = gethtml(openurl)

             elif sellyours == None:

                 print("已经翻到最后一页了...")

                 break

             temparr = getinfo(openhtml, Loginurl)

         except Exception as err:

             print(err)

             print("访问抓取过程中出现小错误...")

             print("暂停20秒记录bug并尝试自我修复...")

             time.sleep(20)

         if temparr:

             information.append(temparr[0])

         loadtime = random.randint(5, 10)

         print("防止反爬虫设定暂停" + str(loadtime) + "秒...")

         time.sleep(loadtime)

     print("抓到的列表如下：")

     print(information)

     # 这里写入excel

     # 创建文件夹

     filename = time.strftime('%Y%m%d', time.localtime())

     path = "../xls/" + filename

     createjia(path)

     # 写入excel

     timename = time.strftime('%Y%H%M%S', time.localtime())

     with xlsxwriter.Workbook(path + "/" + timename + '.xlsx') as workbook:

         # workbook = xlsxwriter.Workbook(path + "/" + timename + '.xlsx')

         worksheet = workbook.add_worksheet()

         first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews']

         # 写入第一行

         for i in range(0, len(first)):

             worksheet.write(0, i, first[i])

         # 写入后面几行

         for m in range(0, len(information)):

             for n in range(0, len(information[m])):

                 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:",

                                                                                                   "").replace(

                         "customer reviews", "").replace("out of 5 stars", "")

                 worksheet.write(m + 1, n, insert)

         workbook.close()

     b = time.clock()

     print('运行时间：' + timetochina(b - a))

     input('请关闭窗口')  ##防止运行完毕后窗口直接关闭而看不到运行时间

由于selenium库支持低版本的浏览器，例如本文的谷歌浏览器需要下载插件，并将插件放到目录C:\Python34即可：

插件为chromedriver.exe，自己搜索，网上很多哒

Amazon后台模拟登陆的更多相关文章

Amazon后台登陆以及跟卖
亚马逊模拟登陆,这里使用的是selenium来登陆,并判断是否登陆成功,以及是否有验证码,并破解验证码登陆. 跟卖主要解决的难题是selenium的新窗口弹出问题,在 # 点击“出售您的” brows ...
php 的curl 模拟登陆
做一个类似这样的web 应用. 1,解决掉验证码其实这是正方的一个小bug,当我们进入登陆界面时,浏览器会去请求服务器,服务器会生成一个验证码图片.如果我们不去请求这个图片,那么正方后台也不会生成相 ...
python模拟登陆之下载
好长时间没有更新博客了,哈哈. 今天公司给了这么一个需求,现在我们需要去淘宝获取上一天的订单号,然后再根据订单号去另一个接口去获取订单详情,然后再给我展示到web! 中间涉及到的技术点有: 模拟登陆 ...
以正方教务系统为例，用php模拟登陆抓取课表、空教室
课程格子和超级课程表这两个应用,想必大学生都很熟悉,使用自己的学号和教务系统的密码,就可以将自己的课表导入,随时随地都可以在手机上查看. 其实稍微了解一点php的话,我们也可以做一个类似这样的web ...
HttpClient+Jsoup模拟登陆贺州学院教务系统，获取学生个人信息
前言注:可能学校的教务系统已经做了升级,当前的程序不知道还能不能成功获取信息,加上已经毕业,我的账户已经被注销,试不了,在这里做下思路跟过程的记录. 在我的毕业设计中”基于SSM框架贺州学院校园二手 ...
使用ApiPost测试接口时需要先登录怎么办？利用Cookie模拟登陆！
ApiPost简介: ApiPost是一个支持团队协作,并可直接生成文档的API调试.管理工具.它支持模拟POST.GET.PUT等常见请求,是后台接口开发者或前端.接口测试人员不可多得的工具 . 下 ...
第三百四十三节，Python分布式爬虫打造搜索引擎Scrapy精讲—scrapy模拟登陆和知乎倒立文字验证码识别
第三百四十三节,Python分布式爬虫打造搜索引擎Scrapy精讲—scrapy模拟登陆和知乎倒立文字验证码识别第一步.首先下载,大神者也的倒立文字验证码识别程序下载地址:https://gith ...
.net+jquery+ashx实现客户端模拟登陆扩展
客户端实现:login namespace LoginApp { partial class Form1 { /// <summary> /// 必需的设计器变量. /// </su ...
Python爬虫学习笔记之模拟登陆并爬去GitHub
(1)环境准备: 请确保已经安装了requests和lxml库 (2)分析登陆过程: 首先要分析登陆的过程,需要探究后台的登陆请求是怎样发送的,登陆之后又有怎样的处理过程. 如果已经 ...

随机推荐

js实现图片预显示
html页面代码 <div id="localImag" style="display:none"><img id="previe ...
Android屏幕适配常识
屏幕适配的注意事项 1. AndroidManifest.xml设置在中Menifest中添加子元素 android:anyDensity="true"时,应用程序安装在不同密度 ...
IOS 作业项目(4)步步完成画图程序(问题处理)终结
一,解决换色程序崩溃问题程序崩溃是因为颜色的内存被释放,添加如下类的内容即可 @implementation TuyaPath - (id)init { self = [super init]; i ...
# 20145210 《Java程序设计》第05周学习总结
教材学习内容总结第八章异常处理 8.1语法与继承架构 •使用 try.catch •Java中所有信息都会被打包为对象,如果愿意,可以尝试(try)捕捉(catch)代表错误的对象后做一些处理 • ...
CODEVS1380 没有上司的舞会（树形DP）
f[i,0] 表示第i个人不参加舞会 f[i,1] 表示第i个人参加舞会 f[i,1]=sigma(f[j,0])+v[i] j 为 i 的孩子 f[i,1]=sigma(max(f[j,0] ...
错误处理：加载文件或程序集“ICSharpCode.SharpZipLib”
解决方案:如果你的程序是2.0的,则删除 C:/WINDOWS/Microsoft.NET/Framework/v2.0.50727/中的所有的文件如果是4.0的,删除C:/WINDOWS/Micr ...
【题解】【BST】【Leetcode】Validate Binary Search Tree
Given a binary tree, determine if it is a valid binary search tree (BST). Assume a BST is defined as ...
CSS2伪类选择器要点
有四个选择器,分别是 hover:鼠标悬停 link:链接不能使用时 visited:链接被点击后 active:链接被点击时,如果鼠标不放,就会一直触发active属性 link在w3c中记录为链接 ...
UIImage加载图片的两种方法区别
Apple官方的文档为生成一个UIImage对象提供了两种方法加载图片: 1. imageNamed,其参数为图片的名字: 2. imageWithContentsOfFile,其参数也是图片文件的路 ...
如何在ALV_Grid的函数中定义下拉列表
转自 http://www.cnblogs.com/VerySky/articles/2392262.htmlABAP--如何在ALV_Grid的函数中定义下拉列表 REPORT. ********* ...

Amazon后台模拟登陆

Amazon后台模拟登陆的更多相关文章

随机推荐

热门专题