python3爬虫-通过selenium获取到dj商品

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.chrome.options import Options

from selenium.common.exceptions import NoSuchElementException

from lxml import etree

import time, json

JD_URL_Login = "https://www.jd.com/"

class CustomizeException(Exception):

    def __init__(self, status, msg):

        self.status = status

        self.msg = msg

class JD:

    def __init__(self):

        self.browser = None

        self.__init_browser()

    def __init_browser(self):

        options = Options()

        options.add_argument("--headless")

        options.add_experimental_option('excludeSwitches', ['enable-automation'])

        # 设置为无图模式

        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

        self.browser = webdriver.Chrome(options=options)

        # 设置浏览器最大化窗口

        self.browser.maximize_window()

        # 隐式等待时间为3s

        self.browser.implicitly_wait(3)

        self.browser.get(JD_URL_Login)

        self.wait = WebDriverWait(self.browser, 10)

    def __search_goods(self, goods):

        '''搜索商品的方法'''

        self.file = open("jd-{}.json".format(goods), "a", encoding="utf-8")

        self.wait.until(EC.presence_of_all_elements_located((By.ID, "key")))

        serach_input = self.browser.find_element_by_id("key")

        serach_input.clear()

        serach_input.send_keys(goods, Keys.ENTER)

    def __get_goods_info(self, page_source):

        '''从网页源码中获取到想要的数据'''

        selector_html = etree.HTML(page_source)

        # 商品名字 不要获取title属性，以后再改吧，最好是获取到商品名的文本内容

        goods_name = selector_html.xpath("//div[@class='gl-i-wrap']//div[contains(@class,'p-name')]/a/@title")

        # 商品价格

        goods_price = selector_html.xpath("//div[@class='gl-i-wrap']//div[@class='p-price']/strong/i/text()")

        # 商品评价数量

        comment_num_selector = selector_html.xpath("//div[@class='p-commit']/strong")

        comment_num = [selector.xpath("string(.)") for selector in comment_num_selector]

        # 商品店铺

        shop_name = selector_html.xpath("//a[@class='curr-shop']/text()")

        goods_zip = zip(goods_name, goods_price, comment_num, shop_name)

        for goods_info in goods_zip:

            dic = {}

            dic["goods_name"] = goods_info[0]

            dic["goods_price"] = goods_info[1]

            dic["comment_num"] = goods_info[2]

            dic["shop_name"] = goods_info[3]

            # print("商品名字>>:", goods_info[0])

            # print("商品价格>>:", goods_info[1])

            # print("商品评价数量>>:", goods_info[2])

            # print("商品店铺>>:", goods_info[3])

            # print("*" * 100)

            yield dic

    def __swipe_page(self):

        '''上下滑动页面，将完整的网页源码返回'''

        height = self.browser.execute_script("return document.body.scrollHeight;")

        js = "window.scrollTo(0, {});".format(height)

        self.browser.execute_script(js)

        while True:

            time.sleep(1)

            now_height = self.browser.execute_script("return document.body.scrollHeight;")

            if height == now_height:

                return self.browser.page_source

            js = "window.scrollTo({}, {});".format(height, now_height)

            self.browser.execute_script(js)

            height = now_height

    def __is_element_exists(self, xpath):

        '''检测一个xpath是否能够找到'''

        try:

            self.browser.find_element_by_xpath(xpath=xpath)

            return True

        except NoSuchElementException:

            return False

    def __click_next_page(self):

        '''点击下一页，实现翻页功能'''

        self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "pn-next")))

        xpath = "//a[@class='pn-next']"

        if not self.__is_element_exists(xpath):

            raise CustomizeException(10000, "该商品访问完毕")

        self.browser.find_element_by_xpath(xpath).click()

    def __write_to_json(self, dic: dict):

        data_json = json.dumps(dic, ensure_ascii=False)

        self.file.write(data_json + "\n")

    def run(self, goods):

        self.__search_goods(goods)

        n = 1

        while True:

            print("正在爬取商品 <{}>---第{}页......".format(goods, n))

            time.sleep(3)

            html = self.__swipe_page()

            for dic in self.__get_goods_info(html):

                self.__write_to_json(dic)

            try:

                self.__click_next_page()

            except CustomizeException:

                try:

                    goods = goods_list.pop(0)

                    self.run(goods)

                except IndexError:

                    return

            n += 1

    def __del__(self):

        self.browser.close()

        self.file.close()

if __name__ == '__main__':

    jd = JD()

    goods_list = ["纯牛奶", "酸奶", "奶茶", "床上用品", "电磁炉", "电视", "小米笔记本", "华硕笔记本", "联想笔记本", "男士洗面奶", "女士洗面奶", "沐浴露", "洗发露",

                  "牙刷", "牙膏", "拖鞋", "剃须刀", "水手服", "运动服", "红龙果", "苹果", "香蕉", "洗衣液", "电饭煲"]

    try:

        goods = goods_list.pop(0)

    except IndexError:

        raise CustomizeException(20000, "goods_list不能为空")

    try:

        jd.run(goods)

    finally:

        del jd

python3爬虫-通过selenium获取到dj商品的更多相关文章

python3爬虫-通过selenium获取TB商品
from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from seleni ...
Python3.x：selenium获取iframe内嵌页面的源码
Python3.x:selenium获取iframe内嵌页面的源码前言在一些网页中经常会看到ifrmae/frame标签,iframe是嵌入式框架一般用来在已有的页面中嵌入另一个页面,当一个元素在 ...
小白学 Python 爬虫：Selenium 获取某大型电商网站商品信息
目标先介绍下我们本篇文章的目标,如图: 本篇文章计划获取商品的一些基本信息,如名称.商店.价格.是否自营.图片路径等等. 准备首先要确认自己本地已经安装好了 Selenium 包括 Chrome ...
python爬虫——用selenium爬取京东商品信息
1.先附上效果图(我偷懒只爬了4页) 2.京东的网址https://www.jd.com/ 3.我这里是不加载图片,加快爬取速度,也可以用Headless无弹窗模式 options = webdri ...
python3爬虫-通过requests获取安居客房屋信息
import requests from fake_useragent import UserAgent from lxml import etree from http import cookiej ...
爬虫之selenium爬取京东商品信息
import json import time from selenium import webdriver """ 发送请求 1.1生成driver对象 2.1窗口最大 ...
python3 [爬虫实战] selenium 爬取安居客
我们爬取的网站:https://www.anjuke.com/sy-city.html 获取的内容:包括地区名,地区链接: 安居客详情一开始直接用requests库进行网站的爬取,会访问不到数据的, ...
【Python3爬虫】selenium入门
selenium 是一个用于Web应用程序测试的工具.Selenium测试直接运行在浏览器中,就像真正的用户在操作一样.支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Fire ...
python3爬虫-通过selenium登陆拉钩，爬取职位信息
from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from se ...

随机推荐

vmware centos 安装
一.分区一块硬盘主分区+扩展分区最多只能有4个,其中扩展分区最多只能有1个.扩展分区不能写入数据,只能包含逻辑分区.这些都不是linux的限制,而是硬盘结构的限制. 分区号第一种分区法: |--- ...
python学习：数据类型检查
函数调用时可能会出现数据类型不匹配的问题,为了保证代码的鲁棒性,最好加上数据类型检查. 应用举例: if not isinstance(x, (int, float)): raise Typ ...
Aheadof Time Compilation(AOT) vs (JIT)Just In Time compilation approach
像java这样的高级语言,往往先做好部分编译,在运行时,再使用JIT将前面编译的中间件输出编译为机器语言,放到机器上运行.这可能会影响到运行的性能. 再比如,像angular的web 应用,angul ...
springMVC入门-09
这一节介绍SpringMVC对文件上传的支持,该功能支持需要使用到两个jar包:cmmons-fileupload-1.2.2.jar和commons-io-2.1.jar. 在controller类 ...
进程间协作---wait,notify,notifyAll
转自牛客网的一篇评论,解释的十分详细在 Java 中,可以通过配合调用 Object 对象的 wait() 方法和 notify()方法或 notifyAll() 方法来实现线程间的通信.在线程中调 ...
让两个对象间建立weak关系
让两个对象间建立weak关系这是为了给两个对象间建立weak关系,当一个对象被释放时,另外一个对象再获取这个值时就是nil,也就是不持有这个对象:) 源码: WeakRelatedDictionar ...
使用FBTweak
使用FBTweak https://github.com/facebook/Tweaks FBTweak是Facebook的一款开源库,用于微调数据而无需我们重复编译跑真机用的,它支持4种类型的cel ...
.Net WebApi 支持跨域访问使用 Microsoft.AspNet.WebApi.Cors
首先导入Cors库,通过程序包管理控制台导入 Install-Package Microsoft.AspNet.WebApi.Cors 引用库之后,我们需要进行简单的配置. 现在WebApiConfi ...
UE4中的AI行为树简单介绍
UE4引擎中可以实现简单AI的方式有很多,行为树是其中比较常用也很实用的AI控制方式,在官网的学习文档中也有最简单的目标跟踪AI操作教程,笔者在这里只作简单介绍. AIController->和 ...
【原创】Apache ab结果参数详解
解释如下: Server Software 服务器软件软件名称. Server Hostname 被测服务器的主机名. Server Port 被测试的Web服务器的监听端口. SSL/TLS Pro ...

python3爬虫-通过selenium获取到dj商品

python3爬虫-通过selenium获取到dj商品的更多相关文章

随机推荐

热门专题