动态网页爬取例子（WebCollector+selenium+phantomjs）

目标：动态网页爬取

说明：这里的动态网页指几种可能：1）需要用户交互，如常见的登录操作；2）网页通过JS /
AJAX动态生成，如一个html里有<div id="test"></div>，通过JS生成<div
id="test"><span>aaa</span></div>。

这里用了WebCollector 2进行爬虫，这东东也方便，不过要支持动态关键还是要靠另外一个API -- selenium 2（集成htmlunit 和 phantomjs）.

1）需要登录后的爬取，如新浪微博

import java.util.Set;
import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* 登录后爬取
* Refer: http://nutcher.org/topics/33
* https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md
* Lib required: webcollector-2.07-bin, selenium-java-2.44.0 & its lib
*/
public class WebCollector1 extends DeepCrawler {
public WebCollector1(String crawlPath) {
super(crawlPath);
/*获取新浪微博的cookie，账号密码以明文形式传输，请使用小号*/
try {
String cookie=WebCollector1.WeiboCN.getSinaCookie("yourAccount", "yourPwd");
HttpRequesterImpl myRequester=(HttpRequesterImpl) this.getHttpRequester();
myRequester.setCookie(cookie);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public Links visitAndGetNextLinks(Page page) {
/*抽取微博*/
Elements weibos=page.getDoc().select("div.c");
for(Element weibo:weibos){
System.out.println(weibo.text());
}
/*如果要爬取评论，这里可以抽取评论页面的URL，返回*/
return null;
}
public static void main(String[] args) {
WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");
crawler.setThreads(3);
/*对某人微博前5页进行爬取*/
for(int i=0;i<5;i++){
crawler.addSeed("http://weibo.cn/zhouhongyi?vt=4&page="+i);
}
try {
crawler.start(1);
} catch (Exception e) {
e.printStackTrace();
}
}
public static class WeiboCN {
/**
* 获取新浪微博的cookie，这个方法针对weibo.cn有效，对weibo.com无效
* weibo.cn以明文形式传输数据，请使用小号
* @param username 新浪微博用户名
* @param password 新浪微博密码
* @return
* @throws Exception
*/
public static String getSinaCookie(String username, String password) throws Exception{
StringBuilder sb = new StringBuilder();
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get("http://login.weibo.cn/login/");
WebElement mobile = driver.findElementByCssSelector("input[name=mobile]");
mobile.sendKeys(username);
WebElement pass = driver.findElementByCssSelector("input[name^=password]");
pass.sendKeys(password);
WebElement rem = driver.findElementByCssSelector("input[name=remember]");
rem.click();
WebElement submit = driver.findElementByCssSelector("input[name=submit]");
submit.click();
Set<Cookie> cookieSet = driver.manage().getCookies();
driver.close();
for (Cookie cookie : cookieSet) {
sb.append(cookie.getName()+"="+cookie.getValue()+";");
}
String result=sb.toString();
if(result.contains("gsid_CTandWM")){
return result;
}else{
throw new Exception("weibo login failed");
}
}
}
}

* 这里有个自定义路径/home/hu/data/weibo（WebCollector1 crawler=new WebCollector1("/home/hu/data/weibo");），是用来保存到嵌入式数据库Berkeley DB。

* 总体上来自Webcollector 作者的sample。

2）JS动态生成HTML元素的爬取

import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
/*
* JS爬取
* Refer: http://blog.csdn.net/smilings/article/details/7395509
*/
public class WebCollector3 extends DeepCrawler {
public WebCollector3(String crawlPath) {
super(crawlPath);
// TODO Auto-generated constructor stub
}
@Override
public Links visitAndGetNextLinks(Page page) {
/*HtmlUnitDriver可以抽取JS生成的数据*/
// HtmlUnitDriver driver=PageUtils.getDriver(page,BrowserVersion.CHROME);
// String content = PageUtils.getPhantomJSDriver(page);
WebDriver driver = PageUtils.getWebDriver(page);
// List<WebElement> divInfos=driver.findElementsByCssSelector("#feed_content");
List<WebElement> divInfos=driver.findElements(By.cssSelector("#feed_content span"));
for(WebElement divInfo:divInfos){
System.out.println("Text是：" + divInfo.getText());
}
return null;
}
public static void main(String[] args) {
WebCollector3 crawler=new WebCollector3("/home/hu/data/wb");
for(int page=1;page<=5;page++)
// crawler.addSeed("http://www.sogou.com/web?query="+URLEncoder.encode("编程")+"&page="+page);
crawler.addSeed("http://cq.qq.com/baoliao/detail.htm?294064");
try {
crawler.start(1);
} catch (Exception e) {
e.printStackTrace();
}
}
}

PageUtils.java

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.htmlunit.HtmlUnitDriver;
import org.openqa.selenium.ie.InternetExplorerDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import cn.edu.hfut.dmic.webcollector.model.Page;
public class PageUtils {
public static HtmlUnitDriver getDriver(Page page) {
HtmlUnitDriver driver = new HtmlUnitDriver();
driver.setJavascriptEnabled(true);
driver.get(page.getUrl());
return driver;
}
public static HtmlUnitDriver getDriver(Page page, BrowserVersion browserVersion) {
HtmlUnitDriver driver = new HtmlUnitDriver(browserVersion);
driver.setJavascriptEnabled(true);
driver.get(page.getUrl());
return driver;
}
public static WebDriver getWebDriver(Page page) {
// WebDriver driver = new HtmlUnitDriver(true);
// System.setProperty("webdriver.chrome.driver", "D:\\Installs\\Develop\\crawling\\chromedriver.exe");
// WebDriver driver = new ChromeDriver();
System.setProperty("phantomjs.binary.path", "D:\\Installs\\Develop\\crawling\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe");
WebDriver driver = new PhantomJSDriver();
driver.get(page.getUrl());
// JavascriptExecutor js = (JavascriptExecutor) driver;
// js.executeScript("function(){}");
return driver;
}
public static String getPhantomJSDriver(Page page) {
Runtime rt = Runtime.getRuntime();
Process process = null;
try {
process = rt.exec("D:\\Installs\\Develop\\crawling\\phantomjs-2.0.0-windows\\bin\\phantomjs.exe " +
"D:\\workspace\\crawlTest1\\src\\crawlTest1\\parser.js " +
page.getUrl().trim());
InputStream in = process.getInputStream();
InputStreamReader reader = new InputStreamReader(
in, "UTF-8");
BufferedReader br = new BufferedReader(reader);
StringBuffer sbf = new StringBuffer();
String tmp = "";
while((tmp = br.readLine())!=null){
sbf.append(tmp);
}
return sbf.toString();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
}

2.1）HtmlUnitDriver getDriver是selenium 1.x的作法，已经outdate了，现在用WebDriver getWebDriver

2.2）这里用了几种方法：HtmlUnitDriver, ChromeDriver, PhantomJSDriver,
PhantomJS，参考 http://blog.csdn.net/five3/article/details/19085303，各自之间的优缺
点如下：

driver类型	优点	缺点	应用
真实浏览器driver	真实模拟用户行为	效率、稳定性低	兼容性测试
HtmlUnit	速度快	js引擎不是主流的浏览器支持的	包含少量js的页面测试
PhantomJS	速度中等、模拟行为接近真实	不能模拟不同/特定浏览器的行为	非GUI的功能性测试

* 真实浏览器driver 包括 Firefox, Chrome, IE

2.3）用PhantomJSDriver的时候，遇上错
误：ClassNotFoundException: org.openqa.selenium.browserlaunchers.Proxies，原
因竟然是selenium 2.44 的bug，后来通过maven找到phantomjsdriver-1.2.1.jar 才解决了。

2.4）另外，我还试了PhantomJS 原生调用（也就是不用selenium，直接调用PhantomJS，见上面的方法），原生要调用JS，这里的parser.js代码如下：

system = require('system')
address = system.args[1];//获得命令行第二个参数接下来会用到
//console.log('Loading a web page');
var page = require('webpage').create();
var url = address;
//console.log(url);
page.open(url, function (status) {
//Page is loaded!
if (status !== 'success') {
console.log('Unable to post!');
} else {
//此处的打印，是将结果一流的形式output到java中，java通过InputStream可以获取该输出内容
console.log(page.content);
}
phantom.exit();
});

3）后话

3.1）HtmlUnitDriver + PhantomJSDriver是当前最可靠的动态抓取方案。

3.2）这过程中用到很多包、exe，遇到很多的墙~，有需要的朋友可以找我要。

Reference

http://www.ibm.com/developerworks/cn/web/1309_fengyq_seleniumvswebdriver/
http://blog.csdn.net/smilings/article/details/7395509
http://phantomjs.org/download.html
http://blog.csdn.net/five3/article/details/19085303
http://phantomjs.org/quick-start.html

动态网页爬取例子（WebCollector+selenium+phantomjs）的更多相关文章

Node.js 动态网页爬取 PhantomJS 使用入门(转)
Node.js 动态网页爬取 PhantomJS 使用入门原创NeverSettle101 发布于2017-03-24 09:34:45 阅读数 8309 收藏展开版权声明:本文为 winte ...
python动态网页爬取——四六级成绩批量爬取
需求: 四六级成绩查询网站我所知道的有两个:学信网(http://www.chsi.com.cn/cet/)和99宿舍(http://cet.99sushe.com/),这两个网站采用的都是动态网页. ...
爬虫入门（三）——动态网页爬取：爬取pexel上的图片
Pexel上有大量精美的图片,没事总想看看有什么好看的自己保存到电脑里可能会很有用但是一个一个保存当然太麻烦了所以不如我们写个爬虫吧(๑•̀ㅂ•́)و✧ 一开始学习爬虫的时候希望爬取pexel上的 ...
动态网页爬取样例（WebCollector+selenium+phantomjs）
目标:动态网页爬取说明:这里的动态网页指几种可能:1)须要用户交互,如常见的登录操作:2)网页通过JS / AJAX动态生成.如一个html里有<div id="test" ...
Python开发爬虫之动态网页抓取篇：爬取博客评论数据——通过Selenium模拟浏览器抓取
区别于上篇动态网页抓取,这里介绍另一种方法,即使用浏览器渲染引擎.直接用浏览器在显示网页时解析 HTML.应用 CSS 样式并执行 JavaScript 的语句. 这个方法在爬虫过程中会打开一个浏览器 ...
【图文详解】scrapy爬虫与动态页面——爬取拉勾网职位信息（2）
上次挖了一个坑,今天终于填上了,还记得之前我们做的拉勾爬虫吗?那时我们实现了一页的爬取,今天让我们再接再厉,实现多页爬取,顺便实现职位和公司的关键词搜索功能. 之前的内容就不再介绍了,不熟悉的请一定要 ...
爬虫系列5：scrapy动态页面爬取的另一种思路
前面有篇文章给出了爬取动态页面的一种思路,即应用Selenium+Firefox(参考<scrapy动态页面爬取>).但是selenium需要运行本地浏览器,比较耗时,不太适合大规模网页抓 ...
python网络爬虫-动态网页抓取（五）
动态抓取的实例在开始爬虫之前,我们需要了解一下Ajax(异步请求).它的价值在于在与后台进行少量的数据交换就可以使网页实现异步更新. 如果使用Ajax加载的动态网页抓取,有两种方法: 通过浏览器审查 ...
使用urllib进行网页爬取
# coding=gbk # 抓取开奖号码 # url:http://datachart.500.com/dlt/zoushi/jbzs_foreback.shtml ''' 对网页逐行迭代,找到目标 ...

随机推荐

【转】Nginx+Tomcat搭建高性能负载均衡集群
最近对负载均衡比较感兴趣,研究了公司的负载均衡的配置,用的是阿里的SLB,相当于不用运维,只需要在后台进行简单的配置就能完成Tomcat的负载均衡,索性在网上找了几篇文章去尝试搭建一个集群,然而很多都 ...
spring注解开发中常用注解以及简单配置
一.spring注解开发中常用注解以及简单配置 1.为什么要用注解开发:spring的核心是Ioc容器和Aop,对于传统的Ioc编程来说我们需要在spring的配置文件中邪大量的bean来向sprin ...
基础算法-查找：线性索引查找（I）
前面介绍的几种查找的算法都是基于数据有序的基础上进行的.但是在实际的应用中,很多数据集可能有惊人的数据量,面对这些海量的数据,要保证记录全部按照当中的某个关键字有序,其时间代价是非常昂贵的,所以这种数 ...
kinect for windows - DepthBasics-D2D详解之一
Depth在kinect中经常被翻译为深度图,指的是图像到摄像头的距离,这些距离数据能让机器知道物理距离有多远.kinect通过两个红外摄像头来实现这个功能的.在这个例子里,就实现了深度图的提取和现实 ...
django在视图中使用模板
在视图中使用模板在学习了模板系统的基础之后,现在让我们使用相关知识来创建视图. 重新打开我们在前一章在 mysite.views 中创建的 current_datetime 视图. 以下是其内容 ...
android 时间滚动控件底部弹出
下载地址:http://download.csdn.net/detail/ldd119/7440895 转载请说明出处先上个效果图 watermark/2/text/aHR0cDovL2Jsb2cu ...
Android Texting（2）Testing Fundamentals 测试基础篇
Testing Fundamentals The Android testing framework, an integral part of the development environment, ...
如何成为CSDN博客专家
先看一下官方给出的要求: 申请CSDN博客专家应具备的条件: 1.原创IT类文章总数超过20篇,并且最近一个月内发布了新的原创IT类文章. 2.博客文章总的浏览量超过5万次以上. 3.文章内容的质量很 ...
ceph之crush map
编辑crush map: 1.获取crush map: 2.反编译crush map: 3.至少编辑一个设备,桶, 规则: 4.重新编译crush map: 5.重新注入crush map: 获取cr ...
haml、sass简单的解释
1. Haml 全名为 HTML Abstract Markup Language,主要就是让开发者能够使用缩排的方式撰写 HTML,做到永不忘记关 Tag 的效果. 例如:%h1= "He ...

动态网页爬取例子（WebCollector+selenium+phantomjs）

动态网页爬取例子（WebCollector+selenium+phantomjs）的更多相关文章

随机推荐

热门专题