private WebClient getAWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setTimeout(20000);
// webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
webClient.addRequestHeader("Cache-Control", "max-age=0");
webClient.addRequestHeader("Connection", "keep-alive");
webClient.addRequestHeader("Host", "www.amazon.com");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
return webClient;
}
/**
* 采集网页
*/
public StringBuilder crawlPage(String url) {
StringBuilder builder = new StringBuilder();
logger.info(Thread.currentThread().getName() + " crawl " + url);
// mygetpage代码放在这里
webClient.getCookieManager().clearCookies();
logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
File file = new File(cookiePathAppendRandom());
logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
if (file.exists()) {
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
CookieStore cookieStore = null;
ObjectInputStream in;
try {
in = new ObjectInputStream(fin);
cookieStore = (CookieStore) in.readObject();
in.close();
} catch (IOException e) {
logger.error(e);
} catch (ClassNotFoundException e) {
logger.error(e);
}
List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
for (org.apache.http.cookie.Cookie temp : l) {
Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
temp.getExpiryDate(), false);
webClient.getCookieManager().addCookie(cookie);
}
}
logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
HtmlPage page = MyGetPage(new StringBuffer(url));
logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
if (page == null) {
// 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
logger.info("Page null!");
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
builder.append(page.asXml());
logger.info(Thread.currentThread().getName() + " return builder;");
logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
if(builder.toString().length()<=300){
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
return builder;
}
/***
* 自定义的getpage,遇到验证码页面识别直至成功
*
*/
private HtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page = null;
boolean flag = true;
int TryTimeCnt = 1;
int UnknowHostTryTimeCnt = 1;
while (flag) {
flag = false;
try {
logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
+ crawlURLId);
page = webClient.getPage(URL.toString());
Document doc = Jsoup.parse(page.asXml());
int robotchecknum = 1;
while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [Robot Check,URL:" + URL + "]");
String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+ captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
form = page.getForms().get(0);
logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
button = (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start");
boolean click_flag = false;
while (!click_flag) {
try {
click_flag = true;
page = button.click();
} catch (Exception e1) {
logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName() + " button.click end");
while (page.asXml() == null) {
logger.info(Thread.currentThread().getName() + " page xml null");
logger.info(Thread.currentThread().getName() +" "+ page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName() + " refresh End!");
}
logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!");
doc = Jsoup.parse(page.asXml());
if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
logger.info(Thread.currentThread().getName() + " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
} } catch (FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
} catch (MalformedURLException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
logger.info("found UnknownHostException,start sleep 20 min");
try {
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
} catch (InterruptedException e1) {
logger.error(Thread.currentThread().getName() +" "+ e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
return null;
}
}catch (Exception eq) {
logger.error(Thread.currentThread().getName() + " "+eq);
TryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [TryTimeCnt:" + TryTimeCnt + "]");
if (TryTimeCnt > 5) {
return null;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName() + e);
}
flag = true;
}
try {
Thread.sleep(random.nextInt(500) + 1500);
} catch (InterruptedException e) {
logger.error(Thread.currentThread().getName() + e);
flag = true;
}
}
return page;
}

Java WebClient 总结的更多相关文章

  1. Spark案例分析

    一.需求:计算网页访问量前三名 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /* ...

  2. C#调用JAVA接口WSSE方式用WebClient方式

    C#读取JAVA的WSSE接口的调用代码: 用webclient 方式: /// <summary> /// 调用java cxf ws_security加密的服务wcf客户端对应的加密类 ...

  3. websocket通信 实现java模拟一个client与webclient通信

    发文原由: 熟悉socket通信的同学,对于socket模拟server与client,实现相互通信, 或者使用websocket与java模拟的websocket服务器通信(比如一个聊天室),对于这 ...

  4. Java调用Http/Https接口(7,end)--WebClient调用Http/Https接口

    WebClient是Spring提供的非阻塞.响应式的Http客户端,提供同步及异步的API,将会代替RestTemplate及AsyncRestTemplate.文中所使用到的软件版本:Java 1 ...

  5. java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major.minor version 52.0 (unable to load class com.gargoylesoftware.htmlunit.WebClient)

    java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major. ...

  6. htmlunit学习之java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargoylesoftware/htmlunit/WebClientOptions;

    运行到这里就报错 java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargo ...

  7. webClient请求JAVA超时解决方案

    private class MyWebClient: WebClient { protected override WebRequest GetWebRequest(Uri uri) { WebReq ...

  8. C#、JAVA操作Hadoop(HDFS、Map/Reduce)真实过程概述。组件、源码下载。无法解决:Response status code does not indicate success: 500。

    一.Hadoop环境配置概述 三台虚拟机,操作系统为:Ubuntu 16.04. Hadoop版本:2.7.2 NameNode:192.168.72.132 DataNode:192.168.72. ...

  9. Atitit.http httpclient实践java c# .net php attilax总结

    Atitit.http httpclient实践java c# .net php attilax总结 1. Navtree>> net .http1 2. Httpclient理论1 2. ...

随机推荐

  1. Javascript+Dom(加减乘除计算器)

    计算器介绍:只能进行加减乘除,提示用户输入数字,正则表达式限制用户只能输入数字(在用户输入时限制),如果出现除零操作答案为0: 有两种针对不同运算符的解决方法: 1. 使用eval() 函数 //函数 ...

  2. Haproxy安装配置及日志输出问题

    简介: 软件负载均衡一般通过两种方式来实现:基于操作系统的软负载实现和基于第三方应用的软负载实现.LVS就是基于Linux操作系统实现的一种软负载,HAProxy就是开源的并且基于第三应用实现的软负载 ...

  3. Memcached【Magent+Memcached】集群

    Memcached介绍  事件处理libevent是个程序库,它将Linux的epoll.BSD类操作系统的kqueue等事件处理功能封装成统一的接口.即使对服务器的连接数增加,也能发挥O(1)的性能 ...

  4. OC-self关键字

    self关键字 1. 成员变量和局部变量同名 当成员变量和局部变量同名时,采取就近原则,访问的是局部变量 用self访问成员变量,区分同名的局部变量 2.使用细节 1)     出现的地方:所有的OC ...

  5. IOS开发中@2x图片等适应不同分辨率手机

    开发中,例如: nanshanImage.image=[UIImage imageNamed:@'index_pic.png']; 在项目中还保存中index_pic@2x.png的图片,此图为了只适 ...

  6. asp.net mvc 4 高级编程学习笔记:第三章 视图(2)

    页面布局 asp.net MVC中提供了布局的支持,默认情况下才布局文件保存到 /View/Shared/目录下的_Layout.cshtml,View目录有个_ViewStart.cshtml文件, ...

  7. vim 创建和管理折叠

    参考文章: http://blog.csdn.net/bendanban/article/details/7743530 首先要有折叠, 然后才能说, 打开和关闭 折叠; 打开: zo: zip op ...

  8. php 经典的算法题你懂的

    有5个人偷了一堆苹果,准备在第二天分赃.晚上,有一人遛出来,把所有菜果分成5份,但是多了一个,顺手把这个扔给树上的猴了,自己先拿1/5藏了.没想到其他四人也都是这么想的,都如第一个人一样分成5份把多的 ...

  9. HTTP协议详解(真的很经典)

    HTTP 是一个属于应用层的面向对象的协议,由于其简捷.快速的方式,适用于分布式超媒体信息系统.它于1990年提出,经过几年的使用与发展,得到不断地完善和 扩展.目前在WWW中使用的是HTTP/1.0 ...

  10. IE6 Must Die

    最近 Twitter 上很多人在推一个名为 IE6 Must Die 的活动, 参与的朋友可以通过头像转换服务在自己的头像上加上一个禁止 IE6 的图标, 很是拉风. Internet Explore ...