java爬虫遇到个页面加密的东西,找了些资料学习学习

做了个java运行js的工具类,希望对大家有用,其中用到client(获取js)可以自行换成自己的client。主要是用了

Rhino就是JavaScript引擎,它的目的就是实现Java与JavaScript的互操作性。rhino-1.7R1.jar

Envjs一个纯js方式在无浏览器环境下模拟浏览器的行为。envjs-1.2.js

一般网站js中都会用到jauery,所以还用了jauery.js

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.lang.ref.SoftReference;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.Validate;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.RequestAcceptEncoding;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.Args;
import org.apache.http.util.ByteArrayBuffer;
import org.jsoup.Jsoup;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.ContextFactory;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.Scriptable; import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch; //import net.sourceforge.htmlunit.corejs.javascript.Context;
//import net.sourceforge.htmlunit.corejs.javascript.ContextFactory;
//import net.sourceforge.htmlunit.corejs.javascript.Function;
//import net.sourceforge.htmlunit.corejs.javascript.Scriptable; /**
* 参照http://mybeautiful.iteye.com/blog/1442839
* http://m.oschina.net/blog/121347
* http://blog.csdn.net/dwjmantou/article/details/45276967
* http://lcllcl987.iteye.com/blog/87423
* ***不可使用htmlunit的包******Cannot call method "setOptimizationLevel" of null
* @author 5432
*
*/
public class RhinoScaper {
private Context context;
private Scriptable scriptable;
/**
* 初始化方法
*/
public void init(){
context = ContextFactory.getGlobal().enterContext();
scriptable =context.initStandardObjects(null);
context.setOptimizationLevel(-1);
context.setLanguageVersion(Context.VERSION_1_5);
// 初始化测试用,并定义envjs-1.2.js未定义print
context.evaluateString(scriptable,
"var v='sssaass';"
+ "var print = function(v) {"+
" java.lang.System.out.println(v);return v ;"+
" };function hah(){return v }",
"print",1,null);
// System.out.println("v == " + scriptable.get("v", scriptable) );
Function prf = (Function)scriptable.get("print", scriptable);
Object call = prf.call(Context.getCurrentContext(), scriptable, prf, new Object[]{"test"});
// System.out.println("print == "+call.toString());
Object invokFunction = invokFunction("hah");
// System.out.println(invokFunction.toString()); String[] file = { this.getClass().getResource("/")+"envjs-1.2.js", "./lib/jquery.js" };
for (String f : file) {
evaluateJs(f);
}
}
/**
* 调用函数
* @param functionName
* @param functionArags
* @return
*/
public Object invokFunction(String functionName,Object... functionArags) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
Function function = (Function) scriptable.get(functionName, scriptable);
Object call = function.call(Context.getCurrentContext(), scriptable, function, functionArags);
// System.out.println("reslult = "+call.toString());
return call;
} /**
* 加载js文件
* (当没有找到对应文件,
* 且要加载文件名路径包含‘envjs-1.2.js’ 会访问 https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js
* 文件名路径包含‘jquery.js’ 会访问 http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js
* 加载js文件 )
* @param f 文件名路径
*/
public void evaluateJs(String f) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
FileReader in = null;
try {
// FileInputStream fI = new FileInputStream(f);
// String js = IOUtils.toString(fI, "UTF-8");//设置默认js文件编码为utf-8
// context.evaluateString(scriptable, js, f, 1, null);
in = new FileReader(f);
context.evaluateReader(scriptable, in, f, 1, null);
} catch (FileNotFoundException e1) {
// e1.printStackTrace();
if (f.contains("envjs-1.2.js")) {
String envjs ="https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js";
try {
SoftReference<String> htmlString = Client.getHtmlString(envjs);
String jqueryStr = htmlString==null?"":htmlString.get();
// DefaultClient defaultClient = new DefaultClient();
// String jqueryStr =defaultClient.get(envjs).asHtml();
context.evaluateString(scriptable, jqueryStr, envjs, 1, null);
} catch (Exception e) {
e.printStackTrace();
}
} else if (f.contains("jquery.js")) {
String jquery = "http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js";
Reader bufR =null;
try {
SoftReference<Reader> htmlReader = Client.getHtmlReader(jquery);
bufR = htmlReader==null?new BufferedReader(null):htmlReader.get();
// String js = IOUtils.toString(bufR);
context.evaluateReader(scriptable, bufR , jquery, 1, null);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}finally {
// close(bufR);
IOUtils.closeQuietly(bufR);
}
} else{
throw new RuntimeException("unknown file "+f);
}
} catch (IOException e1) {
e1.printStackTrace();
}finally {
// close(in);
IOUtils.closeQuietly(in);
}
} public static void main(String[] args) {
RhinoScaper rhinoScaper = new RhinoScaper();
rhinoScaper.init();
// rhinoScaper.JSloadString("jsString", "jsname");
// rhinoScaper.evaluateJs("E:/Desktop/loginjs.js");
// rhinoScaper.loadJS("", classpathURI); // 电信登录加密测试
String pwd="111";
StringBuilder ascending = new StringBuilder();
SoftReference<String> htmlString = null;
try {
htmlString = Client.getHtmlString("http://login.189.cn/bundles/jquery?v=h3Pl8XT8zdNkoI1VbV5sEZOBrSqsxRXX0TIQ9S_lAlM1");
} catch (Exception e) {
e.printStackTrace();
}
String jsStr =htmlString==null?"":htmlString.get();
jsStr = jsStr.replaceAll("float:", "floats:").replaceAll("throws", "throwss");
ascending.append(jsStr);
ascending.append(";\n var input=document.createElement(\"input\");input.value='"+pwd+"';;input.id= 'pass';input.type='password';");
ascending.append("\n function getpassword(){ return $(input).valAesEncryptSet()}");
rhinoScaper.JSloadString(ascending.toString(), "jsname");
Object result = rhinoScaper.invokFunction("getpassword");
System.out.println(result);
try {
htmlString = Client.getHtmlString("http://www.youdaili.net/Daili/");
jsStr =htmlString==null?"":htmlString.get();
String runScript = rhinoScaper.runScript(jsStr);
System.out.println(runScript);
} catch (Exception e) {
e.printStackTrace();
} }
/**
* 运行js
* @param html
* @return
*/
private String runScript(String html) {
String function = null;int jsfrom = 0;
Pattern p = Pattern.compile("setTimeout\\(\"(.*)\\((.*)\\)\", 200\\);");
Matcher m = p.matcher(html);
if(m.find()){
function = m.group(1);//函数名
jsfrom = Integer.parseInt(m.group(2));//参数
}
JSloadString(Jsoup.parse(html).select("script").html().replace("eval(\"qo=eval;qo(po);\")", "return po"), "jsname");
Object result = invokFunction(function, jsfrom);
return result.toString();
}
/**
* 加载js文件
* @param sourceName 名称
* @param classpathURI 文件路径
*/
public void loadJS(String sourceName, String classpathURI) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
String js = null;
InputStream inputStream = null;
try {
inputStream = getClass().getResourceAsStream(classpathURI);
js = IOUtils.toString(inputStream, "UTF-8");//设置默认js文件编码为utf-8
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(inputStream);
}
context.evaluateString(scriptable, js, sourceName, 1, null);
}
/**
* 加载js字符串
* @param source js字符串(注意处理js中由于变量名为throws,float类似名称导致的报错)
* @param sourceName 名称
*/
public void JSloadString(String source, String sourceName){
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
context.evaluateString(scriptable, source, sourceName, 1, null);
}
}
class Client{
public static void close(AutoCloseable close) {
if (close != null) {
try {
close.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static CloseableHttpResponse HttpGetResponse(String url) throws IOException, ClientProtocolException {
HttpGet httpGet = new HttpGet(URI.create(url));
BasicCookieStore cookieStore = new BasicCookieStore();
HttpClientBuilder builder = HttpClientBuilder.create().disableContentCompression()
.setConnectionReuseStrategy(new DefaultConnectionReuseStrategy()).setUserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36");
builder.addInterceptorLast(new RequestAcceptEncoding());
builder.setDefaultCookieStore(cookieStore);
CloseableHttpClient client = builder.build();
CloseableHttpResponse execute = client.execute(httpGet);
return execute;
}
public static SoftReference<String> getHtmlString(String url)throws Exception {
CloseableHttpResponse execute = null;
byte[] binary =null;//初次解析内容
SoftReference<String> result = null;
try {
execute = HttpGetResponse(url);
// content = execute.getEntity().getContent();
binary = HttpEntityTOByte(execute.getEntity());
}finally {
close(execute);
}
String html;
byte[] decode;
try {
System.out.println(execute.getStatusLine().toString());
System.out.println(execute.getEntity().getContentEncoding()); Args.notNull(binary, "binary");
decode= decode(binary,execute.getEntity());
try {
String charset = getContentCharSet(execute.getEntity().getContentType().getValue());
if (charset != null) {
html = new String(decode, Charset.forName(charset));
} else {
CharsetMatch match = new CharsetDetector().setText(decode)
.detect();
html = match.getString();
}
} catch (Exception e) {
throw new Exception(e);
}
result = new SoftReference<String>(html);
}finally {
binary =null;
decode =null;
html=null;
}
return result; }
public static SoftReference<Reader> getHtmlReader(String url)throws Exception {
CloseableHttpResponse execute = null;
byte[] binary =null;//初次解析内容
SoftReference<Reader> result = null;
try {
execute = HttpGetResponse(url);
binary = HttpEntityTOByte(execute.getEntity());
}finally {
close(execute);
}
byte[] decode;
Reader bufR = null;
try {
System.out.println(execute.getStatusLine().toString());
System.out.println(execute.getEntity().getContentEncoding().toString());
Args.notNull(binary, "binary");
decode= decode(binary,execute.getEntity());
bufR= new BufferedReader(new InputStreamReader(new ByteArrayInputStream(decode)));
result=new SoftReference<Reader>(bufR);
}finally {
binary =null;
decode =null;
//close(bufR);
}
return result; }
private static String getContentCharSet(String contentType) throws ParseException {
String charset = null;
if (StringUtils.isNotEmpty(contentType)) {
String[] strs = contentType.split(";");
for (String string : strs) {
if (string.contains("charset")) {
String[] tmp = string.split("=");
if (tmp.length == 2) {
return tmp[1];
}
}
}
}
return charset;
}
public static final int BUFFER = 1024;
/**
* 数据解压缩 gizp
*
* @param data
* @return
* @throws Exception
* @author http://snowolf.iteye.com/blog/643010
*/
public static byte[] decompress(byte[] data) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 解压缩
decompress(bais, baos);
data = baos.toByteArray();
baos.flush();
close(baos);
close(bais);
// baos.close();
// bais.close();
return data;
}
/**
* 数据解压缩
*
* @param is
* @param os
* @throws Exception
*/
public static void decompress(InputStream is, OutputStream os)
throws Exception {
GZIPInputStream gis =null;
byte data[];
try {
gis = new GZIPInputStream(is);
int count;
data = new byte[BUFFER];
while ((count = gis.read(data, 0, BUFFER)) != -1) {
os.write(data, 0, count);
}
} finally{
data = null;
close(gis);
// gis.close();
}
} /**
* gizp解压
* @param binary
* @param res
* @param entity
* @return
* @throws Exception
*
*/
public static byte[] decode(byte[] binary, final HttpEntity entity) throws Exception {
if (entity != null && entity.getContentLength() != 0) {
final Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
final HeaderElement[] codecs = ceheader.getElements();
for (final HeaderElement codec : codecs) {
final String codecname = codec.getName().toLowerCase(Locale.US);
if ("gzip".equals(codecname) || "x-gzip".equals(codecname)) {
return decompress(binary);
} else if ("deflate".equals(codecname)) {
return binary;
} else if ("identity".equals(codecname)) { /* Don't need to transform the content - no-op */
return binary;
} else {
throw new Exception("Unsupported Content-Coding: "+codecname );
}
}
}
}
return binary;
}
/**
* 将HttpEntity转换成byte数组
* @param entity HttpEntity
* @return byte[]
* @throws IOException
   * @author EntityUtils.toByteArray(entity)
*/
public static byte[] HttpEntityTOByte(HttpEntity entity) throws IOException{
final InputStream instream = entity.getContent();
if (instream == null) {
return null;
}
try {
Args.check(entity.getContentLength() <= Integer.MAX_VALUE,
"HTTP entity too large to be buffered in memory");
int i = (int)entity.getContentLength();
if (i < 0) {
i = 4096;
}
final ByteArrayBuffer buffer = new ByteArrayBuffer(i);
final byte[] tmp = new byte[4096];
int l;
while((l = instream.read(tmp)) != -1) {
buffer.append(tmp, 0, l);
}
return buffer.toByteArray();
} finally {
instream.close();
}
}
}

Rhino+envjs-1.2.js 在java运行网站js 工具类的更多相关文章

  1. java中常用的工具类(一)

    我们java程序员在开发项目的是常常会用到一些工具类.今天我汇总了一下java中常用的工具方法.大家可以在项目中使用.可以收藏!加入IT江湖官方群:383126909 我们一起成长 一.String工 ...

  2. Android PermissionUtils:运行时权限工具类及申请权限的正确姿势

    Android PermissionUtils:运行时权限工具类及申请权限的正确姿势 ifadai 关注 2017.06.16 16:22* 字数 318 阅读 3637评论 1喜欢 6 Permis ...

  3. Java日期时间实用工具类

    Java日期时间实用工具类 1.Date (java.util.Date)    Date();        以当前时间构造一个Date对象    Date(long);        构造函数   ...

  4. Java并发多线程 - 并发工具类JUC

    安全共享对象策略 1.线程限制 : 一个被线程限制的对象,由线程独占,并且只能被占有它的线程修改 2.共享只读 : 一个共享只读的对象,在没有额外同步的情况下,可以被多个线程并发访问, 但是任何线程都 ...

  5. Java 中的并发工具类

    Java 中的并发工具类 CountDownLatch public class JoinCountDownLatchTest { public static void main(String[] a ...

  6. Java线程的并发工具类

    Java线程的并发工具类. 一.fork/join 1. Fork-Join原理 在必要的情况下,将一个大任务,拆分(fork)成若干个小任务,然后再将一个个小任务的结果进行汇总(join). 适用场 ...

  7. Java学习-041-颜色工具类(RGB,HEX)

    在日常的网页开发中,经常需要进行颜色数值获取.转换,例如获取红色,获取蓝色,获取绿色,RGB转十六进制颜色,十六进制颜色转RGB等,因而在学习过程中,写了一个小工具类,仅供各位小主参考! 多不闲言,直 ...

  8. JAVA中封装JSONUtils工具类及使用

    在JAVA中用json-lib-2.3-jdk15.jar包中提供了JSONObject和JSONArray基类,用于JSON的序列化和反序列化的操作.但是我们更习惯将其进一步封装,达到更好的重用. ...

  9. JAVA自动生成正则表达式工具类

    经过很久的努力,终于完成了JAVA自动生成正则表达式工具类.还记得之前需要正则,老是从网上找吗?找了想修改也不会修改.现在不用再为此烦恼了,使用此生成类轻松搞定所有正则表达式.赶快在同事面前炫一下吧. ...

随机推荐

  1. Unattend.xml应答文件制作(WISM)

    将制作好的应答文件unattend.xml拷贝到模板机sysprep目录下,然后在cmd下运行 (unattend.xml文件可自定义名称)   sysprep /generalize /oobe / ...

  2. xpath轴的正确使用姿势

    网上看了许多关于轴的介绍,只介绍了语法,而没有明说具体实际中该怎么使用,百思不得其解. 背景--python中使用xpath:  ----------------------------------- ...

  3. getElementById,getElementsByName,getElementsByTagName的区别

    1.getElementById 作用:一般页面里ID是唯一的,用于准备定为一个元素 语法: document.getElementById(id) 参数:id :必选项为字符串(String) 返回 ...

  4. 【转载】Android 自动化测试 Emmagee

    Emmagee 是一个性能测试小工具 用来监控指定被测应用在使用过程中占用机器的CPU, 内存,流量资源的性能小工具 Emmagee 介绍 Emmagee是网易杭州研究院QA团队开发的一个简单易上手的 ...

  5. fail to create java virtual machine..

    今天打开zend stdio 的时候 出现的错误  fail to create java virtual machine... 然后找度娘了,,都说改xxxxx, 我打开360  ,把内存清理了一遍 ...

  6. Ftp类

    using System; using System.Collections.Generic; using System.Text; using System.Net; using System.IO ...

  7. Oracle--用变量保存查询出来的值

    1:在我们一般编写存储过程中比较常见的是,习惯将查询出来的一个值赋值给一个变量,这个如何实现呢,用into,代码如下   Select ID into 变量1 from 表 where 条件 2:但当 ...

  8. JS函数是如何执行的

    当局部变量和函数参数同名时,该怎么理解呢? function test(a){ var a=a||5; alert(a) } test() //没传参的话,就是5:传参的话就alert参数 ===== ...

  9. 模板方法模式(Template Method)

    一.引言 提到模板,大家肯定不免想到生活中的“简历模板”.“论文模板”.“Word中模版文件”等,在现实生活中,模板的概念就是——有一个规定的格式,然后每个人都可以根据自己的需求或情况去更新它,例如简 ...

  10. $.post() 传递多个参数.

    $("#button").click(function() { /获取表单中id为idname和count的文本值付给property的两个属性 var property={&qu ...