鼓捣phantomjs，做ajax网站的信息采集

版权所有：http://www.cnblogs.com/zeusro/

引用不给稿费的，切你jj

准备工作：

1phantomjs的安装

2 phantomjs环境变量的配置

需求：

采集手机淘宝某店铺的所有商品的ID

难点：

1页面是ajax的，不能用传统方法（webrequest，正则提取）提取数据，所以这才是我用 phantomjs的原因

那么对于这部分内容，除了要确保加载页面完成后，还要等待其所有资源加载完毕，确保DOM是符合我们预期的，才开始采集。

2模块化

加载到nodejs里面，用于批量采集。

方法：把变动的参数做成

3淘宝的反采集

4数据的持久化

开工：

我以http://shop100338207.m.taobao.com/#list 举例。

var webpage = require('webpage'), page = webpage.create();

var fs = require('fs');

page.viewportSize = { width: 1024, height: 800 };

page.clipRect = { top: 0, left: 0, width: 1024, height: 800 };

page.settings = {

    javascriptEnabled: true,

    loadImages: true,

    webSecurityEnabled: false,

    userAgent: 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36 LBBROWSER'

    //要指定谷歌ua,我用火狐无法浏览

};

var lastReceived = new Date().getTime();

var requestCount = 0;

var responseCount = 0;

var requestIds = [];

var startTime = new Date().getTime();

page.onLoadStarted = function () {

    page.startTime = new Date();

};//获取页面开始加载的时间

page.open('http://shop100338207.m.taobao.com/#list', function () {

    console.log('start');

    if (status === 'fail') {

        console.log('open page fail!');

    } else {

        waitFor(function () {

            return page.evaluate(function () {

                //判断页面加载完成的信号,

                return $("a:first-child", ".goods-list-items").length > 0;

            });

        }, function () {

            //页面加载完成后我们的DOM操作,

            //引入外部js库

            page.includeJs("http://xxxx/jquery-1.9.1.min.js", function () {

                page.evaluate(function () { //操作页面事件

                    console.log("jQuery version:" + jQuery.fn.jquery);

                    $("a", ".goods-list-items").each(function () {

                        console.log($(this).attr("href"));

                    });

                });

                setTimeout(function () {

                    page.render('../snapshot/taoba2o.png');

                }, 2000);

                //console.log()

                var t = Date.now() - page.startTime; //页面加载完成后的当前时间减去页面开始加载的时间，为整个页面加载时间

                console.log('firstLoadPage time :' + t + 'ms');

                console.log("end");

                setTimeout(function () {

                    page.close();

                    phantom.exit();

                }, 0);

            });

        });

    }

});

function screan(filename) {

    page.render(filename);

}

function waitFor(testFx, onReady, timeOutMillis) {

    var maxtimeOutMillis = timeOutMillis ? timeOutMillis : 3000, //< Default Max Timout is 3s

        start = new Date().getTime(),

        condition = false,

        interval = setInterval(function () {

            if ((new Date().getTime() - start < maxtimeOutMillis) && !condition) {

                // If not time-out yet and condition not yet fulfilled

                screan('../snapshot/taobao.png');

                condition = (typeof (testFx) === "string" ? eval(testFx) : testFx()); //< defensive code

            } else {

                if (!condition) {

                    // If condition still not fulfilled (timeout but condition is 'false')

                    console.log("'waitFor()' timeout");

                    phantom.exit(1);

                } else {

                    // Condition fulfilled (timeout and/or condition is 'true')

                    console.log("'waitFor()' finished in " + (new Date().getTime() - start) + "ms.");

                    typeof (onReady) === "string" ? eval(onReady) : onReady(); //< Do what it's supposed to do once the condition is fulfilled

                    clearInterval(interval); //< Stop this interval

                }

            }

        }, 250); //< repeat check every 250ms

};

page.onCallback = function (data) {

    console.log('CALLBACK: ' + JSON.stringify(data));

    // Prints 'CALLBACK: { "hello": "world" }'

};

page.onAlert = function (msg) {

    console.log('ALERT: ' + msg);

};

page.onConsoleMessage = function (msg, lineNum, sourceId) {

    console.log('CONSOLE:' + msg);

    //var d = "http://h5.m.taobao.com/awp/core/detail.htm?id=43064483679";

    var re = new RegExp("[/?id=]+[0-9]{11}");

    var arr = (msg.match(re));

    //if (arr != null) {

    //    console.log(msg.match(re)[0].replace("?id=", ""));

    //}

};

page.onError = function (msg, trace) {

    var msgStack = ['ERROR: ' + msg];

    if (trace && trace.length) {

        msgStack.push('TRACE:');

        trace.forEach(function (t) {

            msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function + '")' : ''));

        });

    }

    console.error(msgStack.join('\n'));

};

扯淡

我的算法是，用某个元素的出现作为页面加载完成的信号。在页面加载完成，我用dom处理把数据输出到console.log().js那边用page.onConsoleMessage +正则筛选输出我真正需要的信息。

我觉得这玩意的坑点在于

引入jquery（includeJs ，injectJs傻傻分不清啊有木有）并且运用其方法

上面举例的jquery网络地址是不对的，大家自己找一个

console.log()在不同的作用域有不同的语义

这个最坑。我早上浪费了一上午在这个方法里面。用这个框架，首先要把

page.evaluate(function () {} //操作页面事件

这句方法的注释默念一千遍，这个是在页面操作的。比如console.log("草泥马")，不是在我们phantomjs那个控制台里面输出那个文本，而是浏览器的。。。

所以最后在数据的获取的时候，我用了取巧的办法，onConsoleMessage+正则提取

`Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file://./embed_images.js. Domains, protocols and ports must match.`

这个影响视觉而已，屏蔽这JB玩意用下面的代码退出就行了

setTimeout(function(){

    phantom.exit();

}, 0);

特别的装逼技巧

因为我没有模块化，只是单纯一个文件运行，一遍情况下，每次开CMD，然后balala很麻烦的，做成批处理（.bat）打开就可以了

cd  F:\Scripts\

f:

phantomjs test.js

pause

引用不给稿费的，切你jj

参考链接：

中文入门参考

http://my.oschina.net/rasine/blog/335997#OSC_h3_6

phantomjs使用说明

http://www.zhouhua.info/2014/03/19/phantomjs/

waitforjs

https://github.com/ariya/phantomjs/blob/master/examples/waitfor.js

Does Phantom.js capture all AJAX?

http://stackoverflow.com/questions/14747643/does-phantom-js-capture-all-ajax

Using PhantomJS to embed all images of a webpage produces warnings but works

http://stackoverflow.com/questions/26608391/using-phantomjs-to-embed-all-images-of-a-webpage-produces-warnings-but-works

PhantomJS 不支持哪些操作？
http://www.zhihu.com/question/26653233

Using PhantomJS to make your AJAX web applications crawlable by Google

http://blog.istepaniuk.com/phantomjs-to-make-your-ajax-web-crawlable-by-google/

借助Nodejs在服务端使用jQuery采集17173游戏排行信息

咦，貌似文中有一些坑没填平，等下次吧，哈哈。