源代码如下:

 
//(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification.
//解决 javascript – Node.js请求CERT_HAS_EXPIRED问题,下面这句置首
// process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
//end let http = require("http");
let https = require("https");
let iconv = require("iconv-lite");
let cheerio = require("cheerio");
let path = require('path');
let fs = require('fs');
const phantom = require('phantom'); let EventEmitter = require('events').EventEmitter; class MyEmitter extends EventEmitter {
} const myEmitter = new MyEmitter();
myEmitter.setMaxListeners(0); // const util = require('util'); const request = require('request');
//var url = "https://www.baidu.com/";
//const getPromise = util.promisify(request.get);
const userAgents = [
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
]; //选择器模板
let selector_temple = [{"normal": "#picBody > p > a > img",
"fix": "#picBody > p > img"
},
{"normal": "#picBody > center > a > img",
"fix": "#picBody > center > img"
},
{"normal": "#contentV3_article > div.contentV3_body > p > a > img",
"fix": "#contentV3_article > div.contentV3_body > p > img"
}
]; /**
* 异步延迟
* @param {number} time 延迟的时间,单位毫秒
*/
function sleep(time = 0) {
return new Promise((resolve, reject) => {
setTimeout(() => {
resolve();
}, time);
})
}; class Spider2717 extends EventEmitter { constructor(_starturl = 'https://www.2717.com/ent/meinvtupian/2019/316305.html',
// _selector = 'div.w1200.yh >div.MeinvTuPianBox > ul > li>a>i>img',
_type = 'meinv',
_nextpage = 1,
_lastpage = 1
//_fix_selector = '#picBody > p > img'
) { super()
// this._emitter = myEmitter;
//src,title,flag:当前页面图片的src,和title及下载完成标志 this.data = [];
this.starturl= _starturl; //起始页url前半部分
//this.selector = _selector;//提取数据选择器字符串
//this.fix_selector = _fix_selector;//补丁选择器
this.nextpage = _nextpage;//开始抓取页面
this.lastpage = _lastpage; //最后抓取页面
this.type = _type; //图片类型:meinv(243),meishi(199),stars(16),wenshen(380),zhiwu(100)
//初始化保存图片目录
let i1 = this.starturl.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
let i2 = this.starturl.lastIndexOf("/")
let tmpstr = this.starturl.substring(i1, i2); //this.savedir = path.join('imgs', this.type).toString();
this.savedir = path.join('imgs', this.type,tmpstr).toString();
console.log("savedir:" + this.savedir);
//if (!fs.existsSync(this.savedir)) {
// 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
if (err) throw err;
});
//} //当前选择器模板序号
//let select_type = 0; //!!!!!!!!!!!!!!!!!!!
//下载html页面数据失败标志
this.get_html_flag = true;
//下载图片页完成计数器
this.downloaded_imagepage_count = 0;
//下载图片单个页面事件名称
this.download_onepage_event = "download_onepage_event"; } /**
* 获取指定url中的html文本内容
* @param url
* @param no
* @param event_name:html,etc
*/
spidermeinvtupian(url, pno, event_name = 'html') { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
let req = request({
url: url,
UserAgent: userAgent,
timeout: 5000,
encoding: null //设置encoding
}, function (error, response, body) {
if (!error && response.statusCode == 200) {
let html = iconv.decode(body, 'gbk').toString(); //解码gb2312
this.get_html_flag = true;
myEmitter.emit(event_name, html, pno); } else {
console.log("获取 " + url + " 失败!--"+error.message); this.get_html_flag = false;
let html = '';
myEmitter.emit(event_name, html, pno);
}
});
} /**
* 从html文本中获取图片src和atl
* @param html
* @param pno
*/
getTupianData(html, pno, event_name = 'images') {
//body > div.w1200.yh > div.MeinvTuPianBox > ul > li:nth-child(1) > a.MMPic/
const $ = cheerio.load(html);
//美女图片
//修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
// console.log("selector:" + this.selector);
//let imgs = $('#picBody > p > a > img').toArray();
//#picBody > p > img let imgs = []; for (let i = 0; i < selector_temple.length; i++) {
//尝试normal selector
imgs = $(selector_temple[i]['normal']).toArray();
console.log("selector:" + selector_temple[i]['normal']);
if (imgs.length > 0) break;
//尝试fix selector imgs = $(selector_temple[i]['fix']).toArray();
console.log("selector:" + selector_temple[i]['fix']);
if (imgs.length > 0) break; }
console.log("total page1:" + imgs.length); for (let i = 0; i < imgs.length; i++) {
let src = $(imgs[i]).attr('src');
let title = $(imgs[i]).attr("alt");
//增加文件下载标志,true:已完成下载,false:没有下载
//let flag = false;
this.data.push({src, title});
// console.log(typeof (this.data.flag));
}
// myEmitter.emit("images", this.data, pno);
myEmitter.emit(event_name, this.data, pno);
// this.emit("images", data, pno); } /**
* 根据抓取的图片src和alt下载图片数据
* @param data
* @param pno
*/
downloadphoto(data, pno) {
for (let i = 0; i < data.length; i++) {
data[i].title = data[i].title.replace(new RegExp("/", 'g'), '_');
data[i].title = data[i].title.replace(new RegExp("\\\\", 'g'), '_');
data[i].title = data[i].title.replace(new RegExp('<', 'g'), '_');
data[i].title = data[i].title.replace(new RegExp('>', 'g'), '_');
data[i].title = data[i].title.replace('|', '_'); this.downloadfile(data[i].src, data[i].title, i, pno);
}
} /**
* 随机延迟下载图片文件
* @param src
* @param title
* @param no 当前页面第no个图片文件
* @param delaytime
* @param pno 当前页面号
*/ /*
require('https').get({
secure: true,
host: 'github.com',
method: 'GET',
path: '/downloads/Graylog2/graylog2-web-interface/graylog2-web-interface-0.9.6.tar.gz',
'headers': {
Host: 'github.com'
}}).on('response', function(response) {
console.log(response.statusCode);
});
*/
/**
* 用NodeJs实现获取301或302跳转后的URL
* @param link
* @param collback
* https://calfgz.github.io/blog/2018/05/http-redirect-java-node.html find_link(link, collback) { var f = function (link) {
var options = {
url: link,
followRedirect: false,
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Charset': 'UTF-8;',
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.8) Firefox/3.6.8',
}
} request(options, function (error, response, body) {
console.log(response.statusCode);
if (response.statusCode == 301 || response.statusCode == 302) {
var location = response.headers.location;
console.log('location: ' + location);
f(location);
} else {
//console.log(body);
collback(link);
}
})
} f(link);
} // find_link("http://a.m.taobao.com/i538372076663.htm?&sid=7ac494a5aa270ce9562feadef7423650", function(link){
// console.log(link);
// }); */
calldownload=(src, no, filename, delaytime)=> {
//src 非法
if (src == undefined || src.length == 0) {
//跳过,继续下一个图片下载
console.log(`下载图片src':${src} '非法,跳过下载,继续下一个`);
// this.data[no].flag = true;
myEmitter.emit(this.download_onepage_event, "fail", no);
return;
}
let time = 0; time = Math.random() * delaytime; let timeout=setTimeout(() => {
let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
var options = {
url: src,
followRedirect: false,
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Charset': 'UTF-8;',
'User-Agent': userAgent
}
}
if (src.startsWith("https")) { https.get(src, options, res => { // console.log(filename);
let writer = fs.createWriteStream(filename);
res.pipe(writer);
res.on("end", () => {
if (res.statusCode == 200) {
console.log(new Date().toLocaleString() + ",完成下载:" + filename);
//this.data[no].flag = true;
myEmitter.emit(this.download_onepage_event, "ok", no);
} else if (res.statusCode == 301 || res.statusCode == 302) {
console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
//继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
let location = res.headers.location;
console.log("正在重新跳转正确的URL进行下载:" + location);
// console.log('src: ' + src);
this.calldownload(location, no, filename);
} else { //文件下载失败,提示并跳过下载
console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
//跳过,继续下一个图片下载
//this.data[no].flag = true;
myEmitter.emit(this.download_onepage_event, "fail", no);
}
});
res.on('error',(err)=>{
console.log("download_onepage_event:failed"+err.message);
myEmitter.emit(this.download_onepage_event, "fail", no);
}); });
} else if (src.startsWith("http")) { http.get(src, res => {
// let filename = path.join('imgs', title + path.extname(src));
//console.log(filename);
let writer = fs.createWriteStream(filename);
res.pipe(writer);
res.on("end", () => {
if (res.statusCode == 200) {
console.log(new Date().toLocaleString() + ",完成下载:" + filename);
//this.data[no].flag = true;
myEmitter.emit(this.download_onepage_event, "ok", no);
} else if (res.statusCode == 301 || res.statusCode == 302) {
console.log("未完成下载:" + filename + ",http返回值:" + res.statusCode);
//继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
let location = res.headers.location;
console.log("正在重新跳转正确的URL进行下载:" + location);
this.calldownload(location, no, filename);
} else { //文件下载失败,提示并跳过下载
console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
//跳过,继续下一个图片下载
myEmitter.emit(this.download_onepage_event, "fail", no);
} });
res.on('error',(err)=>{
console.log("download_onepage_event:failed"+err.message);
myEmitter.emit(this.download_onepage_event, "fail", no);
});
});
}
clearTimeout((timeout));
}, time);
}; /**
* 根据src,title,no,pno等参数进行下载图片文件到本地
* @param src
* @param title
* @param no
* @param delaytime
* @param pno
*/
downloadfile=(src, title, no, pno)=> { try {
// src= src.replace('https','http');
console.log("src:" + src); //let filename = path.join(this.savedir, title,pno + path.extname(src));
//if (!fs.existsSync(this.savedir)) {
// 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
let dirpath=path.join(this.savedir,title).toString();
fs.mkdirSync(dirpath, {recursive: true}, (err) => {
if (err) throw err;
});
//}*/
let filename = path.join(this.savedir, title,pno + path.extname(src));
//如果本地文件存在则跳过,不再下载
if (fs.existsSync(filename)) { let stat = fs.statSync(filename);
if (stat.size > 1024) {
//跳过,继续下一个图片下载
console.log("本地文件:" + filename + "已经存在,系统跳过下载");
// this.data[no].flag = true;
myEmitter.emit(this.download_onepage_event, "ingore", no);
return;
}
}
console.log(new Date().toLocaleDateString() + ",正在下载:" + filename);
//
this.calldownload(src, no, filename, 100); } catch (e) {
console.log(e);
// this.data[no].flag = flag;
myEmitter.emit(this.download_onepage_event, "ingore", no);
} }; /**
* 开启抓取图片数据
*/
startSpider=()=> { //注册自定义监听事件
// 根据html获取图片src,art
myEmitter.on("html", (html, pno) => {
// this.on("html", (html, pno) => {
// console.log("html:", html, pno);
this.getTupianData(html, pno);
}); //根据图片src,alt,及指定页面下载图片到本地
myEmitter.on("images", (data, pno) => {
// this.on("images", (data, pno) => {
// console.log("images:", data, pno);
this.downloadphoto(data, pno);
}); //下载图片页完成计数器
this.downloaded_imagepage_count = 0;
this.data = [];
//下载图片单个页面事件名称
// this.download_onepage_event="download_onepage_event";
myEmitter.on(this.download_onepage_event, (status, pno) => {
console.log("download_onepage_event=>status:"+status);
this.downloaded_imagepage_count++;
if (this.downloaded_imagepage_count >= this.data.length) { console.log("某单页图片数据抓取完毕!");
this.downloaded_imagepage_count = 0;
this.data = []; this.nextpage++;
if (this.nextpage <= this.lastpage) { console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
this.spiderpage(this.nextpage); } else {
console.log("所有页面图片数据抓取完毕!");
//clearInterval(interval);//停止定时器
myEmitter.emit("download_allpage_event","ok");
this.data = [];
myEmitter.removeAllListeners("html")
myEmitter.removeAllListeners("images")
//写标志
fs.writeFileSync('save.txt',"ok"); } }
}); //首先开启起始页数据下载。。。。
console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
this.spiderpage(this.nextpage) }; /**
* 开启指定页面数据抓取
* @param pageno
*/
spiderpage=(pageno)=> {
let url = '';
if(pageno===1){
url=this.starturl;
}
else { url = this.starturl.substring(0, this.starturl.length - 5) + "_" + pageno + ".html";
}
// url = this.preurl + pageno + ".html"; console.log("url:" + url);
this.spidermeinvtupian(url, pageno);
}
} /**
* 通过原生regquest模块获取指定url中文本内容
* @param url
* @param event_name
*/
function get_html_by_request(url, event_name = 'get_html') { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
let req = request({
url: url,
UserAgent: userAgent,
encoding: null, //设置encoding
strictSSL: true
}, function (error, response, body) {
if (!error && response.statusCode == 200) {
let html = iconv.decode(body, 'gbk').toString(); //解码gb2312 myEmitter.emit(event_name, html); } else {
console.log("获取 " + url + " 失败:" + response.statusCode);
let html = '';
myEmitter.emit(event_name, html);
}
});
} /**
*通过phamtomjs同步获取url对应的html内容
* @param url
* @returns {Promise<string|*>}
*/
async function get_html_from_url_by_phantom(url) {
// phantom.outputEncoding='utf-8';//指定编码方式
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function (requestData) {
console.info('Requesting', requestData.url);
});
//设置动态useragent
let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
//warn: Using page.settings = ...; is not supported. Use page.property('settings', ...) instead. See the README file for more examples of page#property.
page.property('settings', {
javascriptEnabled: true,
loadImages: true,
userAgent: userAgent
}); const status = await page.open(url); let content = await page.property('content');
// console.log(content);
// page.render('example.png');
// await page.close();
await instance.exit(); return content; } /**
* 获取总页面数及其标题
* @param html
* @returns {number}
*/
function getPageinfo(html) { const $ = cheerio.load(html); //获取标题
let hs = $('div.warp.mar.oh > div.warp.oh > h1').toArray();
let title = $(hs[0]).text();
// //获取总页面数
let pageinfo = '';
let lis = $('#pageinfo').toArray();
if (lis.length == 0) {
pageinfo = '-1';
} else {
pageinfo = $(lis[0]).attr('pageinfo');
} let count = Number(pageinfo); let data = {'title':title, 'count':count}; data.title = title;
data.count = count; return data; } //---------------------------------------------------------------------------
/**
* 无分页网页图片下载类
*/
class SpiderOnePageBuff {
/**
* 监听一个事件的参数
* @param _event_name
*/
constructor(_html, _event_name, _save_dir) { //初始化保存图片目录 this.savedir = _save_dir;
// 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
if (err) throw err;
}); this.clsname = 'SpiderOnePageBuff=>';
this.downloaded_one_image = 'downloaded_one_image'; this.html = _html; this.data = [];
this.imgs = []; this.event_name = _event_name;
/**
* 处理所有图片数据完成计数器
* @type {number}
*/
this.process_event_finish_count = 0; } /**
* 随机延迟下载图片文件
* @param src
* @param title
* @param no 当前页面第no个图片文件
* @param delaytime
* @param pno 当前页面号
*/
calldownload=(src, no, filename, delaytime)=>{
//src 非法
if (src == undefined || src.length == 0) {
//跳过,继续下一个图片下载
console.log(this.clsname + `下载图片src':${src} '非法,跳过下载,继续下一个`);
this.data[no].flag = true;
return;
}
let time = 0; time = Math.random() * delaytime; let timeout= setTimeout(() => {
let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
let options = {
url: src,
followRedirect: false,
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept-Charset': 'UTF-8;',
'User-Agent': userAgent
}
}
if (src.startsWith("https")) { https.get(src, options, res => { // console.log(filename);
let writer = fs.createWriteStream(filename);
res.pipe(writer);
res.on("end", () => {
if (res.statusCode == 200) {
console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
myEmitter.emit(this.downloaded_one_image, "ok", no);
} else if (res.statusCode == 301 || res.statusCode == 302) {
console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
//继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
let location = res.headers.location;
console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
// console.log('src: ' + src);
this.calldownload(location, no, filename);
} else { //文件下载失败,提示并跳过下载
console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
//跳过,继续下一个图片下载
myEmitter.emit(this.downloaded_one_image, "fail", no);
}
}); });
} else if (src.startsWith("http")) { http.get(src, res => {
// let filename = path.join('imgs', title + path.extname(src));
//console.log(filename);
let writer = fs.createWriteStream(filename);
res.pipe(writer);
res.on("end", () => {
if (res.statusCode == 200) {
console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
myEmitter.emit(this.downloaded_one_image, "ok", no);
} else if (res.statusCode == 301 || res.statusCode == 302) {
console.log(this.clsname + "未完成下载:" + filename + ",http返回值:" + res.statusCode);
//继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
let location = res.headers.location;
console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
this.calldownload(location, no, filename);
} else { //文件下载失败,提示并跳过下载
console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
//跳过,继续下一个图片下载
myEmitter.emit(this.downloaded_one_image, "fail", no);
} });
});
}
clearTimeout(timeout);
}, time);
} /**
* 预先处理标题为文件格式字符
* @param _title
* @returns {string}
*/
preprocess_title(_title) {
let title = _title;
title = title.replace(new RegExp("\\\\", 'g'), '_');
title = title.replace(new RegExp("/", 'g'), '_');
title = title.replace(new RegExp('<', 'g'), '_');
title = title.replace(new RegExp('>', 'g'), '_');
title = title.replace('|', '_');
return title;
} /**
* 抓取单个页面图片
* @param html_buff
* @param event_name
*/
//抓取只有单个图片的页面处理函数
spider_one_image=(event_name = 'get_one_image')=> { const $ = cheerio.load(this.html);
//修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
for (let i = 0; i < selector_temple.length; i++) {
//尝试normal selector
this.imgs = $(selector_temple[i]['normal']).toArray();
console.log("selector:" + selector_temple[i]['normal']);
if (this.imgs.length > 0) break;
//尝试fix selector this.imgs = $(selector_temple[i]['fix']).toArray();
console.log("selector:" + selector_temple[i]['fix']);
if (this.imgs.length > 0) break; }
console.log("spider_one_image=>total page1:" + this.imgs.length); for (let i = 0; i < this.imgs.length; i++) {
let src = $(this.imgs[i]).attr('src');
let title = $(this.imgs[i]).attr("alt");
title = this.preprocess_title(title);
//增加文件下载标志,true:已完成下载,false:没有下载
//let flag = false;
this.data.push({src, title});
} if (this.imgs.length > 0)
myEmitter.emit(event_name, this.imgs);
}; /**
* 开启入口
*/
start_spider=()=>{
this.process_event_finish_count = 0;
myEmitter.on('main_download_one_image', (status, no) => {
console.log(this.clsname + "status:" + status);
console.log("this.event_name:"+this.event_name);
this.process_event_finish_count++;
if (this.process_event_finish_count >= this.imgs.length) {
//如果完成所有文件下载(无论成功与否),则发去完成事件给回调函数
this.process_event_finish_count=0;
this.data=[];
myEmitter.emit("download_allpage_event", "ok");
} });
myEmitter.on("get_one_image", data => { let filename = '';
for (let i = 0; i < this.data.length; i++) {
//开始下载图片文件
//src, no, filename, delaytime
//filename = path.join(this.savedir, (i + 1) + "_" + this.data[i].title + path.extname(this.data[i].src));
filename = path.join(this.savedir, (i + 1) + path.extname(this.data[i].src));
console.log(this.clsname + new Date().toLocaleDateString() + ",正在下载:" + filename);
//如果本地文件存在则跳过,不再下载
if (fs.existsSync(filename)) { let stat = fs.statSync(filename);
if (stat.size > 1024) {
//跳过,继续下一个图片下载
console.log(this.clsname + "本地文件:" + filename + "已经存在,系统跳过下载");
myEmitter.emit("main_download_one_image", "ingore", i);
return;
}
}
this.calldownload(this.data[i].src, i, filename, 3000);
} }); this.spider_one_image();
}
} //抓取页面入口url地址
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2013/4499.html'; function main(url = 'https://www.2717.com/word/dongwushijie/2018/313620.html',type) {
//
let pagecount = 0;
let title = '';
let html_buff = '';
// let end_flag = false; //完成所有页面图片下载回调处理事件
myEmitter.on("download_allpage_event",status=>{
//开启结束标志
//end_flag=true;
});
// console.log('step 1================='); //way1
myEmitter.on('get_html', html => {
let data = getPageinfo(html);
pagecount = data['count'];
title = data['title']; html_buff = html;
console.log(title, pagecount); if (pagecount <= 0) {
myEmitter.on("main_download_one_image", status => {
//下载单个图片完成!!!
console.log("下载单个图片完成!!!=状态" + status);
//写标志
fs.writeFileSync('save.txt',"ok"); });
//初始化保存图片目录
let i1 = url.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
let i2 = url.lastIndexOf("/")
let tmpstr = url.substring(i1, i2);
let savedir = path.join('imgs', tmpstr).toString();
if(arguments.length<=1){
savedir=path.join('imgs',title).toString();
}
else{
savedir=path.join('imgs',type,title).toString();
} let spiderbuff = new SpiderOnePageBuff(html_buff,"main_download_one_image", savedir);
spiderbuff.start_spider();
} else { //有多个图片的tab页显示 // console.log('step 2=================');
let typestr=type; if(arguments.length<=1)
{
typestr=title;
} let spider = new Spider2717(
url,
typestr,
1,
pagecount
);
spider.startSpider();
} });
//触发获取html内容
get_html_by_request(url);
//end way1 } /**
* 主调用
* 只需要指定抓取图片首页url
*/
/*
性感红唇美女暗黑哥特风高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html
清新浪漫的蓝天白云纯美风景图片高清壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313774.html
世外桃源田园山水风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313773.html
祖国山河壮丽的自然风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313772.html
上帝视角俯瞰不一样的自然美景图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313771.html
小巧可爱的七星瓢虫动物图片壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2019/313769.html
雨后如珠似玉的花卉水珠梦幻特写图壁纸片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313768.html
神奇瑰丽的西藏圣象天门风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313767.html
大自然雄伟雪山美景高清壁纸图片素材 https://www.2717.com/beautiful/zhuomianbeijing/2018/313723.html
唯美图文手机背景高清壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313722.html
甜美可爱的冬日圣诞女孩手机高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313721.html
联想桌面壁纸高清图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313635.html
香港乐坛天后容祖儿图片桌面壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313634.html
刘德华主演电影高清桌面壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313608.html
美女明星杨蓉白色吊带性感连衣裙高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313590.html
死侍双刀耍酷高清壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313572.html
马思纯露肩性感写真高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313571.html
温馨幸福的韩系情侣高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313558.html
韩国女神美女IU拼接图片大全分享 https://www.2717.com/beautiful/zhuomianbeijing/2018/313557.html
你和我的倾城时光金瀚高清剧照图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313556.html
李易峰高清手机壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313555.html
最新超级可爱的萌娃拼接图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313552.html
偶像练习生陈立农高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313532.html
白敬亭帅气时尚高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313531.html
悲伤逆流成河顾森湘高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313517.html
可盐可甜的爱豆高清锁屏壁纸图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313505.html
2016年1月日历精选清新护眼壁纸图片5下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313494.html
奔驰梅赛德斯SLK55汽车壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313489.html
延禧攻略 清宫浮世绘版海报壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313487.html
海洋世界里的动物蓝色图片桌面壁纸1下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313485.html
OL制服美女美腿丝袜性感图片桌面壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313470.html
飞檐走壁的美女个性壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313466.html */
let url='';
// url = "https://www.2717.com/ent/meinvtupian/2019/316305.html";
// let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html';
//url='https://lq.2717.com/kbtp/2018/313409.html';
//url='https://lq.2717.com/kbtp/2017/184385.html';
url='https://www.2717.com/beautiful/qichetuku/2015/17388.html';
// url='https://www.2717.com/beautiful/zhuomianbeijing/2018/313450.html'; let arguments = process.argv.splice(2);
if(arguments.length>0)
{
url=arguments[0];
}
let type='美女图片';
if(arguments.length>1)
{
type=arguments[1];
}
main(url,type);

 
本次本来想继承events的事件驱动类来写爬虫的,经过测试死活不行,后来只有使用外部events实列的on,emit方法才通过,但是如下测试代码通过继承events又可以

let EventsDemo = require('events');

class MyEvents extends EventsDemo {
constructor() {
super();
} callA() {
console.log("call A");
this.emit("aaa", "a",123);
} callB() {
console.log("call B");
this.emit('bbb', 'b',123,456);
} start(){
// let myevent = new MyEvents(); this.on("test", (p1, p2, p3) => {
let msg = '';
//msg="p1={$p1},p2={$p2},p3={$p3}";
msg = "p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3; console.log(msg);
});
this.emit("test", 1, "abc", 3.1415926); console.log("==================================================");
// myevent = new MyEvents(); this.on("aaa",(p1,p2)=>{
let msg = '';
msg = "callA:"+"p1=" + p1 + "," + "p2=" + p2 ;
console.log(msg);
}); this.on('bbb', (p1,p2,p3)=>{
let msg = '';
msg = "callB:"+"p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3;
console.log(msg);
}); this.callA();
this.callB();
}
}; /**
* 主函数
*/
//main(); myevent = new MyEvents();
myevent.start();

这个问题有点诡异,知道的朋友请指教,谢谢。


 

使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹的更多相关文章

  1. 第一个nodejs爬虫:爬取豆瓣电影图片

    第一个nodejs爬虫:爬取豆瓣电影图片存入本地: 首先在命令行下 npm install request cheerio express -save; 代码: var http = require( ...

  2. nodejs:本地文件夹http服务器http-server

    一.已经安装nodejs的电脑,有一个方便通过http访问本地文件夹.文件夹服务器 static files over HTTP,并不是我们平常说的node那个web服务器哦 二.好处 可以方便实现跨 ...

  3. nodejs下载图片到本地,根据百度图片查找相应的图片,通过nodejs保存到本地文件夹

    根据百度图片查找相应的图片:输入图片关键字,输入图片数量(默认是30条),通过nodejs将批量保存图片到本地文件夹. 代码已上传到github上:代码github的地址 下载后进去back-end: ...

  4. nodejs爬虫笔记(三)---爬取YouTube网站上的视频信息

    思路:通过笔记(二)中代理的设置,已经可以对YouTube的信息进行爬取了,这几天想着爬取网站下的视频信息.通过分析YouTube,发现可以从订阅号入手,先选择几个订阅号,然后爬取订阅号里面的视频分类 ...

  5. 基于nodejs的 本地文件夹http服务器:http-server

    请记住,是文件夹服务器 $ npm install http-server -g $ cd /tmp && http-server 或: $ http-server /tmp

  6. nodejs 将网上的图片下载到本地文件

    var request = require('request'); var fs = require('fs'); var img_src = 'https://www.baidu.com/img/b ...

  7. NodeJs+http+fs+request+cheerio 采集,保存数据,并在网页上展示(构建web服务器)

    目的: 数据采集 写入本地文件备份 构建web服务器 将文件读取到网页中进行展示 目录结构: package.json文件中的内容与上一篇一样:NodeJs+Request+Cheerio 采集数据 ...

  8. 使用nodejs爬取拉勾苏州和上海的.NET职位信息

    最近开始找工作,本人苏州,面了几家都没有结果很是伤心.在拉勾上按照城市苏州关键字.NET来搜索一共才80来个职位,再用薪水一过滤,基本上没几个能投了.再加上最近苏州的房价蹭蹭的长,房贷压力也是非常大, ...

  9. 基于nodejs模拟浏览器post请求爬取json数据

    今天想爬取某网站的后台传来的数据,中间遇到了很多阻碍,花了2个小时才请求到数据,所以我在此总结了一些经验. 首先,放上我所爬取的请求地址http://api.chuchujie.com/api/?v= ...

随机推荐

  1. 改变默认的多选框 checkbox 样式~

    效果图: HTML代码: <label for="Checkbox1" style="display:none;"></label> & ...

  2. 七、Linux_端口、进程

    Linux_端口.进程 1.查看所有端口 netstat -nlutp 2.停掉使用端口的进程,根据进程pid kill 1818 kill -9 1818 # 强制杀掉进程 3.根据进程名杀死进程: ...

  3. linux内核模块编译makefile

    linux内核可加载模块的makefile 在开发linux内核驱动时,免不了要接触到makefile的编写和修改,尽管网上的makefile模板一大堆,做一些简单的修改就能用到自己的项目上,但是,对 ...

  4. SWD烧录/仿真方式

    单片机在烧写/仿真的时候具有一种方式叫做SWD,这种方式只用到两根线SWDIO,SWCLK.一般SWD和JTAG中的JTMS和JTCK共用的.由于线少,所以使用非常方便,但是速率相对较低. 在接线时, ...

  5. jpa之No property buyerOpenId found for type OrderMaster! Did you mean 'buyerOpenid'?

    java.lang.IllegalStateException: Failed to load ApplicationContext at org.springframework.test.conte ...

  6. Vue的参数请求与传递

    Vue不同模板之间的参数传递 页面路由带参数的跳转: 参数接收: Vue向服务器请求资源的两种方式 VUE-RESOURCE 1.Vue.js是数据驱动的,这使得我们并不需要直接操作DOM,如果我们不 ...

  7. discuz数据批量入库接口

    近期在做社区,首选discuz,数据需要用scrapy爬虫批量入库,就写了一个php入库接口. <?php define('PW', 'abc123456');//一定要修改 if($_REQU ...

  8. @getMapping和@postMapping,@RestController 区别

    @getMapping和@postMapping,@RestController   @RequestMapping   和  @GetMapping @PostMapping 区别 @GetMapp ...

  9. 浅谈C++编译原理 ------ C++编译器与链接器工作原理

    原文:https://blog.csdn.net/zyh821351004/article/details/46425823 第一篇:      首先是预编译,这一步可以粗略的认为只做了一件事情,那就 ...

  10. noip初赛试题

    链接: https://pan.baidu.com/s/1yoOMIUqMRBnBUPprC3o6HQ&shfl=shareset 提取码: m8ns 复制这段内容后打开百度网盘手机App,操 ...