







通过程序伪装成浏览器请求的时候,就多须要关注Request Headers里面的信息,另一些须要登录的站点也是须要关注这些的。Response里面的信息就是server返回的内容,这里仅仅做对文本信息的处理,对图片、音频、视频等信息不做介绍。





*@Description: 获取网页信息基类
package com.lulei.crawl; import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry; import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger; import com.lulei.util.CharsetUtil; public abstract class CrawlBase {
private static Logger log = Logger.getLogger(CrawlBase.class); //链接源码
private String pageSourceCode = "";
private Header[] responseHeaders = null;
private static int connectTimeout = 3500;
private static int readTimeout = 3500;
private static int maxConnectTimes = 3;
private static String charsetName = "iso-8859-1";
private static HttpClient httpClient = new HttpClient(); static {
} /**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @return
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: method方式訪问页面
public boolean readPage(String urlStr, String charsetName, String method, HashMap<String, String> params) throws HttpException, IOException {
if ("post".equals(method) || "POST".equals(method)) {
return readPageByPost(urlStr, charsetName, params);
} else {
return readPageByGet(urlStr, charsetName, params);
} /**
* @param urlStr
* @param charsetName
* @param params
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Get方式訪问页面
public boolean readPageByGet(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException {
GetMethod getMethod = createGetMethod(urlStr, params);
return readPage(getMethod, charsetName, urlStr);
} /**
* @param urlStr
* @param charsetName
* @param params
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: Post方式訪问页面
public boolean readPageByPost(String urlStr, String charsetName, HashMap<String, String> params) throws HttpException, IOException{
PostMethod postMethod = createPostMethod(urlStr, params);
return readPage(postMethod, charsetName, urlStr);
} /**
* @param method
* @param defaultCharset
* @param urlStr
* @return 訪问是否成功
* @throws HttpException
* @throws IOException
* @Author: lulei
* @Description: 读取页面信息和头信息
private boolean readPage(HttpMethod method, String defaultCharset, String urlStr) throws HttpException, IOException{
int n = maxConnectTimes;
while (n > 0) {
try {
if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
log.error("can not connect " + urlStr + "\t" + (maxConnectTimes - n + 1) + "\t" + httpClient.executeMethod(method));
} else {
responseHeaders = method.getResponseHeaders();
InputStream inputStream = method.getResponseBodyAsStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
StringBuffer stringBuffer = new StringBuffer();
String lineString = null;
while ((lineString = bufferedReader.readLine()) != null){
pageSourceCode = stringBuffer.toString();
InputStream in =new ByteArrayInputStream(pageSourceCode.getBytes(charsetName));
String charset = CharsetUtil.getStreamCharset(in, defaultCharset);
if ("Big5".equals(charset)) {
charset = "gbk";
if (!charsetName.toLowerCase().equals(charset.toLowerCase())) {
pageSourceCode = new String(pageSourceCode.getBytes(charsetName), charset);
return true;
} catch (Exception e) {
System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));
return false;
} /**
* @param urlStr
* @param params
* @return GetMethod
* @Author: lulei
* @Description: 设置get请求參数
private GetMethod createGetMethod(String urlStr, HashMap<String, String> params){
GetMethod getMethod = new GetMethod(urlStr);
if (params == null){
return getMethod;
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
getMethod.setRequestHeader(key, val);
return getMethod;
} /**
* @param urlStr
* @param params
* @return PostMethod
* @Author: lulei
* @Description: 设置post请求參数
private PostMethod createPostMethod(String urlStr, HashMap<String, String> params){
PostMethod postMethod = new PostMethod(urlStr);
if (params == null){
return postMethod;
Iterator<Entry<String, String>> iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<String, String> entry = iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
postMethod.setParameter(key, val);
return postMethod;
} /**
* @param urlStr
* @param charsetName
* @return 訪问是否成功
* @throws IOException
* @Author: lulei
* @Description: 不设置不论什么头信息直接訪问网页
public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
return this.readPageByGet(urlStr, charsetName, null);
} /**
* @return String
* @Author: lulei
* @Description: 获取网页源码
public String getPageSourceCode(){
return pageSourceCode;
} /**
* @return Header[]
* @Author: lulei
* @Description: 获取网页返回头信息
public Header[] getHeader(){
return responseHeaders;
} /**
* @param timeout
* @Author: lulei
* @Description: 设置连接超时时间
public void setConnectTimeout(int timeout){
} /**
* @param timeout
* @Author: lulei
* @Description: 设置读取超时时间
public void setReadTimeout(int timeout){
} /**
* @param maxConnectTimes
* @Author: lulei
* @Description: 设置最大訪问次数,链接失败的情况下使用
public static void setMaxConnectTimes(int maxConnectTimes) {
CrawlBase.maxConnectTimes = maxConnectTimes;
} /**
* @param connectTimeout
* @param readTimeout
* @Author: lulei
* @Description: 设置连接超时时间和读取超时时间
public void setTimeout(int connectTimeout, int readTimeout){
} /**
* @param charsetName
* @Author: lulei
* @Description: 设置默认编码方式
public static void setCharsetName(String charsetName) {
CrawlBase.charsetName = charsetName;


*@Description: 获取页面链接地址信息基类
package com.lulei.crawl; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import com.lulei.util.DoRegex; public abstract class CrawlListPageBase extends CrawlBase {
private String pageurl; /**
* @param urlStr
* @param charsetName
* @throws IOException
public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
readPageByGet(urlStr, charsetName);
pageurl = urlStr;
} /**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @throws IOException
public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap<String, String> params) throws IOException{
readPage(urlStr, charsetName, method, params);
pageurl = urlStr;
} /**
* @return List<String>
* @Author: lulei
* @Description: 返回页面上需求的链接地址
public List<String> getPageUrls(){
List<String> pageUrls = new ArrayList<String>();
pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
return pageUrls;
} /**
* @return String
* @Author: lulei
* @Description: 返回页面上需求的网址连接的正則表達式
public abstract String getUrlRegexString(); /**
* @return int
* @Author: lulei
* @Description: 正則表達式中要去的字段位置
public abstract int getUrlRegexStringNum();


* @Description: 正则处理工具
package com.lulei.util; import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; public class DoRegex { private static String rootUrlRegex = "(http://.*?/)";
private static String currentUrlRegex = "(http://.*/)";
private static String ChRegex = "([\u4e00-\u9fa5]+)"; /**
* @param dealStr
* @param regexStr
* @param splitStr
* @param n
* @return String
* @Author: lulei
* @Description: 正则匹配结果。每条记录用splitStr切割
public static String getString(String dealStr, String regexStr, String splitStr, int n){
String reStr = "";
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reStr;
splitStr = (splitStr == null) ? "" : splitStr;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
StringBuffer stringBuffer = new StringBuffer();
while (matcher.find()) {
reStr = stringBuffer.toString();
if (splitStr != "" && reStr.endsWith(splitStr)){
reStr = reStr.substring(0, reStr.length() - splitStr.length());
return reStr;
} /**
* @param dealStr
* @param regexStr
* @param n
* @return String
* @Author: lulei
* @Description: 正则匹配结果,将所有匹配记录组装成字符串
public static String getString(String dealStr, String regexStr, int n){
return getString(dealStr, regexStr, null, n);
} /**
* @param dealStr
* @param regexStr
* @param n
* @return String
* @Author: lulei
* @Description: 正则匹配第一条结果
public static String getFirstString(String dealStr, String regexStr, int n){
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return "";
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
return matcher.group(n).trim();
return "";
} /**
* @param dealStr
* @param regexStr
* @param n
* @return ArrayList<String>
* @Author: lulei
* @Description: 正则匹配结果。将匹配结果组装成数组
public static List<String> getList(String dealStr, String regexStr, int n){
List<String> reArrayList = new ArrayList<String>();
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reArrayList;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
return reArrayList;
} /**
* @param url
* @param currentUrl
* @return String
* @Author: lulei
* @Description: 组装网址,网页的url
private static String getHttpUrl(String url, String currentUrl){
try {
url = encodeUrlCh(url);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
if (url.indexOf("http") == 0){
return url;
if (url.indexOf("/") == 0){
return getFirstString(currentUrl, rootUrlRegex, 1) + url.substring(1);
return getFirstString(currentUrl, currentUrlRegex, 1) + url;
} /**
* @param dealStr
* @param regexStr
* @param currentUrl
* @param n
* @return ArrayList<String>
* @Author: lulei
* @Description: 获取和正则匹配的绝对链接地址
public static List<String> getArrayList(String dealStr, String regexStr, String currentUrl, int n){
List<String> reArrayList = new ArrayList<String>();
if (dealStr == null || regexStr == null || n < 1 || dealStr.isEmpty()){
return reArrayList;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
reArrayList.add(getHttpUrl(matcher.group(n).trim(), currentUrl));
return reArrayList;
} /**
* @param url
* @return
* @throws UnsupportedEncodingException
* @Author: lulei
* @Description: 将连接地址中的中文进行编码处理
public static String encodeUrlCh (String url) throws UnsupportedEncodingException {
while (true) {
String s = getFirstString(url, ChRegex, 1);
if ("".equals(s)){
return url;
url = url.replaceAll(s, URLEncoder.encode(s, "utf-8"));
} /**
* @param dealStr
* @param regexStr
* @param array 正则位置数组
* @return
* @Author:lulei
* @Description: 获取所有
public static List<String[]> getListArray(String dealStr, String regexStr, int[] array) {
List<String[]> reArrayList = new ArrayList<String[]>();
if (dealStr == null || regexStr == null || array == null) {
return reArrayList;
for (int i = 0; i < array.length; i++) {
if (array[i] < 1) {
return reArrayList;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
String[] ss = new String[array.length];
for (int i = 0; i < array.length; i++) {
ss[i] = matcher.group(array[i]).trim();
return reArrayList;
} /**
* @param dealStr
* @param regexStr
* @param array
* @return
* @Author:lulei
* @Description: 获取所有
public static List<String> getStringArray(String dealStr, String regexStr, int[] array) {
List<String> reStringList = new ArrayList<String>();
if (dealStr == null || regexStr == null || array == null) {
return reStringList;
for (int i = 0; i < array.length; i++) {
if (array[i] < 1) {
return reStringList;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < array.length; i++) {
return reStringList;
} /**
* @param dealStr
* @param regexStr
* @param array 正则位置数组
* @return
* @Author:lulei
* @Description: 获取第一个
public static String[] getFirstArray(String dealStr, String regexStr, int[] array) {
if (dealStr == null || regexStr == null || array == null) {
return null;
for (int i = 0; i < array.length; i++) {
if (array[i] < 1) {
return null;
Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher matcher = pattern.matcher(dealStr);
while (matcher.find()) {
String[] ss = new String[array.length];
for (int i = 0; i < array.length; i++) {
ss[i] = matcher.group(array[i]).trim();
return ss;
return null;


*@Description: 编码方式检測类
package com.lulei.util; import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset; import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.ParsingDetector;
import info.monitorenter.cpdetector.io.UnicodeDetector; public class CharsetUtil {
private static final CodepageDetectorProxy detector; static {//初始化探測器
detector = CodepageDetectorProxy.getInstance();
detector.add(new ParsingDetector(false));
} /**
* @param url
* @param defaultCharset
* @Author:lulei
* @return 获取文件的编码方式
public static String getStreamCharset (URL url, String defaultCharset) {
if (url == null) {
return defaultCharset;
try {
Charset charset = detector.detectCodepage(url);
if (charset != null) {
return charset.name();
} catch (Exception e1) {
// TODO Auto-generated catch block
return defaultCharset;
} /**
* @param inputStream
* @param defaultCharset
* @return
* @Author:lulei
* @Description: 获取文件流的编码方式
public static String getStreamCharset (InputStream inputStream, String defaultCharset) {
if (inputStream == null) {
return defaultCharset;
int count = 200;
try {
count = inputStream.available();
} catch (IOException e) {
// TODO Auto-generated catch block
try {
Charset charset = detector.detectCodepage(inputStream, count);
if (charset != null) {
return charset.name();
} catch (Exception e1) {
// TODO Auto-generated catch block
return defaultCharset;



1)找到百度新闻更新列表页,如http://news.baidu.com/n?cmd=4&class=civilnews&pn=1&from=tab 界面例如以下图所看到的:


watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQveGlhb2ppbWFubWFu/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center" alt="" />


*@Description: 百度新闻滚动列表页,能够获取当前页面上的链接
package com.lulei.crawl.news; import java.io.IOException;
import java.util.HashMap; import com.lulei.crawl.CrawlListPageBase; public class BaiduNewList extends CrawlListPageBase{
private static HashMap<String, String> params; /**
* 加入相关头信息,对请求进行伪装
static {
params = new HashMap<String, String>();
params.put("Referer", "http://www.baidu.com");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
} public BaiduNewList(String urlStr) throws IOException {
super(urlStr, "utf-8", "get", params);
} @Override
public String getUrlRegexString() {
// TODO Auto-generated method stub
return "• <a href=\"(.*? )\"";
} @Override
public int getUrlRegexStringNum() {
// TODO Auto-generated method stub
return 1;
} /**
* @param args
* @throws IOException
* @Author:lulei
* @Description: 測试用例
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
BaiduNewList baidu = new BaiduNewList("http://news.baidu.com/n? cmd=4&class=sportnews&pn=1&from=tab");
for (String s : baidu.getPageUrls()) {



*@Description: 新闻类站点新闻内容
package com.lulei.crawl.news; import java.io.IOException;
import java.util.HashMap; import org.apache.commons.httpclient.HttpException; import com.lulei.crawl.CrawlBase;
import com.lulei.util.DoRegex; public class News extends CrawlBase{
private String url;
private String content;
private String title;
private String type; private static String contentRegex = "<p.*?>(.*?)</p>";
private static String titleRegex = "<title>(.*?)</title>";
private static int maxLength = 300; private static HashMap<String, String> params;
* 加入相关头信息,对请求进行伪装
static {
params = new HashMap<String, String>();
params.put("Referer", "http://www.baidu.com");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
} /**
* @Author:lulei
* @Description: 默认p标签内的内容为正文。假设正文长度查过设置的最大长度,则截取前半部分
private void setContent() {
String content = DoRegex.getString(getPageSourceCode(), contentRegex, 1);
content = content.replaceAll("\n", "")
.replaceAll("<script.*?/script>", "")
.replaceAll("<style.*?/style>", "")
.replaceAll("<.*?>", "");
this.content = content.length() > maxLength ? content.substring(0, maxLength) : content;
} /**
* @Author:lulei
* @Description: 默认title标签内的内容为标题
private void setTitle() {
this.title = DoRegex.getString(getPageSourceCode(), titleRegex, 1);;
} public News(String url) throws HttpException, IOException {
this.url = url;
readPageByGet(url, "utf-8", params);
} public String getUrl() {
return url;
} public void setUrl(String url) {
this.url = url;
} public String getContent() {
return content;
} public String getTitle() {
return title;
} public String getType() {
return type;
} public void setType(String type) {
this.type = type;
} public static void setMaxLength(int maxLength) {
News.maxLength = maxLength;
} /**
* @param args
* @throws HttpException
* @throws IOException
* @Author:lulei
* @Description: 測试用例
public static void main(String[] args) throws HttpException, IOException {
// TODO Auto-generated method stub
News news = new News("http://we.sportscn.com/viewnews-1634777.html");
} }



watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQveGlhb2ppbWFubWFu/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center" alt="" />


package com.lulei.knn.data; import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import com.lulei.crawl.news.BaiduNewList;
import com.lulei.crawl.news.News;
import com.lulei.knn.index.KnnIndex;
import com.lulei.knn.index.KnnSearch;
import com.lulei.util.ParseMD5; public class CrawlNews {
private static List<Info> infos;
private static KnnIndex knnIndex = new KnnIndex();
private static KnnSearch knnSearch = new KnnSearch();
private static HashMap<String, Integer> result; static {
infos = new ArrayList<Info>();
infos.add(new Info("http://news.baidu.com/n? cmd=4&class=sportnews&pn=1&from=tab", "体育类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=2&from=tab", "体育类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=sportnews&pn=3&from=tab", "体育类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=1&sub=0", "军事类"));
infos.add(new Info("http://news.baidu.com/n? cmd=4&class=mil&pn=2&sub=0", "军事类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=mil&pn=3&sub=0", "军事类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=1&sub=0", "財经类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=finannews&pn=2&sub=0", "財经类"));
infos.add(new Info("http://news.baidu.com/n? cmd=4&class=finannews&pn=3&sub=0", "財经类")); infos.add(new Info("http://news.baidu.com/n? cmd=4&class=internet&pn=1&from=tab", "互联网")); infos.add(new Info("http://news.baidu.com/n? cmd=4&class=housenews&pn=1&sub=0", "房产类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=2&sub=0", "房产类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=housenews&pn=3&sub=0", "房产类")); infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=1&sub=0", "游戏类"));
infos.add(new Info("http://news.baidu.com/n? cmd=4&class=gamenews&pn=2&sub=0", "游戏类"));
infos.add(new Info("http://news.baidu.com/n?cmd=4&class=gamenews&pn=3&sub=0", "游戏类"));
} /**
*@Description: 抓取网址信息
static class Info{
String url;
String type;
Info(String url, String type) {
this.url = url;
this.type = type;
} /**
* @param info
* @Author:lulei
* @Description: 抓取一个列表页面下的新闻信息
private void crawl(Info info) {
if (info == null) {
try {
BaiduNewList baiduNewList = new BaiduNewList(info.url);
List<String> urls = baiduNewList.getPageUrls();
for (String url : urls) {
News news = new News(url);
NewsBean newBean = new NewsBean();
if (news.getContent() == null || "".equals(news.getContent())) {
result.put("E", 1+result.get("E"));
if (info.type.equals(knnSearch.getType(news.getContent()))) {
result.put("R", 1+result.get("R"));
} else {
result.put("W", 1+result.get("W"));
} catch (Exception e) {
} /**
* @Author:lulei
* @Description: 启动入口
public void run() {
result = new HashMap<String, Integer>();
result.put("R", 0);
result.put("W", 0);
result.put("E", 0);
for (Info info : infos) {
System.out.println(info.url + "------start");
System.out.println(info.url + "------end");
try {
System.out.println("R = " + result.get("R"));
System.out.println("W = " + result.get("W"));
System.out.println("E = " + result.get("E"));
System.out.println("准确度:" + (result.get("R") * 1.0 / (result.get("R") + result.get("W"))));
} catch (IOException e) {
} public static void main(String[] args) {
new CrawlNews().run();



  1. 【java爬虫】---爬虫+基于接口的网络爬虫

    爬虫+基于接口的网络爬虫 上一篇讲了[java爬虫]---爬虫+jsoup轻松爬博客,该方式有个很大的局限性,就是你通过jsoup爬虫只适合爬静态网页,所以只能爬当前页面的所有新闻.如果需要爬一个网站 ...

  2. 基于Thinkphp5+phpQuery 网络爬虫抓取数据接口,统一输出接口数据api

    TP5_Splider 一个基于Thinkphp5+phpQuery 网络爬虫抓取数据接口 统一输出接口数据api.适合正在学习Vue,AngularJs框架学习 开发demo,需要接口并保证接口不跨 ...

  3. 《精通Python网络爬虫》|百度网盘免费下载|Python爬虫实战

    <精通Python网络爬虫>|百度网盘免费下载|Python爬虫实战 提取码:7wr5 内容简介 为什么写这本书 网络爬虫其实很早就出现了,最开始网络爬虫主要应用在各种搜索引擎中.在搜索引 ...

  4. 基于java的网络爬虫框架(实现京东数据的爬取,并将插入数据库)

    原文地址http://blog.csdn.net/qy20115549/article/details/52203722 本文为原创博客,仅供技术学习使用.未经允许,禁止将其复制下来上传到百度文库等平 ...

  5. android基于MVP小说网络爬虫、宝贝社区APP、仿虎扑钉钉应用、滑动阴影效果等源码

    Android精选源码 android宝贝社区app源码 android仿Tinder最漂亮的一个滑动效果 android仿滴滴打车开具发票页,ListView粘性Header Android基于MV ...

  6. 爬虫学习之基于Scrapy的网络爬虫

    ###概述 在上一篇文章<爬虫学习之一个简单的网络爬虫>中我们对爬虫的概念有了一个初步的认识,并且通过Python的一些第三方库很方便的提取了我们想要的内容,但是通常面对工作当作复杂的需求 ...

  7. 基于perl的网络爬虫

    use Mojo::UserAgent; use Bloom::Filter; use Smart::Comments; use DBI; my $dbname = "bbs_url&quo ...

  8. 2019基于python的网络爬虫系列,爬取糗事百科

    **因为糗事百科的URL改变,正则表达式也发生了改变,导致了网上许多的代码不能使用,所以写下了这一篇博客,希望对大家有所帮助,谢谢!** 废话不多说,直接上代码. 为了方便提取数据,我用的是beaut ...

  9. nutch从搜索引擎到网络爬虫

    人物介绍 姓名:DougCutting 个人名望:开发出开源全文检索引擎工具包Lucene. 个人简介/主要荣誉:除了 Lucene,还开发了著名的网络爬虫工具 Nutch,分布式系统基础架构Hado ...


  1. 关于iframe的高度自适应问题(js)

    function SetCwinHeight() { var cwin=document.getElementById("cwin"); if (document.getEleme ...

  2. barrier and Fence

    barrier 管理的是commandbuffer里面 command之间 fence管理的是queue之间 queue和cpu之间的顺序 通过flag比如等待所有面片画完 ------------- ...

  3. Spring IoC Container and Spring Bean Example Tutorial

    Spring Framework is built on the Inversion of Control (IOC) principle. Dependency injection is the t ...

  4. 修改PHP上传文件的大小限制(post)

    在PHP的默认配置情况下,当上传的文件大小超出一定的限制时,我们将得到如下的错误提示信息: Warning: POST Content-Length of 625523488 bytes exceed ...

  5. 转:myeclipse和eclipse的区别和联系,以及版本间的对应关系

    myeclipse和eclipse的区别和联系,以及版本间的对应关系 Eclipse:IBM花了4千万美金来开发这个IDE(Integrated Development Environment).第一 ...

  6. Java过滤器(Filter)与SpringMVC拦截器(Interceptor)之间的关系与区别

    过滤器和拦截器的区别: ①拦截器是基于java的反射机制的,而过滤器是基于函数回调. ②拦截器不依赖与servlet容器,过滤器依赖与servlet容器. ③拦截器只能对action请求起作用,而过滤 ...

  7. Jenkins 无法下载插件的解决办法

    有时候在安装插件时可能会出现下图的问题: 这应该是由于天朝的墙导致的,所以笔者就用了手动安装的方式 到https://wiki.jenkins-ci.org/display/JENKINS/Plugi ...

  8. webstrom 很卡 底下一直走进度条 scanning files to index

    最近工作总会遇到一些问题 先说说webstrom 其中有一次仅仅开了两个项目 电脑风扇就各种轰鸣 各种重启 安装卸载webstrom 都没有作用 好吧 其实解决很简单 选择一个文件夹,右键, Mark ...

  9. Android学习(八) 打开Activity

    在Android中打开窗口有两种方式,第一种是不需要返回值的,第二种是带返回值的. Main.xml文件,程序从这个窗口开始执行. <LinearLayout xmlns:android=&qu ...

  10. vue笔记二

    七.列表渲染 1.示例 <ul id="example-2"> <li v-for="(item, index) in items"> ...