【JAVA】我的爬虫

简介：不是很智能的爬虫，效率慢，仅用作自娱自乐，需要观察目标网站的页面然后修改相关正则去获取自己想要的数据

环境：需要Http-client相关jar包，如下，可以去我的下载链接下载：

https://download.csdn.net/download/the_fool_/10046597

重新整理jar包：

<dependency>

    <groupId>org.apache.httpcomponents</groupId>

    <artifactId>httpclient</artifactId>

    <version>4.5</version>

</dependency>

工具类1，获取某个页面HTML代码

package com.zzt.spider;

import java.io.IOException;

import java.util.Random;

import org.apache.http.HttpEntity;

import org.apache.http.HttpResponse;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.HttpClient;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.util.EntityUtils;

/**

 * 用于获取整张页面的字符串表现形式

 * @author Administrator

 *

 */

public class SpiderChild {

	public static void main(String[] args) {

		Random r =new Random();

		int nextInt = r.nextInt(8)+1;

		System.out.println(nextInt);

//		String stringHtml = getStringHtml("http://www.163.com");

//		System.out.println(stringHtml);

//		String[] contents = stringHtml.split("<a href=\"");

//		for(String s :contents){

//			System.out.println(s);

//		}

	}

	//获取�?��张页面的字符串表现形�?

	public static String getStringHtml(String url){

		//实例化客户端

		HttpClient client = new DefaultHttpClient();

		HttpGet getHttp = new HttpGet(url);

		//整张页面

		String content = null;

		HttpResponse response;

		try {

			response = client.execute(getHttp);

			//获取到实�?

			HttpEntity entity = response.getEntity();

			if(entity!=null){

				content = EntityUtils.toString(entity);

				//System.out.println(content);

			}

		} catch (ClientProtocolException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		} catch (IOException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}finally{

			client.getConnectionManager().shutdown();

		}

		return content;

	}

}

主类：

package com.zzt.spider;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.io.PrintWriter;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

 * ZX 2017.7.26

 * 针对于不同网站定制化爬虫，并不全面

 * @author Administrator

 *

 */

public class BigBugSpiderSu {

	  public static void main(String[] args) throws Exception {

		  File file = new File("E:\\htmlfile");

		  if(!file.exists()){

			  boolean createNewFile = file.mkdir();

			  System.out.println(createNewFile);

		  }

		  crawl();

	  }

	  public static void crawl() {

		  System.out.println("begin+++++++++++++++++++++++++++++++++++++++++++");

			  String url = "http://www.xxxx.cn/html/";

			  for(int i=0; i<=30000; i++){

					String URLS = url + i+".html";

					System.out.println("共30000页，当前="+i+".html");

					try {

						spider(URLS);

						for(int j=0;j<urlUeue.size();j++){

							String u = urlUeue.get(j);System.out.println("页面"+u);

							String stringHtml = SpiderChild.getStringHtml(u);

							String fileName=(u.substring(u.indexOf("="))+".html").replace("=", "a");

							System.out.println(u);

							try {

								Thread.sleep(1000);

								writetoFile(stringHtml,fileName);

							} catch (Exception e) {

								e.printStackTrace();

								continue;

							}

						}

						urlUeue.clear();

					} catch (Exception e) {

						e.printStackTrace();

						continue;

					}

					try {

						Thread.sleep(0);

					} catch (InterruptedException e) {

						// TODO Auto-generated catch block

						e.printStackTrace();

					}

				}

				System.out.println("end+++++++++++++++++++++++++++++++++++++++++++");

				System.out.println("end+++++++++++++++++++++++++++++++++++++++++++");

				System.out.println("end+++++++++++++++++++++++++++++++++++++++++++");

		}

	    //链接容器

	    static List<String> urlUeue = new ArrayList<String>();

	    //获取�?��页面的所有连�?

		private static void spider(String URLS) throws Exception{

			 URL url = null;

		        URLConnection urlconn = null;

		        BufferedReader br = null;

		        PrintWriter pw = null;

		        //http://www.ajxxgk.jcy.cn/html/[0-9_/]+.html

		        //href="/html/20170904/2/7002870.html">

		        String regex = "//html//[0-9]+//2//[0-9_]+.html";

		        Pattern p = Pattern.compile(regex);

		        try {

		            url = new URL(URLS);

		            urlconn = url.openConnection();  //X-Forward-For

		            urlconn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");

		            urlconn.setRequestProperty("Accept-Encoding","gzip, deflate");

		            urlconn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");

		            urlconn.setRequestProperty("Connection","keep-alive");

		            urlconn.setRequestProperty("Cookie", "__jsluid=053c2fe045bbf28d2215a0aa0aa713e5; Hm_lvt_2e64cf4f6ff9f8ccbe097650c83d719e=1502258037,1504571969; Hm_lpvt_2e64cf4f6ff9f8ccbe097650c83d719e=1504574596; sYQDUGqqzHpid=page_0; sYQDUGqqzHtid=tab_0; PHPSESSID=7ktaqicdremii959o4d0p2rgm6; __jsl_clearance=1504575799.118|0|cwzSt6rKCXJZrf5ZOVGhco1TpWw%3");

		            urlconn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0");

		            urlconn.setRequestProperty("Host","www.ajxxgk.jcy.cn");

		            urlconn.setRequestProperty("Content-Type","text/html; charset=UTF-8");

		            urlconn.setRequestProperty("Referer","http://www.ajxxgk.jcy.cn/html/zjxflws/2.html");

		            pw = new PrintWriter(new FileWriter("e:/url.txt"), true);

		            br = new BufferedReader(new InputStreamReader(

		                    urlconn.getInputStream()));

		            String buf = null;

		            while ((buf = br.readLine()) != null) {

		            	System.out.println(buf);

//		            	String string = new String(buf.getBytes(), "utf-8");

//		            	System.out.println(string);

		                Matcher buf_m = p.matcher(buf);

		                while (buf_m.find()) {

		                    urlUeue.add("http://www.xxxx.cn/"+buf_m.group());

		                }

		            }

		            //System.out.println("获取成功");

		        } catch (MalformedURLException e) {

		            e.printStackTrace();

		        } catch (IOException e) {

		            e.printStackTrace();

		        } finally {

		            try {

		                br.close();

		            } catch (IOException e) {

		                e.printStackTrace();

		            }

		            pw.close();

		        }

		}

		 public static void writetoFile(String context,String fileName)throws Exception{

		        // 构建指定文件

		        File file = new File("E:" + File.separator + "htmlfile"+File.separator+fileName);

		        OutputStream out = null;

		        try {

		            // 根据文件创建文件的输出流

		            out = new FileOutputStream(file);

		            // 把内容转换成字节数组

		            byte[] data = context.getBytes();

		            // 向文件写入内�?

		            out.write(data);

		        } catch (Exception e) {

		            e.printStackTrace();

		        } finally {

		            try {

		                // 关闭输出�?

		                out.close();

		            } catch (Exception e) {

		                e.printStackTrace();

		            }

		        }

		    }

}

//class Dog implements Runnable{

//	public  List<String> urlUeue;

//	@Override

//	public void run() {

//		for(String u:urlUeue){

//

//			String stringHtml = SpiderChild.getStringHtml(u);

//			String fileName=u.substring(u.indexOf("xiangqing-"));

//			System.out.println("fileName"+fileName);

//			writetoFile(stringHtml,fileName);

//		}

//

//	}

//	 public  void writetoFile(String context,String fileName){

//	        // 构建指定文件

//	        File file = new File("E:" + File.separator + "htmlfile"+File.separator+fileName);

//	        OutputStream out = null;

//	        try {

//	            // 根据文件创建文件的输出流

//	            out = new FileOutputStream(file);

//	            // 把内容转换成字节数组

//	            byte[] data = context.getBytes();

//	            // 向文件写入内�?

//	            out.write(data);

//	        } catch (Exception e) {

//	            e.printStackTrace();

//	        } finally {

//	            try {

//	                // 关闭输出�?

//	                out.close();

//	            } catch (Exception e) {

//	                e.printStackTrace();

//	            }

//	        }

//	    }

//	public Dog(List<String> urlUeue) {

//		this.urlUeue = urlUeue;

//	}

//	public Dog() {

//		super();

//	}

//

//}

【JAVA】我的爬虫的更多相关文章

Java豆瓣电影爬虫——抓取电影详情和电影短评数据
一直想做个这样的爬虫:定制自己的种子,爬取想要的数据,做点力所能及的小分析.正好,这段时间宝宝出生,一边陪宝宝和宝妈,一边把自己做的这个豆瓣电影爬虫的数据采集部分跑起来.现在做一个概要的介绍和演示. ...
Java开发搜索引擎爬虫
package com.peidon.html; import java.io.BufferedReader; import java.io.File; import java.io.FileOutp ...
Java之网络爬虫WebCollector2.1.2+selenium2.44+phantomjs2.1.1
Java之网络爬虫WebCollector2.1.2+selenium2.44+phantomjs2.1.1 一.简介版本匹配: WebCollector2.12 + selenium2.44.0 ...
java简单web爬虫(网页图片)
java简单web爬虫(网页图片)效果,执行main()方法后图片就下载道C盘的res文件夹中.没有的话创建一个文件夹代码里的常量根据自己的需求修改,代码附到下面. package com.sinit ...
【网络爬虫】【java】微博爬虫（五）：防止爬虫被墙的几个技巧（总结篇）
爬虫的目的就是大规模地.长时间地获取数据,跟我们正常浏览器获取数据相比,虽然机理相差不大,但总是一个IP去爬网站,大规模集中对服务器访问,时间一长就有可能被拒绝.关于爬虫长时间爬取数据,可能会要求验证 ...
【网络爬虫】【java】微博爬虫（一）：小试牛刀——网易微博爬虫（自定义关键字爬取微博数据）（附软件源码）
一.写在前面 (本专栏分为"java版微博爬虫"和"python版网络爬虫"两个项目,系列里所有文章将基于这两个项目讲解,项目完整源码已经整理到我的Github ...
Java学习-058-Jsoup爬虫获取中国所有的三级行政区划数据（三），处理二级编码缺失
通过查看数据可知,直辖市或者某些三级行政区域没有对应的二级区域,为方便后续的地址使用,可自定义缺失的二级地址. 如下示例自定义的二级行政区域的名称为一级区域的名称,对应的源码如下所示: 将此段源码添加 ...
Java豆瓣电影爬虫——小爬虫成长记（附源码）
以前也用过爬虫,比如使用nutch爬取指定种子,基于爬到的数据做搜索,还大致看过一些源码.当然,nutch对于爬虫考虑的是十分全面和细致的.每当看到屏幕上唰唰过去的爬取到的网页信息以及处理信息的时候, ...
Java写的爬虫的基本程序
这是一个web搜索的基本程序,从命令行输入搜索条件(起始的URL.处理url的最大数.要搜索的字符串),它就会逐个对Internet上的URL进行实时搜索,查找并输出匹配搜索条件的页面. 这个程序的原 ...
【转】零基础写Java知乎爬虫之进阶篇
转自:脚本之家说到爬虫,使用Java本身自带的URLConnection可以实现一些基本的抓取页面的功能,但是对于一些比较高级的功能,比如重定向的处理,HTML标记的去除,仅仅使用URLConnec ...

随机推荐

WPF 实现跑马灯效果的Label控件，数据绑定方式实现
原文:WPF 实现跑马灯效果的Label控件,数据绑定方式实现项目中需要使用数据绑定的方式实现跑马灯效果的Label,故重构了Label控件:具体代码如下 using System; using S ...
构建自己的PHP框架（Twig模板引擎）
完整项目地址:https://github.com/Evai/Aier Twig 模板引擎模版引擎 twig 的模板就是普通的文本文件,也不需要特别的扩展名,.html .htm .twig 都可以 ...
hdu 4035 可能性DP 成都网络游戏
http://acm.hdu.edu.cn/showproblem.php?pid=4035 获得: 1.首先推断是不是树.事实上,所有的感觉身影,既看边数==算-1是不成立 2.有时候,我告诉孩子来 ...
潜移默化学会WPF(转载篇)--屏幕显示Label，鼠标移上去变成textBox
原文:潜移默化学会WPF(转载篇)--屏幕显示Label,鼠标移上去变成textBox <Window x:Class="WpfApplication1.Window1" x ...
ef core 数据类型 && 表字段名设置
HasColumnType HasColumnType是指定字段类型 [Column(TypeName = "decimal(18, 2)")] public decimal Mo ...
WPF 多路绑定
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Threa ...
JS的innerText和innerHTML
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/ ...
WPF 验证错误模板
<Window x:Class="BindingExam.MainWindow" xmlns="http://schemas.microsoft.co ...
Visual C++ 编译器自动假定带 .C 扩展名的文件是 C 文件而不是 C++ 文件，并且拒绝 C++ 语法和关键字（c语言只能在大括号最前面申明变量）
今天在编译OpenGL红宝书附带源码中的light.c文件时遇到一个诡异的问题: 如图light .c,在不做任何修改的情况编译OK.然而只要在某些地方写了可执行代码,则会无法通过编译器编译! (这几 ...
Windows NT WinLogon Notify
在NT系列Windows操作系统中,恶意软件可以通过关联Winlogon特定的事件来使自身被启动,如Lock,Logoff,Logon,Shutdown,StartScreenSaver,StartS ...

【JAVA】我的爬虫

【JAVA】我的爬虫的更多相关文章

随机推荐

热门专题