这里从车商网上进行数据抓取,请保持良好的职业道德不要将数据用于商业途径。工信部官网有汽车方面的公告目录,那里有最全的pdf或word数据,鉴于word和pdf解析的繁琐和耗时,我暂时用这个网站的数据进行测试。

Spider主要代码:

package tk.mybatis.springboot.util;

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import tk.mybatis.springboot.model.AutobatchDirectory;
import tk.mybatis.springboot.service.AutobatchDirectoryService; public class AutoBatchSpider { // 原始来源http://www.cn357.com/notice_list/
public static final String web = "http://www.cn357.com"; private static final int timeOut=30000; /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
try {
saveByJdbc(autobatchDirectory);
} catch (SQLException e) {
e.printStackTrace();
}
}
} } /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(AutobatchDirectoryService autobatchDirectoryService,String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
autobatchDirectoryService.save(autobatchDirectory);
}
} } /**
* 获取所有汽车公告批次详细分页条目
*
* @throws InterruptedException
* @throws IOException
*
*/
public static List<Map<String, Object>> getDetailsPageFromBatchItems(String url)
throws InterruptedException, IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Thread.sleep(1000);
/************ 获取分页参数 ************/
Elements pages = doc.getElementsByAttributeValue("class", "page");
int max = 1;
for (Element element : pages) {
for (Element children : element.getElementsByTag("a")) {
String text = children.text();
if (!text.equals("") && !text.equals("下一页") && !text.equals("上一页")) {
int index = Integer.valueOf(children.text());
if (index > max) {
max = index;
}
}
}
}
int totalBatchPage = max;
List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
/************ 获取分页数据 ************/
for (int i = 1; i <= totalBatchPage; i++) {
System.out.println("分页数据获取进度:" + (double) Math.round(i * 100 / max) / 100);
Thread.sleep(2000);// 等待2秒开始访问
doc = Jsoup.connect(url + "_" + i).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements divs = doc.getElementsByAttributeValue("class", "noticeLotItem");
Map<String, Object> map = new HashMap<String, Object>();
for (Element div : divs) {
Elements divChildren = div.children();
boolean isHref = false;
for (Element element : divChildren) {
String claszType = element.attr("class");
if (claszType.equals("m")) {
Element a = element.getElementsByAttribute("href").get(0);
String href = a.attr("href");
if ("".equals(href)) {
continue;
}
map = new HashMap<String, Object>();
if (!href.startsWith("http://")) {
map.put("href", web + href);
} else {
map.put("href", href);
}
map.put("href_text", a.text());
isHref = true;
} else if (claszType.equals("c")) {
if (!isHref) {
continue;
}
map.put("type_text", element.text());
}
}
if (map != null) {
list.add(map);
map = null;
}
}
}
System.out.println("Total rows:" + list.size());
return list;
} /**
* 解析详细的车型车厂信息
*
* @throws IOException
* @throws InterruptedException
*/
public static AutobatchDirectory getDetailOfAutoBatchInfo(String url) throws IOException, InterruptedException {
Thread.sleep(2000);
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements tableElements = doc.getElementsByAttributeValue("class", "noticeAttr mt5").get(0)
.getElementsByTag("tbody").get(0).children();
AutobatchDirectory autobatchDirectory = null;
if (tableElements.size() == 22) {
autobatchDirectory = new AutobatchDirectory();
// 22行表示没有发动机参数
System.out.println("......22行表示没有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
} else if (tableElements.size() == 23) {
autobatchDirectory = new AutobatchDirectory();
// 23行表示有发动机参数
System.out.println("......23行表示有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == 22) {
Element valueTr = element.getElementsByTag("table").get(0);
Elements children = valueTr.getElementsByTag("tbody").get(0).children().get(1).children();
int count = 0;
for (Element child : children) {
switch (count) {
case 0:
// 发动机型号
autobatchDirectory.setfEngineType(child.text());
break;
case 1:
// 发动机生产企业
autobatchDirectory.setfEnginePro(child.text());
break;
case 2:
// 发动机商标
autobatchDirectory.setfEngineTrademark(child.text());
break;
case 3:
// 排量
autobatchDirectory.setfOutputVolume(child.text());
break;
case 4:
// 功率
autobatchDirectory.setfPower(child.text());
break;
default:
break;
}
count++;
}
} else if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
}
return autobatchDirectory;
} /**
* 设置对象属性值
*/
private static void setPropertyToObject(AutobatchDirectory autobatchDirectory, int i, Element element) {
Elements children = element.children();
String values = children.get(1).text() + " " + children.get(3).text();
System.out.println(values);
switch (i) {
case 1:
// 公告型号 公告批次
autobatchDirectory.setfAnnouType(children.get(1).text());
autobatchDirectory.setfAnnouBatch(children.get(3).text());
break;
case 2:
// 品牌 类型
autobatchDirectory.setfVehicleBrand(children.get(1).text());
autobatchDirectory.setfVehicleType(children.get(3).text());
break;
case 3:
// 额定质量 32000,32700 总质量
autobatchDirectory.setfMaxMass(children.get(1).text());
autobatchDirectory.setfTotalMass(children.get(3).text());
break;
case 4:
// 整备质量 8000,7300 燃料种类
autobatchDirectory.setfWholeMass(children.get(1).text());
autobatchDirectory.setfFuelType(children.get(3).text());
break;
case 5:
// 排放依据标准 轴数
autobatchDirectory.setfBlowoffStandard(children.get(1).text());
autobatchDirectory.setfAxleNumber(children.get(3).text());
break;
case 6:
// 轴距 7250+1310+1310,6850+1310+1310 轴荷
autobatchDirectory.setfWheelbase(children.get(1).text());
autobatchDirectory.setfAxleWeight(children.get(3).text());
break;
case 7:
// 弹簧片数 -/8/8/8,-/4/4/4,-/7/7/7,-/-/-/-,-/10/10/10 轮胎数
autobatchDirectory.setfSpringNumber(children.get(1).text());
autobatchDirectory.setfTyreNumber(children.get(3).text());
break;
case 8:
// 轮胎规格 11.00R20 12PR,11.00-20 12PR,12R22.5 12PR 接近离去角
autobatchDirectory.setfTyreSize(children.get(1).text());
autobatchDirectory.setfDepartureAngle(children.get(3).text());
break;
case 9:
// 前悬后悬 -/2080,-/1730,-/2480,-/2130 前轮距
autobatchDirectory.setfFrearSuspension(children.get(1).text());
autobatchDirectory.setfFrontGauge(children.get(3).text());
break;
case 10:
// 后轮距 1830/1830/1830 识别代号
autobatchDirectory.setfBackGauge(children.get(1).text());
autobatchDirectory.setfVinCode(children.get(3).text());
break;
case 11:
// 整车长 13000 整车宽
autobatchDirectory.setfVehicleLength(children.get(1).text());
autobatchDirectory.setfVehicleWidth(children.get(3).text());
break;
case 12:
// 整车高 2970,3030,2760 货厢长
autobatchDirectory.setfVehicleHeight(children.get(1).text());
autobatchDirectory.setfCargoLength(children.get(3).text());
break;
case 13:
// 货厢宽 2400,2450,2470 货厢高
autobatchDirectory.setfCargoWidth(children.get(1).text());
autobatchDirectory.setfCargoHeight(children.get(3).text());
break;
case 14:
// 最高车速 额定载客
autobatchDirectory.setfMaxSpeed(children.get(1).text());
autobatchDirectory.setfMaxPassenger(children.get(3).text());
break;
case 15:
// 驾驶室准乘人数 转向形式
autobatchDirectory.setfCabNumber(children.get(1).text());
autobatchDirectory.setfSteeringType(children.get(3).text());
break;
case 16:
// 准拖挂车总质量 载质量利用系数
autobatchDirectory.setfTotalMassTrailer(children.get(1).text());
autobatchDirectory.setfLoadMassFactor(children.get(3).text());
break;
case 17:
// 半挂车鞍座最大承载质量 16000,16150 企业名称
autobatchDirectory.setfMaxSemitrailer(children.get(1).text());
autobatchDirectory.setfEnterpriseName(children.get(3).text());
break;
case 18:
// 企业地址 深圳市龙岗区坪山镇锦龙大道1号 电话号码
autobatchDirectory.setfEnterpriseAddress(children.get(1).text());
autobatchDirectory.setfEnterprisePhone(children.get(3).text());
break;
case 19:
// 传真号码 (0755)89663298 邮政编码
autobatchDirectory.setfEnterpriseFax(children.get(1).text());
autobatchDirectory.setfPostcode(children.get(3).text());
break;
case 20:
// 底盘1 底盘2
autobatchDirectory.setfChassisOne(children.get(1).text());
autobatchDirectory.setfChassisTwo(children.get(3).text());
break;
case 21:
// 底盘3 底盘4
autobatchDirectory.setfChassisThree(children.get(1).text());
autobatchDirectory.setfChassisFour(children.get(3).text());
break;
default:
break;
}
} /**
* JDBC存储
* @throws SQLException
*/
public static void saveByJdbc(AutobatchDirectory autobatchDirectory) throws SQLException{
String sql = "insert into autobatch_directory ("
+ "F_ANNOU_TYPE, F_ANNOU_BATCH, F_VEHICLE_BRAND, F_VEHICLE_TYPE,"
+ "F_MAX_MASS, F_TOTAL_MASS, F_WHOLE_MASS, F_FUEL_TYPE, "
+ "F_BLOWOFF_STANDARD, F_AXLE_NUMBER, F_WHEELBASE, F_AXLE_WEIGHT,"
+ "F_SPRING_NUMBER, F_TYRE_NUMBER,F_TYRE_SIZE, F_DEPARTURE_ANGLE, "
+ "F_FREAR_SUSPENSION, F_FRONT_GAUGE, F_BACK_GAUGE, F_VIN_CODE, "
+ "F_VEHICLE_LENGTH, F_VEHICLE_WIDTH, F_VEHICLE_HEIGHT,F_CARGO_LENGTH,"
+ "F_CARGO_WIDTH, F_CARGO_HEIGHT,F_MAX_SPEED, F_MAX_PASSENGER,"
+ "F_CAB_NUMBER, F_STEERING_TYPE, F_TOTAL_MASS_TRAILER,"
+ "F_LOAD_MASS_FACTOR, F_MAX_SEMITRAILER, F_ENTERPRISE_NAME, F_ENTERPRISE_ADDRESS, "
+ "F_ENTERPRISE_PHONE, F_ENTERPRISE_FAX, F_POSTCODE, F_CHASSIS_ONE,"
+ "F_CHASSIS_TWO, F_CHASSIS_THREE, F_CHASSIS_FOUR, F_ENGINE_TYPE,"
+ "F_ENGINE_PRO, F_ENGINE_TRADEMARK, F_OUTPUT_VOLUME, F_POWER, F_REMARK) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,"
+ "?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
Connection conn = MycatJdbc.getConnection();
PreparedStatement pstm = null;
try {
System.out.println(sql);
pstm = (PreparedStatement) conn.prepareStatement(sql);
pstm.setString(1, autobatchDirectory.getfAnnouType());
pstm.setString(2, autobatchDirectory.getfAnnouBatch());
pstm.setString(3, autobatchDirectory.getfVehicleBrand());
pstm.setString(4, autobatchDirectory.getfVehicleType());
pstm.setString(5, autobatchDirectory.getfMaxMass());
pstm.setString(6, autobatchDirectory.getfTotalMass());
pstm.setString(7, autobatchDirectory.getfWholeMass());
pstm.setString(8, autobatchDirectory.getfFuelType());
pstm.setString(9, autobatchDirectory.getfBlowoffStandard());
pstm.setString(10, autobatchDirectory.getfAxleNumber());
pstm.setString(11, autobatchDirectory.getfWheelbase());
pstm.setString(12,autobatchDirectory.getfAxleWeight() );
pstm.setString(13, autobatchDirectory.getfSpringNumber());
pstm.setString(14,autobatchDirectory.getfTyreNumber() );
pstm.setString(15, autobatchDirectory.getfTyreSize() );
pstm.setString(16, autobatchDirectory.getfDepartureAngle());
pstm.setString(17, autobatchDirectory.getfFrearSuspension());
pstm.setString(18, autobatchDirectory.getfFrontGauge());
pstm.setString(19,autobatchDirectory.getfBackGauge() );
pstm.setString(20, autobatchDirectory.getfVinCode());
pstm.setString(21, autobatchDirectory.getfVehicleLength());
pstm.setString(22, autobatchDirectory.getfVehicleWidth());
pstm.setString(23, autobatchDirectory.getfVehicleHeight());
pstm.setString(24, autobatchDirectory.getfCargoLength());
pstm.setString(25,autobatchDirectory.getfCargoWidth() );
pstm.setString(26, autobatchDirectory.getfCargoHeight());
pstm.setString(27,autobatchDirectory.getfMaxSpeed() );
pstm.setString(28,autobatchDirectory.getfMaxPassenger() );
pstm.setString(29,autobatchDirectory.getfCabNumber() );
pstm.setString(30, autobatchDirectory.getfSteeringType() );
pstm.setString(31, autobatchDirectory.getfTotalMassTrailer());
pstm.setString(32,autobatchDirectory.getfLoadMassFactor() );
pstm.setString(33,autobatchDirectory.getfMaxSemitrailer() );
pstm.setString(34, autobatchDirectory.getfEnterpriseName());
pstm.setString(35,autobatchDirectory.getfEnterpriseAddress() );
pstm.setString(36,autobatchDirectory.getfEnterprisePhone() );
pstm.setString(37,autobatchDirectory.getfEnterpriseFax() );
pstm.setString(38, autobatchDirectory.getfPostcode() );
pstm.setString(39, autobatchDirectory.getfChassisOne());
pstm.setString(40, autobatchDirectory.getfChassisTwo());
pstm.setString(41, autobatchDirectory.getfChassisThree());
pstm.setString(42, autobatchDirectory.getfChassisFour());
pstm.setString(43, autobatchDirectory.getfEngineType() == null ? "" : autobatchDirectory.getfEngineType());
pstm.setString(44, autobatchDirectory.getfEnginePro() == null ? "" : autobatchDirectory.getfEnginePro());
pstm.setString(45, autobatchDirectory.getfEngineTrademark() == null ? "" : autobatchDirectory.getfEngineTrademark());
pstm.setString(46, autobatchDirectory.getfOutputVolume() == null ? "" : autobatchDirectory.getfOutputVolume());
pstm.setString(47, autobatchDirectory.getfPower() == null ? "" : autobatchDirectory.getfPower());
pstm.setString(48, autobatchDirectory.getfRemark()==null?"":autobatchDirectory.getfRemark());
pstm.executeUpdate();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (pstm != null) {
try {
pstm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
}

代码没什么难度,都是基本的元素解析。

Java Jsoup Spider抓取数据入库的更多相关文章

  1. java抓取网页数据,登录之后抓取数据。

    最近做了一个从网络上抓取数据的一个小程序.主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中. 也找了一些资料,觉得没有一个很好的,全面的例子.因此在这里做个笔记提醒自己. 首先需要一 ...

  2. Java模拟新浪微博登陆抓取数据

    前言:  兄弟们来了来了,最近有人在问如何模拟新浪微博登陆抓取数据,我听后默默地抽了一口老烟,暗暗的对自己说,老汉是时候该你出场了,所以今天有时间就整理整理,浅谈一二. 首先:  要想登陆新浪微博需要 ...

  3. 通过Java进行网页抓取并生成plist创建代码

    抓取网页的方法: 抓取网页可以通过正则表达式也可以通过Java. 通过firefox浏览器,安装Firebug来查看网页的源代码. 首先将要抓取的部分保存到本地,步骤如下: 1.在要抓取的位置右键,选 ...

  4. .net处理页面的抓取数据

    //要抓取数据的页面路径 string url = "http://www.scedu.net/banshi/used-car/lower-secondary-education/middl ...

  5. windows环境下nutch2.x 在eclipse中实现抓取数据存进mysql详细步骤

    nutch2.x 在eclipse中实现抓取数据存进mysql步骤 最近在研究nutch,花了几天时间,也遇到很多问题,最终结果还是成功了,在此记录,并给其他有兴趣的人提供参考,共同进步. 对nutc ...

  6. java做web抓取

    就像许多现代科技一样,从网站提取信息这一功能也有多个框架可以选择.最流行的有JSoup.HTMLUnit和Selenium WebDriver.我们这篇文章讨论JSoup.JSoup是个开源项目,提供 ...

  7. 分布式爬虫:使用Scrapy抓取数据

    分布式爬虫:使用Scrapy抓取数据 Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据.Scrapy用途广泛,可以用于数据挖掘. ...

  8. C# 从需要登录的网站上抓取数据

    [转] C# 从需要登录的网站上抓取数据 背景:昨天一个学金融的同学让我帮她从一个网站上抓取数据,然后导出到excel,粗略看了下有1000+条记录,人工统计的话确实不可能.虽说不会,但作为一个学计算 ...

  9. nodejs--实现跨域抓取数据

    最近公司安排给我一个任务,抓取页面数据:http://survey.finance.sina.com.cn/static/20205/20131120.html?pid=20205&dpc=1 ...

随机推荐

  1. 关于可图化序列的一点结论 NEU 1429

    Graphic Sequence A graphic sequence is a sequence of numbers which can be the degree sequence of som ...

  2. 一个javascript继承和使用的例子

    继承可以帮助我们实现代码的重用,把对象的属性写入构造函数,对象的方法写入原型后,以下例子演示继承的使用: 示例的css和js在后 父实例,得到一个间隔1s的轮播: <!DOCTYPE html& ...

  3. python(5)-- 函数

    python 函数  定义:函数是组织好的,可重复使用的,用来实现单一,或相关联功能的代码段.  使用好处:函数能提高应用的模块性,和代码的重复利用率. 分类:(1)python 内建函数:pytho ...

  4. 通过Xode上传代码到GIthub---步骤

    ---恢复内容开始--- 一:打开终端,git命令进行全局配置 由于本人已经配置完成,so,直接查看配置信息 然后在本地创建一个文件夹, 然后在gitHub上创建一个代码库 在终端clone到本地创建 ...

  5. jQuery基础 浅析(含基本方法和选择器)

    1.jQuery与DOM互相转换 jQuery入库函数:$(document).ready(function(){}) $(function(){}) $(“#btn”):jQuery存储的是DOM对 ...

  6. Mysql 取整的方法

    .CEIL() 向上取整 SELECT CEIL(/); .FLOOR() 向下取整 SELECT FLOOR( .ROUND() 四舍五入 SELECT ROUND(

  7. 1180: [CROATIAN2009]OTOCI

    1180: [CROATIAN2009]OTOCI Time Limit: 50 Sec  Memory Limit: 162 MBSubmit: 1032  Solved: 638[Submit][ ...

  8. BZOJ 3384 上帝与集合的正确用法

    上帝与集合的正确用法 [问题描述] [输入格式] 第一行一个T,接下来T行,每行一个正整数p,代表你需要取模的值. [输出格式] T行,每行一个正整数,为答案对p取模后的值. [样例输入] 3236 ...

  9. C#可选参数与具名参数

    可选参数 static void test1() { func1("A"); func1(); Console.ReadKey(); } ) { Console.WriteLine ...

  10. Array拼接字符串

    原文发布时间为:2011-01-12 -- 来源于本人的百度文章 [由搬家工具导入] Array拼接字符串本来就是一种投机取巧的无聊玩意,来源是IE6对字符串的+实现错误一般情况下,如果是语义性的字符 ...