Java Jsoup Spider抓取数据入库
这里从车商网上进行数据抓取,请保持良好的职业道德不要将数据用于商业途径。工信部官网有汽车方面的公告目录,那里有最全的pdf或word数据,鉴于word和pdf解析的繁琐和耗时,我暂时用这个网站的数据进行测试。
Spider主要代码:
package tk.mybatis.springboot.util; import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.mysql.jdbc.Connection;
import com.mysql.jdbc.PreparedStatement;
import tk.mybatis.springboot.model.AutobatchDirectory;
import tk.mybatis.springboot.service.AutobatchDirectoryService; public class AutoBatchSpider { // 原始来源http://www.cn357.com/notice_list/
public static final String web = "http://www.cn357.com"; private static final int timeOut=30000; /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
try {
saveByJdbc(autobatchDirectory);
} catch (SQLException e) {
e.printStackTrace();
}
}
} } /**
* 获取汽车公告批次
*
* @throws InterruptedException
* @throws IOException
*
*/
public static void getBatchFromUrl(AutobatchDirectoryService autobatchDirectoryService,String listurl) throws InterruptedException, IOException {
Document doc = Jsoup.connect(listurl).userAgent("Mozilla/5.0").timeout(timeOut).get();
Element batchElement = doc.getElementById("noticeList");
Elements elements = batchElement.children();
List<String> urls = new ArrayList<String>();
for (Element element : elements) {
String href = element.attr("href");
String text = element.text();
if (!href.startsWith("http://")) {
StringBuffer sb = new StringBuffer();
String batchUrl = sb.append(web).append(href).toString();
System.out.println(text + "\t\t" + batchUrl);
urls.add(batchUrl);
} else {
String batchUrl = href;
urls.add(batchUrl);
System.out.println(text + "\t\t" + batchUrl);
}
}
// 反转排序
Collections.reverse(urls); System.out.println("总批次数:" + urls.size());
for (int j = 0, k = urls.size(); j < k; j++) {
String url = urls.get(j);
System.out.println("第" + url.split("_")[1] + "批数据获取进度:" + (double) Math.round(j * 100 / k) / 100);
// 获取分页链接
List<Map<String,Object>> list=getDetailsPageFromBatchItems(url);
for (Map<String, Object> map : list) {
// 获取详细数据对象
String detailUrl =(String)map.get("href");
AutobatchDirectory autobatchDirectory=getDetailOfAutoBatchInfo(detailUrl);
autobatchDirectoryService.save(autobatchDirectory);
}
} } /**
* 获取所有汽车公告批次详细分页条目
*
* @throws InterruptedException
* @throws IOException
*
*/
public static List<Map<String, Object>> getDetailsPageFromBatchItems(String url)
throws InterruptedException, IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Thread.sleep(1000);
/************ 获取分页参数 ************/
Elements pages = doc.getElementsByAttributeValue("class", "page");
int max = 1;
for (Element element : pages) {
for (Element children : element.getElementsByTag("a")) {
String text = children.text();
if (!text.equals("") && !text.equals("下一页") && !text.equals("上一页")) {
int index = Integer.valueOf(children.text());
if (index > max) {
max = index;
}
}
}
}
int totalBatchPage = max;
List<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
/************ 获取分页数据 ************/
for (int i = 1; i <= totalBatchPage; i++) {
System.out.println("分页数据获取进度:" + (double) Math.round(i * 100 / max) / 100);
Thread.sleep(2000);// 等待2秒开始访问
doc = Jsoup.connect(url + "_" + i).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements divs = doc.getElementsByAttributeValue("class", "noticeLotItem");
Map<String, Object> map = new HashMap<String, Object>();
for (Element div : divs) {
Elements divChildren = div.children();
boolean isHref = false;
for (Element element : divChildren) {
String claszType = element.attr("class");
if (claszType.equals("m")) {
Element a = element.getElementsByAttribute("href").get(0);
String href = a.attr("href");
if ("".equals(href)) {
continue;
}
map = new HashMap<String, Object>();
if (!href.startsWith("http://")) {
map.put("href", web + href);
} else {
map.put("href", href);
}
map.put("href_text", a.text());
isHref = true;
} else if (claszType.equals("c")) {
if (!isHref) {
continue;
}
map.put("type_text", element.text());
}
}
if (map != null) {
list.add(map);
map = null;
}
}
}
System.out.println("Total rows:" + list.size());
return list;
} /**
* 解析详细的车型车厂信息
*
* @throws IOException
* @throws InterruptedException
*/
public static AutobatchDirectory getDetailOfAutoBatchInfo(String url) throws IOException, InterruptedException {
Thread.sleep(2000);
Document doc = Jsoup.connect(url).userAgent("Mozilla/5.0").timeout(timeOut).get();
Elements tableElements = doc.getElementsByAttributeValue("class", "noticeAttr mt5").get(0)
.getElementsByTag("tbody").get(0).children();
AutobatchDirectory autobatchDirectory = null;
if (tableElements.size() == 22) {
autobatchDirectory = new AutobatchDirectory();
// 22行表示没有发动机参数
System.out.println("......22行表示没有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
} else if (tableElements.size() == 23) {
autobatchDirectory = new AutobatchDirectory();
// 23行表示有发动机参数
System.out.println("......23行表示有发动机参数.....解析中.......");
for (int i = 1, j = tableElements.size(); i <= j; i++) {
Element element = tableElements.get(i - 1);
if (i == 22) {
Element valueTr = element.getElementsByTag("table").get(0);
Elements children = valueTr.getElementsByTag("tbody").get(0).children().get(1).children();
int count = 0;
for (Element child : children) {
switch (count) {
case 0:
// 发动机型号
autobatchDirectory.setfEngineType(child.text());
break;
case 1:
// 发动机生产企业
autobatchDirectory.setfEnginePro(child.text());
break;
case 2:
// 发动机商标
autobatchDirectory.setfEngineTrademark(child.text());
break;
case 3:
// 排量
autobatchDirectory.setfOutputVolume(child.text());
break;
case 4:
// 功率
autobatchDirectory.setfPower(child.text());
break;
default:
break;
}
count++;
}
} else if (i == j) {
Elements children = element.children();
String values = children.get(1).text();
autobatchDirectory.setfRemark(values);
System.out.println(values);
} else {
// 设置对象属性值
setPropertyToObject(autobatchDirectory, i, element);
}
}
}
return autobatchDirectory;
} /**
* 设置对象属性值
*/
private static void setPropertyToObject(AutobatchDirectory autobatchDirectory, int i, Element element) {
Elements children = element.children();
String values = children.get(1).text() + " " + children.get(3).text();
System.out.println(values);
switch (i) {
case 1:
// 公告型号 公告批次
autobatchDirectory.setfAnnouType(children.get(1).text());
autobatchDirectory.setfAnnouBatch(children.get(3).text());
break;
case 2:
// 品牌 类型
autobatchDirectory.setfVehicleBrand(children.get(1).text());
autobatchDirectory.setfVehicleType(children.get(3).text());
break;
case 3:
// 额定质量 32000,32700 总质量
autobatchDirectory.setfMaxMass(children.get(1).text());
autobatchDirectory.setfTotalMass(children.get(3).text());
break;
case 4:
// 整备质量 8000,7300 燃料种类
autobatchDirectory.setfWholeMass(children.get(1).text());
autobatchDirectory.setfFuelType(children.get(3).text());
break;
case 5:
// 排放依据标准 轴数
autobatchDirectory.setfBlowoffStandard(children.get(1).text());
autobatchDirectory.setfAxleNumber(children.get(3).text());
break;
case 6:
// 轴距 7250+1310+1310,6850+1310+1310 轴荷
autobatchDirectory.setfWheelbase(children.get(1).text());
autobatchDirectory.setfAxleWeight(children.get(3).text());
break;
case 7:
// 弹簧片数 -/8/8/8,-/4/4/4,-/7/7/7,-/-/-/-,-/10/10/10 轮胎数
autobatchDirectory.setfSpringNumber(children.get(1).text());
autobatchDirectory.setfTyreNumber(children.get(3).text());
break;
case 8:
// 轮胎规格 11.00R20 12PR,11.00-20 12PR,12R22.5 12PR 接近离去角
autobatchDirectory.setfTyreSize(children.get(1).text());
autobatchDirectory.setfDepartureAngle(children.get(3).text());
break;
case 9:
// 前悬后悬 -/2080,-/1730,-/2480,-/2130 前轮距
autobatchDirectory.setfFrearSuspension(children.get(1).text());
autobatchDirectory.setfFrontGauge(children.get(3).text());
break;
case 10:
// 后轮距 1830/1830/1830 识别代号
autobatchDirectory.setfBackGauge(children.get(1).text());
autobatchDirectory.setfVinCode(children.get(3).text());
break;
case 11:
// 整车长 13000 整车宽
autobatchDirectory.setfVehicleLength(children.get(1).text());
autobatchDirectory.setfVehicleWidth(children.get(3).text());
break;
case 12:
// 整车高 2970,3030,2760 货厢长
autobatchDirectory.setfVehicleHeight(children.get(1).text());
autobatchDirectory.setfCargoLength(children.get(3).text());
break;
case 13:
// 货厢宽 2400,2450,2470 货厢高
autobatchDirectory.setfCargoWidth(children.get(1).text());
autobatchDirectory.setfCargoHeight(children.get(3).text());
break;
case 14:
// 最高车速 额定载客
autobatchDirectory.setfMaxSpeed(children.get(1).text());
autobatchDirectory.setfMaxPassenger(children.get(3).text());
break;
case 15:
// 驾驶室准乘人数 转向形式
autobatchDirectory.setfCabNumber(children.get(1).text());
autobatchDirectory.setfSteeringType(children.get(3).text());
break;
case 16:
// 准拖挂车总质量 载质量利用系数
autobatchDirectory.setfTotalMassTrailer(children.get(1).text());
autobatchDirectory.setfLoadMassFactor(children.get(3).text());
break;
case 17:
// 半挂车鞍座最大承载质量 16000,16150 企业名称
autobatchDirectory.setfMaxSemitrailer(children.get(1).text());
autobatchDirectory.setfEnterpriseName(children.get(3).text());
break;
case 18:
// 企业地址 深圳市龙岗区坪山镇锦龙大道1号 电话号码
autobatchDirectory.setfEnterpriseAddress(children.get(1).text());
autobatchDirectory.setfEnterprisePhone(children.get(3).text());
break;
case 19:
// 传真号码 (0755)89663298 邮政编码
autobatchDirectory.setfEnterpriseFax(children.get(1).text());
autobatchDirectory.setfPostcode(children.get(3).text());
break;
case 20:
// 底盘1 底盘2
autobatchDirectory.setfChassisOne(children.get(1).text());
autobatchDirectory.setfChassisTwo(children.get(3).text());
break;
case 21:
// 底盘3 底盘4
autobatchDirectory.setfChassisThree(children.get(1).text());
autobatchDirectory.setfChassisFour(children.get(3).text());
break;
default:
break;
}
} /**
* JDBC存储
* @throws SQLException
*/
public static void saveByJdbc(AutobatchDirectory autobatchDirectory) throws SQLException{
String sql = "insert into autobatch_directory ("
+ "F_ANNOU_TYPE, F_ANNOU_BATCH, F_VEHICLE_BRAND, F_VEHICLE_TYPE,"
+ "F_MAX_MASS, F_TOTAL_MASS, F_WHOLE_MASS, F_FUEL_TYPE, "
+ "F_BLOWOFF_STANDARD, F_AXLE_NUMBER, F_WHEELBASE, F_AXLE_WEIGHT,"
+ "F_SPRING_NUMBER, F_TYRE_NUMBER,F_TYRE_SIZE, F_DEPARTURE_ANGLE, "
+ "F_FREAR_SUSPENSION, F_FRONT_GAUGE, F_BACK_GAUGE, F_VIN_CODE, "
+ "F_VEHICLE_LENGTH, F_VEHICLE_WIDTH, F_VEHICLE_HEIGHT,F_CARGO_LENGTH,"
+ "F_CARGO_WIDTH, F_CARGO_HEIGHT,F_MAX_SPEED, F_MAX_PASSENGER,"
+ "F_CAB_NUMBER, F_STEERING_TYPE, F_TOTAL_MASS_TRAILER,"
+ "F_LOAD_MASS_FACTOR, F_MAX_SEMITRAILER, F_ENTERPRISE_NAME, F_ENTERPRISE_ADDRESS, "
+ "F_ENTERPRISE_PHONE, F_ENTERPRISE_FAX, F_POSTCODE, F_CHASSIS_ONE,"
+ "F_CHASSIS_TWO, F_CHASSIS_THREE, F_CHASSIS_FOUR, F_ENGINE_TYPE,"
+ "F_ENGINE_PRO, F_ENGINE_TRADEMARK, F_OUTPUT_VOLUME, F_POWER, F_REMARK) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,"
+ "?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
Connection conn = MycatJdbc.getConnection();
PreparedStatement pstm = null;
try {
System.out.println(sql);
pstm = (PreparedStatement) conn.prepareStatement(sql);
pstm.setString(1, autobatchDirectory.getfAnnouType());
pstm.setString(2, autobatchDirectory.getfAnnouBatch());
pstm.setString(3, autobatchDirectory.getfVehicleBrand());
pstm.setString(4, autobatchDirectory.getfVehicleType());
pstm.setString(5, autobatchDirectory.getfMaxMass());
pstm.setString(6, autobatchDirectory.getfTotalMass());
pstm.setString(7, autobatchDirectory.getfWholeMass());
pstm.setString(8, autobatchDirectory.getfFuelType());
pstm.setString(9, autobatchDirectory.getfBlowoffStandard());
pstm.setString(10, autobatchDirectory.getfAxleNumber());
pstm.setString(11, autobatchDirectory.getfWheelbase());
pstm.setString(12,autobatchDirectory.getfAxleWeight() );
pstm.setString(13, autobatchDirectory.getfSpringNumber());
pstm.setString(14,autobatchDirectory.getfTyreNumber() );
pstm.setString(15, autobatchDirectory.getfTyreSize() );
pstm.setString(16, autobatchDirectory.getfDepartureAngle());
pstm.setString(17, autobatchDirectory.getfFrearSuspension());
pstm.setString(18, autobatchDirectory.getfFrontGauge());
pstm.setString(19,autobatchDirectory.getfBackGauge() );
pstm.setString(20, autobatchDirectory.getfVinCode());
pstm.setString(21, autobatchDirectory.getfVehicleLength());
pstm.setString(22, autobatchDirectory.getfVehicleWidth());
pstm.setString(23, autobatchDirectory.getfVehicleHeight());
pstm.setString(24, autobatchDirectory.getfCargoLength());
pstm.setString(25,autobatchDirectory.getfCargoWidth() );
pstm.setString(26, autobatchDirectory.getfCargoHeight());
pstm.setString(27,autobatchDirectory.getfMaxSpeed() );
pstm.setString(28,autobatchDirectory.getfMaxPassenger() );
pstm.setString(29,autobatchDirectory.getfCabNumber() );
pstm.setString(30, autobatchDirectory.getfSteeringType() );
pstm.setString(31, autobatchDirectory.getfTotalMassTrailer());
pstm.setString(32,autobatchDirectory.getfLoadMassFactor() );
pstm.setString(33,autobatchDirectory.getfMaxSemitrailer() );
pstm.setString(34, autobatchDirectory.getfEnterpriseName());
pstm.setString(35,autobatchDirectory.getfEnterpriseAddress() );
pstm.setString(36,autobatchDirectory.getfEnterprisePhone() );
pstm.setString(37,autobatchDirectory.getfEnterpriseFax() );
pstm.setString(38, autobatchDirectory.getfPostcode() );
pstm.setString(39, autobatchDirectory.getfChassisOne());
pstm.setString(40, autobatchDirectory.getfChassisTwo());
pstm.setString(41, autobatchDirectory.getfChassisThree());
pstm.setString(42, autobatchDirectory.getfChassisFour());
pstm.setString(43, autobatchDirectory.getfEngineType() == null ? "" : autobatchDirectory.getfEngineType());
pstm.setString(44, autobatchDirectory.getfEnginePro() == null ? "" : autobatchDirectory.getfEnginePro());
pstm.setString(45, autobatchDirectory.getfEngineTrademark() == null ? "" : autobatchDirectory.getfEngineTrademark());
pstm.setString(46, autobatchDirectory.getfOutputVolume() == null ? "" : autobatchDirectory.getfOutputVolume());
pstm.setString(47, autobatchDirectory.getfPower() == null ? "" : autobatchDirectory.getfPower());
pstm.setString(48, autobatchDirectory.getfRemark()==null?"":autobatchDirectory.getfRemark());
pstm.executeUpdate();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (pstm != null) {
try {
pstm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
}
代码没什么难度,都是基本的元素解析。
Java Jsoup Spider抓取数据入库的更多相关文章
- java抓取网页数据,登录之后抓取数据。
最近做了一个从网络上抓取数据的一个小程序.主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中. 也找了一些资料,觉得没有一个很好的,全面的例子.因此在这里做个笔记提醒自己. 首先需要一 ...
- Java模拟新浪微博登陆抓取数据
前言: 兄弟们来了来了,最近有人在问如何模拟新浪微博登陆抓取数据,我听后默默地抽了一口老烟,暗暗的对自己说,老汉是时候该你出场了,所以今天有时间就整理整理,浅谈一二. 首先: 要想登陆新浪微博需要 ...
- 通过Java进行网页抓取并生成plist创建代码
抓取网页的方法: 抓取网页可以通过正则表达式也可以通过Java. 通过firefox浏览器,安装Firebug来查看网页的源代码. 首先将要抓取的部分保存到本地,步骤如下: 1.在要抓取的位置右键,选 ...
- .net处理页面的抓取数据
//要抓取数据的页面路径 string url = "http://www.scedu.net/banshi/used-car/lower-secondary-education/middl ...
- windows环境下nutch2.x 在eclipse中实现抓取数据存进mysql详细步骤
nutch2.x 在eclipse中实现抓取数据存进mysql步骤 最近在研究nutch,花了几天时间,也遇到很多问题,最终结果还是成功了,在此记录,并给其他有兴趣的人提供参考,共同进步. 对nutc ...
- java做web抓取
就像许多现代科技一样,从网站提取信息这一功能也有多个框架可以选择.最流行的有JSoup.HTMLUnit和Selenium WebDriver.我们这篇文章讨论JSoup.JSoup是个开源项目,提供 ...
- 分布式爬虫:使用Scrapy抓取数据
分布式爬虫:使用Scrapy抓取数据 Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据.Scrapy用途广泛,可以用于数据挖掘. ...
- C# 从需要登录的网站上抓取数据
[转] C# 从需要登录的网站上抓取数据 背景:昨天一个学金融的同学让我帮她从一个网站上抓取数据,然后导出到excel,粗略看了下有1000+条记录,人工统计的话确实不可能.虽说不会,但作为一个学计算 ...
- nodejs--实现跨域抓取数据
最近公司安排给我一个任务,抓取页面数据:http://survey.finance.sina.com.cn/static/20205/20131120.html?pid=20205&dpc=1 ...
随机推荐
- Windows地址空间
虚拟地址空间 当处理器读取或写入存储器位置时,它使用虚拟地址.作为读或写操作的一部分,处理器将虚拟地址转换为物理地址.通过虚拟地址访问内存具有以下优势: 程序可以使用连续范围的虚拟地址来访问在物理 ...
- Struts框架详解
1.Struts应用框架介绍 (1)框架 框架最简单的形式是指已开发过并已测试过的软件的程序块,这些程序块可以在多个软件开发工程中重用.框架提供了一个概括的体系结构模版,可以用这个模板来构建特定领域中 ...
- php处理ajax
首先安装wamp,若安装过mysql则终止进程防止冲突,可以访问localhost说明成功.在www目录下新建项目,使用localhost访问. php: <?php //3.获取ajax传过来 ...
- python(1)-- 变量类型
常规: Python有五个标准的数据类型: Numbers(数字):数字数据类型用于存储数值.他们是不可改变的数据类型,这意味着改变数字数据类型会分配一个新的对象.String(字符串):由数字.字母 ...
- [CODEVS1051]接龙游戏
题目描述 给出了N个单词,已经按长度排好了序.如果某单词i是某单词j的前缀,i->j算一次接龙(两个相同的单词不能算接龙). 你的任务是:对于输入的单词,找出最长的龙. 输入描述 Input D ...
- 调用Outlook发送邮件
#region 查找与指定文件关联在一起的程序的文件名 /// <summary> /// 查找与指定文件关联在一起的程序的文件名 /// </summary> /// < ...
- [翻译]现代Linux系统上的栈溢出攻击【转】
转自:http://www.codeweblog.com/%E7%BF%BB%E8%AF%91-%E7%8E%B0%E4%BB%A3linux%E7%B3%BB%E7%BB%9F%E4%B8%8A%E ...
- 無法使用 system/bin/r 讀取 pmic pm8937 hardware regitster 的原因
Platform Qualcomm MSM8917 + PM8937 + PMI8940 起因 同事問我 PM8937 的 VREG_L17 如何設定成 3.3V, 從 PM8937 hardware ...
- final、finalize()、finally、static
一.final final的三种情况: 1.变量 1)对于基本类型,final使数值恒定不变:而对于对象引用,final使引用恒定不变,即一旦引用被初始化指向一个对象,就无法再把它改为指向另一个对象, ...
- layui 自定义表单验证 以及提交表单
订购数量</span> <span style="color: red">*</span>: <input type="text ...