基于【 springBoot+jsoup】一 || 爬取全国行政区划数据
一、代码演示
如果中途中断,可进行刷选过滤已拉取省份数据
/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 19:37
*/
@RestController
public class CityController { @Autowired
private ProvinceService provinceService;
@Autowired
private HttpUtil httpUtil;
private String yearHref = "";
private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
@GetMapping("/start")
public ResultTemplate<String> spider() throws Exception {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
String charset = "gb2312";
Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) {
return of("fail");
}
Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
Document doc = httpUtil.get(yearHref, charset);
// 遍历所有的省
Elements provinceElements = doc.getElementsByClass("provincetr");
for (Element element : provinceElements) {
Elements aEles = element.select("a");
for (Element aEle : aEles) {
String name = aEle.text();
// 11.html
String provincesHref = aEle.attr("href");
String code = provincesHref.substring(0, provincesHref.indexOf("."));
index = yearHref.lastIndexOf("/") + 1;
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
provincesHref = yearHref.substring(0, index) + provincesHref;
DicProvince province = new DicProvince()
.setProvinceName(name)
.setProvinceCode(code)
.setCountryId(1196612453660643329L)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
System.out.println("未执行市:" + name);
} else {
System.out.println("开始时间:" + LocalDateTime.now());
System.out.println("省名称:" + name);
Long id = provinceService.insertProvince(province);
getCites(provincesHref, charset, id);
}
}
}
return of("spider crawl end.");
} private void getCites(String url, String charset, Long provinceId) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("citytr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
// 11/1101.html
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
cityHref = yearHref.substring(0, index) + cityHref;
DicCity city = new DicCity()
.setCityName(name)
.setCityCode(code)
.setProvinceId(provinceId)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCity(city);
//Long id=1L; getDistrict(cityHref, charset, id);
}
}
} // 区县
private void getDistrict(String url, String charset, Long idDis) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("countytr");
for (Element cityElement : cityElements) {
try {
Element aEle = cityElement.select("a").get(1);
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertDistrict(district);
//Long id=1L;
getStreet(cityHref, charset, id);
} catch (Exception e) {
System.out.println("市辖区");
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
Long id = provinceService.insertDistrict(district);
System.out.println("执行完毕"); } }
}
} // 街道
private void getStreet(String url, String charset, Long idStr) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("towntr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref;
DicStreet street = new DicStreet()
.setStreetName(name)
.setStreetCode(code)
.setDistrictId(idStr)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertStreet(street);
//Long id=1L;
getCommunity(cityHref, charset, id);
}
}
} // 社区
private void getCommunity(String url, String charset, Long idPro) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("villagetr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2);
String name = aEle3.text(); DicCommunity community = new DicCommunity()
.setCommunityName(name)
.setCommunityCode(code)
.setClassificationCode(cl_code)
.setStreetId(idPro)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCommunity(community);
}
}
} }
二、HttppUtil工具类
/**
* TODO
*
* @author kevin
* @createTime 2019-11-20 9:17
*/
@Component
public class HttpUtil {
public Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
connection.setRequestMethod("GET");
//是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
//设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
//设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
//开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
}
return null;
} }
三、service部分,根据需要自行定义数据库表
/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 20:41
*/
@Service
public class ProvinceServiceImpl implements ProvinceService { @Autowired
private ProvinceMapper provinceMapper;
@Autowired
private CityMapper cityMapper;
@Autowired
private DistrictMapper districtMapper;
@Autowired
private StreetMapper streetMapper;
@Autowired
private CommunityMapper communityMapper; @Override
public Long insertProvince(DicProvince dicProvince) {
int res=0;
while (res!=1){
try {
res=provinceMapper.insert(dicProvince);
} catch (Exception e) {
res=0;
System.out.println("插入省数据失败");
e.printStackTrace();
}
}
return dicProvince.getProvinceId();
} @Override
public Long insertCity(DicCity dicCity) {
int res=0;
while(res!=1){
try {
res=cityMapper.insert(dicCity);
} catch (Exception e) {
res=0;
System.out.println("插入市数据失败");
e.printStackTrace();
}
}
return dicCity.getCityId();
} @Override
public Long insertDistrict(DicDistrict dicDistrict) {
int res=0;
while (res!=1){
try {
res=districtMapper.insert(dicDistrict);
} catch (Exception e) {
res=0;
System.out.println("插入区县数据失败");
e.printStackTrace();
}
}
return dicDistrict.getDistrictId();
} @Override
public Long insertStreet(DicStreet dicStreet) {
int res=0;
while (res!=1){
try {
res=streetMapper.insert(dicStreet);
} catch (Exception e) {
res=0;
System.out.println("插入街道数据失败");
e.printStackTrace();
}
}
return dicStreet.getStreetId();
} @Override
public Long insertCommunity(DicCommunity dicCommunity) {
int res=0;
while (res!=1){
try {
res=communityMapper.insert(dicCommunity);
} catch (Exception e) {
res=0;
System.out.println("插入社区数据失败");
e.printStackTrace();
}
}
return dicCommunity.getCommunityId();
} }
基于【 springBoot+jsoup】一 || 爬取全国行政区划数据的更多相关文章
- Java使用Jsoup之爬取博客数据应用实例
导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...
- python爬虫学习之爬取全国各省市县级城市邮政编码
实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中 实例环境:python3.7 requests库 ...
- Python爬取招聘网站数据,给学习、求职一点参考
1.项目背景 随着科技的飞速发展,数据呈现爆发式的增长,任何人都摆脱不了与数据打交道,社会对于“数据”方面的人才需求也在不断增大.因此了解当下企业究竟需要招聘什么样的人才?需要什么样的技能?不管是对于 ...
- Java实现爬取京东手机数据
Java实现爬取京东手机数据 最近看了某马的Java爬虫视频,看完后自己上手操作了下,基本达到了爬数据的要求,HTML页面源码也刚好复习了下,之前发布两篇关于简单爬虫的文章,也刚好用得上.项目没什么太 ...
- Java爬取同花顺股票数据(附源码)
最近有小伙伴问我能不能抓取同花顺的数据,最近股票行情还不错,想把数据抓下来自己分析分析.我大A股,大家都知道的,一个概念火了,相应的股票就都大涨. 如果能及时获取股票涨跌信息,那就能在刚开始火起来的时 ...
- Scrapy 通过登录的方式爬取豆瓣影评数据
Scrapy 通过登录的方式爬取豆瓣影评数据 爬虫 Scrapy 豆瓣 Fly 由于需要爬取影评数据在来做分析,就选择了豆瓣影评来抓取数据,工具使用的是Scrapy工具来实现.scrapy工具使用起来 ...
- selenium跳过webdriver检测并爬取天猫商品数据
目录 简介 编写思路 使用教程 演示图片 源代码 @(文章目录) 简介 现在爬取淘宝,天猫商品数据都是需要首先进行登录的.上一节我们已经完成了模拟登录淘宝的步骤,所以在此不详细讲如何模拟登录淘宝.把关 ...
- 【scrapy_redis】调试后爬取了部分数据,然后重新调试时,直接被去重机制过滤掉无法重头开始爬取
这2天遇到一个问题,之前调试的时候爬取了一些数据,结果第二天重新调试的时候发现爬虫很快结束,而且还没有报错.后来从日志里看到这个: no more duplicates will be shown ( ...
- node 爬虫 --- 将爬取到的数据,保存到 mysql 数据库中
步骤一:安装必要模块 (1)cheerio模块 ,一个类似jQuery的选择器模块,分析HTML利器. (2)request模块,让http请求变的更加简单 (3)mysql模块,node连接mysq ...
随机推荐
- net.ipv4.tcp_fin_timeout的错误理解
按照文档的说法,貌似长久以来我对于tcp_fin_timeout的理解都是错误的 先备份在这里,再验证 提高Linux应对短连接的负载能力 在存在大量短连接的情况下,Linux的TCP栈一般都 ...
- python-learning-第二季-数据处理numpy
https://www.bjsxt.com/down/8468.html numpy-科学计算基础库 例子: import numpy as np #创建数组 a = np.arange() prin ...
- DECODE函数和CASE WHEN 比较
http://blog.csdn.net/zhangbingtao2011/article/details/51384393 一,DECODE函数 其基本语法为: DECODE(value, if1, ...
- 123457123456#0#----com.tym.ErTongFanPai20--前拼后广--儿童FanPai_tym
com.tym.ErTongFanPai20--前拼后广--儿童FanPai_tym
- HR数据分析常用的50个公式
HR数据分析常用的50个公式 HR经常需要和数据打交道,如入职率.离职率.加班费计算等.虽然日常工作中,单个数据的计算并不麻烦,但几十上百个累计在一起,确实很容易混淆,甚至计算失误.今天小编急HR之所 ...
- Win10下载安装PostgreSQL 11.1
下载地址:https://get.enterprisedb.com/postgresql/postgresql-11.1-1-windows-x64.exe Installation Director ...
- 【转】百万年薪挖了p8,难道是水货?
大厦新搬进来一家创业公司,老板红光满面地提着果篮上楼拜访,说是刚拿到了投资人的钱,正准备扩充团队大干一场.那个时候的他踌躇满志,顾盼生辉.当时我想,能在这个大环境下拿到投资的公司,做的产品应该是有前景 ...
- 【WAP触屏】YouKu视频弹窗播放组件
(function(window){ /* youku api : http://open.youku.com/tools 调用方法 : LM_youkuPop.open('XODI5Mzk3MDAw ...
- 【GStreamer开发】GStreamer基础教程04——时间管理
目标 本教程主要讲述一些和时间相关的内容.主要包括: 1. 如何问pipeline查询到流的总时间和当前播放的时间 2. 如何在流内部实现跳转功能 介绍 GstQuery是向一个element或者pa ...
- sudo权限配置
首先要禁止root的用户登录ssh 在ssh配置文件里面把root用户no掉,一般公司不允许用第三方软件直接root登陆. 一.linux给用户添加sudo权限: 有时候,linux下面运行sudo ...