一、代码演示

如果中途中断,可进行刷选过滤已拉取省份数据

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 19:37
*/
@RestController
public class CityController { @Autowired
private ProvinceService provinceService;
@Autowired
private HttpUtil httpUtil;
private String yearHref = "";
private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
@GetMapping("/start")
public ResultTemplate<String> spider() throws Exception {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
String charset = "gb2312";
Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) {
return of("fail");
}
Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
Document doc = httpUtil.get(yearHref, charset);
// 遍历所有的省
Elements provinceElements = doc.getElementsByClass("provincetr");
for (Element element : provinceElements) {
Elements aEles = element.select("a");
for (Element aEle : aEles) {
String name = aEle.text();
// 11.html
String provincesHref = aEle.attr("href");
String code = provincesHref.substring(0, provincesHref.indexOf("."));
index = yearHref.lastIndexOf("/") + 1;
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
provincesHref = yearHref.substring(0, index) + provincesHref;
DicProvince province = new DicProvince()
.setProvinceName(name)
.setProvinceCode(code)
.setCountryId(1196612453660643329L)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
System.out.println("未执行市:" + name);
} else {
System.out.println("开始时间:" + LocalDateTime.now());
System.out.println("省名称:" + name);
Long id = provinceService.insertProvince(province);
getCites(provincesHref, charset, id);
}
}
}
return of("spider crawl end.");
} private void getCites(String url, String charset, Long provinceId) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("citytr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
// 11/1101.html
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
cityHref = yearHref.substring(0, index) + cityHref;
DicCity city = new DicCity()
.setCityName(name)
.setCityCode(code)
.setProvinceId(provinceId)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCity(city);
//Long id=1L; getDistrict(cityHref, charset, id);
}
}
} // 区县
private void getDistrict(String url, String charset, Long idDis) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("countytr");
for (Element cityElement : cityElements) {
try {
Element aEle = cityElement.select("a").get(1);
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertDistrict(district);
//Long id=1L;
getStreet(cityHref, charset, id);
} catch (Exception e) {
System.out.println("市辖区");
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
Long id = provinceService.insertDistrict(district);
System.out.println("执行完毕"); } }
}
} // 街道
private void getStreet(String url, String charset, Long idStr) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("towntr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref;
DicStreet street = new DicStreet()
.setStreetName(name)
.setStreetCode(code)
.setDistrictId(idStr)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertStreet(street);
//Long id=1L;
getCommunity(cityHref, charset, id);
}
}
} // 社区
private void getCommunity(String url, String charset, Long idPro) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("villagetr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2);
String name = aEle3.text(); DicCommunity community = new DicCommunity()
.setCommunityName(name)
.setCommunityCode(code)
.setClassificationCode(cl_code)
.setStreetId(idPro)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCommunity(community);
}
}
} }

二、HttppUtil工具类

/**
* TODO
*
* @author kevin
* @createTime 2019-11-20 9:17
*/
@Component
public class HttpUtil {
public Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
connection.setRequestMethod("GET");
//是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
//设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
//设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
//开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
}
return null;
} }

三、service部分,根据需要自行定义数据库表

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 20:41
*/
@Service
public class ProvinceServiceImpl implements ProvinceService { @Autowired
private ProvinceMapper provinceMapper;
@Autowired
private CityMapper cityMapper;
@Autowired
private DistrictMapper districtMapper;
@Autowired
private StreetMapper streetMapper;
@Autowired
private CommunityMapper communityMapper; @Override
public Long insertProvince(DicProvince dicProvince) {
int res=0;
while (res!=1){
try {
res=provinceMapper.insert(dicProvince);
} catch (Exception e) {
res=0;
System.out.println("插入省数据失败");
e.printStackTrace();
}
}
return dicProvince.getProvinceId();
} @Override
public Long insertCity(DicCity dicCity) {
int res=0;
while(res!=1){
try {
res=cityMapper.insert(dicCity);
} catch (Exception e) {
res=0;
System.out.println("插入市数据失败");
e.printStackTrace();
}
}
return dicCity.getCityId();
} @Override
public Long insertDistrict(DicDistrict dicDistrict) {
int res=0;
while (res!=1){
try {
res=districtMapper.insert(dicDistrict);
} catch (Exception e) {
res=0;
System.out.println("插入区县数据失败");
e.printStackTrace();
}
}
return dicDistrict.getDistrictId();
} @Override
public Long insertStreet(DicStreet dicStreet) {
int res=0;
while (res!=1){
try {
res=streetMapper.insert(dicStreet);
} catch (Exception e) {
res=0;
System.out.println("插入街道数据失败");
e.printStackTrace();
}
}
return dicStreet.getStreetId();
} @Override
public Long insertCommunity(DicCommunity dicCommunity) {
int res=0;
while (res!=1){
try {
res=communityMapper.insert(dicCommunity);
} catch (Exception e) {
res=0;
System.out.println("插入社区数据失败");
e.printStackTrace();
}
}
return dicCommunity.getCommunityId();
} }

  

基于【 springBoot+jsoup】一 || 爬取全国行政区划数据的更多相关文章

  1. Java使用Jsoup之爬取博客数据应用实例

    导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...

  2. python爬虫学习之爬取全国各省市县级城市邮政编码

    实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中 实例环境:python3.7 requests库 ...

  3. Python爬取招聘网站数据,给学习、求职一点参考

    1.项目背景 随着科技的飞速发展,数据呈现爆发式的增长,任何人都摆脱不了与数据打交道,社会对于“数据”方面的人才需求也在不断增大.因此了解当下企业究竟需要招聘什么样的人才?需要什么样的技能?不管是对于 ...

  4. Java实现爬取京东手机数据

    Java实现爬取京东手机数据 最近看了某马的Java爬虫视频,看完后自己上手操作了下,基本达到了爬数据的要求,HTML页面源码也刚好复习了下,之前发布两篇关于简单爬虫的文章,也刚好用得上.项目没什么太 ...

  5. Java爬取同花顺股票数据(附源码)

    最近有小伙伴问我能不能抓取同花顺的数据,最近股票行情还不错,想把数据抓下来自己分析分析.我大A股,大家都知道的,一个概念火了,相应的股票就都大涨. 如果能及时获取股票涨跌信息,那就能在刚开始火起来的时 ...

  6. Scrapy 通过登录的方式爬取豆瓣影评数据

    Scrapy 通过登录的方式爬取豆瓣影评数据 爬虫 Scrapy 豆瓣 Fly 由于需要爬取影评数据在来做分析,就选择了豆瓣影评来抓取数据,工具使用的是Scrapy工具来实现.scrapy工具使用起来 ...

  7. selenium跳过webdriver检测并爬取天猫商品数据

    目录 简介 编写思路 使用教程 演示图片 源代码 @(文章目录) 简介 现在爬取淘宝,天猫商品数据都是需要首先进行登录的.上一节我们已经完成了模拟登录淘宝的步骤,所以在此不详细讲如何模拟登录淘宝.把关 ...

  8. 【scrapy_redis】调试后爬取了部分数据,然后重新调试时,直接被去重机制过滤掉无法重头开始爬取

    这2天遇到一个问题,之前调试的时候爬取了一些数据,结果第二天重新调试的时候发现爬虫很快结束,而且还没有报错.后来从日志里看到这个: no more duplicates will be shown ( ...

  9. node 爬虫 --- 将爬取到的数据,保存到 mysql 数据库中

    步骤一:安装必要模块 (1)cheerio模块 ,一个类似jQuery的选择器模块,分析HTML利器. (2)request模块,让http请求变的更加简单 (3)mysql模块,node连接mysq ...

随机推荐

  1. ionic生命周期函数

    Ionic4中的生命周期函数和angualr7基本是一样的,下面我们看看Ionic4中的生命周期函数,以及生命周期函数的用法. Ionic4中内置的生命周期函数: ionViewWillEnter — ...

  2. 实战c++中的string系列--std:vector 和std:string相互转换(vector to stringstream)

    string.vector 互转 string 转 vector vector  vcBuf;string        stBuf("Hello DaMao!!!");----- ...

  3. Spring cloud微服务安全实战-4-6搭建OAuth2资源服务器

    认证服务器已经搭建好了. 可以通过认证服务器拿到令牌 下面改造订单服务,让它可以用这个令牌. 争对订单服务要做三个事, 1.让订单服务知道它自己是Oauth协议里面的资源服务器.,它知道这个事后,它才 ...

  4. ES6深入浅出-4 迭代器与生成器-1.字面量增强

    今天的内容 字面量literal 写出来就是它的值 例如字符串hello.这就是自变量. 一个空对象,也是自变量 写出来就是代表它写出来的那个意思就是自变量. 与其相反的就是构造出来的.例如下面的ne ...

  5. 【Java】分布式自增ID算法---雪花算法 (snowflake,Java版)

    一般情况,实现全局唯一ID,有三种方案,分别是通过中间件方式.UUID.雪花算法. 方案一,通过中间件方式,可以是把数据库或者redis缓存作为媒介,从中间件获取ID.这种呢,优点是可以体现全局的递增 ...

  6. QML工程加载main.qml的两种方式

    1. QQmlApplicationEngine engine; engine.load(QUrl(QStringLiteral("qrc:/main.qml"))); if (e ...

  7. Django之Restful API

    理解Restful架构:http://www.ruanyifeng.com/blog/2011/09/restful RESTful设计指南:http://www.ruanyifeng.com/blo ...

  8. 【PromQL】prometheus查询语言

    常用查询: https://songjiayang.gitbooks.io/prometheus/content/exporter/nodeexporter_query.html group by 操 ...

  9. Sublime Text 3能用支持的插件推荐

    从二月份用测试版本build 3012开始用sublime text 3,虽然很多插件在sublime text 3不工作了,因为sublime text 3修复了2的一些bug.提升了性能并集成了不 ...

  10. 推荐Pi(π)币,相当于比特币手机挖矿版

    我为什么推荐这个? 说实话,之所以发出来还是因为如果用我的邀请码注册,双方的挖矿速度都会增加些,我的邀请码:leneing,有问题可以咨询我. Pi币简介 1.在这里强烈推荐Pi币,相当于比特币手机挖 ...