Python爬虫小结

有些数据是没有专门的数据集的，为了找到神经网络训练的数据，自然而然的想到了用爬虫的方法开始采集数据。一开始采用了网上的一个动态爬虫的代码，发现爬取的图片大多是重复的，有效图片很少。

动态爬虫：

from lxml import etree
import requests
import re
import urllib
import json
import time
import os
 
local_path = '/home/path/'
if not os.path.exists(local_path):
    os.makedirs(local_path)
keyword = input('请输入想要搜索图片的关键字:')
first_url = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1530850407660_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1530850407660%5E00_1651X792&word={}'.format(keyword)
want_download = input('请输入想要下载图片的张数:')
 
global page_num
page_num = 1
global download_num
download_num = 0
 
#这个函数用来获取图片格式
def get_format(pic_url):
    #url的末尾存着图片的格式，用split提取
    #有些url末尾并不是常见图片格式，此时用jpg补全
    t = pic_url.split('.')
    if t[-1].lower() != 'bmp' and t[-1].lower() != 'gif' and t[-1].lower() != 'jpg' and t[-1].lower() != 'png':
        pic_format = 'jpg'
    else:
        pic_format = t[-1]
    return pic_format
 
#这个函数用来获取下一页的url
def get_next_page(page_url):
    global page_num
    html = requests.get(page_url).text
    with open('html_info.txt', 'w', encoding='utf-8') as h:
        h.write(html)
    selector = etree.HTML(html)
    try:
        msg = selector.xpath('//a[@class="n"]/@href')
        print(msg[0])
        next_page = 'http://image.baidu.com/' + msg[0]
        print('现在是第%d页' % (page_num + 1))
    except Exception as e:
        print('已经没有下一页了')
        print(e)
        next_page = None
    page_num = page_num + 1
    return next_page
 
#这个函数用来下载并保存图片
def download_img(pic_urls):
    count = 1
    global download_num
    for i in pic_urls:
        time.sleep(1)
        try:
            pic_format = get_format(i)
            pic = requests.get(i, timeout=15)
            #按照格式和名称保存图片
            with open(local_path + 'page%d_%d.%s' % (page_num, count, pic_format), 'wb') as f:
                f.write(pic.content)
                #print('成功下载第%s张图片: %s' % (str(count), str(pic.url)))
                count = count + 1
                download_num = download_num + 1
        except Exception as e:
            #print('下载第%s张图片时失败: %s' % (str(count), str(pic.url)))
            print(e)
            count = count + 1
            continue
        finally:
            if int(want_download) == download_num:
                return 0
 
#这个函数用来提取url中图片的url
def get_pic_urls(web_url):
    html = requests.get(web_url).text
    #通过正则表达式寻找图片的地址，
    pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
    #返回图片地址，是一个list
    return pic_urls
 
if __name__ == "__main__":
    while True:
        pic_urls = get_pic_urls(first_url)
        t = download_img(pic_urls)
        if t==0:
            break
        next_url = get_next_page(first_url)
        if next_url == None:
            print('已经没有更多图片')
            break
        pic_urls = get_pic_urls(next_url)
        t = download_img(pic_urls)
        if t== 0:
            break
        first_url = next_url
    #print('已经成功下载%d张图片' %download_num)

为了筛选出重复的图片又采用了哈希算法进行去重

 # -*- coding: utf-8 -*-
 
 import sys
 reload(sys)
 sys.setdefaultencoding('utf8')
 
 """
 用dhash判断是否相同照片
 基于渐变比较的hash
 hash可以省略(本文省略)
 By Guanpx
 """
 import os
 from PIL import Image
 from os import listdir
 
 def picPostfix():  # 相册后缀的集合
     postFix = set()
     postFix.update(['bmp', 'jpg', 'png', 'tiff', 'gif', 'pcx', 'tga', 'exif',
                     'fpx', 'svg', 'psd', 'cdr', 'pcd', 'dxf', 'ufo', 'eps', 'JPG', 'raw', 'jpeg'])
     return postFix
 
 def getDiff(width, high, image):  # 将要裁剪成w*h的image照片
     diff = []
     im = image.resize((width, high))
     imgray = im.convert('L')  # 转换为灰度图片 便于处理
     pixels = list(imgray.getdata())  # 得到像素数据 灰度0-255
 
     for row in range(high): # 逐一与它左边的像素点进行比较
         rowStart = row * width  # 起始位置行号
         for index in range(width - 1):
             leftIndex = rowStart + index
             rightIndex = leftIndex + 1  # 左右位置号
             diff.append(pixels[leftIndex] > pixels[rightIndex])
 
     return diff  #  *得到差异值序列 这里可以转换为hash码*
 
 def getHamming(diff=[], diff2=[]):  # 暴力计算两点间汉明距离
     hamming_distance = 0
     for i in range(len(diff)):
         if diff[i] != diff2[i]:
             hamming_distance += 1
 
     return hamming_distance
 
 if __name__ == '__main__':
 
     width = 32
     high = 32  # 压缩后的大小
     dirName = "/home/yourpath"  # 相册路径
     allDiff = []
     postFix = picPostfix()  #  图片后缀的集合
 
     dirList = os.listdir(dirName)
     cnt = 0
     for i in dirList:
         cnt += 1
         # print('文件处理的数量是', cnt)  # 可以不打印 表示处理的文件计数
         if str(i).split('.')[-1] in postFix:  # 判断后缀是不是照片格式
             try:
                 im = Image.open(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
             except OSError as err:
                 os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
                 print('OS error : {}'.format(err))
                 # continue
 
             except IndexError as err:
                 os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8")))
                 print('OS error : {}'.format(err))
                 print('Index Error: {}'.format(err))
                 # continue
 
             except IOError as err:
                 os.remove(r'%s/%s' % (dirName, unicode(str(i), "utf-8"))) # 删除图片
                 # print('OS error : {}'.format(err))
                 print('IOError : {}'.format(err))
                 # continue
 
             # except:
             #     print ('Other error')
             else:
                 diff = getDiff(width, high, im)
                 allDiff.append((str(i), diff))
 
     for i in range(len(allDiff)):
         for j in range(i + 1, len(allDiff)):
             if i != j:
                 ans = getHamming(allDiff[i][1], allDiff[j][1])
                 if ans <= 5:  # 判别的汉明距离，自己根据实际情况设置
                     print(allDiff[i][0], "and", allDiff[j][0], "maybe same photo...")
                     result = dirName + "/" + allDiff[j][0]
                     if os.path.exists(result):
                         os.remove(result)

用哈希算法筛选后又发现筛除的太多了，阈值不好控制。又尝试采用了静态爬虫的方法，发现结果还不错，重复的也不多，也就省了筛除的步骤。

静态爬虫：

 # -*- coding: utf-8 -*-
 import sys
 reload(sys)
 sys.setdefaultencoding('utf8')
 import time
 # 导入需要的库
 import requests
 # import os
 import json
 import time
 
 # 爬取百度图片，解析页面的函数
 def getManyPages(keyword, pages):
     '''
     参数keyword：要下载的影像关键词
     参数pages：需要下载的页面数
     '''
     params = []
 
     for i in range(30, 30 * pages + 30, 30):
         params.append({
             'tn': 'resultjson_com',
             'ipn': 'rj',
             'ct': 201326592,
             'is': '',
             'fp': 'result',
             'queryWord': keyword,
             'cl': 2,
             'lm': -1,
             'ie': 'utf-8',
             'oe': 'utf-8',
             'adpicid': '',
             'st': -1,
             'z': '',
             'ic': 0,
             'word': keyword,
             's': '',
             'se': '',
             'tab': '',
             'width': '',
             'height': '',
             'face': 0,
             'istype': 2,
             'qc': '',
             'nc': 1,
             'fr': '',
             'pn': i,
             'rn': 30,
             'gsm': '1e',
             '': ''
         })
     url = 'https://image.baidu.com/search/acjson'
     urls = []
     for i in params:
         try:
             urls.append(requests.get(url, params=i).json().get('data'))
         # except json.decoder.JSONDecodeError:
         #     print("解析出错")
 
         except OSError as err:
             print('OS error : {}'.format(err))
 
         except IndexError as err:
             print('Index Error: {}'.format(err))
 
         except IOError as err:
             print('IOError : {}'.format(err))
         except:
             print('Other error')
     return urls
 
 # 下载图片并保存
 def getImg(dataList, localPath):
     '''
     参数datallist：下载图片的地址集
     参数localPath：保存下载图片的路径
     '''
     if not os.path.exists(localPath):  # 判断是否存在保存路径，如果不存在就创建
         os.mkdir(localPath)
     x = 0
     for list in dataList:
         for i in list:
             if i.get('thumbURL') != None:
                 # print('正在下载：%s' % i.get('thumbURL'))
                 ir = requests.get(i.get('thumbURL'))
                 open(localPath + '/' + '%d.jpg' % x, 'wb').write(ir.content)  # 这里是新加的斜杠
                 x += 1
             else:
                 print('图片链接不存在')
 
 # 根据关键词来下载图片
 if __name__ == '__main__':
     import os
     father_path = "/home/yourpath/"
     t0 = time.time()
     for init in os.listdir(father_path):
         print('init is{}'.format(str(init)))
         for name in os.listdir(init):
             print('name is{}'.format(str(name)))
             t1 = time.time()
             if not os.listdir(os.path.join(father_path, init, name)):
                 dataList = getManyPages(name, 30)
                 getImg(dataList, os.path.join(father_path, init, name))
             t2 = time.time()
             print('cost time is', t2 - t1)
     t3 = time.time()
     print('total time is', t3 - t0)
     # t1 = time.time()
     # dataList = getManyPages('keyword', page
 _number)  # 参数1:关键字，参数2:要下载的页数
     # getImg(dataList, './file_path/')  # 参数2:指定保存的路径
     # t2 = time.time()
     # print('cost time is', t2 - t1)
     #
     # parent_name = "/home/path"  # 相册路径
     # dirList = os.listdir(parent_name)  # 所有文件夹的列表
     # for one_file in dirList:  # 其中的一个文件夹
     #     # son_list = os.listdir(one_file)
     #     son_list = os.path.join(parent_name, one_file)
     #     son_file = os.listdir(son_list)
     #     t1 = time.time()

Python爬虫小结的更多相关文章

python爬虫小结1
先看正则化,正则化就是描述命令和字符切分.查找.筛选等功能的方便方式. http://www.cnblogs.com/fnng/archive/2013/05/20/3089816.html 一个游戏 ...
【Python爬虫】爬虫利器 requests 库小结
requests库 Requests 是一个 Python 的 HTTP 客户端库. 支持许多 HTTP 特性,可以非常方便地进行网页请求.网页分析和处理网页资源,拥有许多强大的功能. 本文主要介绍 ...
Python爬虫股票数据爬取
前一篇提到了与股票数据相关的可能几种数据情况,本篇接着上篇,介绍一下多个网页的数据爬取.目标抓取平安银行(000001)从1989年~2017年的全部财务数据. 数据源分析地址分析 http://m ...
Python字典小结
字典(dict)结构是Python中常用的数据结构,笔者结合自己的实际使用经验,对字典方面的相关知识做个小结,希望能对读者一些启发~ 创建字典常见的字典创建方法就是先建立一个空字典,然后逐一 ...
0.Python 爬虫之Scrapy入门实践指南（Scrapy基础知识）
目录 0.0.Scrapy基础 0.1.Scrapy 框架图 0.2.Scrapy主要包括了以下组件: 0.3.Scrapy简单示例如下: 0.4.Scrapy运行流程如下: 0.5.还有什么? 0. ...
路飞学城Python爬虫课第一章笔记
前言原创文章,转载引用务必注明链接.水平有限,如有疏漏,欢迎指正. 之前看阮一峰的博客文章,介绍到路飞学城爬虫课程限免,看了眼内容还不错,就兴冲冲报了名,99块钱满足以下条件会返还并送书送视频. 缴 ...
Python爬虫视频教程
├─第1章_[第0周]网络爬虫之前奏 │ ├─第1节_"网络爬虫"课程内容导学 │ │ 第1部分_全课程内容导学.mp4 │ │ 第2部分_全课程内容导学(WS00单元)学习资料. ...
Python爬虫之小试牛刀——使用Python抓取百度街景图像
之前用.Net做过一些自动化爬虫程序,听大牛们说使用python来写爬虫更便捷,按捺不住抽空试了一把,使用Python抓取百度街景影像. 这两天,武汉迎来了一个德国总理默克尔这位大人物,又刷了一把武汉 ...
Python 爬虫入门实战
1. 前言首先自我介绍一下,我是一个做 Java 的开发人员,从今年下半年开始,一直在各大技术博客网站发表自己的一些技术文章,差不多有几个月了,之前在 cnblog 博客园加了网站统计代码,看到每天 ...

随机推荐

CentOS7.2 部署Ceph分布式存储
1.1 环境准备主机名 IP地址 ceph-admin 192.168.16.220 ceph-node1,ceph-mon 192.168.16.221 ceph-node2,ceph-mon 1 ...
virtualenv虚拟环境使用及介绍
一.为什么使用virtualenv虚拟环境每个虚拟环境下的依赖相互独立,不同的项目可以单独使用一套python环境,减少各依赖包的影响更容易部署容器化二.virtualenv创建虚拟环境三. ...
Windows和Linux下与VMware虚拟机通过内网IP通讯
首先分两种情况:一种是你的电脑已经在一个内网的环境下且有额外的内网IP地址,和另一种只是想给自己电脑上的虚拟机分配个内网IP来通讯. ①有可用的内网IP 找到一个空闲的IP地址(这里以192.168. ...
入门Grunt前端构建工具
1. 全局安装 grunt:(倘若之前电脑安装过,则跳过此步骤) $ cnpm install -g grunt-cli 2. 作为项目的开发依赖(devDependencies)安装: (此步骤会自 ...
C# 删除指定文件
using System;using System.Collections.Generic;using System.IO;using System.Linq;using System.Text;us ...
EF 使用lambda表达式更新一对多数据时报错
1.需求更新一对多表中的附表数据,表结构如下: 2.思路个人觉得一个个去对比关联的附表数据是删除还是添加比较麻烦,就直接清空主表关联的附表,然后重新建立关联关系. 3.弊端如果附表(前提是附表 ...
MySQL快速回顾：更新和删除操作
前提要述:参考书籍<MySQL必知必会> 6.1 更新数据为了更新(修改)表中的数据,可使用UPDATE语句.可采用两种方式使用UPDATE: 更新表中特定的行: 更新表中所有的行. U ...
3maven常用命令和配置依赖
依赖: 例:spring-context.jar 依赖 spring-aop.jar... A中的某些类需要使用B中的某些类,则称为A依赖于B 在maven项目中,如果要使用一个当时存在的Jar或 ...
springboot +fastdfs 上传文件到到云服务器
fastdfs在云服务器的搭建和配置:https://blog.csdn.net/qq_41592652/article/details/104006289 springboot结构如下: appli ...
CentOS6.8 LAMP
第一次配置LAMP运行环境,上网查询了很多资料,一边试命令一边学习.服务器重置了很多次. 虽然有OneinStack这个方便的网站一键命令部署,但知道这个网站却是我自己踩坑之后的事情了,故此记录. 1 ...

Python爬虫小结

Python爬虫小结的更多相关文章

随机推荐

热门专题