python 百度图片爬虫

# -*- coding:utf-8 -*-

#https://blog.csdn.net/qq_32166627/article/details/60882964

import requests

import os

import pinyin

import simplejson

def getManyPages(keyword,pages):

    params=[]

    for i in range(30,30*pages+30,30):

        params.append({

                      'tn': 'resultjson_com',

                      'ipn': 'rj',

                      'ct': 201326592,

                      'is': '',

                      'fp': 'result',

                      'queryWord': keyword,

                      'cl': 2,

                      'lm': -1,

                      'ie': 'utf-8',

                      'oe': 'utf-8',

                      'adpicid': '',

                      'st': -1,

                      'z': '',

                      'ic': 0,

                      'word': keyword,

                      's': '',

                      'se': '',

                      'tab': '',

                      'width': '',

                      'height': '',

                      'face': 0,

                      'istype': 2,

                      'qc': '',

                      'nc': 1,

                      'fr': '',

                      'pn': i,

                      'rn': 30,

                      'gsm': '1e',

                      '': ''

                  })

    url = 'https://image.baidu.com/search/acjson'

    urls = []

    for i in params:

        #print("begin")

        try:

            rgjson = requests.get(url,params=i).json().get('data')

        except simplejson.scanner.JSONDecodeError:

            print('【错误】simplejson.scanner.JSONDecodeError ')

            continue

        #print("end")

        urls.append(rgjson)

    return urls

def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹

        os.mkdir(localPath)

    x = 0

    for list in dataList:

        for i in list:

            if i.get('thumbURL') != None:

                #print('download：%s' % i.get('thumbURL'))

                print("down " + str(x) + " image " + i.get('thumbURL'))

                ir = requests.get(i.get('thumbURL'))

                open(localPath +"/" + keyword +  '_%d.jpg' % x, 'wb').write(ir.content)

                x += 1

            else:

                print('image not exist')

def convert():

    fp = open("stars_list_clean.txt",'w')

    with open("stars_list.txt",'r') as face_file:

        stars_list = face_file.readlines()

        index = 0

        line_record = []

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            #print(line)

            line_split = line.strip().split(",")

            print(line_split[1])

            if line_split[1] not in line_record:

                line_record.append(line_split[1])

                fp.write('%s\n' % line_split[1])

            else:

                print(line_split[1], " is exist")

def debug():

    # with open("stars_list_clean.txt",'r') as face_file:

    #   stars_list = face_file.readlines()

    #   index = 0

    #   for line in stars_list:

    #       line = line.replace('\r','').replace('\n','').replace('\t','')

    #       keyword_english = pinyin.get(line, format="strip")

    #       keyword = line

    #       index += 1

    #       if index > 0:

    #         break

    # print(keyword)

    # keyword1 = '胡因梦'

    # if keyword == keyword1:

    #     print("yes")

    # else:

    #     print("no")

    keyword = '胡因梦'

    keyword_english = "hym"

    dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    getImg(dataList,'./hanxue', keyword_english) # 参数2:指定保存的路径

    # keyword = '韩雪'

    # dataList = getManyPages(keyword,2)  # 参数1:关键字，参数2:要下载的页数

    #getImg(dataList,'./hanxue') # 参数2:指定保存的路径

def run():

    fp = open("stars_list_en.txt",'w')

    with open("stars_list_clean.txt",'r') as face_file:

        stars_list = face_file.readlines()

        for line in stars_list:

            line = line.replace('\r','').replace('\n','').replace('\t','')

            keyword_english = pinyin.get(line, format="strip")

            fp.write('%s\n' % keyword_english)

    face_ID_index = 0

    dir = "./stars_srcimg/"

    # if os.path.exists(dir):

    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):

        os.mkdir(dir)

    pages = 5

    maxnum = pages * 30

    print(maxnum)

    for line in stars_list:

        #line.decode('utf-8').encode('gb2312')

        line = line.replace('\r','').replace('\n','').replace('\t','')

        keyword = line

        print keyword

        keyword_english = pinyin.get(keyword, format="strip")

        print keyword_english

        face_ID = str(face_ID_index) + "_" + keyword

        facesavepath = dir + str(face_ID_index) + "_" + keyword

        face_ID_index += 1

        print facesavepath

        if not os.path.exists(facesavepath):

            os.mkdir(facesavepath)

        else:

            print(keyword, " exist")

            continue

        print("down "  + keyword)

        dataList = getManyPages(keyword, pages)  # 参数1:关键字，参数2:要下载的页数

        getImg(dataList, facesavepath, face_ID) # 参数2:指定保存的路径

if __name__ == '__main__':

  debug()

  #run()

python 百度图片爬虫的更多相关文章

python写的百度图片爬虫
学了一下python正则表达式,写一个百度图片爬虫玩玩. 当技术遇上心术不正的人,就成我这样的2B青年了. python3.6开发.程序已经打包好,下载地址: http://pan.baidu.com ...
百度图片爬虫-python版-如何爬取百度图片?
上一篇我写了如何爬取百度网盘的爬虫,在这里还是重温一下,把链接附上: http://www.cnblogs.com/huangxie/p/5473273.html 这一篇我想写写如何爬取百度图片的爬虫 ...
百度图片爬虫-python版
self.browser=imitate_browser.BrowserBase() self.chance=0 self.chanc ...
【Python网络爬虫四】通过关键字爬取多张百度图片的图片
最近看了女神的新剧<逃避虽然可耻但有用>,同样男主也是一名程序员,所以很有共鸣被大只萝莉萌的一脸一脸的,我们来爬一爬女神的皂片. 百度搜索结果:新恒结衣本文主要分为4个部分: 1.下载 ...
Python爬虫：通过关键字爬取百度图片
使用工具:Python2.7 点我下载 scrapy框架 sublime text3 一.搭建python(Windows版本) 1.安装python2.7 ---然后在cmd当中输入python,界 ...
如何用Python爬虫实现百度图片自动下载？
Github:https://github.com/nnngu/LearningNotes 制作爬虫的步骤制作一个爬虫一般分以下几个步骤: 分析需求分析网页源代码,配合开发者工具编写正则表达式或 ...
python爬虫获取百度图片（没有精华，只为娱乐）
python3.7,爬虫技术,获取百度图片资源,msg为查询内容,cnt为查询的页数,大家快点来爬起来.注:现在只能爬取到百度的小图片,以后有大图片的方法,我会陆续发贴. #!/usr/bin/env ...
python爬虫-爬取百度图片
python爬虫-爬取百度图片(转) #!/usr/bin/python# coding=utf-8# 作者 :Y0010026# 创建时间 :2018/12/16 16:16# 文件 :spider ...
python爬虫之爬取百度图片
##author:wuhao##爬取指定页码的图片,如果需要爬取某一类的所有图片,整体框架不变,但需要另作分析#import urllib.requestimport urllib.parseimpo ...

随机推荐

PCIE xilinx v5 IP核使用前的研究
外带一个月前啃的一个星期,加本星期心无旁骛,啃出些心得,广惠后人.但愿有用. trn信号是数据链路层的信号 TLP包是数据链路层传给transaction层的包解包需要一个transaction的协 ...
IOS 入门开发之创建标题栏UINavigationBar的使用
转自:http://xys289187120.blog.51cto.com/3361352/685746 IOS 入门开发之创建标题栏UINavigationBar的使用 IOS 开发有关界面 ...
html5自定义数字键盘
原理:使用div模拟输入框,避免手机原生键盘弹出,键盘使用div模拟,点击事件使用js控制,光标闪烁使用css模拟,具体代码如下: <!doctype html> <html lan ...
what-is-a-closure
https://stackoverflow.com/questions/36636/what-is-a-closure https://www.quora.com/What-are-upvalues- ...
[Windows Azure] How to use the Queue Storage Service
How to use the Queue Storage Service version 1.7 version 2.0 This guide will show you how to perform ...
SVN Cleanup failed to process the following paths错误的解决
在使用TortoiseSVN工具执行Cleanup操作时经常出现Cleanup failed to process the following paths的错误,具体如下图: 网上搜索了一下,找到了解 ...
Logstash使用jdbc同步MySQL中的数据
[--26T20::,][WARN ][logstash.inputs.jdbc ] Exception when executing JDBC query {:exception=>#< ...
[转载]WPF控件拖动
这篇博文总结下WPF中的拖动,文章内容主要包括: 1.拖动窗口 2.拖动控件 Using Visual Studio 2.1thumb控件 2.2Drag.Drop(不连续,没有中间动画) 2.3拖动 ...
[Selenium.2.Testing.Tools.Beginners.Guide]读书笔记
Assert, this allows the test to check if the element is on the page, if it is not available then the ...
am335x uboot, kernel 编译
一.设置环境变量 // 写在家目录下面的 .bashrc 里面 export KERNEL_PATH=~/aplex/kernel3.2.0 // kernel 路径 export UBOOT_PAT ...

python 百度图片爬虫

python 百度图片爬虫的更多相关文章

随机推荐

热门专题