IMDB-TOP_250-爬虫

这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫。第二次写爬虫，比在暑假集训时写的熟练多了。欢迎大家评论。

 '''

 ************************************************

 *Time：2017.9.11

 *Target：All movies' information of IMDB TOP_250

 *Resources：http://www.imdb.cn/IMDB250/

 ************************************************

 '''

 import re

 import requests

 import numpy as np

 import matplotlib.pyplot as plt

 from bs4 import BeautifulSoup

 num = 1 #电影计数

 All_txt = [] #全部电影的信息

 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}#浏览器代理

 def  getHTMLText(url):

     try:

         #print(url)

         r = requests.get( url,headers = headers )

         #print(r)

         r.encoding = 'utf-8'

         return r.text

     except:

         return "错误"

 #从每一部电影的页面中获取全部信息

 def get_all_information(url,page):

     global num,All_txt

     txt = getHTMLText(url)

     if txt != "错误":

         print('page'+str(page)+' NO.'+str(num)+' Get it!')

     if num == 247:

         print('Finished!!!')

     soup = BeautifulSoup(txt,"html.parser")

     Cname,Ename,Score,title,Actor,Starring,Infor = '','','','','','',''

     #TOP250-film_Chinese_name&Score

     infor_1 = soup.find_all('div',class_ = 'hdd')

     rel = '<h3>'+'[\s\S]*?'+'</h3>'

     pattern = re.compile(rel)

     Cname = ''.join(pattern.findall(str(infor_1[0])))

     Cname = Cname.replace('<h3>','').replace('</h3>','')

     #print(Cname)

     #find_the_year & save

     rel = '（'+'[\s\S]*?'+'）'

     pattern = re.compile(rel)

     time_ = ''.join(pattern.findall(Cname))

     #print(time_)

     with open('time.txt','a',encoding='utf-8') as t:

         t.write( time_.replace('（','').replace('）','') + '\n' )

     #find_Score

     rel = '<i>'+'[\s\S]*?'+'</i>'

     pattern = re.compile(rel)

     Score = ''.join(pattern.findall(str(infor_1[0])))

     Score = Score.replace('<i>','').replace('</i>','')

     #print(Cname,Score)

     #TOP250-film_many_infor

     now = soup.find_all('div',class_ = 'bdd clear')

     #print(now[0])

     a = BeautifulSoup(str(now[0]), "html.parser")

     many_infor = a.find_all('li')

     #TOP250-film_Ename

     Ename = str(many_infor[0]).replace('<li>','').replace('<i>','').replace('</i>','').replace('</li>','').replace('<a>','').replace('</a>','')

     #TOP250-film_Actor

     Actor_temp = BeautifulSoup(str(many_infor[2]), "html.parser").find_all('a')

     Actor = Actor_temp[0].get_text().replace('导演：','')

     #TOP250-film_Starring

     Starring_temp = BeautifulSoup(str(many_infor[3]), "html.parser").find_all('a')

     for i in Starring_temp:

         Starring += i.get_text().replace(' ','') + ' '

     #print(Starring)

     #Top-film_Infor

     for j in range(4,7):

         Infor_temp = BeautifulSoup(str(many_infor[j]), "html.parser")

         for i in Infor_temp.children:

             Infor += i.get_text().replace(' ','') + ' '

         Infor += '\n'

     #print(Infor)

     #TOP250-film_Synopsis

     content =  soup.find_all('div',class_ = 'fk-4 clear')

     #print(content)

     soup_con = BeautifulSoup(str(content[0]), "html.parser")

     title = soup_con.find_all('div',class_ = 'hdd')

     title = str(title[0]).replace('<div class="hdd">','').replace('</div>','\n')

     #print(title)

     content_1 = soup_con.find_all('div',class_ = 'bdd clear')

     content_1 = str(content_1[0]).replace('<div class="bdd clear" style="font-size:15px">','').replace('</div>','')

     content_1 = content_1.replace('<!-- <p><a href="#">更多剧情 >></a></p>  -->','').replace('<br/>','\n')

     #Save_all_information

     All_txt.append('第'+str(num)+'部'+'\n')

     All_txt.append( Cname+'\n' )

     All_txt.append( '【英文名】'+Ename+'\n' )

     All_txt.append( '【评分】'+Score+'\n' )

     All_txt.append( '【导演】'+Actor+'\n' )

     All_txt.append( '【主演】'+Starring+'\n' )

     All_txt.append( Infor+'\n' )

     All_txt.append( title+'\n'+content_1+'\n' )

     All_txt.append('\n')

     num += 1

 #在每一页中得到当前页的全部电影的url

 def getin_one(url,page):

     txt = getHTMLText(url)

     soup = BeautifulSoup(txt, "html.parser")

     #print(soup)

     temp = soup.find_all('div',class_="ss-3 clear")

     rel = '<a href="' + '[\s\S]*?' + '">'

     pattern = re.compile(rel)

     All_url = pattern.findall( str(temp[0]) )

     for i in range(len(All_url)):

         temp_url = 'http://www.imdb.cn'+All_url[i].replace('<a href="','').replace('">','')

         get_all_information(temp_url,page)

     #print(All_url)

 #将所有电影的年份统计并生成条形图

 def Analyze_some_infor():

     plt.rc('font', family='SimHei', size=13)#字体及大小

     #Analyze_time

     file = open('time.txt')

     a,b,c,d,e,f = 0,0,0,0,0,0

     for line in file:

         line = eval(line)

         if line == 0:

             f += 1

         elif line < 1940 and line >= 1920:

             a += 1

         elif line < 1960 and line >= 1940:

             b += 1

         elif line < 1980 and line >= 1960:

             c += 1

         elif line < 2000 and line >= 1980:

             d += 1

         else:

             e += 1

     times = [a,b,c,d,e,f]

     range_time = ['1920-1940','1940-1960','1960-1980','1980-2000','2000-现在','无信息']

     idx = np.arange(len(range_time))

     width = 0.5

     plt.bar(idx,times,width,color='green')

     plt.xticks(idx+width/2, range_time, rotation=40)

     plt.xlabel('电影年代')

     plt.ylabel('数目')

     plt.savefig('time_pic.jpg')

     plt.show()

 def main():

     global All_txt

     getin_one('http://www.imdb.cn/IMDB250/',1)

     for i in range(2,10):

         getin_one( 'http://www.imdb.cn/imdb250/'+str(i) , i )

     #将已有内容清空

     with open('All_infor.txt','w',encoding='utf-8') as x:

         pass

     with open('All_infor.txt','a',encoding='utf-8') as x:

         for i in All_txt:

             x.write(i)

     Analyze_some_infor()

 main()

作者： LB919
出处：http://www.cnblogs.com/L1B0/
该文章为LB919投入了时间和精力的原创；
如有转载，荣幸之至！请随手标明出处；

IMDB-TOP_250-爬虫的更多相关文章

IMDB TOP 250爬虫
这个小学期Python大作业搞了个获取IMDB TOP 250电影全部信息的爬虫.第二次写爬虫,比在暑假集训时写的熟练多了.欢迎大家评论. ''' ************************** ...
python爬虫https://www.imdb.com/chart/top的电影
目标:爬取https://www.imdb.com/chart/top网页上面的电影top20 直接上main.py代码: #!/usr/bin/python35 # -*- coding:utf-8 ...
Java豆瓣电影爬虫——抓取电影详情和电影短评数据
一直想做个这样的爬虫:定制自己的种子,爬取想要的数据,做点力所能及的小分析.正好,这段时间宝宝出生,一边陪宝宝和宝妈,一边把自己做的这个豆瓣电影爬虫的数据采集部分跑起来.现在做一个概要的介绍和演示. ...
利用Abot爬虫和visjs 呈现漫威宇宙
1. 引言最近接触Abot爬虫也有几天时间了,闲来无事打算从IMDB网站上爬取一些电影数据玩玩.正好美国队长3正在热映,打算爬取漫威近几年的电影并用vis这个JS库呈现下漫威宇宙的相关电影. Abo ...
Abot爬虫和visjs
1. 引言最近接触Abot爬虫也有几天时间了,闲来无事打算从IMDB网站上爬取一些电影数据玩玩.正好美国队长3正在热映,打算爬取漫威近几年的电影并用vis这个JS库呈现下漫威宇宙的相关电影. Abo ...
Python爬虫 -- 抓取电影天堂8分以上电影
看了几天的python语法,还是应该写个东西练练手.刚好假期里面看电影,找不到很好的影片,于是有个想法,何不搞个爬虫把电影天堂里面8分以上的电影爬出来.做完花了两三个小时,撸了这么一个程序.反正蛮简单 ...
python增量爬虫pyspider
1.为了能够将爬取到的数据存入本地数据库,现在本地创建一个MySQL数据库example,然后在数据库中建立一张表格test,示例如下: DROP TABLE IF EXISTS `test`; C ...
一个简单的python爬虫程序
python|网络爬虫概述这是一个简单的python爬虫程序,仅用作技术学习与交流,主要是通过一个简单的实际案例来对网络爬虫有个基础的认识. 什么是网络爬虫简单的讲,网络爬虫就是模拟人访问web ...
我们的爬虫从pyspider开始说起（一）
看各种爬虫文献也有好几天了,总是感觉下不了手,总结一句“提笔忘字,总是因为看的太多而写的太少”.所以从现在开始,把看到的想到的,需要总结的东西慢慢的都沉淀下来,扎扎实实的走好每一步. 先来说这几天遇到 ...

随机推荐

一键安装各个版本boost库（无需编译）
1.NuGet 最简单的,用VS自带的NuGet包管理器安装,一般比较常用的上面都有 2.下载exe安装包在这里https://sourceforge.net/projects/boost/file ...
os 和shutil模块的使用方法
1.python中对文件.文件夹操作时经常用到的os模块和shutil模块常用方法. 1.得到当前工作目录,即当前Python脚本工作的目录路径: os.getcwd() 2.返回指定目录下的所有文件 ...
什么是类的hashcode值
1.要知道什么是类的hashcode值,首要要了解什么是hash(哈希).Hash,一般翻译做“散列”,也有直接音译为“哈希”的,就是把任意长度的输入(又叫做预映射pre-image)通过散列算法变换 ...
windows10打开switchHost，提示无修改权限
1.在C盘找到hsot文件,点击属性,去掉只读,去掉勾选. 点击编辑点击Users,选择完全控制,这回降低电脑安全! 确定.
mp
问题 G: Green Bin 时间限制: 1 Sec 内存限制: 128 MB[提交] [状态] 题目描述 We will call a string obtained by arranging ...
Go_Context
如何通知子goroutine退出? 1. 使用全局变量 package main import ( "fmt" "sync" "time" ...
angular6 路由拼接查询参数如 ?id=1 并获取url参数
angular6 路由拼接查询参数如 ?id=1 并获取url参数路由拼接参数: <div class="category-border" [routerLink]=&qu ...
线索二叉树的详细实现（C++）
线索二叉树概述二叉树虽然是非线性结构,但二叉树的遍历却为二又树的结点集导出了一个线性序列.希望很快找到某一结点的前驱或后继,但不希望每次都要对二叉树遍历一遍,这就需要把每个结点的前驱和后继信息记录下 ...
EVE无法安装vim
有些时候,由于一些错误的操作,可能导致vim无法使用,例如如下情况: root@eve-ng:~# vim /etc/profile-bash: vim: command not found 此时,一 ...
[转]使用HttpOnly提升Cookie安全性
原文:https://www.cnblogs.com/zlhff/p/5477943.html 在介绍HttpOnly之前,我想跟大家聊聊Cookie及XSS. 随着B/S的普及,我们平时上网都是依赖 ...

IMDB-TOP_250-爬虫

IMDB-TOP_250-爬虫的更多相关文章

随机推荐

热门专题