基于Python的爬虫案例
案例1:使用爬虫爬取京东华为手机用户评论
本案例借鉴哔哩哔哩博客主视频教程,感谢其教程为我开启了爬虫之旅:https://www.bilibili.com/video/BV1Yt4y1Y7nt?t=3456。本案例主要是通过京东华为手机页面爬取了用户的评论数据,便于对华为本款手机的性能、质量、价格等多维度进行用户方面的分析。
import requests
import json class Jdcomment_spider():
def __init__(self,file_name='jingdong_comment'):
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'} #打开文件
self.fp = open(f'./{file_name}.txt','w',encoding='utf-8')
print(f'爬虫开始,打开{file_name}文件!')
def parse_one_page(self,url):
#京东华为评论的URL,需要去除callback后缀
#url = 'https://club.jd.com/comment/productPageComments.action?productId=10025237646790&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
#打印响应
response = requests.get(url,headers=self.headers)
#print(response.text) #告诉服务器用Python requests发出请求
#print(response.request.headers) #第一种 将json格式字符串转换为字典
js_data = json.loads(response.text,strict=False)
#print(type(js_data))
#第二种
#js_data = response.json() #数据提取
comment_list = js_data['comments']
#print(comment_list)
for comment in comment_list:
#提取商品id
goods_id = comment.get('id')
#提取名称
nickname = comment.get('nickname')
print(nickname)
#提取评分
score = comment.get('score')
#提取产品类型
productSize = comment.get('productSize')
#提取产品颜色
productColor = comment.get('productColor')
#提取评论时间
creationTime = comment.get('creationTime')
#提取评论内容
content = comment.get('content')
#换行符替换空格 或split分离
content = content.replace('\n',' ')
print(content) #存储数据
self.fp.write(f'{goods_id}\t{nickname}\t{score}\t{productSize}\t{productColor}\t{creationTime}\t{content}\n') def parse_max_page(self):
for page_num in range(70):
print(f'正在抓取第{page_num}页的内容')
url = f'https://club.jd.com/comment/productPageComments.action?productId=10025237646790&score=0&sortType=5&page={page_num}&pageSize=10&isShadowSku=0&fold=1'
self.parse_one_page(url=url) def close_files(self):
self.fp.close()
print('爬虫结束,关闭文件!') if __name__ == '__main__':
#实例化对象
jd_spider = Jdcomment_spider()
#调用方法
jd_spider.parse_max_page()
jd_spider.close_files()
案例2:使用爬虫下载百度学术论文
本案例主要是通过百度学术与SCI-HUB联合的方式来获取下载文献,主要步骤为进入百度学术搜索界面获取关键词搜索后的URL,在关键词跳转后的论文页面选择查看源代码找到DOI处(并采用正则表达式进行提取),提取完DOI后利用SCI-HUB搜索文献的URL方式来获取PDF下载的网址。
1 #导入模块
2 import requests
3 import re
4 import os
5 from urllib.request import urlretrieve
6
7 #获取URL信息
8 def get_url(key):
9 url = 'https://xueshu.baidu.com/s?wd=' + key + '&rsv_bp=0&tn=SE_baiduxueshu_c1gjeupa&rsv_spt=3&ie=utf-8&f=8&rsv_sug2=0&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D'
10 return url
11
12 #获取headers 反爬虫
13 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
14 'cookie':'PSTM=1566269439; BIDUPSID=3E682072B0A8C093085B76FBCE0C034D; MCITY=-%3A; BAIDUID=320C35B2412D12FCFA87BEAAE26FAC75:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_ecde685c2a213f89118f49f95351d0131616728035849; BDSFRCVID_BFESS=Bc-OJeC624mAqbveVwGaU7iYMxe-PnbTH6aoaUI4HUrI-lBClSPbEG0P_f8g0Ku-jgOsogKKyeOTHu8F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tJkt_K-5JKvjD4-k247Hhn8thmT22-us0DAL2hcH0KLKMb6qKt5-bqKWQUQPt-ckB6b-sxJ8Kfb1MRjv-qozjMkAK4uL2UjmaN7T3q5TtUJreCnTDMRhqtIsXpbyKMniMCT9-pnafpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKu-n5jHj5XeH0D3q; delPer=0; PSINO=5; BD_HOME=0; BD_CK_SAM=1; antispam_key_id=45; antispam_site=ae_xueshu_paper; BDRCVFR[A88o6x7IGkt]=mk3SLVN4HKm; ab_sr=1.0.0_MWE3NGYyMTgxMjY0ZGM2NTcxNDAwMjVjZmNiOWU3YzIwNDA4OWNmZmNlNmM4NWUyZmZkNDVmN2E1OTZjOGZkMWFiNGFjYTU4Yzg4NTEyMDRkYTkzZTJlYTg3OTU0NTdl; antispam_data=fba5ca43ae000a429d092bba6e092ef3cf7c4c117f92e52dfe3260bb55855d667471475559fa8e05dc8e013a6316afd57f176fcab9710d0fe3eefb2f7799e44a25af15c58c1aae998deb0b9cf008b74e9e8d346b4156cdb351b74869e25b2990; antispam_sign=adb02c19; BA_HECTOR=aka0a52gah2504agvt1g68d0h0r; H_PS_PSSID=; Hm_lvt_43115ae30293b511088d3cbe41ec099c=1617162379,1617179509,1617179667; Hm_lpvt_43115ae30293b511088d3cbe41ec099c=1617179667; Hm_lvt_f28578486a5410f35e6fbd0da5361e5f=1617162379,1617179509,1617179667; Hm_lpvt_f28578486a5410f35e6fbd0da5361e5f=1617179667; BDRCVFR[w2jhEs_Zudc]=mk3SLVN4HKm; BDSVRTM=173'}
15
16 #提取论文的DOI值
17 def get_paper_link(headers,key):
18 response = requests.get(url=get_url(key),headers=headers)
19 data = response.text
20 paper_link = re.findall(r'<h3 class=\"t c_font\">\n + \n + <a href=\"(.*)\"',data) # ()内容获取论文的网址
21 doi_list = [] #列表接收doi
22 for link in paper_link:
23 paper_link = 'http:' + link
24 response2 = requests.get(url=paper_link,headers=headers)
25 res_data = response2.text
26 try:
27 paper_doi = re.findall(r'\'doi\'}\">\n +(.*?)\n ',res_data)
28 if str(10) in paper_doi[0]:
29 doi_list.append(paper_doi)
30 except:
31 pass
32 return doi_list
33
34 #构建scihub下载链接
35 def doi_download(headers,key):
36 doi_list = get_paper_link(headers,key)
37 for doi in doi_list:
38 doi_link = "https://sci-hub.tf/" + doi[0]
39 print(doi_link)
40
41 if 'https:' not in doi_link:
42 doi_link = 'https:' + doi_link
43 res = requests.get(url=doi_link,headers=headers)
44 down_link = re.findall('<iframe.*?src="(.*?)" id=.*?<\/iframe>',res.text)[0]
45 print(down_link)
46 r = requests.get(url=down_link,headers=headers)
47 path = doi_link.split('/')[-1] + '.pdf'
48 with open(path,'wb') as f:
49 f.write(r.content)
50 print('下载完毕:'+doi_link.split('/')[-1])
51
52 key = input('请输入需要下载的论文')
53 doi_download(headers=headers,key=key)
案例3:智联招聘网站信息获取及分析
本案例主要是通过智联招聘网站获取上海地区相关工作岗位的公司性质、公司工资、公司要求、学历要求、经验要求、年龄要求等多种数据,并通过对所得数据进行简单的分析和处理。
1 import requests
2 import re
3 import openpyxl
4 import time
5
6 head = {
7 'cookie': "x-zp-client-id=780a3405-a5d4-4890-92e1-4664829ca846; sts_deviceid=17861c0ec7262d-0f5a8f3ca09458-7373e61-2073600-17861c0ec73103; adfcid2=none; adfbid2=0; LastCity=%E4%B8%8A%E6%B5%B7; LastCity%5Fid=538; FSSBBIl1UgzbN7N443S=25F.0sssQmVDwwCXGqZv2j9BDZfsjKtlMZTD.MdZsUai9uY_xSd8vUXpVziT_BAC; locationInfo_search={%22code%22:%22576%22%2C%22name%22:%22%E5%A4%AA%E5%8E%9F%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; _uab_collina=161654777909311187692832; at=081be913be4f467dadafa0c178630d50; rt=fc2e0c878e01427b82b32427e4db0ac2; selectCity_search=538; ssxmod_itna=eqjxgDyD0Q3WwxGq0dD=wOEBCGOY3Kat7DRAmx0yGueGzDAxn40iDt=oHPhzFSAY4+WDMm20EaxajfRXWKKeOcGbLQn4qGLDmKDySW3w7DxOq0rD74irDDxD3DbRdDSDWKD9zqi3DEnKGfDDoDYf6uDitD4qDBGhdDKqGg8wGtWA=4g4rMGmUtYCGx8qDMmeGXCBWQOeaaaAXWtqGyIPGu0uU9IqbDCO+bfYpGvDp4IAwh57hbmG53SDhr+7mb+DwtfBhGODxw0Dp0xDfxQABQeD; ssxmod_itna2=eqjxgDyD0Q3WwxGq0dD=wOEBCGOY3Kat7DRADnKSiW5Dsp+DLnaKou/F+dsBtYThjTV43uGLtKrBRhQD6DQ+deiat6hcwV9Zm09TiXhnKnxDO92GOpQwCMOop=Caxz9uc1/WoRt0yhjfeRx7UxYS3xA3m2xWD7QPCxGcDiQPeD==; urlfrom=121122523; urlfrom2=121122523; adfbid=0; sts_sg=1; sts_sid=17872e8425f2d9-0783ac3ef1514f-7373e61-2073600-17872e8426044b; sts_chnlsid=121122523; zp_src_url=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.Kf0000K5cNxA6dzipIkSprEwAQaOYa8tX4WYiAxDWD7I29PDuPkPYZM9hefcGb51rnV7AXHQrtltBH905i_cjRrjyPVYD9Dko0iYc7c3dh3W2rhUXbyDD-pqP_45d4QlbcX_MOIflgmOJ_cm8Pe-FSnpRnSJzRVxWYIuQg_VotdiIxV7tPSoZXhX5kA6e_IvJm6mVHyUZDGcES8kLzQBdcDN9unE.DD_NR2Ar5Od669BCXgjRzeASFDZtwhUVHf632MRRt_Q_DNKnLeMX5DkgboozuPvHWdsHRy2J7jZZOlsfRymoM4EQ9JuIWxDBaurGtIKnLxKfYt_U_DY2yQvTyjtLsqT7jHzlRL5spy59OPt5gKfYtVKnv-WF_tU2lSMkl32AM-9I7fH7fmCuX8a9G4myIrP-SJFWZWlkLfYXLDkexd8WoLurAOtxbOveMmOUSENOoRojPakgkvUSkf.U1Yk0ZDqd_xKJVgfkoWPSPx8YnQNYnp30ZKGm1Ys0Zfqd_xKJVgfkoWPSPx8YnQNYnp30A-V5HczPfKM5yqbXWD0Iybqmh7GuZR0TA-b5Hcv0APGujYznHf0UgfqnH0krNtknjDLg1csPH7xnH0YP7tknjc1g1nvnjD0pvbqn0KzIjYvPW00mhbqnHR3g1csP7tznHIxPH010AdW5HDsnj7xnH63rjRdrj6dP7tznjRkg1Dsn-tkg100TgKGujYs0Z7Wpyfqn0KzuLw9u1Ys0A7B5HKxn0K-ThTqn0KsTjYs0A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYk0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Ykn0K8IjYs0ZPl5fK9TdqGuAnqTZnVmvY0pywW5R9affKYmgFMugfqPWPxn7tkPH00IZN15H6kPH6Ln10LPHm1njTdPWRLrH00ThNkIjYkPWDvrjndPHcdnHfk0ZPGujdWuHnYm1bLnH0snj9bn1NW0AP1UHY3P1uKnj9jwbNAfHPKnDFK0A7W5HD0TA3qn0KkUgfqn0KkUgnqn0KlIjYs0AdWgvuzUvYqn7tsg1KxnH0YP-tsg100uA78IyF-gLK_my4GuZnqn7tsg1KxnH63nHm4rNtsg100TA7Ygvu_myTqn0Kbmv-b5H00ugwGujYVnfK9TLKWm1Ys0ZNspy4Wm1Ys0Z7VuWYs0AuWIgfqn0KGTvP_5H00XMK_Ignqn0K9uAu_myTqnfK_uhnqn0KbmvPb5H0knRR1rHPanbfkwbP7fWD1wWT1PY7Dn1TvnDNjwjnz0Zwzmyw-5HTvnjcsn6KBuA-b5HnznDn1PRczPjDzwjDzwbPAfH6LfbNKfRRzPbuKf1Td0AqW5HD0mMfqn0KEmgwL5H00ULfqnfKETMKY5HDWnan1c1cWnWR3rHc1nWfWnWDsnanznH0sQW0snj0snankc1cWnanVc108nj0snj0sc1D8nj0snj0s0Z91IZRqnWTdP1fLPsKkgLmqna34PdtsQW0sg108njKxna34n7tsQW61g108n1Pxna3zn7tknW60mMPxTZFEuA-b5H00pgPxmLK95H00mL0qn0K-TLfqn0KWThnqPHcvrjT%26xst%3Dm1YsnH77n1b1fWFDnRujwRcknYmLn1IKwjnLPWK7fYf1n67B5HnznDn1PRczPjDzwjDzwbPAfH6LfbNKfRRzPbuKf1Td0gnqnHf3PH6LnWcdPHTLP1DkP1fLPj9xnWcdg10KI1LyktAJdIjA8nL3dSefsVgfko6KTHLyktAJdIjA8nL3dSefsVgfko6KIHYzP1RLPjTL0gfqnHmkPW61PHRzPf7VTHYs0W0aQf7WpjdhmdqsmsD1PWmzrjb4Pj0z%26word%3D%26ck%3D6335.3.86.234.150.183.138.217%26shh%3Dwww.baidu.com%26sht%3Dbaidu%26us%3D1.0.1.0.1.301.0%26wd%3D%26bc%3D110101; ZP_OLD_FLAG=false; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%221098833668%22%2C%22first_id%22%3A%2217861c0ed597c3-038f16a76b7241-7373e61-2073600-17861c0ed5aae6%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_utm_source%22%3A%22baiduPC%22%2C%22%24latest_utm_medium%22%3A%22CPC%22%2C%22%24latest_utm_campaign%22%3A%22pp%22%2C%22%24latest_utm_content%22%3A%22tj%22%2C%22%24latest_utm_term%22%3A%2228701167%22%7D%2C%22%24device_id%22%3A%2217861c0ed597c3-038f16a76b7241-7373e61-2073600-17861c0ed5aae6%22%7D; Hm_lvt_38ba284938d5eddca645bb5e02a02006=1616547737,1616835569; sts_evtseq=2; ZL_REPORT_GLOBAL={%22/resume/new%22:{%22actionid%22:%220581e348-6ff3-47a2-9c46-adc4b33a299e%22%2C%22funczone%22:%22addrsm_ok_rcm%22}}; acw_tc=2760828b16168355886785280e5895223771a1826f480f902918aba10f19b7; d4d6cd0b4a19fa72b8cc377185129bb7=f2d20fdf-aca1-47d5-9419-68a1546651e1; zpfe_probe_token=7e4632aes7eed04bb5852d2b16abd20f2480; Hm_lpvt_38ba284938d5eddca645bb5e02a02006=1616835599; FSSBBIl1UgzbN7N443T=5yfdFrb7rjkmpYEstB19g1UVEj9YDb4LJm9L80MbHBMebRO_SeWcfjNo5j.peEtPAeKPKHUGCLZe28BXFiOi3vQYUGrsICs34JIGqAiY72SywAP0Gs.QTTm8iMdbgqKIKKHnuiMJUeztKS64vZRt6g2PeHj1hkinnFErcamuUhf7EBxP34L9oXRYLCdIRAixDuMhcTOwhxVurVUlQmvGOZ1tiflELTVw..OLkO8esROa.LEKP8AvE_ANpsReRVNz4RveDVngNgZTZ1Zq0fYffsv76AJkEmQuylNIF14LjxqNdVHjOHzBu7TFrL08ID1U3_515gBgJ5Gd3cw_6g2aXhXJh3WXlAFmWZe9NLyS.eplb6F9BK.J59jxPd.4XEzRq0La",
8 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
9 }
10 def reqdata():
11 position_name_data = []
12 wage_range_data = []
13 region_data = []
14 working_years_data = []
15 education_requirements_data = []
16 enterprise_name_data = []
17 enterprise_nature_data = []
18 enterprise_scale_data = []
19 all_list = []
20 for num in range(1,10):
21 url1 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2029'.format(num)
22 all_list.append(url1)
23 url2 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2036'.format(num)
24 all_list.append(url2)
25 url3 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2035'.format(num)
26 all_list.append(url3)
27 url4 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2026'.format(num)
28 all_list.append(url4)
29 url5 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2019'.format(num)
30 all_list.append(url5)
31 url6 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2030'.format(num)
32 all_list.append(url6)
33 url7 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2023'.format(num)
34 all_list.append(url7)
35 url8 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2032'.format(num)
36 all_list.append(url8)
37 url9 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2028'.format(num)
38 all_list.append(url9)招聘信息汇总表 (1).xlsx
39 url10 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2031'.format(num)
40 all_list.append(url10)
41 url11 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2024'.format(num)
42 all_list.append(url11)
43 url12 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2034'.format(num)
44 all_list.append(url12)
45 url13 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2033'.format(num)
46 all_list.append(url13)
47 url14 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2021'.format(num)
48 all_list.append(url14)
49 url15 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2027'.format(num)
50 all_list.append(url15)
51 url16 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2022'.format(num)
52 all_list.append(url16)
53 for url in all_list:
54 time.sleep(1)
55 print('----正在下载----',url)
56 res = requests.get(url,headers=head).text.replace('\xa0','')
57 #职位名称
58 position_name = re.findall('(?<=jobname"><span title=")(.+?)(?=" class="iteminfo)',res)
59 for a in position_name:
60 position_name_data.append(a)
61 #薪资范围
62 wage_r = re.findall('(?<=<p class="iteminfo__line2__jobdesc__salary">)(.+?)(?=<!----></p>)',res,re.S)
63 wage_range = []
64 for i in wage_r:
65 j = i.strip()
66 wage_range.append(j)
67 for b in wage_range:
68 wage_range_data.append(b)
69 # #地区
70 region = re.findall('(?<=<li class="iteminfo__line2__jobdesc__demand__item">)(.+?)(?=</li> <li class=")',res)[::2]
71 for c in region:
72 region_data.append(c)
73 # #工作年限
74 working_years = re.findall('(?<=<li class="iteminfo__line2__jobdesc__demand__item">)(.+?)(?=</li> <li class=")',res)[1::2]
75 for d in working_years:
76 working_years_data.append(d)
77 # #学历要求
78 education_requirements = re.findall('(?<=<li class="iteminfo__line2__jobdesc__demand__item">)(.+?)(?=</li>)',res)[2::3]
79 for e in education_requirements:
80 education_requirements_data.append(e)
81 # #企业名称
82 erro_name = re.findall('(?<=" alt=")(.+?)(?=</span>)',res)
83 enterprise_name = re.findall('(?<=<span title=")(.+?)(?=" class="iteminfo)',str(erro_name))
84 for f in enterprise_name:
85 enterprise_name_data.append(f)
86 # #企业性质
87 enterprise_nature = re.findall('(?<=<span class="iteminfo__line2__compdesc__item">)(.+?)(?=</span>)',res)[::2]
88 for g in enterprise_nature:
89 enterprise_nature_data.append(g)
90 # #企业规模
91 enterprise_scale = re.findall('(?<=<span class="iteminfo__line2__compdesc__item">)(.+?)(?=</span>)',res)[1::2]
92 for h in enterprise_scale:
93 enterprise_scale_data.append(h)
94 return position_name_data,wage_range_data,region_data,working_years_data,education_requirements_data,enterprise_name_data,enterprise_nature_data,enterprise_scale_data
95 if __name__ == '__main__':
96 (a1,a2,a3,a4,a5,a6,a7,a8) = reqdata()
97 work = openpyxl.Workbook() # encoding='utf-8'
98 wke = work.create_sheet(index=0,title='招聘信息')
99 wke.cell(row=1, column=1).value = '职位名称'
100 wke.cell(row=1, column=2).value = '薪资范围'
101 wke.cell(row=1, column=3).value = '地区'
102 wke.cell(row=1, column=4).value = '工作年限'
103 wke.cell(row=1, column=5).value = '学历要求'
104 wke.cell(row=1, column=6).value = '企业名称'
105 wke.cell(row=1, column=7).value = '企业性质'
106 wke.cell(row=1, column=8).value = '企业规模'
107 for b1,b2,b3,b4,b5,b6,b7,b8,i in zip(a1,a2,a3,a4,a5,a6,a7,a8,range(2,5000)):
108 wke.cell(row=i, column=1).value = b1
109 wke.cell(row=i, column=2).value = b2
110 wke.cell(row=i, column=3).value = b3
111 wke.cell(row=i, column=4).value = b4
112 wke.cell(row=i, column=5).value = b5
113 wke.cell(row=i, column=6).value = b6
114 wke.cell(row=i, column=7).value = b7
115 wke.cell(row=i, column=8).value = b8
116 work.save('./招聘信息汇总表.xlsx')
117
118 import pandas as pd
119 import matplotlib.pyplot as plt
120 import matplotlib as mpl
121
122 preffered_foot = list(pd.read_excel(r'招聘信息汇总表.xlsx')['地区'])
123 foot = ['上海','上海-松江区','上海-徐汇区','上海-长宁区','上海-普陀区','上海-虹口区','上海-崇明区','上海-杨浦区','上海-金山区','上海-黄浦区','上海-闵行区','上海-宝山区','上海-嘉定区','上海-浦东新区','上海-青浦区','上海-静安区','上海-奉贤区']
124 #counts = [preffered_foot.count('Right'), preffered_foot.count('Left')]
125 counts = [preffered_foot.count('上海'),preffered_foot.count('上海-松江区'),preffered_foot.count('上海-徐汇区'),preffered_foot.count('上海-长宁区'),preffered_foot.count('上海-普陀区'),preffered_foot.count('上海-虹口区'),preffered_foot.count('上海-崇明区'),preffered_foot.count('上海-杨浦区'),preffered_foot.count('上海-金山区'),preffered_foot.count('上海-黄浦区'),preffered_foot.count('上海-闵行区'),preffered_foot.count('上海-宝山区'),preffered_foot.count('上海-嘉定区'),preffered_foot.count('上海-浦东新区'),preffered_foot.count('上海-青浦区'),preffered_foot.count('上海-静安区'),preffered_foot.count('上海-奉贤区')]
126
127 # 设置中文显示
128 mpl.rcParams['font.family'] = 'SimHei'
129 # 设置大小 像素
130 plt.figure(figsize=(9, 6), dpi=100)
131 plt.axes(aspect='equal') # 保证饼图是个正圆
132 #explodes = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6]
133 #exploades = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
134 color = ['red', 'y','c','b','cyan','#FF69B4','#FFB6C1','#6B4226','yellow','#E47833','greenyellow','#545454','#FF00FF','#32CD99','#00FFFF','#545454','#B5A642']
135 # 绘制饼图
136 # x:统计数据 explode:是否突出显示 label:标签 color:自定义颜色
137 # autopct:设置百分比的格式,保留2位小数 shadow: 有阴影 看起来立体
138 # startangle:初始角度 可使饼图旋转 labeldistance:标签离圆心的位置
139 plt.pie(counts, labels=foot,colors=color, autopct='%.2f%%', shadow=True,startangle=30, labeldistance=1.1,)
140 plt.title('职位地区分布饼状图', fontsize=15)
141 plt.savefig(fname="职位地区分布饼状图.png")
142 plt.show()
143
144 import pandas as pd
145 import matplotlib.pyplot as plt
146 import matplotlib.gridspec as gridspec
147 import matplotlib as mpl
148 from matplotlib.ticker import FuncFormatter
149 plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 设置字体,否则中文会显示异常
150 df = pd.read_excel(r"招聘信息汇总表.xlsx")
151 height = df['工作年限'].value_counts()
152 skill = [f'{m}' for m in height.index]
153 counts = height.sort_index()
154
155 plt.figure(figsize=(12, 5), dpi=100)
156 # 设置图形显示风格
157 plt.style.use('ggplot')
158 plt.plot(skill[::-1], counts[::-1],linewidth=8,color='y',marker='o',
159 markerfacecolor='blue',markersize=12)
160 def to_percent(temp, position):
161 return '%1.0f' % (5 * position) + '%'
162 plt.gca().yaxis.set_major_formatter(FuncFormatter(to_percent))
163
164 plt.title('工作年限要求折线图')
165 plt.xlabel('工作年限')
166 plt.ylabel('所占百分比')
167 plt.savefig(fname="工作年限要求折线图.png")
168 plt.show()
169
170 import pandas as pd
171 import matplotlib.pyplot as plt
172 import matplotlib as mpl
173 from matplotlib.ticker import FuncFormatter
174 plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 设置字体,否则中文会显示异常
175 df = pd.read_excel(r"招聘信息汇总表.xlsx")
176 skill_count = df['学历要求'].value_counts()
177 skill = [f'{m}' for m in skill_count.index] # 列表推导式构造不同技术等级
178 counts = skill_count.values.tolist() # 技术等级对应人数统计的列表
179 # 设置中文显示
180 mpl.rcParams['font.family'] = 'SimHei'
181 # 设置大小 像素
182 plt.figure(figsize=(9, 6), dpi=100)
183 # 绘制水平柱状图
184 plt.barh(skill[::-1], counts[::-1], height=0.5, color='#4169E1')
185 plt.title('学历要求柱状图')
186 plt.xlabel('人数所占百分比')
187 def to_percent(temp, position):
188 return '%1.0f' % (0.02 * temp) + '%'
189 plt.gca().xaxis.set_major_formatter(FuncFormatter(to_percent))
190 plt.yticks(['学历不限','初中及以下','中专/中技','高中','大专','本科','硕士','MBA/EMBA','博士'])
191 plt.savefig(fname="学历要求柱状图.png")
192 plt.show()
基于Python的爬虫案例的更多相关文章
- Python 简单爬虫案例
Python 简单爬虫案例 import requests url = "https://www.sogou.com/web" # 封装参数 wd = input('enter a ...
- 基于python的爬虫项目
一.项目简介 1.1 项目博客地址 https://www.cnblogs.com/xsfa/p/12083913.html 1.2 项目完成的功能与特色 爬虫和拥有三个可视化数据分析 1.3 项目采 ...
- 基于python的爬虫(一)
抓取网页 python核心库 urllib2 实现对静态网页的抓取,不得不说,"人生苦短,我用python"这句话还是有道理的,要是用java来写,这估计得20行代码 (对不住了博 ...
- Python网络爬虫案例(二)——爬取招聘信息网站
利用Python,爬取 51job 上面有关于 IT行业 的招聘信息 版权声明:未经博主授权,内容严禁分享转载 案例代码: # __author : "J" # date : 20 ...
- 基于python的发送邮件案例
#coding:utf-8 #强制使用utf-8编码格式 import smtplib #加载smtplib模块 from email.mime.text import MIMEText from e ...
- 基于python的爬虫流程图(精简版)
网址: https://www.processon.com/view/link/5e1148b8e4b07db4cfa9cf34 如果链接失效,请及时反馈(在评论区评论),博主会及时更新
- 一篇文章教会你利用Python网络爬虫获取电影天堂视频下载链接
[一.项目背景] 相信大家都有一种头疼的体验,要下载电影特别费劲,对吧?要一部一部的下载,而且不能直观的知道最近电影更新的状态. 今天小编以电影天堂为例,带大家更直观的去看自己喜欢的电影,并且下载下来 ...
- python网络爬虫(三)requests库的13个控制访问参数及简单案例
酱酱~小编又来啦~
- 【Machine Learning】决策树案例:基于python的商品购买能力预测系统
决策树在商品购买能力预测案例中的算法实现 作者:白宁超 2016年12月24日22:05:42 摘要:随着机器学习和深度学习的热潮,各种图书层出不穷.然而多数是基础理论知识介绍,缺乏实现的深入理解.本 ...
- 基于python的知乎开源爬虫 zhihu_oauth使用介绍
今天在无意之中发现了一个知乎的开源爬虫,是基于Python的,名字叫zhihu_oauth,看了一下在github上面star数还挺多的,貌似文档也挺详细的,于是就稍微研究了一下.发现果然很好用啊.就 ...
随机推荐
- pg_dump导出表时正则匹配多个表,pg_dump导出表
报错信息:pg_dump: 错误: 没有找到符合的表 报错语句:C:\Users\Admin>pg_dump -h172.16.3.159 -p5432 -dchisapp -nmchs -Um ...
- B树、B+树详解
B树.B+树详解 B树 前言 首先,为什么要总结B树.B+树的知识呢?最近在学习数据库索引调优相关知识,数据库系统普遍采用B-/+Tree作为索引结构(例如mysql的InnoDB引擎使用的B+树 ...
- Connect to D365 CE with multi-factor Authentication using C# sharp
Effective Feb 4, 2020 - Use of the WS-Trust (Web-Service Trust) authentication security protocol whi ...
- F. K-th Power 容斥,莫比乌斯
F. K-th Power 传送门: 牛客:https://ac.nowcoder.com/acm/contest/34866/F cf:https://codeforces.com/group/5z ...
- 关于Electron环境配置与一些相关的错误解决
安装步骤: 1.安装NVM: 这个是nodejs的版本管理器,github上有一个nvm for windows,由于不能的nodejs的版本问题,可以自由进行切换. 下载地址:https://git ...
- django+easyui
django+easyui 快速构建网站 演示地址:http://demo.topjui.com/?from=360tg
- ElseViewer--校稿
1Edit ModeThis presents the proof in continuous scroll to review and make corrections. By default, y ...
- MySQL如何指定字符集和排序规则?
在MySQL中,可以使用以下两种方式指定字符集和排序规则: 创建数据库或表时指定字符集和排序规则 在创建数据库或表时,可以使用 CHARACTER SET 和 COLLATE 选项指定字符集和排序规则 ...
- c/c++指针从浅入深介绍——基于数据内存分配的理解(上)
c/c++指针从浅入深介绍--基于数据内存分配的理解(上) 本文是对自我学习的一个总结以及回顾,文章内容主要是针对代码中的数据在内存中的存储情况以及存储中数值的变化来对指针进行介绍,是对指针以及数据在 ...
- JAVA数据类型以及什么是字节
强类型语言:要求变量的使用要严格符合规定,所有变量都必须先定义才能使用(安全性高) java的数据类型分为两大类 基本类型(primitive type) 引用类型(reference type) / ...