#!/usr/env  python
#-*- coding: utf-8 -*-
import urllib
import urllib2
import random
import requests
import os,sys
import MySQLdb
from sgmllib import SGMLParser
from BeautifulSoup import BeautifulSoup
import re
num=0
def main():
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")
conn.query("set names utf8")
except Exception,e:
print e
sys.exit()
cursor=conn.cursor()
category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']
for k in range(0,27):
t="https://play.google.com/store/apps/category/"+category[k]
html=requests.get(t)
preresult=html.content
soup=BeautifulSoup(preresult)
result=soup.prettify("utf-8")
pattern=re.compile('<a class="title" href="(.+?)" title')
dataresult=re.findall(pattern,result)
dataresult=list(set(dataresult))
for i in dataresult:
url="https://play.google.com"+i
print url
#url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk" html=requests.get(url)
preresult=html.content
soup=BeautifulSoup(preresult)
result=soup.prettify("utf-8")
#名称
pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
data0=re.findall(pattern,result)
for items in data0:
print items
#制造商
pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
data1=re.findall(pattern,result) make=data1[0].split("\n") print make[8]
#版本
pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
data2=re.findall(pattern,result)
print data2[0]
#更新时间
pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
data3=re.findall(pattern,result)
print data3[0]
#文件大小
pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
data4=re.findall(pattern,result)
print data4[0]
#支持固件
pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
data5=re.findall(pattern,result)
print data5[0]
#说明
pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
data6=re.findall(pattern,result)
for items in data6:
print re.sub('[<br /> <p> </p>]',' ',items)
sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
for items in data6: if(data5):
#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
#else:
#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
#print values
#print sql % values
#cursor.execute(sql,values)
#conn.commit()
pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
data=re.findall(pattern,result)
global num
for j in data:
print j
print type(j)
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
temp=requests.get(j[1:-2], headers=headers)
f=file("googlemarket/"+str(num),"w+")
num=num+1
print num
f.write(temp.content) if __name__=="__main__":
main()

<type 'str'>
Traceback (most recent call last):
  File "crawler0729.py", line 103, in <module>
    main()
  File "crawler0729.py", line 91, in main
    temp=requests.get(j[1:-2], headers=headers)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get
    return request('get', url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send
    r = adapter.send(request, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send
    raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)

python google play的更多相关文章

  1. Python+Google Geocoding

    本文主要介绍使用Python调用Google Geocoding API进行地址到地理坐标的转换. Google Geocoding参考https://developers.google.com/ma ...

  2. 吴裕雄--天生自然python Google深度学习框架:Tensorflow实现迁移学习

    import glob import os.path import numpy as np import tensorflow as tf from tensorflow.python.platfor ...

  3. 详解Python Google Protocol Buffer

    为什么要使用PB? PB(Protocol Buffer)是 Google 开发的用于结构化数据交换格式,作为腾讯云日志服务标准写入格式.因此用于写入日志数据前,需要将日志原始数据序列化为 PB 数据 ...

  4. 吴裕雄--天生自然python Google深度学习框架:经典卷积神经网络模型

    import tensorflow as tf INPUT_NODE = 784 OUTPUT_NODE = 10 IMAGE_SIZE = 28 NUM_CHANNELS = 1 NUM_LABEL ...

  5. 吴裕雄--天生自然python Google深度学习框架:图像识别与卷积神经网络

  6. 吴裕雄--天生自然python Google深度学习框架:MNIST数字识别问题

    import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data INPUT_NODE = 784 ...

  7. 吴裕雄--天生自然python Google深度学习框架:深度学习与深层神经网络

  8. 吴裕雄--天生自然python Google深度学习框架:TensorFlow实现神经网络

    http://playground.tensorflow.org/

  9. 吴裕雄--天生自然python Google深度学习框架:Tensorflow基础应用

    import tensorflow as tf a = tf.constant([1.0, 2.0], name="a") b = tf.constant([2.0, 3.0], ...

随机推荐

  1. C/C++之Exercise

    一.C/C++之初学Demo---C++调用C.h文件使用实例: 工程结构: exercise.h code: #ifndef _EXERCISE_H_ #define _EXERCISE_H_ #i ...

  2. Android学习4—短信发送器的实现

    界面预览: 由图中可以看出,此APP需要的组件有:两个TextView,一个用于显示手机号码的标题,另一个用于显示短信内容的标题.                                    ...

  3. Django练习项目之搭建博客

    背景:自从今年回家过年后,来到公司给我转了试用,我的学习效率感觉不如从前,而且刚步入社会我总是想要怎么想明白想清楚一些事,这通常会花掉,消耗我大量的精力,因为我想把我的生活管理规划好了,而在it技术学 ...

  4. centos 6.5 openfire安装

    1.下载:http://igniterealtime.org/downloads/download-landing.jsp?file=openfire/openfire-3.9.3-1.i386.rp ...

  5. [python]文本处理1.2

    1.0初步完成了文本截取需要信息的处理 1.1 修复了格式所造成的遗漏字符 1.2 去除了遗漏字符中的多余字符 bug-文本test14 有遗漏字符 bug-修复的遗漏字符中含有\n 未被识别为换行符

  6. UFLDL教程(五)之self-taught learning

    这里所谓的自学习,其实就是利用稀疏自编码器对无标签样本学习其特征 该自学习程序包括两部分: 稀疏自编码器学习图像特征(实现自学习)---用到无标签的样本集 softmax回归对样本分类---用到有标签 ...

  7. Maven镜像配置

    镜像是为了提供更快的服务 如图:X就认为是Y的一个镜像. 编辑settings.xml配置中央仓库镜像: <settings> ... <mirrors> <mirror ...

  8. Delphi在win7/vista下写注册表等需要管理员权限的解决方案

    看到论坛好多人问win7下写注册表的问题,我结合自己的理解写了一点东西,首先声明一下,本人初学Delphi,水平有限,大家见笑了,有什么不对之处请老鸟多指点. [背景]win7/Vista提供的UAC ...

  9. java 学习连接

    @Repository.@Service.@Controller 和 @Component   注解:http://blog.csdn.net/ye1992/article/details/19971 ...

  10. lazyman学习

    1.安装: gem install lazyman 2.建立工程: cd到工程目录下 lazyman new 工程名 3.打开调试命令 lazyman c lazyman调用selenium-webd ...