python网页爬虫 spiders_97A-04B
import urllib
import urllib.request
import bs4
from bs4 import BeautifulSoup as bs
import re
import os # year = '97A'
# ss="./data/%s/"%year
'''
适应网页爬取95B-96B ''' '''
解决网页请求失败
resp = None
while (resp == None):
try:
resp = urllib.request.urlopen("http://baidu.com ")
except:
pass '''
def b0_trmd(year,ss):
if not os.path.exists(ss):
os.makedirs(ss)
# os.makedirs(ss)
p1=r"^([A-Z]{6})" url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('cp852')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')# ResultSet
segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'>
# segment2= soup.find_all('table')
# print(type(segment1))#
f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
pattern1=re.compile(p1)
tag_list=[]
for item in segment1:
# print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。
str1=item.get_text()
# if str1.strip()=="":用于判断字符串是否含空格
# break
if item.string==None:
# print("hhusssssssssssssssssssss")
break
matcher1=re.findall(pattern1,str1)
if matcher1: f3.write(matcher1[0]+','+year+'\n')
tag_list.append(matcher1[0])
f4.write(matcher1[0]+',')
else:
f4.write(str1+'\n') # print(type(str1))
# test1(str1)
# print(str1)#以文本方式呈现 # print(item.get_text())#获取具体标签内部内容
# print([text for text in item.stripped_strings] )#以列表方式呈现 # str2=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str1+'\n')
f2.close()
return tag_list
def test1(code_tag,year,ss): url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('table')
segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
for item in segment1: # #print(item)
'''
<tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
<span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
<a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>
Damage</td><td align="right"><span class="FrameDetailFont"> ×1
</span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
'''
str12=item.get_text()
# #print(str12)#以文本方式呈现
# #print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
# #print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
# #print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close()
def test2(code_tag,year,ss):
# p1=r"^(?:├─|└─)(.+)\n"
p1=r"^\W{2}(\w.+)\n"#
# p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层
# p2=r"^(?:│─├─|│─└─)(.+)\n"
p2=r"^\W{4}(\w.+)\n"
# p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"
p3=r"^\W{6}(\w.+)\n"
# p4=r"^(?:)(.+)\n" p4=r"^\W{8}(\w.+)\n"
p5=r"^\W{10}(\w.+)\n"
p6=r"^\W{12}(\w.+)\n"
p7=r"^\W{14}(\w.+)\n"
p8=r"^\W{16}(\w.+)\n" p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"
# p10="Segment Group " pattern1=re.compile(p1)
pattern2=re.compile(p2)
pattern3=re.compile(p3)
pattern4=re.compile(p4) pattern5=re.compile(p5)
pattern6=re.compile(p6)
pattern7=re.compile(p7)
pattern8=re.compile(p8)
pattern9=re.compile(p9)
# pattern10=re.compile(p10) f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
# c=int()
# d=int()
listp=[0,0,0,0,0,0,0,0]#用于记录父节点
for line in f1.readlines(): matcher1=re.findall(pattern1,line)
matcher2=re.findall(pattern2,line)
matcher3=re.findall(pattern3,line)
matcher4=re.findall(pattern4,line) matcher5=re.findall(pattern5,line)
matcher6=re.findall(pattern6,line)
matcher7=re.findall(pattern7,line)
matcher8=re.findall(pattern8,line)
matcher9=re.findall(pattern9,line)
# #print(type(matcher1)) if matcher1: a='SG'+str(listp[0])+' '+matcher1[0]+'\n'
f2.write(a)
if matcher9:
listp[1]=matcher9[0]
if matcher2: b='SG'+str(listp[1])+' '+matcher2[0]+'\n'
f2.write(b)
if matcher9:
listp[2]=matcher9[0]
if matcher3: c='SG'+str(listp[2])+' '+matcher3[0]+'\n'
f2.write(c)
#print(c)
if matcher9:
listp[3]=matcher9[0]
if matcher4:
d='SG'+str(listp[3])+' '+matcher4[0]+'\n'
f2.write(d)
#print(d)
if matcher9:
listp[4]=matcher9[0]
if matcher5:
e='SG'+str(listp[4])+' '+matcher5[0]+'\n'
f2.write(e)
#print(d)
if matcher9:
listp[5]=matcher9[0]
if matcher6:
f='SG'+str(listp[5])+' '+matcher6[0]+'\n'
f2.write(f)
#print(d)
if matcher9:
listp[6]=matcher9[0]
if matcher7:
g='SG'+str(listp[6])+' '+matcher7[0]+'\n'
f2.write(g)
#print(d)
if matcher9:
listp[7]=matcher9[0]
if matcher8:
h='SG'+str(listp[7])+' '+matcher8[0]+'\n'
f2.write(h)
#print(d)
if matcher9:
listp[8]=matcher9[0]
f2.close()
f1.close()
f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
for line1 in f4.readlines():
#print(line1)
# f3.write(line1.replace(" "," "))
f3.write(line1.replace("Segment Group ","SG"))
f4.close()
f3.close()
def test3(code_tag,year,ss):
f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"
pattern10=re.compile(p10)
i=0
for line2 in f6.readlines():
i=i+1
matcher10=re.findall(pattern10,line2)
# print(matcher10)
# print(type(matcher10))
if matcher10:
f5.write(str(matcher10[0])+'\n') f5.close()
f6.close()
# print(i)
return i
def test4(code_tag,year,ss):
url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
resp=None
while(resp==None):
try:
resp = urllib.request.urlopen(url)
except:
pass
data = resp.read().decode('UTF-8')
soup = bs(data, 'html.parser')
segment11= soup.find_all('p')
# segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table
# #print(segment1)
f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
for item in segment11:
str12=item.get_text()
#print(str12)#以文本方式呈现
#print(type(str12))
'''
│─│─├─DAM Damage ×1 (M)
'''
# #print(item.td.span.get_text())#获取具体标签内部内容
#print([text for text in item.stripped_strings] )#以列表方式呈现
'''
['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
'''
'''
soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。 soup.get_text("|", strip=True)#u'I linked to|example.com'
'''
str1=str([text for text in item.stripped_strings])
#print(type(str1[0][0]))
f2.writelines(str12+'\n') f2.close() # f2=open('./text1.txt','a',encoding='cp852')
# for item in segment1:
def test5(code_tag,num,year,ss):
f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
p1=r"(^A\sservice\ssegment.+\n)"
# p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"
p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"
pattern1=re.compile(p1)
pattern2=re.compile(p2)
# pattern3=re.compile(p3)
# pattern4=re.compile(p4)
flag=0
i=num
for line3 in f8.readlines():
matcher1=re.findall(pattern1,line3)
matcher2=re.findall(pattern2,line3)
# matcher3=re.findall(pattern3,line3)
# matcher4=re.findall(pattern4,line3) # #print(matcher10)
if matcher1 and flag==0:
f7.write(matcher1[0])
flag=1
i=i-1
if i==0:
break
continue
if (matcher2 and (flag==1 or flag==2)):
f7.write(matcher2[0])
flag=2
i=i-1
continue
f7.close()
f8.close() def join(code_tag,year,ss): f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8') list_note=[]
for line1 in f1:
list_note.append(line1)
f1.close()
p11=r"^\W{2}(\w{3}).+\n"
p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"
p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"
p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"
# print(list_note)
f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')
# for i in range(len(list_note)):
i=0
pattern11=re.compile(p11)
pattern12=re.compile(p12)
pattern13=re.compile(p13)
pattern14=re.compile(p14)
# f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
pos=[ '','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','','','','','','','','','','','','','','','','',
'','','','','' ]
for line2 in f2:
matcher11=re.findall(pattern11,line2)
matcher12=re.findall(pattern12,line2)
matcher13=re.findall(pattern13,line2)
matcher14=re.findall(pattern14,line2)
# print(matcher11[0])
# print(matcher12[0])
# print(matcher13[0])
# print(matcher14[0])
# print(matcher11[0])
# a=list(line2)
# print(a)
# b=str(a)
# print(b)
# print(line2.split(','))
try:
str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n')) i=i+1
# print(i)
# print(str11)
f2_w.write(str11)
f3_w.write(str11)
except:
print("---error---")
break f2_w.close()
f2.close() def test():#用户爬取网页,保存到本地
filename='./codeco.txt'
url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
resp = urllib.request.urlopen(url)
data = resp.read().decode('UTF-8')
# f1=open(filename,'w')
# f1.write(data)
# #print(type(data))
# #print(data)
f2=open('./text.txt','a')
soup = bs(data, 'html.parser') # sw=soup.find_all('table',border=0,width="100%")
# #print(sw[0])
segment1= soup.find_all('h4') segment2= soup.find_all('p')
# #print(type(segment))
#print(segment1)
#print(segment2)
nowplaying_list = []
for item in segment1:
#print(item)
# #print(item.name)
# #print(item.attrs)
# #print(type(item))
#print(item.get_text())
#print([text for text in item.stripped_strings] )
f2.writelines(str([text for text in item.stripped_strings])+'\n')
# nowplaying_dict = {}
# nowplaying_dict['id'] = item['a']
# for tag_img_item in item.find_all('img'):
# nowplaying_dict['name'] = tag_img_item['alt']
# nowplaying_list.append(nowplaying_dict)
# result= segment[0].find_all('h4')
# #print(result) for item in segment2: #print(item)
#print(item.get_text())
f2.writelines(str([text for text in item.stripped_strings] )+'\n')
f2.close()
# data={}
# data['word']='Jecvay Notes' # url_values=urllib.parse.urlencode(data)
# url="http://www.baidu.com/s?"
# full_url=url+url_values # data=urllib.request.urlopen(full_url).read()
# data=data.decode('UTF-8')
# #print(data)
if __name__=='__main__':
# '97A','97B','98A','98B','99A','99B'
year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
for j in range(len(year1)): year=year1[j]
ss="./data/%s/"%year
tag=b0_trmd(year,ss)
print(tag)
for i in range(len(tag)):
test1(tag[i],year,ss)
test2(tag[i],year,ss)
num=test3(tag[i],year,ss)
test4(tag[i],year,ss)
test5(tag[i],num,year,ss)
join(tag[i],year,ss)
print("------%s-----ok"%i)
# str1='APERAK'
# join(str1)
python网页爬虫 spiders_97A-04B的更多相关文章
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)
原文:http://www.52nlp.cn/python-网页爬虫-文本处理-科学计算-机器学习-数据挖掘 曾经因为NLTK的缘故开始学习Python,之后渐渐成为我工作中的第一辅助脚本语言,虽然开 ...
- 【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
本文转载自:https://www.cnblogs.com/colipso/p/4284510.html 好文 mark http://www.52nlp.cn/python-%E7%BD%91%E9 ...
- Python网页爬虫(一)
很多时候我们想要获得网站的数据,但是网站并没有提供相应的API调用,这时候应该怎么办呢?还有的时候我们需要模拟人的一些行为,例如点击网页上的按钮等,又有什么好的解决方法吗?这些正是python和网页爬 ...
- python 网页爬虫+保存图片+多线程+网络代理
今天,又算是浪费了一天了.python爬虫,之前写过简单的版本,那个时候还不懂原理,现在算是收尾吧. 以前对网页爬虫不了解,感觉非常神奇,但是解开这面面纱,似乎里面的原理并不是很难掌握.首先,明白一个 ...
- python网页爬虫
1. 静态页面爬取 这类最简单啦,右键->查看页面源码时,想下载的信息都能够显示在这里,这时只需要直接down页面源码,代码如下: # Simple open web import urllib ...
- python网页爬虫小项目开发
这是我最近接的一个小项目,花了是整整四天多时间. 任务是将http://www.examcoo.com/index/detail/mid/7网站下所有的试卷里的试题全部提取出来,首先按照题型进行分类, ...
- python网页爬虫开发之二
1.网站robots robotparser模块首先加载robots.txt文件,然后通过can_fetch()函数确定指定的用户代理是否允许访问网页. 2.识别网站技术 3.下载网页 使用urlli ...
- python网页爬虫开发之三
1.抓取目录页后用lxml进行页面解析,获取抓取列表 python3.6 urlparse模块变为urllib.parse 2.Python中有一个专门生成各类假数据的库:Faker 3.python ...
- python网页爬虫开发之一
1.beautifulsoap4 和 scrapy解析和下载网页的代码区别 bs可以离线解释html文件,但是获取html文件是由用户的其他行为的定义的,比如urllib或者request : 而sc ...
- python 网页爬虫,带登陆信息
注意点: 1. 用Fiddler抓取登陆后的headers,cookies; 2. 每抓取一次网页暂停一点时间防止反爬虫; 3. 抓取前,需要关闭Fiddler以防止端口占用. 还需解决的问题: 爬取 ...
随机推荐
- Java使用poi生成Excel,生成两种表格下拉框
想要使用POI操作以xsl结尾的Excel,首先要下载poi相关的jar包,用到的jar有: poi-3.9.jar poi-ooxml-3.9.jar poi-ooxml-schemas-3.9.j ...
- vue 项目部署后 刷新一下 页面找不到 解决
1.修改配置router的参数 (效果不好) 2. (不能解决 出现403) 后端配置例子 Apache <IfModule mod_rewrite.c> RewriteEngine O ...
- RabbitMq install on Centos
安装服务(root) erlang官方安装说明:https://www.erlang-solutions.com/resources/download.html step 1: 安装erlang的yu ...
- linux 初始设置
vim .bashrc 添加下行 自用: export PS1="Host:\[\033[1;35m\]\H \[\033[0m\]User:\[\033[1;33m\]\u \[\033[ ...
- jQuery height()、innerHeight()、outerHeight()函数的区别详解
参考来源:http://www.jb51.net/article/84897.htm 代码示例(可复制到编辑器直接打开): <!DOCTYPE html> <html lang=&q ...
- yum 操作
一.使用yum安装和卸载软件,有个前提是yum安装的软件包都是rpm格式的. 安装的命令是,yum install ~,yum会查询数据库,有无这一软件包,如果有,则检查其依赖冲突关系,如果没有依赖冲 ...
- AnimatorOverrideController
[AnimatorOverrideController] 有N个角色,这N个角色的状态机一样,只是动画不一样.这时候,新建一个AnimatorController,设置状态机.然后为这N个角色建立N个 ...
- 全排列12 · Permutations
无重复 [抄题]: Given a collection of numbers, return all possible permutations. For example,[1,2,3] have ...
- 关于swift语言中导入OC三方类找不到头文件的解决方法
首先我遇到的问题是这样的: 我之前封装的OC类,我导入现在的swift工程中,然后建立桥接文件,在Swift的控制器中可以找到这个OC写的东西. 但是问题来了,当你使用cocoapods导入的OC三方 ...
- VideoView的全屏问题
package com.bi.standardcompuse.app.widgets; import android.content.Context;import android.util.Attri ...