蜘蛛页面 获取一个网站的全部url 乐观代码
蜘蛛页面
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', , 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(, ))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > : sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) while True:
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 ORDER BY id DESC '
res = mysql_fetch(sql_ori, 'dic')
for d in res:
page_url, children_url = d['page_url'], d['children_url']
url = children_url
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep()
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > :
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep() dd =
CREATE TABLE `parent_url` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`page_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
`page_url` text,
`children_url` text,
`if_spider` tinyint(4) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=5328 DEFAULT CHARSET=latin1;
先写入,后删除
避免每个写入前的检查
消耗时间
获取一个网站的全部url
修复逻辑错误
支持 多进程 脚本多开
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/']
while True: sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0'
res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url continue_ = False
for fl in url_kw_filter_l:
if fl in url:
continue_ = True
break
if continue_:
continue js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0
代码的每一个功能点的模块化
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);"
print(sql_filter)
sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0' res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
break
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0
(父,子)url有序二元组
蜘蛛页面 获取一个网站的全部url 乐观代码的更多相关文章
- jsp页面获取参数的方法(url解析、el表达式赋值、session取值)【原创】
最近使用myEclispse做网站,使用jsp+js+css做页面,网站中常用到从列表进入详情页面的跳转,下面对详情页面的值填充方式做一个简单总结: 1.url中使用request获取参数 jsp上方 ...
- 用JavaScript获取一个超链接的绝对URL地址
对于Web程序员来说,处理简单的URL格式也许会成为一场噩梦.试想一下,一个网址里有很多组成部分都会影响你对它的解析方法: 是否以/字符开头 是否以//开头 是否以?号开头 是否以#号开头 …等等 当 ...
- 通过cookies跳过验证码登陆页面,直接访问网站的其它URL
我每次手动访问去NN网的一家酒店,就不需要登陆,一旦我用脚本打开就会让我登陆,而登陆页面又有验证码,不想识别验证码,所以就想:“通过cookies跳过验证码登陆页面,直接访问网站的其它URL” 转 ...
- Python 网络爬虫 009 (编程) 通过正则表达式来获取一个网页中的所有的URL链接,并下载这些URL链接的源代码
通过 正则表达式 来获取一个网页中的所有的 URL链接,并下载这些 URL链接 的源代码 使用的系统:Windows 10 64位 Python 语言版本:Python 2.7.10 V 使用的编程 ...
- songtaste网站歌曲真实URL获取
个人挺喜欢songtaste网站的歌曲的,下载方法也层出不穷,可是作为程序员如果不知其中原理的方法真是羞愧.首先简单点的方法当然有google插件这样的嗅探器了,不过这种工具的原理还不是很了解.今天先 ...
- 获取一个 app 的 URL Scheme 的方法:
获取一个 app 的 URL Scheme 的方法: 上这个网站 URL Schemes 查一下相应的 app 的 URL Scheme 是否有被收录 第一种方法没找到的话,把相应的 app 的 ip ...
- ASP.NET 获取来源网站的网址,获取上一网页的网址,获取来源网页的URL,获取上一网页的URL
ASP.NET 获取来源网站的网址,获取上一网页的网址,获取来源网页的URL, 获取上一网页的URL Uri Url = HttpContext.Current.Request.UrlReferrer ...
- 多域名环境,页面获取url的一种方案
因为系统是分布式部署的.而且有多个域名,所以常常涉及到获取url的问题. 这是系统框架层面须要提供的能力.否则每一个模块都须要自己去想办法获取ip,就会非常混乱.上线也easy发生bug 主要须要解决 ...
- 通过Iframe在A网站页面内嵌入空白页面的方式,跨域获取B网站的数据返回给A网站!
以下代码只是为演示该方法具体是如何操作的,实际的意义并不大. 其实这个方法还可以解决很多方面的跨域操作,以下两点为我工作中遇到的情况! 比如A系统中打开B系统页面的时候,获取B系统页面高度,A系统中可 ...
随机推荐
- java中mkdir()和mkdirs()区别
mkdirs()可以建立多级文件夹 mkdir()只会建立一级的文件夹 例如: new File("/file/one/two").mkdirs(); 可建立/file/one/t ...
- 在springBoot的控制台打印sql语句
在springBoot+Mybatis日志显示SQL的执行情况的最简单方法就是在properties新增: properties形式 logging.level.com.eth.wallet.mapp ...
- Yii 2.0 query模式语法
项目使用Yii 2.0版本开发,个人一直喜好使用(new \yii\db\Query())模式操作数据,把增.删.查.改这4种情况的写法整理出来,方便查阅和记忆. 增加 - insert use Yi ...
- win10 专业版 安装tornado 的步骤
win10 专业版 安装tornado 的步骤: 1.下载tornado源码压缩包 下载网址:https://github.com/tornadoweb/tornado 若是没有github 账号可以 ...
- jsp include html 乱码问题解决
方法一: 在被包含的html中,在首行加上 <%@ page language="java" import="java.util.*" pageEncod ...
- Automation 的 Wait 工具
public static WebDriverWait createWait(WebDriver driver) { return new WebDriverWait(driver, Environm ...
- CodeForces - 425E Sereja and Sets 题解
题目大意: 我们有一个集合 S,其中包含了 m 个不完全相同的区间[l1,r1],[l2,r2]…[lm,rm] (1≤li≤ri≤n,li,ri 都为整数). 定义 f(S)=k,表示集合 S 中能 ...
- [BZOJ1179] [Apio2009]Atm(tarjan缩点 + spfa)
传送门 题意 N个点M条边的有向图 每个点有点权 从某一个结点出发 问能获得的最大点权和 一个点的点权最多被计算一次 N<=500000 M<=500000 思路 先tarjan缩点,然后 ...
- bzoj4553 [Tjoi2016&Heoi2016]序列 树状数组(区间最大值)+cqd
[Tjoi2016&Heoi2016]序列 Time Limit: 20 Sec Memory Limit: 128 MBSubmit: 1006 Solved: 464[Submit][ ...
- 【存储过程】MySQL存储过程/存储过程与自定义函数的区别
---------------------------存储过程-------------------- 语法: 创建存储过程: CREATE [definer = {user|current_user ...