- import requests
- from lxml import etree
- from urllib import parse
- import os, time
- def get_page_html(url):
- '''向url发送请求'''
- resoponse = session.get(url, headers=headers, timeout=timeout)
- try:
- if resoponse.status_code == 200:
- return resoponse
- except Exception:
- return None
- def get_next_url(resoponse):
- '''获取下一页的url链接'''
- if resoponse:
- try:
- selector = etree.HTML(resoponse.text)
- url = selector.xpath("//a[@id='j_chapterNext']/@href")[0]
- next_url = parse.urljoin(resoponse.url, url)
- return next_url
- except IndexError:
- return None
- def xs_content(resoponse):
- '''获取小说的章节名,内容'''
- if resoponse:
- selector = etree.HTML(resoponse.text)
- title = selector.xpath("//h3[@class='j_chapterName']/text()")[0]
- content_xpath = selector.xpath(
- "//div[contains(@class,'read-content') and contains(@class,'j_readContent')]//p/text()")
- return title, content_xpath
- def write_to_txt(info_tuple: tuple):
- if not info_tuple: return
- path = os.path.join(BASE_PATH, info_tuple[0])
- if not os.path.exists(path):
- with open(path + ".txt", "wt", encoding="utf-8") as f:
- for line in info_tuple[1]:
- f.write(line + "\n")
- f.flush()
- def run(url):
- '''启动'''
- html = get_page_html(url)
- next_url = get_next_url(html)
- info_tupe = xs_content(html)
- if next_url and info_tupe:
- print("正在写入")
- write_to_txt(info_tupe)
- time.sleep(sleep_time) # 延迟发送请求的时间,减少对服务器的压力。
- print("正在爬取%s" % info_tupe[0])
- print("正在爬取%s" % next_url)
- run(next_url)
- if __name__ == '__main__':
- session = requests.Session()
- sleep_time = 5
- timeout = 5
- BASE_PATH = r"D:\图片\LSZJ" # 存放文件的目录
- url = "" # 这是斗破苍穹第一章的url 需要爬取的小说的第一章的链接(url)
- headers = {
- "Referer": "",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
- }
- print('开始运行爬虫')
- run(url)
