发布于2020-04-11 15:33 阅读(1760) 评论(0) 点赞(11) 收藏(5)
- import re
- import requests
-
- def parse_url(url):
-
- headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
- response=requests.get(url,headers=headers)
- text=response.text
- end=[]
- #titles=re.findall(r'<div\sclass="ycd">.*?<h2>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
- titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
- dynesties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
- authors=re.findall(r'<p\sclass="source">.*?<a.*?><a.*?>(.*?)</a>',text,re.DOTALL)
- poems=re.findall(r'<div class="contson" .*?>(.*?)</div>',text,re.DOTALL)
- content=[]
- for poem in poems:
- x=re.sub(r'<.*?>'," ",poem)
- content.append(x.strip())
- ends=[]
- for value in zip(titles,dynesties,authors,content):
- titles, dynesties, authors, content=value
- poe={
- "title":titles,
- "dynasty":dynesties,
- "author":authors,
- "content":content
- }
- ends.append(poe)
- for end in ends:
- print(end)
-
-
- #print(authors)
- def main():
- for i in range(10):
- url="https://www.gushiwen.org/default_%s.aspx" %i
- #url = "https://gushiwen.com/type/n/xianqin/n/1.html"
- parse_url(url)
-
-
-
- main()
原文链接:https://blog.csdn.net/devilangel2/article/details/105441537
作者:飞龙出海
链接:https://www.pythonheidong.com/blog/article/324872/a664bb730d3943649508/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!