发布于2020-02-16 20:08 阅读(1064) 评论(0) 点赞(30) 收藏(3)
import re
import requests
def handler_url(url):
header={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
}
reponse=requests.get(url,header)
text=reponse.text
titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)
years=re.findall(r'<p\sclass="source"><a.*?>(.*?)</a>',text,re.DOTALL)
autors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
contents_tmp=re.findall(r'<p\sclass="source">.*?<div\sclass="contson"\s.*?>(.*?)</div>',text,re.DOTALL)
contents=[]
for c in contents_tmp:
x=re.sub(r'<.*?>','',c)
contents.append(x.strip())
poems=[]
for value in zip(titles,years,autors,contents):
title,year,autor,content=value
poem={
"title":title,
"year":year,
"autor":autor,
"content":content
}
poems.append(poem)
for a in poems:
print(a)
def main():
base_url="https://www.gushiwen.org/default_{}.aspx"
for i in range(1,7):
url=base_url.format(i)
handler_url(url)
if __name__ == '__main__':
main()
import re
import requests
HEADERS={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
}
def get_detail_page(url):
resp=requests.get(url,headers=HEADERS)
text=resp.text
detail_pages=re.findall(r'<div\sclass="main-bd">.*?<a\shref="(.*?)">.*?</a>',text,re.DOTALL)
return detail_pages
def handle_detail_page(url):
book={}
resp=requests.get(url,headers=HEADERS)
text=resp.text
titles=re.findall(r'<div\sclass="article">.*?<span.*?>(.*?)</span>',text,re.DOTALL)
articles_tmp=re.findall(r'<div\sid="link-report">.*?<p>.*</p>',text,re.DOTALL)
articles=[]
for a in articles_tmp:
x=re.sub(r'<.*?>','',a)
y=x.replace('\r','').replace('\n','')
a=re.sub(r'<div\s.*?>','',y)
b=a.replace(' ','')
articles.append(b)
# print(articles)
# print(titles)
book={
'title':titles,
'article':articles
}
print(book)
def main():
urls=[]
base_url="https://book.douban.com/review/best/?start={}"
for i in range(0,41,20):
url=base_url.format(i)
urls.append(url)
for a in urls:
detail_urls=get_detail_page(a)
for f in detail_urls:
handle_detail_page(f)
if __name__ == '__main__':
main()
这两个小的爬虫项目,网站都没有进行反爬处理,所以说对于我们爬出页面信息还是比较简单的,关键在于数据怎么解析,最近一直再看正则表达式,所以找了两个小的项目练一练,这两个小项目也可以使用beatifulsoup或者xml库进行进行,找时间再试试别的方法解析数据把~
作者:哇哇
链接:https://www.pythonheidong.com/blog/article/231644/185ca4e3384c5226fcc7/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!