发布于2020-02-25 00:50 阅读(454) 评论(0) 点赞(13) 收藏(2)
import requests
def get_url(url):
header = {
'User-Agent': 'XXXX' #换取自己的
}
response = requests.get(url, headers=header)
content = response.text
if response.status_code == 200:
return content
return None
def main():
# 1.爬取首页
for i in range(0, 1):
url = "https://maoyan.com/board/4?offset={}".format(i * 10)
html = get_url(url)
print(html)
if __name__ == "__main__":
main()
ranks = re.findall('<dd>.*?(\d+)</i>',content,re.S)
print(ranks)
file_names = re.findall('<p class="name">.*?<a href.*?title="(.*?)"',content,re.S)
print(file_names)
file_stars = re.findall('<p class="star">(.*?)</p>',content,re.S)
print(file_stars)
file_releasetimes = re.findall('releasetime.*?(\d+.*?)</p>',content,re.S)
print(file_releasetimes)
file_scores_integer = re.findall('<p.*?score.*?integer.*?(\d.)</i>',content,re.S)
file_scores_fraction = re.findall('<i.*?fraction.*?(\d)</i>',content,re.S)
#print(type(file_scores_fraction[0]))
print(file_scores_integer)
print(file_scores_fraction)
def parse_html(html):
pattern = re.compile('<dd>.*?(\d+)</i>.*?<p class="name">.*?<a href.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime.*?(\d+.*?)</p>.*?<p.*?score.*?integer.*?(\d.)</i>.*?<i.*?fraction.*?(\d)</i>',re.S)
items = re.findall(pattern,html)
# for item in items:
# print(type(item[1]))
return items
def write_cvs(items):
for item in items:
with open('mao.csv','a+',encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([item[0],item[1],item[2].strip(),item[3],str(item[4])+str(item[5])])
import requests
import re
import csv
# 获取 HTML 源码
header = {
'User-Agent' : 'xxx'
}
for i in range (0,100):
url = "https://maoyan.com/board/4?offset={}".format(i*10)
response = requests.get(url,headers=header)
content = response.text
# pattern = re.compile('<dd>.*?(\d)</i>')
# findall() 方法用compile 对象会出现错误
ranks = re.findall('<dd>.*?(\d+)</i>',content,re.S)
# print(ranks)
file_names = re.findall('<p class="name">.*?<a href.*?title="(.*?)"',content,re.S)
# print(file_names)
file_stars = re.findall('<p class="star">(.*?)</p>',content,re.S)
# print(file_stars)
file_releasetimes = re.findall('releasetime.*?(\d+.*?)</p>',content,re.S)
# print(file_releasetimes)
file_scores_integer = re.findall('<p.*?score.*?integer.*?(\d.)</i>',content,re.S)
file_scores_fraction = re.findall('<i.*?fraction.*?(\d)</i>',content,re.S)
#print(type(file_scores_fraction[0]))
# print(file_scores_integer)
# print(file_scores_fraction)
i = 0
for rank in ranks:
file_score = file_scores_integer[i] + file_scores_fraction[i]
# print(ranks[i],file_names[i],file_score,file_stars[i].strip(),file_releasetimes[i])
with open('maoyan.cvs','a+') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([str(ranks[i]),str(file_names[i]),str(file_score),str(file_stars[i].strip()),str(file_releasetimes[i])])
i += 1
import requests
import re
import csv
def get_url(url):
header = {
'User-Agent': 'XXXX'
}
response = requests.get(url, headers=header)
content = response.text
if response.status_code == 200:
return content
return None
def parse_html(html):
pattern = re.compile('<dd>.*?(\d+)</i>.*?<p class="name">.*?<a href.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime.*?(\d+.*?)</p>.*?<p.*?score.*?integer.*?(\d.)</i>.*?<i.*?fraction.*?(\d)</i>',re.S)
items = re.findall(pattern,html)
# for item in items:
# print(type(item[1]))
return items
#print(items)
def write_cvs(items):
for item in items:
with open('mao.csv','a+',encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([item[0],item[1],item[2].strip(),item[3],str(item[4])+str(item[5])])
def main():
# 1.爬取首页
for i in range(0, 10):
url = "https://maoyan.com/board/4?offset={}".format(i * 10)
html = get_url(url)
#parse_html(html)
items = parse_html(html)
write_cvs(items)
# print(items)
# print(html)
if __name__ == "__main__":
main()
作者:骏马
链接:https://www.pythonheidong.com/blog/article/233018/14a9879222ce1f579ff6/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!