debug(0)
发布于2019-09-06 11:30 阅读(1283) 评论(0) 点赞(7) 收藏(3)
Traceback (most recent call last):
File "e:\python27\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "e:\python27\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "e:\python27\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "e:\python27\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "e:\python27\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "E:\pypro\dingdian\dingdian\spiders\dingdian.py", line 69, in get_chapter
rets = Sql.select_chapter(chapterurl)
File "E:\pypro\dingdian\dingdian\mysqlpipelines\sql.py", line 57, in select_chapter
cur.execute(sql, value)
File "e:\python27\lib\site-packages\mysql\connector\cursor.py", line 551, in execute
self._handle_result(self._connection.cmd_query(stmt))
File "e:\python27\lib\site-packages\mysql\connector\connection.py", line 490, in cmd_query
result = self._handle_result(self._send_cmd(ServerCmd.QUERY, query))
File "e:\python27\lib\site-packages\mysql\connector\connection.py", line 405, in _handle_result
self._socket.recv(), self.python_charset)
File "e:\python27\lib\site-packages\mysql\connector\protocol.py", line 239, in parse_column
(packet, _) = utils.read_lc_string(packet) # db
File "e:\python27\lib\site-packages\mysql\connector\utils.py", line 199, in read_lc_string
if buf[0] == 251: # \xfb
IndexError: bytearray index out of range
python2.7 scrapy 1.3版本运行scrapy 的时候提示上面错误.
下面是出错的dingdian.py 与 sql.py 的代码文件
dingdian.py
# -*- coding:utf-8 -*-
import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from items import DingdianItem, DcontentItem
from scrapy.selector import Selector
from ..mysqlpipelines.sql import Sql
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
class DingdianSpider(scrapy.Spider):
name = 'dingdianspider'
allowed = ['23us.com']
bash_url = 'http://www.23us.com/class/'
bashurl = '.html'
def start_requests(self):
for i in range(1, 11):
url = self.bash_url + str(i) + '_1' + self.bashurl
yield Request(url, self.parse)
##yield Request('http://www.23us.com/quanben/1', self.parse)
def parse(self, response):
sel = Selector(response)
##max_num = BeautifulSoup(response.text, 'lxml').find('div', {'calss':'pagelink'}).find_all('a')[-1].get_text()
max_num = sel.xpath("//a[@class='last']/text()").extract()
max_num = max_num[0]
bashurl = str(response.url)[:-7]
for i in range(1, int(max_num) + 1):
url = bashurl + '_' + str(i)+self.bashurl
yield Request(url, self.get_name)
def get_name(self, response):
tds = BeautifulSoup(response.text, 'lxml').find_all('tr', bgcolor='#FFFFFF')
##print (tds)
for td in tds:
novelname = td.find('a', target='_blank').get_text()
novelurl = td.find('a')['href']
yield Request(novelurl, self.get_chapterurl, meta={'name':novelname, 'url':novelurl})
def get_chapterurl(self, response):
item = DingdianItem()
item['name'] = str(response.meta['name']).replace('\xa0', '')
item['novelurl'] = response.meta['url']
soup = BeautifulSoup(response.text, 'lxml')
category = soup.find('table').find('a').get_text()
author = soup.find('table').find_all('td')[1].get_text()
##bash_url = soup.find('p', class_='btnlinks').find('a', class_='read')['href']
bash_url = soup.find('a', {'class':'read'})['href']
name_id = str(bash_url)[-6:-1].replace('/', '')
item['name_id'] = name_id
item['category'] = str(category).replace('/', '')
item['author'] = str(author).replace('/', '')
##print (item)
yield item
yield Request(bash_url, self.get_chapter, meta={'name_id': name_id})
def get_chapter(self, response):
urls = re.findall(r'<td class="L"><a href="(.*)?">(.*)?</a></td>', response.text)
num = 0
for url in urls:
num = num +1
chapterurl = response.url + url[0]
chaptername = url[1]
rets = Sql.select_chapter(chapterurl)
if rets[0] == 1:
print(u'这一节已经存在')
pass
else:
yield Request(chapterurl, self.get_chaptercontent, meta={
'num':num, 'name_id':response.meta['name_id'], 'chaptername': chaptername, 'chapterurl':chapterurl
})
def get_chaptercontent(self, response):
item = DcontentItem()
item['num'] = response.meta['num']
item['id_name'] = response.meta['name_id']
item['chaptername'] = str(response.meta['chaptername']).replace('\xa0', '')
item['chapterurl'] = response.meta['chapterurl']
content = BeautifulSoup(response.text, 'lxml').find('dd', {'id':'contents'}).get_text()
item['chaptercontent'] = str(content).replace('\xa0', '')
yield item
sql.py
# -*- coding:utf-8 -*-
import mysql.connector
from dingdian import settings
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
MYSQL_HOST = settings.MYSQL_HOST
MYSQL_USER = settings.MYSQL_USER
MYSQL_PASSWORD = settings.MYSQL_PASSWORD
MYSQL_PORT = settings.MYSQL_PORT
MYSQL_DB = settings.MYSQL_DB
cnx = mysql.connector.connect(host=MYSQL_HOST, user=MYSQL_USER, password=MYSQL_PASSWORD, database=MYSQL_DB)
cur = cnx.cursor(buffered=True)
class Sql:
@classmethod
def insert_dd_name(cls, xs_name, xs_author, category, name_id):
sql = 'INSERT INTO dd_name(`xs_name`, `xs_author`, `category`, `name_id`) VALUES (%(xs_name)s, %(xs_author)s, %(category)s, %(name_id)s)'
value = {
'xs_name': xs_name,
'xs_author':xs_author,
'category':category,
'name_id':name_id
}
cur.execute(sql, value)
cnx.commit()
@classmethod
def select_name(cls, name_id):
sql = "SELECT EXISTS(SELECT 1 FROM dd_name WHERE name_id=%(name_id)s)"
value = {
'name_id':name_id
}
cur.execute(sql, value)
return cur.fetchall()[0]
@classmethod
def insert_dd_chaptername(cls, xs_chaptername, xs_content, id_name, num_id, url):
sql = 'INSERT INTO dd_chaptername(`xs_chaptername`, `xs_content`, `id_name`, `num_id`, `url`) VALUES (%(xs_chaptername)s, %(xs_content)s, %(id_name)s, %(num_id)s, %(url)s)'
value = {
'xs_chaptername': xs_chaptername,
'xs_content': xs_content,
'id_name': id_name,
'num_id': num_id,
'url': url
}
cur.execute(sql, value)
cnx.commit()
@classmethod
def select_chapter(cls, url):
sql = 'SELECT EXISTS(SELECT 1 FROM dd_chaptername WHERE url=%(url)s)'
value = {
'url': url
}
cur.execute(sql, value)
return cur.fetchall()[0]
那错误提示的也不明显.不知道哪里有问题?谢谢
作者:慧雅
链接:https://www.pythonheidong.com/blog/article/98012/45b485b03353f23ad2ac/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!