程序员最近都爱上了这个网站  程序员们快来瞅瞅吧!  it98k网:it98k.com

本站消息

站长简介/公众号

  出租广告位,需要合作请联系站长

+关注
已关注

分类  

暂无分类

标签  

暂无标签

日期归档  

2024-11(2)

带cookie爬取图片

发布于2020-03-18 12:02     阅读(1448)     评论(0)     点赞(29)     收藏(4)


业务需要自己做了爬某宝某猫 商户图片的的,省的进个网站还要复制黏贴半天的图片。

自己记录一下。

import requests,re,os,time
from lxml import etree


class TBZT():
    def __init__(self):
            self.headers=self.cookiE()
            try:
                url=str(input('请输入淘宝的网址:'))
                self.req=requests.get(url,self.headers)
            except Exception as e:
                print('网址错误',e)
                time.sleep(10)

def cookiE(self):
    cookie=str(input('请输入淘宝的cookie:'))
    try:
        if cookie == '':
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initiall_headers={
                'cookie':'你的cookie',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('默认cookie')
            return initiall_headers
            time.sleep(1)

        else:
            #url1='https://detail.tmall.com/item.htm?spm=a230r.1.999.159.599b523cjYsg4e&id=593911750924&ns=1
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initialls_headers={
                'cookie': cookie,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('自选cookie')
            time.sleep(1)
            return initialls_headers
    except Exception as e:
        print('cookie错误',e)
        time.sleep(10)

def desc_Url(self):
    req=self.req.text
    text=etree.HTML(req)
    # documentName=text.xpath('//*[@id="J_Title"]/h3/text()')[0].strip()
    documentName=text.xpath('//*[@id="J_Title"]/h3/@data-title')[0]
    if not os.path.exists(documentName):
        os.mkdir(documentName)

    rule=re.compile(r"location.protocol==='http:' \? '(.*?)' :")
    desc=re.findall(rule,req)[0]
    descUrl='https:{}'.format(desc)
    descUrls=requests.get(descUrl,headers=self.headers).text
    rule=re.compile(r'src="(.*?)" ')
    imgUrls=re.findall(rule,descUrls)
    if len(imgUrls)==0:
        print('noSrc!')
        time.sleep(2)
        exit(1)
    for Index,imgurl in enumerate(imgUrls):
        self.download_Img(str(Index),imgurl,documentName)
        print(Index,imgurl)

def download_Img(self,Index,imgurl,doucumentName):
    try:
        imgUrl=requests.get(imgurl)
        with open (doucumentName+'\\'+Index+'.jpg','wb') as f:
            f.write(imgUrl.content)
    except Exception as e:
        print('downloadError:',e)
        time.sleep(3)

class TMZT():
def init(self):
self.headers=self.cookiE()
try:
url=str(input(‘请输入天猫的网址:’))
self.req=requests.get(url,self.headers)
except Exception as e:
print(‘网址错误’,e)
time.sleep(10)

def cookiE(self):
    cookie=str(input('请输入天猫的cookie:'))
    try:
        if cookie == '':
            # url='https://detail.tmall.com/item.htm?spm=a230r.1.999.191.599b523cjYsg4e&id=589451048907&ns=1'
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initiall_headers={
                'cookie':'你的cookie',
                # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
                # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
            }
            print('默认cookie')
            return initiall_headers
            time.sleep(1)

        else:
            #url1='https://detail.tmall.com/item.htm?spm=a230r.1.999.159.599b523cjYsg4e&id=593911750924&ns=1
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initialls_headers={
                'cookie': cookie,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('自选cookie')
            time.sleep(1)
            return initialls_headers
    except Exception as e:
        print('cookie错误',e)
        time.sleep(10)

def desc_Url(self):
    req=self.req.text
    text=etree.HTML(req)
    documentName=text.xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()')[0].strip()
    if not os.path.exists(documentName):
        os.mkdir(documentName)
    rule=re.compile(r'"descUrl":"(.*?)",')
    # rule=re.compile(r"location.protocol==='http:' \? '(.*?)' :")
    desc=re.findall(rule,req)[0]
    descUrl='https:{}'.format(desc)
    descUrls=requests.get(descUrl,headers=self.headers).text
    rule=re.compile(r'img src="(.*?)" ')
    imgUrls=re.findall(rule,descUrls)
    if len(imgUrls)==0:
        print('noSrc!')
        time.sleep(2)
        exit(1)
    for Index,imgurl in enumerate(imgUrls):
        self.download_Img(str(Index),imgurl,documentName)
        print(Index,imgurl)


def download_Img(self,Index,imgurl,doucumentName):
    try:
        imgUrl=requests.get(imgurl)
        with open (doucumentName+'\\'+Index+'.jpg','wb') as f:
            f.write(imgUrl.content)
    except Exception as e:
        print('downloadError:',e)
        time.sleep(3)




if __name__ == '__main__':

model=input('淘宝找图输入"tb或a",天猫找图输入"tm或b": ')
if model == 'tb':
    print('淘宝模式')
    tbzt=TBZT()
    tbzt.desc_Url()
elif model == 'a':
    print('淘宝模式')
    tbzt=TBZT()
    tbzt.desc_Url()
elif model == 'tm':
    print('天猫模式')
    tmzt=TMZT()
    tmzt.desc_Url()
elif model == 'b':
    print('天猫模式')
    tmzt=TMZT()
    tmzt.desc_Url()
else:
    print('输入错误,关闭')
    time.sleep(5)
    exit(1)


所属网站分类: 技术文章 > 博客

作者:战天

链接:https://www.pythonheidong.com/blog/article/265485/6659c2cd42787fa4f267/

来源:python黑洞网

任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任

29 0
收藏该文
已收藏

评论内容:(最多支持255个字符)