带cookie爬取图片-python黑洞网

本站消息

站长简介/公众号

出租广告位,需要合作请联系站长

战天

746

文章

624167

访问

+关注

分类

暂无分类

日期归档

2024-11(2)

带cookie爬取图片

发布于2020-03-18 12:02 阅读(1572) 评论(0) 点赞(29) 收藏(4)

业务需要自己做了爬某宝某猫商户图片的的，省的进个网站还要复制黏贴半天的图片。

自己记录一下。

import requests,re,os,time
from lxml import etree


class TBZT():
    def __init__(self):
            self.headers=self.cookiE()
            try:
                url=str(input('请输入淘宝的网址：'))
                self.req=requests.get(url,self.headers)
            except Exception as e:
                print('网址错误',e)
                time.sleep(10)

def cookiE(self):
    cookie=str(input('请输入淘宝的cookie：'))
    try:
        if cookie == '':
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initiall_headers={
                'cookie':'你的cookie',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('默认cookie')
            return initiall_headers
            time.sleep(1)

        else:
            #url1='https://detail.tmall.com/item.htm?spm=a230r.1.999.159.599b523cjYsg4e&id=593911750924&ns=1
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initialls_headers={
                'cookie': cookie,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('自选cookie')
            time.sleep(1)
            return initialls_headers
    except Exception as e:
        print('cookie错误',e)
        time.sleep(10)

def desc_Url(self):
    req=self.req.text
    text=etree.HTML(req)
    # documentName=text.xpath('//*[@id="J_Title"]/h3/text()')[0].strip()
    documentName=text.xpath('//*[@id="J_Title"]/h3/@data-title')[0]
    if not os.path.exists(documentName):
        os.mkdir(documentName)

    rule=re.compile(r"location.protocol==='http:' \? '(.*?)' :")
    desc=re.findall(rule,req)[0]
    descUrl='https:{}'.format(desc)
    descUrls=requests.get(descUrl,headers=self.headers).text
    rule=re.compile(r'src="(.*?)" ')
    imgUrls=re.findall(rule,descUrls)
    if len(imgUrls)==0:
        print('noSrc!')
        time.sleep(2)
        exit(1)
    for Index,imgurl in enumerate(imgUrls):
        self.download_Img(str(Index),imgurl,documentName)
        print(Index,imgurl)

def download_Img(self,Index,imgurl,doucumentName):
    try:
        imgUrl=requests.get(imgurl)
        with open (doucumentName+'\\'+Index+'.jpg','wb') as f:
            f.write(imgUrl.content)
    except Exception as e:
        print('downloadError：',e)
        time.sleep(3)

class TMZT():
def init(self):
self.headers=self.cookiE()
try:
url=str(input(‘请输入天猫的网址：’))
self.req=requests.get(url,self.headers)
except Exception as e:
print(‘网址错误’,e)
time.sleep(10)

def cookiE(self):
    cookie=str(input('请输入天猫的cookie：'))
    try:
        if cookie == '':
            # url='https://detail.tmall.com/item.htm?spm=a230r.1.999.191.599b523cjYsg4e&id=589451048907&ns=1'
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initiall_headers={
                'cookie':'你的cookie',
                # 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
                # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
            }
            print('默认cookie')
            return initiall_headers
            time.sleep(1)

        else:
            #url1='https://detail.tmall.com/item.htm?spm=a230r.1.999.159.599b523cjYsg4e&id=593911750924&ns=1
            # url1="https://item.taobao.com/item.htm?spm=a230r.1.999.51.3de1523cBN9zex&id=606684613235&ns=1#detail"
            initialls_headers={
                'cookie': cookie,
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.44 Safari/537.36'
            }
            print('自选cookie')
            time.sleep(1)
            return initialls_headers
    except Exception as e:
        print('cookie错误',e)
        time.sleep(10)

def desc_Url(self):
    req=self.req.text
    text=etree.HTML(req)
    documentName=text.xpath('//*[@id="J_DetailMeta"]/div[1]/div[1]/div/div[1]/h1/text()')[0].strip()
    if not os.path.exists(documentName):
        os.mkdir(documentName)
    rule=re.compile(r'"descUrl":"(.*?)",')
    # rule=re.compile(r"location.protocol==='http:' \? '(.*?)' :")
    desc=re.findall(rule,req)[0]
    descUrl='https:{}'.format(desc)
    descUrls=requests.get(descUrl,headers=self.headers).text
    rule=re.compile(r'img src="(.*?)" ')
    imgUrls=re.findall(rule,descUrls)
    if len(imgUrls)==0:
        print('noSrc!')
        time.sleep(2)
        exit(1)
    for Index,imgurl in enumerate(imgUrls):
        self.download_Img(str(Index),imgurl,documentName)
        print(Index,imgurl)


def download_Img(self,Index,imgurl,doucumentName):
    try:
        imgUrl=requests.get(imgurl)
        with open (doucumentName+'\\'+Index+'.jpg','wb') as f:
            f.write(imgUrl.content)
    except Exception as e:
        print('downloadError：',e)
        time.sleep(3)




if __name__ == '__main__':

model=input('淘宝找图输入"tb或a",天猫找图输入"tm或b": ')
if model == 'tb':
    print('淘宝模式')
    tbzt=TBZT()
    tbzt.desc_Url()
elif model == 'a':
    print('淘宝模式')
    tbzt=TBZT()
    tbzt.desc_Url()
elif model == 'tm':
    print('天猫模式')
    tmzt=TMZT()
    tmzt.desc_Url()
elif model == 'b':
    print('天猫模式')
    tmzt=TMZT()
    tmzt.desc_Url()
else:
    print('输入错误,关闭')
    time.sleep(5)
    exit(1)