+关注
已关注

分类  

暂无分类

标签  

暂无标签

日期归档  

2019-08(96)

2019-09(100)

2019-10(16)

2019-11(10)

2019-12(12)

Scrapy爬取某音用户粉丝、关注、视频、内附签名x-gorgon,stub签名算法

发布于2020-06-04 22:45     阅读(1198)     评论(0)     点赞(30)     收藏(4)


Scrapy爬取某音用户粉丝、关注、视频、内附签名x-gorgon,stub签名算法


在这里插入图片描述
直接贴代码

UserInfoSpider.py

# -*- coding: utf-8 -*-

import json
import jieba
import random
import scrapy
from urllib import parse
from douyin.items import DouyinUserInfoItem
from douyin.spiders.Verification import getXGon, encryption, x_gorgon, str_to_byte, time_stamp
from douyin.devices_data import DevicesInfo


class UserInfoSpider(scrapy.Spider):
    name = 'userinfo'
    allowed_domains = []

    tag_list = [
        '游戏', '美食', '汽车', '电视剧', '美女', '国际', '摄影', '音乐',
        '数码', '体育', '搞笑', '军事', '科普', '养生', '减肥', '宠物',
        '健康', '艺术', '公益', '王者荣耀', '吃鸡', '绝地求生', '舞蹈',
        '健身', '爱情', '恋爱', '感情', '日常', '艺术', '宝宝', '风景',
        '国防', '模特', '平面', '设计', '护士', '印度', '拍照', '摄影',
        '交通', '女司机',
    ]
    tag = ''
    jieba_data = []

    count = 10
    cursor = 0

    cookies = ""
    x_tt_token = ""
    device_data = DevicesInfo.oppo_R17

    def headers(self, params, cookies='', x_tt_token=''):
        ts = time_stamp()
        stub = encryption(params)
        s = getXGon(params, stub)
        gorgon = x_gorgon(ts, str_to_byte(s))
        header = {
            "Accept-Encoding": "gzip",
            "Connection": "Keep-Alive",
            'X-SS-STUB': stub,
            'X-SS-REQ-TICKET': time_stamp(1000),
            'sdk-version': "1",
            'X-Gorgon': gorgon,
            "Cookie": cookies,
            "X-tt-token": x_tt_token,
            'X-Khronos': ts,
            'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
            'Host': "api.amemv.com",
            'User-Agent': "okhttp/3.10.0.1",
            'cache-control': "no-cache",
        }
        return header

    def data(self, tag):
        data = {
            "cursor": str(self.cursor),
            "keyword": tag,
            "count": str(self.count),
            "type": "1", "is_pull_refresh": "1", "hot_search": "0",
            "search_source": "", "search_id": "", "query_correct_type": "1"
        }
        return data

    def start_requests(self):
        """ spider启动执行,只会执行者一次 """
        url = "https://aweme.snssdk.com/aweme/v1/discover/search/?" + self.device_data
        headers = self.headers(params=self.device_data, cookies=self.cookies, x_tt_token=self.x_tt_token)
        self.tag = random.choice(self.tag_list)
        formdata = self.data(self.tag)
        yield scrapy.FormRequest(url=url, headers=headers, formdata=formdata, callback=self.parse_sec_id,
                                 dont_filter=True)
        # time.sleep(random.randint(1, 3))

    def parse_sec_id(self, response):
        """ 解析用户列表中的parse_sec_uid,生成新的请求 """
        response_json = json.loads(response.text, encoding='utf-8')
        print(response_json)
        uesr_info = response_json['user_list']
        # 如果获取到的用户列表不为空,表示该标签依然存在用户
        if uesr_info:
            for i in uesr_info:
                sec_uid = i['user_info']['sec_uid']
                params = {
                    'sec_user_id': sec_uid,
                    'address_book_access': "1",
                    'retry_type': 'no_retry',
                    'iid': '2576896581967031',
                    'device_id': '3491689742481495',
                    'ac': 'wifi',
                    'channel': 'tengxun_new',
                    'aid': '1128',
                    'app_name': 'aweme',
                    'version_code': '830',
                    'version_name': '8.3.0',
                    'device_platform': 'android',
                    'ssmix': 'a',
                    'device_type': 'MI+5s',
                    'device_brand': 'Xiaomi',
                    'language': 'zh',
                    'os_api': '23',
                    'os_version': '6.0.1',
                    'uuid': '300000000087236',
                    'openudid': '2ca3305f4119dd89',
                    'manifest_version_code': '830',
                    'resolution': '810*1440',
                    'dpi': '270',
                    'update_version_code': '8302',
                    '_rticket': '1590995035033',
                    'mcc_mnc': '46003',
                    'ts': '1590995033',
                    'app_type': 'normal'
                }
                url_encode = parse.urlencode(params)
                url = "https://aweme-eagle.snssdk.com/aweme/v1/user/?" + url_encode
                headers = self.headers(params=url_encode, cookies=self.cookies, x_tt_token=self.x_tt_token)
                # 根据获取到的sec_uid请求用户数据接口, 发送给parse_user_info进行用户信息提取
                yield scrapy.Request(
                    url=url, headers=headers,
                    body=url_encode, callback=self.parse_user_info,
                )
            # 该用户列表处理完成后,修改游标再次发送请求,获取翻页后的用户列表信息,每次游标的位置都是上次用户列表数量的总和
            self.cursor += len(uesr_info)

        else:
            # 表示该标签用户列表请求完成,在self.tag中随机选取一个标签再次进行请求用户列表
            self.tag = random.choice(self.jieba_data)
            self.logger.info("切换关键词:" + self.tag)
            self.cursor = 0
        url = "https://aweme.snssdk.com/aweme/v1/discover/search/?" + self.device_data
        headers = self.headers(params=self.device_data, cookies=self.cookies, x_tt_token=self.x_tt_token)
        formdata = self.data(self.tag)
        # 修改游标再次请求并回调给自己进行解析sec_uid
        yield scrapy.FormRequest(url=url, headers=headers, formdata=formdata, callback=self.parse_sec_id)

    def parse_following(self, response):
        """ 解析用户的关注列表 """
        pass

    def parse_fans(self, response):
        """ 解析用户粉丝列表 """
        print("获取到粉丝列表解析response", response.text)

    def parse_user_info(self, response):
        """ 解析用户的数据 """
        # 如果jieba_data中的数据大于100条,则删除前面50条
        if len(self.jieba_data) >= 100:
            self.jieba_data = self.jieba_data[50:]

        user_info_item = DouyinUserInfoItem()
        response_json = json.loads(response.text, encoding='utf-8')
        user_info_data = response_json['user']

        user_info_item['uid'] = str(user_info_data['uid'])  # uid
        user_info_item['sec_uid'] = user_info_data['sec_uid']  # sec_uid
        user_info_item['nickname'] = user_info_data['nickname']  # 昵称

        user_info_item['province'] = 'null'
        if user_info_data['province']:
            user_info_item['province'] = user_info_data['province']

        # 检查生日数据是否存在
        user_info_item['birthday'] = 'null'  # 生日
        if user_info_data['birthday']:
            user_info_item['birthday'] = user_info_data['birthday']

        user_info_item['city'] = 'null'
        if user_info_data['city']:
            user_info_item['city'] = user_info_data['city']

        user_info_item['location'] = 'null'
        if str(user_info_data['hide_location']) != 'True':
            user_info_item['location'] = user_info_data['location']

        user_info_item['fans_count'] = str(user_info_data['mplatform_followers_count'])  # 粉丝数
        user_info_item['following_count'] = str(user_info_data['following_count'])  # 关注数
        user_info_item['total_favorited'] = str(user_info_data['total_favorited'])  # 点赞数
        user_info_item['aweme_count'] = str(user_info_data['aweme_count'])  # 视频数量
        user_info_item['avatar_thumb'] = user_info_data['avatar_thumb']["url_list"][0]  # 头像
        user_info_item['classify'] = self.tag
        user_info_item['signature'] = 'null'
        if user_info_data['signature']:
            user_info_item['signature'] = user_info_data['signature']  # 签名
            # 将用户签名进行分词处理,将处理后的结果过滤并保存,用户下次请求的关键词
            m = user_info_item['signature']
            d = jieba.lcut(m)
            for i in d:
                self.jieba_data.append(i)
        user_info_item['classify'] = parse.unquote(self.tag)
        yield user_info_item


if __name__ == '__main__':
    from scrapy import cmdline

    cmdline.execute("scrapy crawl userinfo".split())

需要的同学加vx: xiaobom9 备注抖音

仅供参考学习,侵权联系删除

原文链接:https://blog.csdn.net/weixin_41257505/article/details/106527835



所属网站分类: 技术文章 > 博客

作者:dkfj8787

链接: https://www.pythonheidong.com/blog/article/405883/

来源: python黑洞网

任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任

30 0
收藏该文
已收藏

评论内容:(最多支持255个字符)