易妖游戏网
您的当前位置:首页requests爬百度,抖音,微博热榜

requests爬百度,抖音,微博热榜

来源:易妖游戏网

requests爬百度,抖音,微博热榜

快手热榜

def get_baidu_hot():
    url = "https://top.baidu.com/board?tab=realtime"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    html = etree.HTML(response.text)
    content = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div')
    
    data = []
    index = 0
    for con in content:
        index+=1

        pattern_abc = re.compile(r'[\u4e00-\u9fff]+')
        chinese_text = ''.join(pattern_abc.findall(con.xpath('div[2]/a/div[1]/text()')[0]))

        hot_url = con.xpath('a/@href')[0]

        pattern_123 = re.compile(r'[\u4e00-\u9fff0-9]+')
        hot_heat = ''.join(pattern_123.findall(con.xpath('div[1]/div[2]/text()')[0]))
        hot_heat = int(hot_heat)

        # 为每个元素创建一个字典,包含url, text, 和ranking
        element = {
            "ranking": index,
            "url": hot_url,  # 示例URL
            "text": chinese_text,        
            "hot_heat": hot_heat
        }
        data.append(element)
            
    with open('baidu.json', 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    return data

抖音热榜

def get_douyin_hot():
    url = "https://www.douyin.com/aweme/v1/web/hot/search/list/"
    params = {
        "device_platform": "webapp",
        "aid": "6383",
        "channel": "channel_pc_web",
        "detail_list": "1",
        "source": "6",
        "main_billboard_count": "5",
        "pc_client_type": "1",
        "version_code": "170400",
        "version_name": "17.4.0",
        "cookie_enabled": "true",
        "screen_width": "1920",
        "screen_height": "1080",
        "browser_language": "zh-CN",
        "browser_platform": "Win32",
        "browser_name": "Chrome",
        "browser_version": "119.0.0.0",
        "browser_online": "true",
        "engine_name": "Blink",
        "engine_version": "119.0.0.0",
        "os_name": "Windows",
        "os_version": "10",
        "cpu_core_num": "8",
        "device_memory": "8",
        "platform": "PC",
        "downlink": "10",
        "effective_type": "4g",
        "round_trip_time": "50",
        "webid": "72990928502883875",
        "msToken": "rPSaiuDrWOdu4pFm29n4Ql_FAFDDP_-WTqt7n2A1yztcaA0bXBMyEGOebIG25Bj48gfrOiYTu_TpOT5D2qjY6epk57pJF0jCIrkInSGXW3up82Jc7WVJNa1HfNnVZ2M=",
        "X-Bogus": "DFSzswVL0EtANxxXtulm9YXAIQRw"
    }
    headers = {
        "authority": "www.douyin.com",
        "accept": "application/json, text/plain, */*",
        "accept-language": "zh-CN,zh;q=0.9",
        # 添加你的cookie信息
        # "cookie": ; passport_csrf_token=af0cc2656bb31bf10326d79b9b8200; ...",
        "referer": "https://www.douyin.com/discover",
        "sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Windows"',
        "sec-fetch-dest": "empty",
        "sec-fetch-mode": "cors",
        "sec-fetch-site": "same-origin",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win; x) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers, params=params)
    data = []
    index = 0
    for work in json.loads(response.text)['data']['word_list']:
        # print(work)
        index+=1
        work_text = work["word"]
        hot_url = 'https://www.douyin.com/hot/15080/'+work_text
        hot_value = work["hot_value"]

        element = {
            "ranking": index,
            "url": hot_url,
            "text": work_text,
            "hot_heat":hot_value
             
        }
        data.append(element)
    with open('douyin.json', 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    return data

微博热榜

def get_weibo_hot():
    url = 'https://s.weibo.com/top/summary?cate=realtimehot'
    headers = {
        # 'authority': 's.weibo.com',
        # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        # 'accept-language': 'zh-CN,zh;q=0.9',
        # 'cache-control': 'max-age=0',
        # # 添加你的cookie信息
        'cookie': ,
        # 'referer': 'https://passport.weibo.com/',
        # 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        # 'sec-ch-ua-mobile': '?0',
        # 'sec-ch-ua-platform': '"Windows"',
        # 'sec-fetch-dest': 'document',
        # 'sec-fetch-mode': 'navigate',
        # 'sec-fetch-site': 'same-site',
        # 'sec-fetch-user': '?1',
        # 'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win; x) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    rep_text = response.text

    print(rep_text)
    pattern = re.compile(r'<a href="/weibo\?.*?>(.*?)</a>')
    host_list = pattern.findall(rep_text)

    pattern_span = re.compile(r'<a href="/weibo\?.*?</a>\s*<span>\s*(.*?)\s*</span>')
    hot_heat = pattern_span.findall(rep_text)

    hot_heat.insert(0,0)

    paired_tags = zip(host_list, hot_heat)

    data = []
    index = 0
    for host_text in  paired_tags:
        print(host_text)

        host_url = 'https://s.weibo.com/weibo?q=%23'+host_text[0]
        index+=1
        element = {
            "ranking": index ,
            "url": host_url,
            "text": host_text[0],
            "hot_heat": host_text[1]
        }
        print(element)
        data.append(element)

    with open('weibo.json', 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    return data

因篇幅问题不能全部显示,请点此查看更多更全内容