requests爬百度,抖音,微博热榜
快手热榜
def get_baidu_hot():
url = "https://top.baidu.com/board?tab=realtime"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
content = html.xpath('//*[@id="sanRoot"]/main/div[2]/div/div[2]/div')
data = []
index = 0
for con in content:
index+=1
pattern_abc = re.compile(r'[\u4e00-\u9fff]+')
chinese_text = ''.join(pattern_abc.findall(con.xpath('div[2]/a/div[1]/text()')[0]))
hot_url = con.xpath('a/@href')[0]
pattern_123 = re.compile(r'[\u4e00-\u9fff0-9]+')
hot_heat = ''.join(pattern_123.findall(con.xpath('div[1]/div[2]/text()')[0]))
hot_heat = int(hot_heat)
element = {
"ranking": index,
"url": hot_url,
"text": chinese_text,
"hot_heat": hot_heat
}
data.append(element)
with open('baidu.json', 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
return data
抖音热榜
def get_douyin_hot():
url = "https://www.douyin.com/aweme/v1/web/hot/search/list/"
params = {
"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"detail_list": "1",
"source": "6",
"main_billboard_count": "5",
"pc_client_type": "1",
"version_code": "170400",
"version_name": "17.4.0",
"cookie_enabled": "true",
"screen_width": "1920",
"screen_height": "1080",
"browser_language": "zh-CN",
"browser_platform": "Win32",
"browser_name": "Chrome",
"browser_version": "119.0.0.0",
"browser_online": "true",
"engine_name": "Blink",
"engine_version": "119.0.0.0",
"os_name": "Windows",
"os_version": "10",
"cpu_core_num": "8",
"device_memory": "8",
"platform": "PC",
"downlink": "10",
"effective_type": "4g",
"round_trip_time": "50",
"webid": "72990928502883875",
"msToken": "rPSaiuDrWOdu4pFm29n4Ql_FAFDDP_-WTqt7n2A1yztcaA0bXBMyEGOebIG25Bj48gfrOiYTu_TpOT5D2qjY6epk57pJF0jCIrkInSGXW3up82Jc7WVJNa1HfNnVZ2M=",
"X-Bogus": "DFSzswVL0EtANxxXtulm9YXAIQRw"
}
headers = {
"authority": "www.douyin.com",
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9",
"referer": "https://www.douyin.com/discover",
"sec-ch-ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win; x) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, params=params)
data = []
index = 0
for work in json.loads(response.text)['data']['word_list']:
index+=1
work_text = work["word"]
hot_url = 'https://www.douyin.com/hot/15080/'+work_text
hot_value = work["hot_value"]
element = {
"ranking": index,
"url": hot_url,
"text": work_text,
"hot_heat":hot_value
}
data.append(element)
with open('douyin.json', 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
return data
微博热榜
def get_weibo_hot():
url = 'https://s.weibo.com/top/summary?cate=realtimehot'
headers = {
'cookie': ,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win; x) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
rep_text = response.text
print(rep_text)
pattern = re.compile(r'<a href="/weibo\?.*?>(.*?)</a>')
host_list = pattern.findall(rep_text)
pattern_span = re.compile(r'<a href="/weibo\?.*?</a>\s*<span>\s*(.*?)\s*</span>')
hot_heat = pattern_span.findall(rep_text)
hot_heat.insert(0,0)
paired_tags = zip(host_list, hot_heat)
data = []
index = 0
for host_text in paired_tags:
print(host_text)
host_url = 'https://s.weibo.com/weibo?q=%23'+host_text[0]
index+=1
element = {
"ranking": index ,
"url": host_url,
"text": host_text[0],
"hot_heat": host_text[1]
}
print(element)
data.append(element)
with open('weibo.json', 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
return data