选择分类开始采集...
# -*- coding: utf-8 -*-
import requests
import os
# API URL
# https://www.douyu.com/gapi/rknc/directory/yzRec/1
def get_html(url):
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
response = requests.get(url=url, headers=header)
html = response.json()
return html
def parse_html(html):
rl_list = html['data']['rl']
img_info_list = []
for rl in rl_list:
img_info = {}
img_info['img_url'] = rl['rs1'] # 头像URL
img_info['nickname'] = rl['nn'] # 主播昵称
img_info['room_id'] = rl['rid'] # 房间号
img_info['title'] = rl['rn'] # 房间标题
img_info['viewers'] = rl['ol'] # 在线人数
img_info_list.append(img_info)
return img_info_list
def save_to_images(img_info_list):
dir_path = 'douyu_streamers'
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for img_info in img_info_list:
img_path = os.path.join(dir_path, f"{img_info['nickname']}.jpg")
res = requests.get(img_info['img_url'])
with open(img_path, 'wb') as f:
f.write(res.content)
def save_to_csv(img_info_list):
import csv
with open('douyu_streamers.csv', 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=['room_id', 'nickname', 'title', 'viewers'])
writer.writeheader()
for info in img_info_list:
writer.writerow(info)
if __name__ == '__main__':
# 颜值主播分类
url = 'https://www.douyu.com/gapi/rknc/directory/yzRec/1'
html = get_html(url)
img_info_list = parse_html(html)
# 保存头像
save_to_images(img_info_list)
# 保存CSV
save_to_csv(img_info_list)
print(f"采集完成,共 {len(img_info_list)} 位主播")
for info in img_info_list:
print(f"{info['nickname']} - {info['title']} - {info['viewers']}在线")
pip install requestsdouyu_spider.pypython douyu_spider.py