百度贴吧图片爬虫

📌 百度贴吧图片爬虫

📋 功能说明

爬取指定贴吧的帖子图片
支持分页爬取
自动保存图片到本地

💻 源代码

from lxml import etree
import requests
import os

def crawl_tieba(tieba_name, start_page, end_page):
    """爬取贴吧图片"""
    url = 'https://tieba.baidu.com/f?'
    
    if not os.path.exists('img'):
        os.mkdir('img')
    
    for page in range(start_page, end_page + 1):
        pn = (page - 1) * 50
        response = requests.get(url, params={'kw': tieba_name, 'pn': pn})
        html = response.text
        content = etree.HTML(html)
        
        # 获取帖子链接
        link_list = content.xpath('//a[@class="j_th_tit "]/@href')
        
        for link in link_list:
            fulllink = 'https://tieba.baidu.com' + link
            tiezi_resp = requests.get(fulllink)
            tiezi_html = tiezi_resp.text
            img_link = etree.HTML(tiezi_html)
            
            # 获取帖子中的图片
            img_list = img_link.xpath('//img[@class="BDE_Image"]/@src')
            
            for img_url in img_list:
                img_response = requests.get(img_url)
                img_name = img_url[-10:]
                with open('img/' + img_name, 'wb') as f:
                    f.write(img_response.content)
                    print(f"保存图片: {img_name}")

# 示例: 爬取"美女"贴吧第1-3页的图片
crawl_tieba('美女', 1, 3)

📦 运行环境

pip install requests lxml

参数说明

tieba_name: 贴吧名称
start_page: 开始页码
end_page: 结束页码