💻 源代码
from lxml import etree
import requests
import os
def crawl_tieba(tieba_name, start_page, end_page):
"""爬取贴吧图片"""
url = 'https://tieba.baidu.com/f?'
if not os.path.exists('img'):
os.mkdir('img')
for page in range(start_page, end_page + 1):
pn = (page - 1) * 50
response = requests.get(url, params={'kw': tieba_name, 'pn': pn})
html = response.text
content = etree.HTML(html)
# 获取帖子链接
link_list = content.xpath('//a[@class="j_th_tit "]/@href')
for link in link_list:
fulllink = 'https://tieba.baidu.com' + link
tiezi_resp = requests.get(fulllink)
tiezi_html = tiezi_resp.text
img_link = etree.HTML(tiezi_html)
# 获取帖子中的图片
img_list = img_link.xpath('//img[@class="BDE_Image"]/@src')
for img_url in img_list:
img_response = requests.get(img_url)
img_name = img_url[-10:]
with open('img/' + img_name, 'wb') as f:
f.write(img_response.content)
print(f"保存图片: {img_name}")
# 示例: 爬取"美女"贴吧第1-3页的图片
crawl_tieba('美女', 1, 3)