输入帖子URL开始采集...
# -*- coding: utf-8 -*-
import requests
from lxml import etree
from fontTools.ttLib import TTFont
url = "https://club.autohome.com.cn/bbs/thread/帖子ID"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
}
# 获取页面源码
res = requests.get(url=url, headers=headers)
res_html = res.text
# 解析数据
html = etree.HTML(res_html)
content_list = html.xpath('//div[@class="tz-paragraph"]//text()')
# 连接文本内容
content_str = "".join(content_list)
# 处理字体反爬
# 1. 下载并保存字体文件
font = TTFont("autohome.ttf")
font.saveXML("fonts.xml")
# 2. 构建编码映射表
uniList = font.getGlyphOrder()
word_list = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', ...]
# 3. 替换加密文字
for i in range(len(uniList)):
content_str = content_str.replace(uni_list[i], word_list[i])
print(content_str)
pip install requests lxml fonttoolsautohome.pypython autohome.py