📝 简书文章爬虫

📋 功能说明
  • 爬取指定用户的所有文章
  • 批量下载文章内容
  • 保存为Markdown格式
💻 源代码
import requests
import re
from lxml import etree

def get_jianshu_user_id(username):
    """获取用户ID"""
    url = f'https://www.jianshu.com/u/{username}'
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    html = etree.HTML(res.text)
    user_id = html.xpath('//div[@class="_1u3F7"]/@data-user-id')
    return user_id[0] if user_id else None

def get_user_articles(user_id, page=1):
    """获取用户文章列表"""
    url = f'https://www.jianshu.com/users/{user_id}/timeline'
    params = {'page': page}
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    res = requests.get(url, params=params, headers=headers).json()
    articles = []
    
    for item in res:
        article = {
            'title': item['object']['data']['title'],
            'slug': item['object']['data']['slug'],
            'publish_time': item['object']['data']['publish_time'],
            'likes_count': item['object']['data']['likes_count']
        }
        articles.append(article)
    
    return articles

def get_article_content(slug):
    """获取文章详细内容"""
    url = f'https://www.jianshu.com/p/{slug}'
    headers = {'User-Agent': 'Mozilla/5.0'}
    res = requests.get(url, headers=headers)
    html = etree.HTML(res.text)
    
    content = html.xpath('//div[@class="article"]//text()')
    return ''.join(content)

# 示例
user_id = get_jianshu_user_id('username')
articles = get_user_articles(user_id)
for article in articles:
    print(f"标题: {article['title']}")
📦 运行环境
pip install requests lxml
功能特点
  • ✅ 用户文章列表
  • ✅ 批量下载
  • ✅ Markdown格式