🛡️ 图片内容反爬

📋 功能说明
  • 识别图片中的文字内容
  • 破解图片反爬机制
  • 提取图片中的关键信息
💻 源代码
import cv2
import numpy as np
from PIL import Image
import pytesseract

def preprocess_image(image_path):
    """图像预处理"""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # 二值化
    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # 去噪
    denoised = cv2.fastNlMeansDenoising(binary, None, 10, 7, 21)
    
    return denoised

def ocr_image(image_path):
    """OCR识别图片文字"""
    # 预处理
    processed = preprocess_image(image_path)
    
    # 保存临时文件
    cv2.imwrite('temp.png', processed)
    
    # OCR识别
    text = pytesseract.image_to_string('temp.png', lang='chi_sim+eng')
    return text

def extract_info(image_path):
    """提取图片中的关键信息"""
    text = ocr_image(image_path)
    
    # 提取数字
    import re
    numbers = re.findall(r'\d+', text)
    
    # 提取邮箱
    emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)
    
    return {
        'text': text,
        'numbers': numbers,
        'emails': emails
    }

# 示例
info = extract_info('captcha.png')
print(f"识别文字: {info['text']}")
print(f"数字: {info['numbers']}")
📦 运行环境
pip install opencv-python pillow pytesseract
依赖安装
  • Tesseract-OCR引擎
  • 中文语言包