转word.py

import os
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches

# 设置文件路径
html_file_path = 'D:/Desktop/temp/软创/代码/aidetect/精华消息.html'
output_dir = 'D:/Desktop/temp/软创/代码/aidetect/精华消息_files'
output_word_path = 'D:/Desktop/temp/软创/代码/aidetect/output.docx'

# 创建 Word 文档
document = Document()

# 解析 HTML 文件
with open(html_file_path, 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file, 'html.parser')

# 查找所有消息记录
messages = soup.find_all('div', class_='block')

for message in messages:
    sender_nick = message.find('div', class_='sender_nick')
    sender_time = message.find('div', class_='sender_time')
    content_div = message.find('div', class_='short')
    add_digest = message.find('div', class_='add_digest')

    # 检查并获取文本内容
    sender_nick_text = sender_nick.text.strip() if sender_nick else '未知发送者'
    sender_time_text = sender_time.text.strip() if sender_time else '未知时间'
    content_text = content_div.text.strip() if content_div else '无内容'
    add_digest_text = add_digest.text.strip() if add_digest else '无附加信息'

    # 将消息记录添加到 Word 文档
    document.add_heading(sender_nick_text, level=2)
    document.add_paragraph(sender_time_text)
    document.add_paragraph(content_text)
    document.add_paragraph(add_digest_text)

    # 查找图片
    images = content_div.find_all('img') if content_div else []
    for img in images:
        img_url = img['src']
        img_name = img_url.split('/')[-1]
        img_path = os.path.join(output_dir, img_name)

        # 检查图片URL是否正确
        print(f"Processing image: {img_url}")

        # 下载图片
        if not os.path.exists(img_path):
            try:
                img_data = requests.get(img_url).content
                with open(img_path, 'wb') as img_file:
                    img_file.write(img_data)
                print(f"Image downloaded: {img_path}")
            except Exception as e:
                print(f"Failed to download image: {img_url}, error: {e}")
                continue

        # 将图片添加到 Word 文档
        try:
            document.add_picture(img_path, width=Inches(2))
            print(f"Image added to document: {img_path}")
        except Exception as e:
            print(f"Failed to add image to document: {img_path}, error: {e}")

# 保存 Word 文档
document.save(output_word_path)

print(f'Word 文档已保存到: {output_word_path}')