-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path转word.py
71 lines (58 loc) · 2.52 KB
/
转word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches
# 设置文件路径
html_file_path = 'D:/Desktop/temp/软创/代码/aidetect/精华消息.html'
output_dir = 'D:/Desktop/temp/软创/代码/aidetect/精华消息_files'
output_word_path = 'D:/Desktop/temp/软创/代码/aidetect/output.docx'
# 创建 Word 文档
document = Document()
# 解析 HTML 文件
with open(html_file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
# 查找所有消息记录
messages = soup.find_all('div', class_='block')
for message in messages:
sender_nick = message.find('div', class_='sender_nick')
sender_time = message.find('div', class_='sender_time')
content_div = message.find('div', class_='short')
add_digest = message.find('div', class_='add_digest')
# 检查并获取文本内容
sender_nick_text = sender_nick.text.strip() if sender_nick else '未知发送者'
sender_time_text = sender_time.text.strip() if sender_time else '未知时间'
content_text = content_div.text.strip() if content_div else '无内容'
add_digest_text = add_digest.text.strip() if add_digest else '无附加信息'
# 将消息记录添加到 Word 文档
document.add_heading(sender_nick_text, level=2)
document.add_paragraph(sender_time_text)
document.add_paragraph(content_text)
document.add_paragraph(add_digest_text)
# 查找图片
images = content_div.find_all('img') if content_div else []
for img in images:
img_url = img['src']
img_name = img_url.split('/')[-1]
img_path = os.path.join(output_dir, img_name)
# 检查图片URL是否正确
print(f"Processing image: {img_url}")
# 下载图片
if not os.path.exists(img_path):
try:
img_data = requests.get(img_url).content
with open(img_path, 'wb') as img_file:
img_file.write(img_data)
print(f"Image downloaded: {img_path}")
except Exception as e:
print(f"Failed to download image: {img_url}, error: {e}")
continue
# 将图片添加到 Word 文档
try:
document.add_picture(img_path, width=Inches(2))
print(f"Image added to document: {img_path}")
except Exception as e:
print(f"Failed to add image to document: {img_path}, error: {e}")
# 保存 Word 文档
document.save(output_word_path)
print(f'Word 文档已保存到: {output_word_path}')