-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
141 additions
and
141 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,152 +1,152 @@ | ||
import pandas as pd | ||
import os | ||
import re | ||
|
||
|
||
def parse_course_block(content): | ||
"""解析单个课程块的内容""" | ||
blocks = [] | ||
current_block = [] | ||
lines = content.split("\n") | ||
|
||
for line in lines: | ||
if line.startswith("## ") and current_block: | ||
blocks.append("\n".join(current_block)) | ||
current_block = [] | ||
if line.strip(): | ||
current_block.append(line) | ||
|
||
if current_block: | ||
blocks.append("\n".join(current_block)) | ||
|
||
return blocks | ||
|
||
def parse_markdown_file(content): | ||
"""解析markdown文件内容,返回课程、校区、老师的层级结构""" | ||
structure = {} | ||
current_course = None | ||
current_district = None | ||
current_teacher = None | ||
|
||
def get_course_title(block): | ||
"""获取课程块的标题""" | ||
lines = block.split("\n") | ||
lines = content.split("\n") | ||
for line in lines: | ||
if line.startswith("## "): | ||
return line[3:].strip() | ||
return None | ||
|
||
|
||
def get_teacher_name(block): | ||
"""获取教师名称""" | ||
lines = block.split("\n") | ||
for line in lines: | ||
if line.startswith("#### "): | ||
return line[5:].strip() | ||
return None | ||
|
||
|
||
def merge_content(target_content, new_block): | ||
"""在目标内容中合并新的课程块""" | ||
# 获取新块的课程标题和教师名称 | ||
new_course_title = get_course_title(new_block) | ||
teacher_name = get_teacher_name(new_block) | ||
|
||
if not new_course_title or not teacher_name: | ||
return target_content, new_block | ||
|
||
# 在目标内容中查找对应的课程部分 | ||
course_pattern = f"## {new_course_title}[\\s\\S]*?(?=\\n## |$)" | ||
course_match = re.search(course_pattern, target_content) | ||
|
||
if course_match: | ||
course_section = course_match.group(0) | ||
# 在课程部分中查找对应教师的部分 | ||
teacher_pattern = f"#### {teacher_name}[\\s\\S]*?(?=\\n#### |$)" | ||
teacher_match = re.search(teacher_pattern, course_section) | ||
|
||
if teacher_match: | ||
# 找到对应教师的部分 | ||
teacher_section = teacher_match.group(0) | ||
|
||
# 提取新评价内容 | ||
new_content = [] | ||
found_teacher = False | ||
for line in new_block.split("\n"): | ||
if line.startswith("#### "): | ||
found_teacher = True | ||
current_course = line[3:].strip() | ||
structure[current_course] = {} | ||
current_district = None | ||
current_teacher = None | ||
elif line.startswith("### "): | ||
current_district = line[4:].strip() | ||
if current_course: | ||
structure[current_course][current_district] = {} | ||
current_teacher = None | ||
elif line.startswith("#### "): | ||
current_teacher = line[5:].strip() | ||
if current_course and current_district: | ||
structure[current_course][current_district][current_teacher] = [] | ||
|
||
return structure | ||
|
||
|
||
def find_insertion_point(content, course, district, teacher): | ||
"""找到在文件中插入新内容的位置""" | ||
lines = content.split("\n") | ||
course_pattern = f"## {course}" | ||
district_pattern = f"### {district}" | ||
teacher_pattern = f"#### {teacher}" | ||
|
||
# 找到课程、校区、老师的位置 | ||
course_found = False | ||
district_found = False | ||
teacher_found = False | ||
|
||
for i, line in enumerate(lines): | ||
if line.startswith(course_pattern): | ||
course_found = True | ||
elif course_found and line.startswith(district_pattern): | ||
district_found = True | ||
elif district_found and line.startswith(teacher_pattern): | ||
teacher_found = True | ||
# 找到教师后,继续往下找到最后一条评论 | ||
for j in range(i + 1, len(lines)): | ||
if ( | ||
lines[j].startswith("##") | ||
or lines[j].startswith("###") | ||
or lines[j].startswith("####") | ||
): | ||
return j | ||
if j == len(lines) - 1: | ||
return j + 1 | ||
|
||
return -1 | ||
|
||
|
||
def process_data(excel_file, markdown_dir): | ||
# 读取Excel文件 | ||
df = pd.read_excel(excel_file, header=None) | ||
df.columns = ["course", "teacher", "district", "year", "description", "submitter"] | ||
|
||
rows_to_delete = [] | ||
unmatched_rows = set() # 使用集合来存储未匹配的行 | ||
|
||
# 遍历markdown文件 | ||
for filename in os.listdir(markdown_dir): | ||
if filename.endswith(".md"): | ||
file_path = os.path.join(markdown_dir, filename) | ||
with open(file_path, "r", encoding="utf-8") as f: | ||
content = f.read() | ||
|
||
# 解析文件结构 | ||
structure = parse_markdown_file(content) | ||
content_modified = False | ||
|
||
# 处理每一行Excel数据 | ||
for index, row in df.iterrows(): | ||
if index in rows_to_delete: # 跳过已经处理过的行 | ||
continue | ||
if found_teacher and line.strip(): | ||
if not line.startswith("##"): | ||
new_content.append(line) | ||
|
||
# 只有在有新内容时才合并 | ||
if new_content: | ||
# 合并内容,确保评价和引用之间有空行 | ||
merged_teacher_section = ( | ||
teacher_section + "\n\n" + "\n".join(new_content) | ||
) | ||
|
||
# 更新课程部分 | ||
updated_course_section = course_section.replace( | ||
teacher_section, merged_teacher_section | ||
) | ||
|
||
# 更新整个内容 | ||
return ( | ||
target_content.replace(course_section, updated_course_section), | ||
None, | ||
) | ||
|
||
return target_content, new_block | ||
|
||
|
||
def process_files(source_file, target_dir): | ||
"""处理文件合并""" | ||
# 读取源文件 | ||
with open(source_file, "r", encoding="utf-8") as f: | ||
source_content = f.read() | ||
|
||
# 解析源文件中的课程块 | ||
course_blocks = parse_course_block(source_content) | ||
unmerged_blocks = [] | ||
merged_blocks = set() # 用于跟踪已合并的块 | ||
|
||
# 遍历目标目录中的所有markdown文件 | ||
for filename in os.listdir(target_dir): | ||
if not filename.endswith(".md"): | ||
continue | ||
|
||
file_path = os.path.join(target_dir, filename) | ||
with open(file_path, "r", encoding="utf-8") as f: | ||
target_content = f.read() | ||
|
||
modified = False | ||
# 尝试合并每个课程块 | ||
for block in course_blocks: | ||
target_content, unmerged = merge_content(target_content, block) | ||
if unmerged: | ||
unmerged_blocks.append(unmerged) | ||
else: | ||
modified = True | ||
# 记录已成功合并的块的标题 | ||
title = get_course_title(block) | ||
if title: | ||
merged_blocks.add(title) | ||
|
||
# 如果有修改,写入文件 | ||
if modified: | ||
with open(file_path, "w", encoding="utf-8") as f: | ||
f.write(target_content) | ||
|
||
# 只保留未合并的块 | ||
final_unmerged = [] | ||
for block in unmerged_blocks: | ||
title = get_course_title(block) | ||
if title and title not in merged_blocks: | ||
final_unmerged.append(block) | ||
|
||
# 更新源文件,只包含未合并的内容 | ||
if final_unmerged: | ||
with open(source_file, "w", encoding="utf-8") as f: | ||
f.write("\n\n".join(final_unmerged)) | ||
else: | ||
# 如果所有内容都已合并,清空源文件 | ||
open(source_file, "w", encoding="utf-8").close() | ||
|
||
course = row["course"] | ||
district = row["district"] | ||
teacher = row["teacher"] | ||
|
||
# 检查是否存在对应的结构 | ||
if ( | ||
course in structure | ||
and district in structure[course] | ||
and teacher in structure[course][district] | ||
): | ||
|
||
# 找到插入点 | ||
insert_point = find_insertion_point( | ||
content, course, district, teacher | ||
) | ||
if insert_point != -1: | ||
# 构建新内容 | ||
new_content = f"\n{row['description']}\n\n> {row['submitter']}({row['year']}年)\n" | ||
|
||
# 插入新内容 | ||
content_lines = content.split("\n") | ||
content_lines.insert(insert_point, new_content) | ||
content = "\n".join(content_lines) | ||
|
||
# 标记要删除的行 | ||
rows_to_delete.append(index) | ||
# 如果之前被标记为未匹配,现在移除 | ||
unmatched_rows.discard(index) | ||
content_modified = True | ||
print( | ||
f"✅ 成功添加: {course} - {district} - {teacher} - {row['submitter']}" | ||
) | ||
else: | ||
# 只有当这行数据还没有被成功处理时,才标记为未匹配 | ||
if index not in rows_to_delete: | ||
unmatched_rows.add(index) | ||
|
||
# 只有在文件被修改时才写回 | ||
if content_modified: | ||
with open(file_path, "w", encoding="utf-8") as f: | ||
f.write(content) | ||
|
||
# 打印未匹配的数据 | ||
if unmatched_rows: | ||
print("\n❌ 以下数据未找到匹配位置:") | ||
for index in sorted(unmatched_rows): | ||
row = df.iloc[index] | ||
print( | ||
f"- {row['course']} - {row['district']} - {row['teacher']} - {row['submitter']}" | ||
) | ||
|
||
# 删除已处理的行 | ||
if rows_to_delete: | ||
df = df.drop(rows_to_delete) | ||
df.to_excel(excel_file, index=False, header=False) | ||
print(f"\n📊 统计信息:") | ||
print(f"- 成功处理:{len(rows_to_delete)} 条数据") | ||
print(f"- 未能匹配:{len(unmatched_rows)} 条数据") | ||
print(f"- 总数据量:{len(df) + len(rows_to_delete)} 条数据") | ||
|
||
|
||
# 使用示例 | ||
process_files("output.md", "example") | ||
process_data("data.xlsx", "example") |