Skip to content

Commit

Permalink
feat 改为从Excel直接统计到markdown中
Browse files Browse the repository at this point in the history
  • Loading branch information
W1ndys committed Jan 30, 2025
1 parent 3297601 commit 3cee72a
Showing 1 changed file with 141 additions and 141 deletions.
282 changes: 141 additions & 141 deletions Easy-QFNU-Tools/Excel2md/AddNewData.py
Original file line number Diff line number Diff line change
@@ -1,152 +1,152 @@
import pandas as pd
import os
import re


def parse_course_block(content):
"""解析单个课程块的内容"""
blocks = []
current_block = []
lines = content.split("\n")

for line in lines:
if line.startswith("## ") and current_block:
blocks.append("\n".join(current_block))
current_block = []
if line.strip():
current_block.append(line)

if current_block:
blocks.append("\n".join(current_block))

return blocks

def parse_markdown_file(content):
"""解析markdown文件内容,返回课程、校区、老师的层级结构"""
structure = {}
current_course = None
current_district = None
current_teacher = None

def get_course_title(block):
"""获取课程块的标题"""
lines = block.split("\n")
lines = content.split("\n")
for line in lines:
if line.startswith("## "):
return line[3:].strip()
return None


def get_teacher_name(block):
"""获取教师名称"""
lines = block.split("\n")
for line in lines:
if line.startswith("#### "):
return line[5:].strip()
return None


def merge_content(target_content, new_block):
"""在目标内容中合并新的课程块"""
# 获取新块的课程标题和教师名称
new_course_title = get_course_title(new_block)
teacher_name = get_teacher_name(new_block)

if not new_course_title or not teacher_name:
return target_content, new_block

# 在目标内容中查找对应的课程部分
course_pattern = f"## {new_course_title}[\\s\\S]*?(?=\\n## |$)"
course_match = re.search(course_pattern, target_content)

if course_match:
course_section = course_match.group(0)
# 在课程部分中查找对应教师的部分
teacher_pattern = f"#### {teacher_name}[\\s\\S]*?(?=\\n#### |$)"
teacher_match = re.search(teacher_pattern, course_section)

if teacher_match:
# 找到对应教师的部分
teacher_section = teacher_match.group(0)

# 提取新评价内容
new_content = []
found_teacher = False
for line in new_block.split("\n"):
if line.startswith("#### "):
found_teacher = True
current_course = line[3:].strip()
structure[current_course] = {}
current_district = None
current_teacher = None
elif line.startswith("### "):
current_district = line[4:].strip()
if current_course:
structure[current_course][current_district] = {}
current_teacher = None
elif line.startswith("#### "):
current_teacher = line[5:].strip()
if current_course and current_district:
structure[current_course][current_district][current_teacher] = []

return structure


def find_insertion_point(content, course, district, teacher):
"""找到在文件中插入新内容的位置"""
lines = content.split("\n")
course_pattern = f"## {course}"
district_pattern = f"### {district}"
teacher_pattern = f"#### {teacher}"

# 找到课程、校区、老师的位置
course_found = False
district_found = False
teacher_found = False

for i, line in enumerate(lines):
if line.startswith(course_pattern):
course_found = True
elif course_found and line.startswith(district_pattern):
district_found = True
elif district_found and line.startswith(teacher_pattern):
teacher_found = True
# 找到教师后,继续往下找到最后一条评论
for j in range(i + 1, len(lines)):
if (
lines[j].startswith("##")
or lines[j].startswith("###")
or lines[j].startswith("####")
):
return j
if j == len(lines) - 1:
return j + 1

return -1


def process_data(excel_file, markdown_dir):
# 读取Excel文件
df = pd.read_excel(excel_file, header=None)
df.columns = ["course", "teacher", "district", "year", "description", "submitter"]

rows_to_delete = []
unmatched_rows = set() # 使用集合来存储未匹配的行

# 遍历markdown文件
for filename in os.listdir(markdown_dir):
if filename.endswith(".md"):
file_path = os.path.join(markdown_dir, filename)
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()

# 解析文件结构
structure = parse_markdown_file(content)
content_modified = False

# 处理每一行Excel数据
for index, row in df.iterrows():
if index in rows_to_delete: # 跳过已经处理过的行
continue
if found_teacher and line.strip():
if not line.startswith("##"):
new_content.append(line)

# 只有在有新内容时才合并
if new_content:
# 合并内容,确保评价和引用之间有空行
merged_teacher_section = (
teacher_section + "\n\n" + "\n".join(new_content)
)

# 更新课程部分
updated_course_section = course_section.replace(
teacher_section, merged_teacher_section
)

# 更新整个内容
return (
target_content.replace(course_section, updated_course_section),
None,
)

return target_content, new_block


def process_files(source_file, target_dir):
"""处理文件合并"""
# 读取源文件
with open(source_file, "r", encoding="utf-8") as f:
source_content = f.read()

# 解析源文件中的课程块
course_blocks = parse_course_block(source_content)
unmerged_blocks = []
merged_blocks = set() # 用于跟踪已合并的块

# 遍历目标目录中的所有markdown文件
for filename in os.listdir(target_dir):
if not filename.endswith(".md"):
continue

file_path = os.path.join(target_dir, filename)
with open(file_path, "r", encoding="utf-8") as f:
target_content = f.read()

modified = False
# 尝试合并每个课程块
for block in course_blocks:
target_content, unmerged = merge_content(target_content, block)
if unmerged:
unmerged_blocks.append(unmerged)
else:
modified = True
# 记录已成功合并的块的标题
title = get_course_title(block)
if title:
merged_blocks.add(title)

# 如果有修改,写入文件
if modified:
with open(file_path, "w", encoding="utf-8") as f:
f.write(target_content)

# 只保留未合并的块
final_unmerged = []
for block in unmerged_blocks:
title = get_course_title(block)
if title and title not in merged_blocks:
final_unmerged.append(block)

# 更新源文件,只包含未合并的内容
if final_unmerged:
with open(source_file, "w", encoding="utf-8") as f:
f.write("\n\n".join(final_unmerged))
else:
# 如果所有内容都已合并,清空源文件
open(source_file, "w", encoding="utf-8").close()

course = row["course"]
district = row["district"]
teacher = row["teacher"]

# 检查是否存在对应的结构
if (
course in structure
and district in structure[course]
and teacher in structure[course][district]
):

# 找到插入点
insert_point = find_insertion_point(
content, course, district, teacher
)
if insert_point != -1:
# 构建新内容
new_content = f"\n{row['description']}\n\n> {row['submitter']}({row['year']}年)\n"

# 插入新内容
content_lines = content.split("\n")
content_lines.insert(insert_point, new_content)
content = "\n".join(content_lines)

# 标记要删除的行
rows_to_delete.append(index)
# 如果之前被标记为未匹配,现在移除
unmatched_rows.discard(index)
content_modified = True
print(
f"✅ 成功添加: {course} - {district} - {teacher} - {row['submitter']}"
)
else:
# 只有当这行数据还没有被成功处理时,才标记为未匹配
if index not in rows_to_delete:
unmatched_rows.add(index)

# 只有在文件被修改时才写回
if content_modified:
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)

# 打印未匹配的数据
if unmatched_rows:
print("\n❌ 以下数据未找到匹配位置:")
for index in sorted(unmatched_rows):
row = df.iloc[index]
print(
f"- {row['course']} - {row['district']} - {row['teacher']} - {row['submitter']}"
)

# 删除已处理的行
if rows_to_delete:
df = df.drop(rows_to_delete)
df.to_excel(excel_file, index=False, header=False)
print(f"\n📊 统计信息:")
print(f"- 成功处理:{len(rows_to_delete)} 条数据")
print(f"- 未能匹配:{len(unmatched_rows)} 条数据")
print(f"- 总数据量:{len(df) + len(rows_to_delete)} 条数据")


# 使用示例
process_files("output.md", "example")
process_data("data.xlsx", "example")

0 comments on commit 3cee72a

Please sign in to comment.