From 3cee72acb843e907c357e6d3aabceb77ce7c8d0d Mon Sep 17 00:00:00 2001 From: W1ndys Date: Thu, 30 Jan 2025 20:48:58 +0800 Subject: [PATCH] =?UTF-8?q?feat=20=E6=94=B9=E4=B8=BA=E4=BB=8EExcel?= =?UTF-8?q?=E7=9B=B4=E6=8E=A5=E7=BB=9F=E8=AE=A1=E5=88=B0markdown=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Easy-QFNU-Tools/Excel2md/AddNewData.py | 282 ++++++++++++------------- 1 file changed, 141 insertions(+), 141 deletions(-) diff --git a/Easy-QFNU-Tools/Excel2md/AddNewData.py b/Easy-QFNU-Tools/Excel2md/AddNewData.py index ee410a76..28db713e 100644 --- a/Easy-QFNU-Tools/Excel2md/AddNewData.py +++ b/Easy-QFNU-Tools/Excel2md/AddNewData.py @@ -1,152 +1,152 @@ +import pandas as pd import os import re -def parse_course_block(content): - """解析单个课程块的内容""" - blocks = [] - current_block = [] - lines = content.split("\n") - - for line in lines: - if line.startswith("## ") and current_block: - blocks.append("\n".join(current_block)) - current_block = [] - if line.strip(): - current_block.append(line) - - if current_block: - blocks.append("\n".join(current_block)) - - return blocks - +def parse_markdown_file(content): + """解析markdown文件内容,返回课程、校区、老师的层级结构""" + structure = {} + current_course = None + current_district = None + current_teacher = None -def get_course_title(block): - """获取课程块的标题""" - lines = block.split("\n") + lines = content.split("\n") for line in lines: if line.startswith("## "): - return line[3:].strip() - return None - - -def get_teacher_name(block): - """获取教师名称""" - lines = block.split("\n") - for line in lines: - if line.startswith("#### "): - return line[5:].strip() - return None - - -def merge_content(target_content, new_block): - """在目标内容中合并新的课程块""" - # 获取新块的课程标题和教师名称 - new_course_title = get_course_title(new_block) - teacher_name = get_teacher_name(new_block) - - if not new_course_title or not teacher_name: - return target_content, new_block - - # 在目标内容中查找对应的课程部分 - course_pattern = f"## {new_course_title}[\\s\\S]*?(?=\\n## |$)" - course_match = re.search(course_pattern, target_content) - - if course_match: - course_section = course_match.group(0) - # 在课程部分中查找对应教师的部分 - teacher_pattern = f"#### {teacher_name}[\\s\\S]*?(?=\\n#### |$)" - teacher_match = re.search(teacher_pattern, course_section) - - if teacher_match: - # 找到对应教师的部分 - teacher_section = teacher_match.group(0) - - # 提取新评价内容 - new_content = [] - found_teacher = False - for line in new_block.split("\n"): - if line.startswith("#### "): - found_teacher = True + current_course = line[3:].strip() + structure[current_course] = {} + current_district = None + current_teacher = None + elif line.startswith("### "): + current_district = line[4:].strip() + if current_course: + structure[current_course][current_district] = {} + current_teacher = None + elif line.startswith("#### "): + current_teacher = line[5:].strip() + if current_course and current_district: + structure[current_course][current_district][current_teacher] = [] + + return structure + + +def find_insertion_point(content, course, district, teacher): + """找到在文件中插入新内容的位置""" + lines = content.split("\n") + course_pattern = f"## {course}" + district_pattern = f"### {district}" + teacher_pattern = f"#### {teacher}" + + # 找到课程、校区、老师的位置 + course_found = False + district_found = False + teacher_found = False + + for i, line in enumerate(lines): + if line.startswith(course_pattern): + course_found = True + elif course_found and line.startswith(district_pattern): + district_found = True + elif district_found and line.startswith(teacher_pattern): + teacher_found = True + # 找到教师后,继续往下找到最后一条评论 + for j in range(i + 1, len(lines)): + if ( + lines[j].startswith("##") + or lines[j].startswith("###") + or lines[j].startswith("####") + ): + return j + if j == len(lines) - 1: + return j + 1 + + return -1 + + +def process_data(excel_file, markdown_dir): + # 读取Excel文件 + df = pd.read_excel(excel_file, header=None) + df.columns = ["course", "teacher", "district", "year", "description", "submitter"] + + rows_to_delete = [] + unmatched_rows = set() # 使用集合来存储未匹配的行 + + # 遍历markdown文件 + for filename in os.listdir(markdown_dir): + if filename.endswith(".md"): + file_path = os.path.join(markdown_dir, filename) + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + # 解析文件结构 + structure = parse_markdown_file(content) + content_modified = False + + # 处理每一行Excel数据 + for index, row in df.iterrows(): + if index in rows_to_delete: # 跳过已经处理过的行 continue - if found_teacher and line.strip(): - if not line.startswith("##"): - new_content.append(line) - - # 只有在有新内容时才合并 - if new_content: - # 合并内容,确保评价和引用之间有空行 - merged_teacher_section = ( - teacher_section + "\n\n" + "\n".join(new_content) - ) - - # 更新课程部分 - updated_course_section = course_section.replace( - teacher_section, merged_teacher_section - ) - - # 更新整个内容 - return ( - target_content.replace(course_section, updated_course_section), - None, - ) - - return target_content, new_block - - -def process_files(source_file, target_dir): - """处理文件合并""" - # 读取源文件 - with open(source_file, "r", encoding="utf-8") as f: - source_content = f.read() - - # 解析源文件中的课程块 - course_blocks = parse_course_block(source_content) - unmerged_blocks = [] - merged_blocks = set() # 用于跟踪已合并的块 - - # 遍历目标目录中的所有markdown文件 - for filename in os.listdir(target_dir): - if not filename.endswith(".md"): - continue - - file_path = os.path.join(target_dir, filename) - with open(file_path, "r", encoding="utf-8") as f: - target_content = f.read() - - modified = False - # 尝试合并每个课程块 - for block in course_blocks: - target_content, unmerged = merge_content(target_content, block) - if unmerged: - unmerged_blocks.append(unmerged) - else: - modified = True - # 记录已成功合并的块的标题 - title = get_course_title(block) - if title: - merged_blocks.add(title) - - # 如果有修改,写入文件 - if modified: - with open(file_path, "w", encoding="utf-8") as f: - f.write(target_content) - - # 只保留未合并的块 - final_unmerged = [] - for block in unmerged_blocks: - title = get_course_title(block) - if title and title not in merged_blocks: - final_unmerged.append(block) - - # 更新源文件,只包含未合并的内容 - if final_unmerged: - with open(source_file, "w", encoding="utf-8") as f: - f.write("\n\n".join(final_unmerged)) - else: - # 如果所有内容都已合并,清空源文件 - open(source_file, "w", encoding="utf-8").close() + + course = row["course"] + district = row["district"] + teacher = row["teacher"] + + # 检查是否存在对应的结构 + if ( + course in structure + and district in structure[course] + and teacher in structure[course][district] + ): + + # 找到插入点 + insert_point = find_insertion_point( + content, course, district, teacher + ) + if insert_point != -1: + # 构建新内容 + new_content = f"\n{row['description']}\n\n> {row['submitter']}({row['year']}年)\n" + + # 插入新内容 + content_lines = content.split("\n") + content_lines.insert(insert_point, new_content) + content = "\n".join(content_lines) + + # 标记要删除的行 + rows_to_delete.append(index) + # 如果之前被标记为未匹配,现在移除 + unmatched_rows.discard(index) + content_modified = True + print( + f"✅ 成功添加: {course} - {district} - {teacher} - {row['submitter']}" + ) + else: + # 只有当这行数据还没有被成功处理时,才标记为未匹配 + if index not in rows_to_delete: + unmatched_rows.add(index) + + # 只有在文件被修改时才写回 + if content_modified: + with open(file_path, "w", encoding="utf-8") as f: + f.write(content) + + # 打印未匹配的数据 + if unmatched_rows: + print("\n❌ 以下数据未找到匹配位置:") + for index in sorted(unmatched_rows): + row = df.iloc[index] + print( + f"- {row['course']} - {row['district']} - {row['teacher']} - {row['submitter']}" + ) + + # 删除已处理的行 + if rows_to_delete: + df = df.drop(rows_to_delete) + df.to_excel(excel_file, index=False, header=False) + print(f"\n📊 统计信息:") + print(f"- 成功处理:{len(rows_to_delete)} 条数据") + print(f"- 未能匹配:{len(unmatched_rows)} 条数据") + print(f"- 总数据量:{len(df) + len(rows_to_delete)} 条数据") # 使用示例 -process_files("output.md", "example") +process_data("data.xlsx", "example")