feat 改为从Excel直接统计到markdown中

W1ndys · Jan 30, 2025 · 3cee72a · 3cee72a
1 parent 3297601
commit 3cee72a
Showing 1 changed file with 141 additions and 141 deletions.
diff --git a/Easy-QFNU-Tools/Excel2md/AddNewData.py b/Easy-QFNU-Tools/Excel2md/AddNewData.py
@@ -1,152 +1,152 @@
+import pandas as pd
 import os
 import re
 
 
-def parse_course_block(content):
-    """解析单个课程块的内容"""
-    blocks = []
-    current_block = []
-    lines = content.split("\n")
-
-    for line in lines:
-        if line.startswith("## ") and current_block:
-            blocks.append("\n".join(current_block))
-            current_block = []
-        if line.strip():
-            current_block.append(line)
-
-    if current_block:
-        blocks.append("\n".join(current_block))
-
-    return blocks
-
+def parse_markdown_file(content):
+    """解析markdown文件内容，返回课程、校区、老师的层级结构"""
+    structure = {}
+    current_course = None
+    current_district = None
+    current_teacher = None
 
-def get_course_title(block):
-    """获取课程块的标题"""
-    lines = block.split("\n")
+    lines = content.split("\n")
     for line in lines:
         if line.startswith("## "):
-            return line[3:].strip()
-    return None
-
-
-def get_teacher_name(block):
-    """获取教师名称"""
-    lines = block.split("\n")
-    for line in lines:
-        if line.startswith("#### "):
-            return line[5:].strip()
-    return None
-
-
-def merge_content(target_content, new_block):
-    """在目标内容中合并新的课程块"""
-    # 获取新块的课程标题和教师名称
-    new_course_title = get_course_title(new_block)
-    teacher_name = get_teacher_name(new_block)
-
-    if not new_course_title or not teacher_name:
-        return target_content, new_block
-
-    # 在目标内容中查找对应的课程部分
-    course_pattern = f"## {new_course_title}[\\s\\S]*?(?=\\n## |$)"
-    course_match = re.search(course_pattern, target_content)
-
-    if course_match:
-        course_section = course_match.group(0)
-        # 在课程部分中查找对应教师的部分
-        teacher_pattern = f"#### {teacher_name}[\\s\\S]*?(?=\\n#### |$)"
-        teacher_match = re.search(teacher_pattern, course_section)
-
-        if teacher_match:
-            # 找到对应教师的部分
-            teacher_section = teacher_match.group(0)
-
-            # 提取新评价内容
-            new_content = []
-            found_teacher = False
-            for line in new_block.split("\n"):
-                if line.startswith("#### "):
-                    found_teacher = True
+            current_course = line[3:].strip()
+            structure[current_course] = {}
+            current_district = None
+            current_teacher = None
+        elif line.startswith("### "):
+            current_district = line[4:].strip()
+            if current_course:
+                structure[current_course][current_district] = {}
+            current_teacher = None
+        elif line.startswith("#### "):
+            current_teacher = line[5:].strip()
+            if current_course and current_district:
+                structure[current_course][current_district][current_teacher] = []
+
+    return structure
+
+
+def find_insertion_point(content, course, district, teacher):
+    """找到在文件中插入新内容的位置"""
+    lines = content.split("\n")
+    course_pattern = f"## {course}"
+    district_pattern = f"### {district}"
+    teacher_pattern = f"#### {teacher}"
+
+    # 找到课程、校区、老师的位置
+    course_found = False
+    district_found = False
+    teacher_found = False
+
+    for i, line in enumerate(lines):
+        if line.startswith(course_pattern):
+            course_found = True
+        elif course_found and line.startswith(district_pattern):
+            district_found = True
+        elif district_found and line.startswith(teacher_pattern):
+            teacher_found = True
+            # 找到教师后，继续往下找到最后一条评论
+            for j in range(i + 1, len(lines)):
+                if (
+                    lines[j].startswith("##")
+                    or lines[j].startswith("###")
+                    or lines[j].startswith("####")
+                ):
+                    return j
+                if j == len(lines) - 1:
+                    return j + 1
+
+    return -1
+
+
+def process_data(excel_file, markdown_dir):
+    # 读取Excel文件
+    df = pd.read_excel(excel_file, header=None)
+    df.columns = ["course", "teacher", "district", "year", "description", "submitter"]
+
+    rows_to_delete = []
+    unmatched_rows = set()  # 使用集合来存储未匹配的行
+
+    # 遍历markdown文件
+    for filename in os.listdir(markdown_dir):
+        if filename.endswith(".md"):
+            file_path = os.path.join(markdown_dir, filename)
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+
+            # 解析文件结构
+            structure = parse_markdown_file(content)
+            content_modified = False
+
+            # 处理每一行Excel数据
+            for index, row in df.iterrows():
+                if index in rows_to_delete:  # 跳过已经处理过的行
                     continue
-                if found_teacher and line.strip():
-                    if not line.startswith("##"):
-                        new_content.append(line)
-
-            # 只有在有新内容时才合并
-            if new_content:
-                # 合并内容，确保评价和引用之间有空行
-                merged_teacher_section = (
-                    teacher_section + "\n\n" + "\n".join(new_content)
-                )
-
-                # 更新课程部分
-                updated_course_section = course_section.replace(
-                    teacher_section, merged_teacher_section
-                )
-
-                # 更新整个内容
-                return (
-                    target_content.replace(course_section, updated_course_section),
-                    None,
-                )
-
-    return target_content, new_block
-
-
-def process_files(source_file, target_dir):
-    """处理文件合并"""
-    # 读取源文件
-    with open(source_file, "r", encoding="utf-8") as f:
-        source_content = f.read()
-
-    # 解析源文件中的课程块
-    course_blocks = parse_course_block(source_content)
-    unmerged_blocks = []
-    merged_blocks = set()  # 用于跟踪已合并的块
-
-    # 遍历目标目录中的所有markdown文件
-    for filename in os.listdir(target_dir):
-        if not filename.endswith(".md"):
-            continue
-
-        file_path = os.path.join(target_dir, filename)
-        with open(file_path, "r", encoding="utf-8") as f:
-            target_content = f.read()
-
-        modified = False
-        # 尝试合并每个课程块
-        for block in course_blocks:
-            target_content, unmerged = merge_content(target_content, block)
-            if unmerged:
-                unmerged_blocks.append(unmerged)
-            else:
-                modified = True
-                # 记录已成功合并的块的标题
-                title = get_course_title(block)
-                if title:
-                    merged_blocks.add(title)
-
-        # 如果有修改，写入文件
-        if modified:
-            with open(file_path, "w", encoding="utf-8") as f:
-                f.write(target_content)
-
-    # 只保留未合并的块
-    final_unmerged = []
-    for block in unmerged_blocks:
-        title = get_course_title(block)
-        if title and title not in merged_blocks:
-            final_unmerged.append(block)
-
-    # 更新源文件，只包含未合并的内容
-    if final_unmerged:
-        with open(source_file, "w", encoding="utf-8") as f:
-            f.write("\n\n".join(final_unmerged))
-    else:
-        # 如果所有内容都已合并，清空源文件
-        open(source_file, "w", encoding="utf-8").close()
+
+                course = row["course"]
+                district = row["district"]
+                teacher = row["teacher"]
+
+                # 检查是否存在对应的结构
+                if (
+                    course in structure
+                    and district in structure[course]
+                    and teacher in structure[course][district]
+                ):
+
+                    # 找到插入点
+                    insert_point = find_insertion_point(
+                        content, course, district, teacher
+                    )
+                    if insert_point != -1:
+                        # 构建新内容
+                        new_content = f"\n{row['description']}\n\n> {row['submitter']}({row['year']}年)\n"
+
+                        # 插入新内容
+                        content_lines = content.split("\n")
+                        content_lines.insert(insert_point, new_content)
+                        content = "\n".join(content_lines)
+
+                        # 标记要删除的行
+                        rows_to_delete.append(index)
+                        # 如果之前被标记为未匹配，现在移除
+                        unmatched_rows.discard(index)
+                        content_modified = True
+                        print(
+                            f"✅ 成功添加: {course} - {district} - {teacher} - {row['submitter']}"
+                        )
+                else:
+                    # 只有当这行数据还没有被成功处理时，才标记为未匹配
+                    if index not in rows_to_delete:
+                        unmatched_rows.add(index)
+
+            # 只有在文件被修改时才写回
+            if content_modified:
+                with open(file_path, "w", encoding="utf-8") as f:
+                    f.write(content)
+
+    # 打印未匹配的数据
+    if unmatched_rows:
+        print("\n❌ 以下数据未找到匹配位置：")
+        for index in sorted(unmatched_rows):
+            row = df.iloc[index]
+            print(
+                f"- {row['course']} - {row['district']} - {row['teacher']} - {row['submitter']}"
+            )
+
+    # 删除已处理的行
+    if rows_to_delete:
+        df = df.drop(rows_to_delete)
+        df.to_excel(excel_file, index=False, header=False)
+        print(f"\n📊 统计信息：")
+        print(f"- 成功处理：{len(rows_to_delete)} 条数据")
+        print(f"- 未能匹配：{len(unmatched_rows)} 条数据")
+        print(f"- 总数据量：{len(df) + len(rows_to_delete)} 条数据")
 
 
 # 使用示例
-process_files("output.md", "example")
+process_data("data.xlsx", "example")