Merge branch 'feature/transcriptions-webhook-endpoint' of github.com:…

…ls1intum/Pyris into feature/transcriptions-webhook-endpoint
ls1intum · Feb 11, 2025 · c81c09e · c81c09e
2 parents 5036396 + 5bd5e0d
commit c81c09e
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 39 deletions.
diff --git a/app/domain/data/metrics/transcription_dto.py b/app/domain/data/metrics/transcription_dto.py
@@ -4,22 +4,23 @@
 
 
 class TranscriptionSegmentDTO(BaseModel):
-    start_time: float = Field(default=0.0, alias="startTime")
-    end_time: float = Field(default=0.0, alias="endTime")
-    text: str = Field(default="", alias="text")
+    start_time: float = Field(..., alias="startTime")
+    end_time: float = Field(..., alias="endTime")
+    text: str = Field(..., alias="text")
     slide_number: int = Field(default=0, alias="slideNumber")
-    lecture_unit_id: int = Field(default=0, alias="lectureUnitId")
+    # lecture_unit_id: int = Field(..., alias="lectureUnitId")
 
 
 class TranscriptionDTO(BaseModel):
     language: str = Field(default="en", alias="language")
-    segments: List[TranscriptionSegmentDTO] = Field(default=[], alias="segments")
+    segments: List[TranscriptionSegmentDTO] = Field(..., alias="segments")
 
 
 class TranscriptionWebhookDTO(BaseModel):
-    transcription: TranscriptionDTO = Field(alias="transcription")
-    lecture_id: int = Field(alias="lectureId")
-    lecture_name: str = Field(default="", alias="lectureName")
-    course_id: int = Field(alias="courseId")
-    course_name: str = Field(default="", alias="courseName")
-    course_description: str = Field(default="", alias="courseDescription")
+    transcription: TranscriptionDTO = Field(..., alias="transcription")
+    lecture_id: int = Field(..., alias="lectureId")
+    lecture_name: str = Field(..., alias="lectureName")
+    course_id: int = Field(..., alias="courseId")
+    course_name: str = Field(..., alias="courseName")
+    lecture_unit_id: int = Field(..., alias="lectureUnitId")
+    # course_description: str = Field(..., alias="courseDescription")
diff --git a/app/pipeline/prompts/transcription_ingestion_prompts.py b/app/pipeline/prompts/transcription_ingestion_prompts.py
@@ -1,6 +1,12 @@
 def transcription_summary_prompt(lecture_name: str, chunk_content: str):
     return f"""
-        You are a helpful assistant. A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you, summarize the information without adding details and return only the summary nothing more.
+        You are an excellent tutor with deep expertise in computer science and practical applications,
+        teaching at the university level.
+        A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you.
+        Please accurately follow the instructions below.
+        1. Summarize the information in a clear and accurate manner.
+        2. Do not add additional information.
+        3. Only answer in complete sentences.
         This is the text you should summarize:
         {chunk_content}
     """
diff --git a/app/pipeline/transcription_ingestion_pipeline.py b/app/pipeline/transcription_ingestion_pipeline.py
@@ -1,4 +1,3 @@
-import threading
 from functools import reduce
 from typing import Optional, List, Dict, Any
 
@@ -25,6 +24,7 @@
 )
 from app.llm.langchain import IrisLangchainChatModel
 from app.pipeline import Pipeline
+from app.pipeline.faq_ingestion_pipeline import batch_update_lock
 from app.pipeline.prompts.transcription_ingestion_prompts import (
     transcription_summary_prompt,
 )
@@ -34,7 +34,7 @@
 )
 from app.web.status.transcription_ingestion_callback import TranscriptionIngestionStatus
 
-batch_insert_lock = threading.Lock()
+batch_insert_lock = batch_update_lock
 
 CHUNK_SEPARATOR_CHAR = "\31"
 
@@ -75,7 +75,7 @@ def __call__(self) -> None:
         try:
             self.callback.in_progress("Chunking transcriptions")
             chunks = self.chunk_transcriptions(self.dto.transcriptions)
-
+            logger.info("chunked data")
             self.callback.in_progress("Summarizing transcriptions")
             chunks = self.summarize_chunks(chunks)
 
@@ -118,19 +118,19 @@ def chunk_transcriptions(
         for transcription in transcriptions:
             slide_chunks = {}
             for segment in transcription.transcription.segments:
-                slide_key = f"{transcription.lecture_id}_{segment.lecture_unit_id}_{segment.slide_number}"
+                slide_key = f"{transcription.lecture_id}_{transcription.lecture_unit_id}_{segment.slide_number}"
 
                 if slide_key not in slide_chunks:
                     chunk = {
                         LectureTranscriptionSchema.COURSE_ID.value: transcription.course_id,
                         LectureTranscriptionSchema.COURSE_NAME.value: transcription.course_name,
                         LectureTranscriptionSchema.LECTURE_ID.value: transcription.lecture_id,
                         LectureTranscriptionSchema.LECTURE_NAME.value: transcription.lecture_name,
+                        LectureTranscriptionSchema.LECTURE_UNIT_ID.value: transcription.lecture_unit_id,
                         LectureTranscriptionSchema.LANGUAGE.value: transcription.transcription.language,
-                        LectureTranscriptionSchema.SEGMENT_START.value: segment.start_time,
-                        LectureTranscriptionSchema.SEGMENT_END.value: segment.end_time,
+                        LectureTranscriptionSchema.SEGMENT_START_TIME.value: segment.start_time,
+                        LectureTranscriptionSchema.SEGMENT_END_TIME.value: segment.end_time,
                         LectureTranscriptionSchema.SEGMENT_TEXT.value: segment.text,
-                        LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDES_ID.value: segment.lecture_unit_id,
                         LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDE_NUMBER.value: segment.slide_number,
                     }
 
@@ -140,14 +140,14 @@ def chunk_transcriptions(
                         LectureTranscriptionSchema.SEGMENT_TEXT.value
                     ] += (CHUNK_SEPARATOR_CHAR + segment.text)
                     slide_chunks[slide_key][
-                        LectureTranscriptionSchema.SEGMENT_END.value
+                        LectureTranscriptionSchema.SEGMENT_END_TIME.value
                     ] = segment.end_time
 
             for i, segment in enumerate(slide_chunks.values()):
                 # If the segment is shorter than 1200 characters, we can just add it as is
                 if len(segment[LectureTranscriptionSchema.SEGMENT_TEXT.value]) < 1200:
                     # Add the segment to the chunks list and replace the chunk separator character with a space
-                    segment[LectureTranscriptionSchema.SEGMENT_TEXT.value] = self.replace_seperator_char(segment[
+                    segment[LectureTranscriptionSchema.SEGMENT_TEXT.value] = self.replace_separator_char(segment[
                         LectureTranscriptionSchema.SEGMENT_TEXT.value
                     ])
                     chunks.append(segment)
@@ -163,7 +163,7 @@ def chunk_transcriptions(
                 # Calculate the offset of the current slide chunk to the start of the transcript
                 offset_slide_chunk = reduce(
                     lambda acc, txt: acc
-                                     + len(self.remove_seperator_char(txt)),
+                                     + len(self.remove_separator_char(txt)),
                     map(
                         lambda seg: seg[
                             LectureTranscriptionSchema.SEGMENT_TEXT.value
@@ -175,7 +175,7 @@ def chunk_transcriptions(
                 offset_start = offset_slide_chunk
                 for j, chunk in enumerate(semantic_chunks):
                     offset_end = offset_start + len(
-                        self.remove_seperator_char(chunk)
+                        self.remove_separator_char(chunk)
                     )
 
                     start_time = self.get_transcription_segment_of_char_position(
@@ -188,9 +188,9 @@ def chunk_transcriptions(
                     chunks.append(
                         {
                             **segment,
-                            LectureTranscriptionSchema.SEGMENT_START.value: start_time,
-                            LectureTranscriptionSchema.SEGMENT_END.value: end_time,
-                            LectureTranscriptionSchema.SEGMENT_TEXT.value: self.cleanup_chunk(self.replace_seperator_char(chunk)),
+                            LectureTranscriptionSchema.SEGMENT_START_TIME.value: start_time,
+                            LectureTranscriptionSchema.SEGMENT_END_TIME.value: end_time,
+                            LectureTranscriptionSchema.SEGMENT_TEXT.value: self.cleanup_chunk(self.replace_separator_char(chunk)),
                         }
                     )
                     offset_start = offset_end + 1
@@ -216,11 +216,11 @@ def cleanup_chunk(text: str):
         return text.replace("  ", " ").strip()
 
     @staticmethod
-    def replace_seperator_char(text: str, replace_with: str = " ") -> str:
+    def replace_separator_char(text: str, replace_with: str = " ") -> str:
         return text.replace(CHUNK_SEPARATOR_CHAR, replace_with)
 
-    def remove_seperator_char(self, text: str) -> str:
-        return self.replace_seperator_char(text, "")
+    def remove_separator_char(self, text: str) -> str:
+        return self.replace_separator_char(text, "")
 
     def summarize_chunks(self, chunks):
         chunks_with_summaries = []

diff --git a/app/vector_database/lecture_transcription_schema.py b/app/vector_database/lecture_transcription_schema.py
@@ -16,11 +16,11 @@ class LectureTranscriptionSchema(Enum):
     COURSE_NAME = "course_name"
     LECTURE_ID = "lecture_id"
     LECTURE_NAME = "lecture_name"
+    LECTURE_UNIT_ID = "lecture_unit_id"
     LANGUAGE = "language"
-    SEGMENT_START = "segment_start"
-    SEGMENT_END = "segment_end"
+    SEGMENT_START_TIME = "segment_start_time"
+    SEGMENT_END_TIME = "segment_end_time"
     SEGMENT_TEXT = "segment_text"
-    SEGMENT_LECTURE_UNIT_SLIDES_ID = "segment_lecture_unit_slides_id"
     SEGMENT_LECTURE_UNIT_SLIDE_NUMBER = "segment_lecture_unit_slide_number"
     SEGMENT_SUMMARY = "segment_summary"
 
@@ -67,14 +67,14 @@ def init_lecture_transcription_schema(client: WeaviateClient) -> Collection:
                 index_searchable=False,
             ),
             Property(
-                name=LectureTranscriptionSchema.SEGMENT_START.value,
-                description="The start of the segment",
+                name=LectureTranscriptionSchema.SEGMENT_START_TIME.value,
+                description="The start time of the segment",
                 data_type=DataType.NUMBER,
                 index_searchable=False,
             ),
             Property(
-                name=LectureTranscriptionSchema.SEGMENT_END.value,
-                description="The end of the segment",
+                name=LectureTranscriptionSchema.SEGMENT_END_TIME.value,
+                description="The end time of the segment",
                 data_type=DataType.NUMBER,
                 index_searchable=False,
             ),
@@ -85,8 +85,8 @@ def init_lecture_transcription_schema(client: WeaviateClient) -> Collection:
                 index_searchable=True,
             ),
             Property(
-                name=LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDES_ID.value,
-                description="The id of the lecture unit slides of the segment",
+                name=LectureTranscriptionSchema.LECTURE_UNIT_ID.value,
+                description="The id of the lecture unit of the transcription",
                 data_type=DataType.INT,
                 index_searchable=False,
             ),

diff --git a/app/web/routers/webhooks.py b/app/web/routers/webhooks.py
@@ -191,7 +191,7 @@ def transcription_ingestion_webhook(dto: TranscriptionIngestionPipelineExecution
     """
     Webhook endpoint to trigger the lecture transcription ingestion pipeline
     """
-    print(f"transcription ingestion got DTO {dto}")
+    logger.info(f"transcription ingestion got DTO {dto}")
     thread = Thread(target=run_transcription_ingestion_pipeline_worker, args=(dto,))
     thread.start()