Skip to content

Commit

Permalink
Merge branch 'feature/transcriptions-webhook-endpoint' of github.com:…
Browse files Browse the repository at this point in the history
…ls1intum/Pyris into feature/transcriptions-webhook-endpoint
  • Loading branch information
sebastianloose committed Feb 11, 2025
2 parents 5036396 + 5bd5e0d commit c81c09e
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 39 deletions.
23 changes: 12 additions & 11 deletions app/domain/data/metrics/transcription_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@


class TranscriptionSegmentDTO(BaseModel):
start_time: float = Field(default=0.0, alias="startTime")
end_time: float = Field(default=0.0, alias="endTime")
text: str = Field(default="", alias="text")
start_time: float = Field(..., alias="startTime")
end_time: float = Field(..., alias="endTime")
text: str = Field(..., alias="text")
slide_number: int = Field(default=0, alias="slideNumber")
lecture_unit_id: int = Field(default=0, alias="lectureUnitId")
# lecture_unit_id: int = Field(..., alias="lectureUnitId")


class TranscriptionDTO(BaseModel):
language: str = Field(default="en", alias="language")
segments: List[TranscriptionSegmentDTO] = Field(default=[], alias="segments")
segments: List[TranscriptionSegmentDTO] = Field(..., alias="segments")


class TranscriptionWebhookDTO(BaseModel):
transcription: TranscriptionDTO = Field(alias="transcription")
lecture_id: int = Field(alias="lectureId")
lecture_name: str = Field(default="", alias="lectureName")
course_id: int = Field(alias="courseId")
course_name: str = Field(default="", alias="courseName")
course_description: str = Field(default="", alias="courseDescription")
transcription: TranscriptionDTO = Field(..., alias="transcription")
lecture_id: int = Field(..., alias="lectureId")
lecture_name: str = Field(..., alias="lectureName")
course_id: int = Field(..., alias="courseId")
course_name: str = Field(..., alias="courseName")
lecture_unit_id: int = Field(..., alias="lectureUnitId")
# course_description: str = Field(..., alias="courseDescription")
8 changes: 7 additions & 1 deletion app/pipeline/prompts/transcription_ingestion_prompts.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
def transcription_summary_prompt(lecture_name: str, chunk_content: str):
return f"""
You are a helpful assistant. A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you, summarize the information without adding details and return only the summary nothing more.
You are an excellent tutor with deep expertise in computer science and practical applications,
teaching at the university level.
A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you.
Please accurately follow the instructions below.
1. Summarize the information in a clear and accurate manner.
2. Do not add additional information.
3. Only answer in complete sentences.
This is the text you should summarize:
{chunk_content}
"""
34 changes: 17 additions & 17 deletions app/pipeline/transcription_ingestion_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import threading
from functools import reduce
from typing import Optional, List, Dict, Any

Expand All @@ -25,6 +24,7 @@
)
from app.llm.langchain import IrisLangchainChatModel
from app.pipeline import Pipeline
from app.pipeline.faq_ingestion_pipeline import batch_update_lock
from app.pipeline.prompts.transcription_ingestion_prompts import (
transcription_summary_prompt,
)
Expand All @@ -34,7 +34,7 @@
)
from app.web.status.transcription_ingestion_callback import TranscriptionIngestionStatus

batch_insert_lock = threading.Lock()
batch_insert_lock = batch_update_lock

CHUNK_SEPARATOR_CHAR = "\31"

Expand Down Expand Up @@ -75,7 +75,7 @@ def __call__(self) -> None:
try:
self.callback.in_progress("Chunking transcriptions")
chunks = self.chunk_transcriptions(self.dto.transcriptions)

logger.info("chunked data")
self.callback.in_progress("Summarizing transcriptions")
chunks = self.summarize_chunks(chunks)

Expand Down Expand Up @@ -118,19 +118,19 @@ def chunk_transcriptions(
for transcription in transcriptions:
slide_chunks = {}
for segment in transcription.transcription.segments:
slide_key = f"{transcription.lecture_id}_{segment.lecture_unit_id}_{segment.slide_number}"
slide_key = f"{transcription.lecture_id}_{transcription.lecture_unit_id}_{segment.slide_number}"

if slide_key not in slide_chunks:
chunk = {
LectureTranscriptionSchema.COURSE_ID.value: transcription.course_id,
LectureTranscriptionSchema.COURSE_NAME.value: transcription.course_name,
LectureTranscriptionSchema.LECTURE_ID.value: transcription.lecture_id,
LectureTranscriptionSchema.LECTURE_NAME.value: transcription.lecture_name,
LectureTranscriptionSchema.LECTURE_UNIT_ID.value: transcription.lecture_unit_id,
LectureTranscriptionSchema.LANGUAGE.value: transcription.transcription.language,
LectureTranscriptionSchema.SEGMENT_START.value: segment.start_time,
LectureTranscriptionSchema.SEGMENT_END.value: segment.end_time,
LectureTranscriptionSchema.SEGMENT_START_TIME.value: segment.start_time,
LectureTranscriptionSchema.SEGMENT_END_TIME.value: segment.end_time,
LectureTranscriptionSchema.SEGMENT_TEXT.value: segment.text,
LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDES_ID.value: segment.lecture_unit_id,
LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDE_NUMBER.value: segment.slide_number,
}

Expand All @@ -140,14 +140,14 @@ def chunk_transcriptions(
LectureTranscriptionSchema.SEGMENT_TEXT.value
] += (CHUNK_SEPARATOR_CHAR + segment.text)
slide_chunks[slide_key][
LectureTranscriptionSchema.SEGMENT_END.value
LectureTranscriptionSchema.SEGMENT_END_TIME.value
] = segment.end_time

for i, segment in enumerate(slide_chunks.values()):
# If the segment is shorter than 1200 characters, we can just add it as is
if len(segment[LectureTranscriptionSchema.SEGMENT_TEXT.value]) < 1200:
# Add the segment to the chunks list and replace the chunk separator character with a space
segment[LectureTranscriptionSchema.SEGMENT_TEXT.value] = self.replace_seperator_char(segment[
segment[LectureTranscriptionSchema.SEGMENT_TEXT.value] = self.replace_separator_char(segment[
LectureTranscriptionSchema.SEGMENT_TEXT.value
])
chunks.append(segment)
Expand All @@ -163,7 +163,7 @@ def chunk_transcriptions(
# Calculate the offset of the current slide chunk to the start of the transcript
offset_slide_chunk = reduce(
lambda acc, txt: acc
+ len(self.remove_seperator_char(txt)),
+ len(self.remove_separator_char(txt)),
map(
lambda seg: seg[
LectureTranscriptionSchema.SEGMENT_TEXT.value
Expand All @@ -175,7 +175,7 @@ def chunk_transcriptions(
offset_start = offset_slide_chunk
for j, chunk in enumerate(semantic_chunks):
offset_end = offset_start + len(
self.remove_seperator_char(chunk)
self.remove_separator_char(chunk)
)

start_time = self.get_transcription_segment_of_char_position(
Expand All @@ -188,9 +188,9 @@ def chunk_transcriptions(
chunks.append(
{
**segment,
LectureTranscriptionSchema.SEGMENT_START.value: start_time,
LectureTranscriptionSchema.SEGMENT_END.value: end_time,
LectureTranscriptionSchema.SEGMENT_TEXT.value: self.cleanup_chunk(self.replace_seperator_char(chunk)),
LectureTranscriptionSchema.SEGMENT_START_TIME.value: start_time,
LectureTranscriptionSchema.SEGMENT_END_TIME.value: end_time,
LectureTranscriptionSchema.SEGMENT_TEXT.value: self.cleanup_chunk(self.replace_separator_char(chunk)),
}
)
offset_start = offset_end + 1
Expand All @@ -216,11 +216,11 @@ def cleanup_chunk(text: str):
return text.replace(" ", " ").strip()

@staticmethod
def replace_seperator_char(text: str, replace_with: str = " ") -> str:
def replace_separator_char(text: str, replace_with: str = " ") -> str:
return text.replace(CHUNK_SEPARATOR_CHAR, replace_with)

def remove_seperator_char(self, text: str) -> str:
return self.replace_seperator_char(text, "")
def remove_separator_char(self, text: str) -> str:
return self.replace_separator_char(text, "")

def summarize_chunks(self, chunks):
chunks_with_summaries = []
Expand Down
18 changes: 9 additions & 9 deletions app/vector_database/lecture_transcription_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ class LectureTranscriptionSchema(Enum):
COURSE_NAME = "course_name"
LECTURE_ID = "lecture_id"
LECTURE_NAME = "lecture_name"
LECTURE_UNIT_ID = "lecture_unit_id"
LANGUAGE = "language"
SEGMENT_START = "segment_start"
SEGMENT_END = "segment_end"
SEGMENT_START_TIME = "segment_start_time"
SEGMENT_END_TIME = "segment_end_time"
SEGMENT_TEXT = "segment_text"
SEGMENT_LECTURE_UNIT_SLIDES_ID = "segment_lecture_unit_slides_id"
SEGMENT_LECTURE_UNIT_SLIDE_NUMBER = "segment_lecture_unit_slide_number"
SEGMENT_SUMMARY = "segment_summary"

Expand Down Expand Up @@ -67,14 +67,14 @@ def init_lecture_transcription_schema(client: WeaviateClient) -> Collection:
index_searchable=False,
),
Property(
name=LectureTranscriptionSchema.SEGMENT_START.value,
description="The start of the segment",
name=LectureTranscriptionSchema.SEGMENT_START_TIME.value,
description="The start time of the segment",
data_type=DataType.NUMBER,
index_searchable=False,
),
Property(
name=LectureTranscriptionSchema.SEGMENT_END.value,
description="The end of the segment",
name=LectureTranscriptionSchema.SEGMENT_END_TIME.value,
description="The end time of the segment",
data_type=DataType.NUMBER,
index_searchable=False,
),
Expand All @@ -85,8 +85,8 @@ def init_lecture_transcription_schema(client: WeaviateClient) -> Collection:
index_searchable=True,
),
Property(
name=LectureTranscriptionSchema.SEGMENT_LECTURE_UNIT_SLIDES_ID.value,
description="The id of the lecture unit slides of the segment",
name=LectureTranscriptionSchema.LECTURE_UNIT_ID.value,
description="The id of the lecture unit of the transcription",
data_type=DataType.INT,
index_searchable=False,
),
Expand Down
2 changes: 1 addition & 1 deletion app/web/routers/webhooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def transcription_ingestion_webhook(dto: TranscriptionIngestionPipelineExecution
"""
Webhook endpoint to trigger the lecture transcription ingestion pipeline
"""
print(f"transcription ingestion got DTO {dto}")
logger.info(f"transcription ingestion got DTO {dto}")
thread = Thread(target=run_transcription_ingestion_pipeline_worker, args=(dto,))
thread.start()

Expand Down

0 comments on commit c81c09e

Please sign in to comment.