-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
147 lines (113 loc) · 5.53 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import tensorflow as tf
from typing import List
import cv2
import os
import mediapipe as mp
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
# Mapping integers back to original characters
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
def detect_mouth(video_path):
# Initialize the Mediapipe face detection model
mp_face_detection = mp.solutions.face_detection.FaceDetection(
model_selection=1, min_detection_confidence=0.5
)
# Open the video file
cap = cv2.VideoCapture(video_path)
mouth_bounding_boxes = []
while cap.isOpened():
# Read a frame from the video
ret, frame = cap.read()
if not ret:
break
# Convert the frame to RGB (Mediapipe requires RGB images)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Process the frame with the face detection model
results = mp_face_detection.process(frame_rgb)
# Check if a face is detected in the frame
if results.detections:
# Get the bounding box of the face
face_bounding_box = results.detections[0].location_data.relative_bounding_box
ih, iw, _ = frame.shape
# Calculate the coordinates of the mouth bounding box
mouth_left_x = int((face_bounding_box.xmin + 0.1 * face_bounding_box.width) * iw)
mouth_right_x = int((face_bounding_box.xmin + 0.9 * face_bounding_box.width) * iw)
mouth_top_y = int((face_bounding_box.ymin + 0.6 * face_bounding_box.height) * ih)
mouth_bottom_y = int((face_bounding_box.ymin + face_bounding_box.height) * ih)
mouth_bounding_boxes.append((mouth_left_x, mouth_top_y, mouth_right_x, mouth_bottom_y))
# Release the video capture object and close the OpenCV windows
cap.release()
cv2.destroyAllWindows()
return mouth_bounding_boxes
# def load_video(path: str) -> tf.Tensor:
# # Detect mouth bounding box coordinates
# mouth_bounding_boxes = detect_mouth(path)
# desired_height = 46 # Replace with your desired height
# desired_width = 140 # Replace with your desired width
# # Open the video file
# cap = cv2.VideoCapture(path)
# frames = []
# frame_count = 0
# for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
# ret, frame = cap.read()
# if not ret:
# break
# # Crop and resize the frame using the mouth bounding box coordinates
# for mouth_box in mouth_bounding_boxes:
# mouth_left_x, mouth_top_y, mouth_right_x, mouth_bottom_y = mouth_box
# frame_cropped_resized = frame[mouth_top_y:mouth_bottom_y, mouth_left_x:mouth_right_x, :]
# frame_cropped_resized = tf.image.rgb_to_grayscale(frame_cropped_resized)
# frame_cropped_resized = tf.image.resize(frame_cropped_resized, (desired_height, desired_width))
# frame_cropped_resized = tf.reshape(frame_cropped_resized, (desired_height, desired_width, 1))
# frames.append(frame_cropped_resized)
# cap.release()
# mean = tf.math.reduce_mean(frames)
# std = tf.math.reduce_std(tf.cast(frames, tf.float32))
# processed_frames = tf.cast((frames - mean), tf.float32) / std
# # Convert processed_frames back to uint8
# # processed_frames = tf.clip_by_value(processed_frames, -1, 1)
# # processed_frames = tf.cast(((processed_frames + 1) * 127.5), tf.uint8)
# if len(processed_frames) < 75:
# # Pad preprocessed_frames with zeros to make its length equal to 75
# processed_frames = tf.pad(processed_frames, [[0, 75 - len(processed_frames)], [0, 0], [0, 0], [0, 0]])
# elif len(processed_frames) > 75:
# # Truncate preprocessed_frames to make its length equal to 75
# processed_frames = processed_frames[:75]
# else:
# pad_width = 75 - len(processed_frames)
# processed_frames = tf.pad(processed_frames, [[0, pad_width], [0, 0], [0, 0], [0, 0]])
# return processed_frames
def load_video(path:str) -> List[float]:
cap = cv2.VideoCapture(path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_alignments(path:str) -> List[str]:
#print(path)
with open(path, 'r') as f:
lines = f.readlines()
tokens = []
for line in lines:
line = line.split()
if line[2] != 'sil':
tokens = [*tokens,' ',line[2]]
return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
def load_data(path: str):
path = bytes.decode(path.numpy())
file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('..','data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('..','data','alignments','s1',f'{file_name}.align')
with tf.device('/cpu:0'):
frames = load_video(video_path)
# alignments = load_alignments(alignment_path)
return frames