-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
758 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import wave | ||
|
||
infiles = ["3/audiop1.wav", "3/audiop2.wav"] | ||
outfile = "3/fullAudio.wav" | ||
|
||
data= [] | ||
for infile in infiles: | ||
w = wave.open(infile, 'rb') | ||
data.append( [w.getparams(), w.readframes(w.getnframes())] ) | ||
w.close() | ||
|
||
output = wave.open(outfile, 'wb') | ||
output.setparams(data[0][0]) | ||
output.writeframes(data[0][1]) | ||
output.writeframes(data[1][1]) | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import numpy as np | ||
from scipy import misc | ||
import random | ||
import math | ||
|
||
for i in range(0,20): | ||
strIndex = str(i) | ||
while len(strIndex) < 6: | ||
strIndex = "0"+strIndex | ||
arr = misc.imread('2/origImages/frame'+strIndex+'.jpg') | ||
misc.imsave('2/croppedImages/frame'+strIndex+'.jpg',arr[180:540,384:896]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import numpy as np | ||
from scipy import misc | ||
|
||
INSPEC_WIDTH = 240 # 2 seconds | ||
INSPEC_HEIGHT = 368 | ||
|
||
def readAndClipImage(i): | ||
if i < 0 or i > 90: | ||
return np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH,1)) | ||
arr = misc.imread('2/audioSnippets/'+str(i)+'.jpg') | ||
if i == 0: | ||
return arr[:,:] | ||
elif i == 90: | ||
return arr[:,120:] | ||
else: | ||
return arr[:,120:] | ||
|
||
|
||
def getSpecAtFrame(f,w): | ||
specIndex = (f // 300) | ||
|
||
arr = np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH)) | ||
|
||
specImageFile = readAndClipImage(specIndex) | ||
prevSpecImageFile = readAndClipImage(specIndex-1) | ||
|
||
mod = frameIndex%300 | ||
|
||
if mod < w: # The previous 2 seconds is going to bleed into the previous section | ||
seamSpot = (w-mod)*4 | ||
arr[:,seamSpot:] = specImageFile[:,0:mod*4,0] | ||
arr[:,:seamSpot] = prevSpecImageFile[:,1200-seamSpot:1200,0] | ||
for col in range(seamSpot,min(seamSpot+w,INSPEC_WIDTH)): #60-pixel smoothing between one portion and the next, cuz I'm fancy. | ||
sFrom = prevSpecImageFile[:,1200+col-seamSpot,0] | ||
sTo = specImageFile[:,col-seamSpot,0] | ||
prog = (col-seamSpot)/60.0 | ||
arr[:,col] = sFrom+(sTo-sFrom)*prog | ||
else: | ||
arr = specImageFile[:,(mod-w)*4:mod*4,0] | ||
return np.asarray(arr)/255.0 | ||
|
||
def getInSpecAtFrame(f): | ||
return getSpecAtFrame(f,60) | ||
|
||
def getOutSpecAtFrame(f): | ||
return getSpecAtFrame(f+2,2) | ||
|
||
|
||
frameIndex = 5125 | ||
|
||
misc.imsave('dump9.png',getOutSpecAtFrame(frameIndex)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import face_recognition | ||
import subprocess | ||
|
||
image = face_recognition.load_image_file("2/origImages/frame0000.jpg") | ||
|
||
face_locations = face_recognition.face_locations(image) | ||
|
||
if(len(face_locations) == 1): | ||
top, right, bottom, left = face_locations[0] | ||
faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(0) | ||
height = top-bottom | ||
faceFrame = frame.crop((left,top,right,bottom-height*0.3)) | ||
faceFrame.save(faceFilename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
import subprocess | ||
|
||
command = "ffmpeg -i 3/IMG_4700.MOV -ab 160k -ac 2 -ar 44100 -vn 3/audiop2.wav" | ||
|
||
subprocess.call(command, shell=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from PIL import Image | ||
from PIL import ImageFont | ||
from PIL import ImageDraw | ||
|
||
img = Image.open("sample_in.jpg") | ||
draw = ImageDraw.Draw(img) | ||
# font = ImageFont.truetype(<font-file>, <font-size>) | ||
font = ImageFont.truetype("sans-serif.ttf", 16) | ||
# draw.text((x, y),"Sample Text",(r,g,b)) | ||
draw.text((0, 0),"Sample Text",(255,255,255),font=font) | ||
img.save('sample-out.jpg') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import face_recognition | ||
from scipy import misc | ||
margin = 25 | ||
maxWidth = 0 | ||
maxHeight = 0 | ||
|
||
# frames with my hand in front of my mouth | ||
# 3197 - 3224 | ||
|
||
|
||
for i in range(0,131663): | ||
strIndex = str(i) | ||
while len(strIndex) < 4: | ||
strIndex = "0"+strIndex | ||
|
||
image = face_recognition.load_image_file("/media/rob/Ma Book1/CS 230/videoToVoice/3/origImages/frame"+strIndex+".jpg") | ||
face_landmarks_list = face_recognition.face_landmarks(image) | ||
|
||
if(len(face_landmarks_list) >= 1): | ||
xMin = 999999 | ||
xMax = -999999 | ||
yMin = 999999 | ||
yMax = -999999 | ||
|
||
points = face_landmarks_list[0]['bottom_lip']+face_landmarks_list[0]['top_lip'] | ||
|
||
for point in points: | ||
if point[0] < xMin: | ||
xMin = point[0] | ||
if point[0] > xMax: | ||
xMax = point[0] | ||
if point[1] < yMin: | ||
yMin = point[1] | ||
if point[1] > yMax: | ||
yMax = point[1] | ||
|
||
if(yMax-yMin > maxHeight): | ||
maxHeight = yMax-yMin | ||
|
||
if(xMax-xMin > maxWidth): | ||
maxWidth = xMax-xMin | ||
|
||
arr = misc.imread("3/origImages/frame"+strIndex+".jpg") | ||
misc.imsave("3/mouthImages/frame"+strIndex+".jpg",arr[yMin-margin:yMax+margin,xMin-margin:xMax+margin]) | ||
print("FINISHED IMAGE #"+str(i)+". Also, the maximum dimensions are "+str(maxWidth)+" x "+str(maxHeight)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
from scipy import misc | ||
import random | ||
import math | ||
import os | ||
|
||
phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r") | ||
|
||
phoframes = phoframeFile.read().split("\n") | ||
|
||
FOLDER_SAVE_NAME = "phoframe41" | ||
|
||
if not os.path.exists(FOLDER_SAVE_NAME): | ||
os.makedirs(FOLDER_SAVE_NAME) | ||
|
||
if not os.path.exists(FOLDER_SAVE_NAME+"/samples"): | ||
os.makedirs(FOLDER_SAVE_NAME+"/samples") | ||
|
||
if not os.path.exists(FOLDER_SAVE_NAME+"/models"): | ||
os.makedirs(FOLDER_SAVE_NAME+"/models") | ||
|
||
def weight_variable(shape): | ||
initial = tf.truncated_normal(shape, stddev=0.1) | ||
return tf.Variable(initial) | ||
|
||
def bias_variable(shape): | ||
initial = tf.constant(0.1, shape=shape) | ||
return tf.Variable(initial) | ||
|
||
def getRandomFrame(): | ||
#return 5+int(math.floor(random.randrange(0, 60))) | ||
f = int(math.floor(random.randrange(14, 120000))) | ||
while not isFValid(f): # Exclude portions of the video with no visible mouth | ||
f = int(math.floor(random.randrange(14, 120000))) | ||
return f | ||
|
||
def isFValid(f): | ||
for nearF in range(f-14,f+15): | ||
strIndex = str(nearF) | ||
while len(strIndex) < 4: | ||
strIndex = "0"+strIndex | ||
if not os.path.exists('3/mouthImages/frame'+strIndex+'.jpg'): | ||
return False | ||
return True # As of now, i can't remember where the invalid frames are. | ||
|
||
def getInVidsAtFrame(f): | ||
arr = np.zeros([1, INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH]) | ||
for imageIndex in range(0,29): | ||
strIndex = str(f-14+imageIndex) | ||
while len(strIndex) < 4: | ||
strIndex = "0"+strIndex | ||
newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg') | ||
|
||
if newImage.shape[0] > INVID_HEIGHT: | ||
extraMargin = (newImage.shape[0]-INVID_HEIGHT)//2 | ||
newImage = newImage[extraMargin:extraMargin+INVID_HEIGHT,:,:] | ||
if newImage.shape[1] > INVID_WIDTH: | ||
extraMargin = (newImage.shape[1]-INVID_WIDTH)//2 | ||
newImage = newImage[:,extraMargin:extraMargin+INVID_WIDTH,:] | ||
|
||
h = newImage.shape[0] | ||
w = newImage.shape[1] | ||
yStart = (INVID_HEIGHT-h)//2 | ||
xStart = (INVID_WIDTH-w)//2 | ||
arr[:,yStart:yStart+h,xStart:xStart+w,imageIndex*3:(imageIndex+1)*3] = newImage | ||
return np.asarray(arr)/255.0 | ||
|
||
def getLabelsAtFrame(f): | ||
return int(phoframes[f]) | ||
|
||
INVID_WIDTH = 256 # mouth width | ||
INVID_HEIGHT = 256 # mouth height | ||
INVID_DEPTH = 87 # 29 images of R, G, B | ||
|
||
PHONEME_CATEGORIES = 41 | ||
|
||
learning_rate = 0.0002 | ||
|
||
invids_ = tf.placeholder(tf.float32, (None, INVID_HEIGHT, INVID_WIDTH, INVID_DEPTH), name='invids') | ||
labels_ = tf.placeholder(tf.int32, (None), name='labels') | ||
|
||
### Encode the invids | ||
conv1 = tf.layers.conv2d(inputs=invids_, filters=40, kernel_size=(5,5), strides=(2,2), padding='same', activation=tf.nn.relu) | ||
# Now 128x128x40 | ||
maxpool1 = tf.layers.max_pooling2d(conv1, pool_size=2, strides=(2,2), padding='same') | ||
# Now 64x64x40 | ||
conv2 = tf.layers.conv2d(inputs=maxpool1, filters=70, kernel_size=(5,5), padding='same', activation=tf.nn.relu) | ||
# Now 64x64x70 | ||
maxpool2 = tf.layers.max_pooling2d(conv2, pool_size=2, strides=(2,2), padding='same') | ||
# Now 32x32x70 | ||
conv3 = tf.layers.conv2d(inputs=maxpool2, filters=100, kernel_size=(5,5), padding='same', activation=tf.nn.relu) | ||
# Now 32x32x100 | ||
maxpool3 = tf.layers.max_pooling2d(conv3, pool_size=2, strides=(2,2), padding='same') | ||
# Now 16x16x100 | ||
conv4 = tf.layers.conv2d(inputs=maxpool3, filters=130, kernel_size=(5,5), padding='same', activation=tf.nn.relu) | ||
# Now 16x16x130 | ||
maxpool4 = tf.layers.max_pooling2d(conv4, pool_size=4, strides=(4,4), padding='same') | ||
# Now 4x4x130 (flatten to 2080) | ||
|
||
maxpool4_flat = tf.reshape(maxpool4, [-1,4*4*130]) | ||
# Now 2080 | ||
|
||
W_fc1 = weight_variable([2080, 1000]) | ||
b_fc1 = bias_variable([1000]) | ||
fc1 = tf.nn.relu(tf.matmul(maxpool4_flat, W_fc1) + b_fc1) | ||
|
||
W_fc2 = weight_variable([1000, 300]) | ||
b_fc2 = bias_variable([300]) | ||
fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2) | ||
|
||
W_fc3 = weight_variable([300, PHONEME_CATEGORIES]) | ||
b_fc3 = bias_variable([PHONEME_CATEGORIES]) | ||
logits = tf.matmul(fc2, W_fc3) + b_fc3 | ||
#Now 114 | ||
onehot_labels = tf.one_hot(indices=labels_, depth=PHONEME_CATEGORIES) | ||
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits) | ||
|
||
output = tf.nn.softmax(logits,name=None) | ||
|
||
# Get cost and define the optimizer | ||
cost = tf.reduce_mean(loss) | ||
opt = tf.train.AdamOptimizer(learning_rate).minimize(cost) | ||
|
||
|
||
|
||
print("made it here! :D") | ||
sess = tf.Session() | ||
RANGE_START = 120030 | ||
RANGE_END = 131030 | ||
epochs = 2000000 | ||
batch_size = 50 | ||
MODEL_SAVE_EVERY = 50 | ||
SAVE_FILE_START_POINT = 5750 | ||
|
||
saver = tf.train.Saver() | ||
|
||
sess.run(tf.global_variables_initializer()) | ||
|
||
if SAVE_FILE_START_POINT >= 1: | ||
saver.restore(sess, FOLDER_SAVE_NAME+"/models/model"+str(SAVE_FILE_START_POINT)+".ckpt") | ||
|
||
print("about to start...") | ||
|
||
f = open(FOLDER_SAVE_NAME+'/outputted.txt','w') | ||
for frame in range(RANGE_START,RANGE_END): | ||
invids = np.empty([0,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH]) | ||
labels = np.empty(0) | ||
|
||
invids = np.vstack((invids,getInVidsAtFrame(frame))) | ||
labels = np.append(labels,getLabelsAtFrame(frame)) | ||
|
||
_output, batch_cost, _logits = sess.run([output, cost, logits], | ||
feed_dict={invids_: invids, labels_: labels}) | ||
|
||
for i in _output[0]: | ||
f.write(str(i)+"\t"); | ||
f.write("\n"); | ||
print("Done with "+str(frame-RANGE_START)+" / "+str(RANGE_END-RANGE_START)) | ||
f.close() |
Oops, something went wrong.