Skip to content

Commit

Permalink
files
Browse files Browse the repository at this point in the history
  • Loading branch information
carykh authored Mar 23, 2018
1 parent 94589c3 commit e61158c
Show file tree
Hide file tree
Showing 15 changed files with 758 additions and 0 deletions.
16 changes: 16 additions & 0 deletions audioStitcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import wave

infiles = ["3/audiop1.wav", "3/audiop2.wav"]
outfile = "3/fullAudio.wav"

data= []
for infile in infiles:
w = wave.open(infile, 'rb')
data.append( [w.getparams(), w.readframes(w.getnframes())] )
w.close()

output = wave.open(outfile, 'wb')
output.setparams(data[0][0])
output.writeframes(data[0][1])
output.writeframes(data[1][1])
output.close()
11 changes: 11 additions & 0 deletions cropper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import numpy as np
from scipy import misc
import random
import math

for i in range(0,20):
strIndex = str(i)
while len(strIndex) < 6:
strIndex = "0"+strIndex
arr = misc.imread('2/origImages/frame'+strIndex+'.jpg')
misc.imsave('2/croppedImages/frame'+strIndex+'.jpg',arr[180:540,384:896])
52 changes: 52 additions & 0 deletions dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
from scipy import misc

INSPEC_WIDTH = 240 # 2 seconds
INSPEC_HEIGHT = 368

def readAndClipImage(i):
if i < 0 or i > 90:
return np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH,1))
arr = misc.imread('2/audioSnippets/'+str(i)+'.jpg')
if i == 0:
return arr[:,:]
elif i == 90:
return arr[:,120:]
else:
return arr[:,120:]


def getSpecAtFrame(f,w):
specIndex = (f // 300)

arr = np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH))

specImageFile = readAndClipImage(specIndex)
prevSpecImageFile = readAndClipImage(specIndex-1)

mod = frameIndex%300

if mod < w: # The previous 2 seconds is going to bleed into the previous section
seamSpot = (w-mod)*4
arr[:,seamSpot:] = specImageFile[:,0:mod*4,0]
arr[:,:seamSpot] = prevSpecImageFile[:,1200-seamSpot:1200,0]
for col in range(seamSpot,min(seamSpot+w,INSPEC_WIDTH)): #60-pixel smoothing between one portion and the next, cuz I'm fancy.
sFrom = prevSpecImageFile[:,1200+col-seamSpot,0]
sTo = specImageFile[:,col-seamSpot,0]
prog = (col-seamSpot)/60.0
arr[:,col] = sFrom+(sTo-sFrom)*prog
else:
arr = specImageFile[:,(mod-w)*4:mod*4,0]
return np.asarray(arr)/255.0

def getInSpecAtFrame(f):
return getSpecAtFrame(f,60)

def getOutSpecAtFrame(f):
return getSpecAtFrame(f+2,2)


frameIndex = 5125

misc.imsave('dump9.png',getOutSpecAtFrame(frameIndex))

13 changes: 13 additions & 0 deletions faceReadTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import face_recognition
import subprocess

image = face_recognition.load_image_file("2/origImages/frame0000.jpg")

face_locations = face_recognition.face_locations(image)

if(len(face_locations) == 1):
top, right, bottom, left = face_locations[0]
faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(0)
height = top-bottom
faceFrame = frame.crop((left,top,right,bottom-height*0.3))
faceFrame.save(faceFilename)
5 changes: 5 additions & 0 deletions getAudio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import subprocess

command = "ffmpeg -i 3/IMG_4700.MOV -ab 160k -ac 2 -ar 44100 -vn 3/audiop2.wav"

subprocess.call(command, shell=True)
11 changes: 11 additions & 0 deletions imageTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from PIL import Image
from PIL import ImageFont
from PIL import ImageDraw

img = Image.open("sample_in.jpg")
draw = ImageDraw.Draw(img)
# font = ImageFont.truetype(<font-file>, <font-size>)
font = ImageFont.truetype("sans-serif.ttf", 16)
# draw.text((x, y),"Sample Text",(r,g,b))
draw.text((0, 0),"Sample Text",(255,255,255),font=font)
img.save('sample-out.jpg')
45 changes: 45 additions & 0 deletions lipTester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import face_recognition
from scipy import misc
margin = 25
maxWidth = 0
maxHeight = 0

# frames with my hand in front of my mouth
# 3197 - 3224


for i in range(0,131663):
strIndex = str(i)
while len(strIndex) < 4:
strIndex = "0"+strIndex

image = face_recognition.load_image_file("/media/rob/Ma Book1/CS 230/videoToVoice/3/origImages/frame"+strIndex+".jpg")
face_landmarks_list = face_recognition.face_landmarks(image)

if(len(face_landmarks_list) >= 1):
xMin = 999999
xMax = -999999
yMin = 999999
yMax = -999999

points = face_landmarks_list[0]['bottom_lip']+face_landmarks_list[0]['top_lip']

for point in points:
if point[0] < xMin:
xMin = point[0]
if point[0] > xMax:
xMax = point[0]
if point[1] < yMin:
yMin = point[1]
if point[1] > yMax:
yMax = point[1]

if(yMax-yMin > maxHeight):
maxHeight = yMax-yMin

if(xMax-xMin > maxWidth):
maxWidth = xMax-xMin

arr = misc.imread("3/origImages/frame"+strIndex+".jpg")
misc.imsave("3/mouthImages/frame"+strIndex+".jpg",arr[yMin-margin:yMax+margin,xMin-margin:xMax+margin])
print("FINISHED IMAGE #"+str(i)+". Also, the maximum dimensions are "+str(maxWidth)+" x "+str(maxHeight))
160 changes: 160 additions & 0 deletions phoframeTest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import tensorflow as tf
import numpy as np
from scipy import misc
import random
import math
import os

phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r")

phoframes = phoframeFile.read().split("\n")

FOLDER_SAVE_NAME = "phoframe41"

if not os.path.exists(FOLDER_SAVE_NAME):
os.makedirs(FOLDER_SAVE_NAME)

if not os.path.exists(FOLDER_SAVE_NAME+"/samples"):
os.makedirs(FOLDER_SAVE_NAME+"/samples")

if not os.path.exists(FOLDER_SAVE_NAME+"/models"):
os.makedirs(FOLDER_SAVE_NAME+"/models")

def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)

def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)

def getRandomFrame():
#return 5+int(math.floor(random.randrange(0, 60)))
f = int(math.floor(random.randrange(14, 120000)))
while not isFValid(f): # Exclude portions of the video with no visible mouth
f = int(math.floor(random.randrange(14, 120000)))
return f

def isFValid(f):
for nearF in range(f-14,f+15):
strIndex = str(nearF)
while len(strIndex) < 4:
strIndex = "0"+strIndex
if not os.path.exists('3/mouthImages/frame'+strIndex+'.jpg'):
return False
return True # As of now, i can't remember where the invalid frames are.

def getInVidsAtFrame(f):
arr = np.zeros([1, INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
for imageIndex in range(0,29):
strIndex = str(f-14+imageIndex)
while len(strIndex) < 4:
strIndex = "0"+strIndex
newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg')

if newImage.shape[0] > INVID_HEIGHT:
extraMargin = (newImage.shape[0]-INVID_HEIGHT)//2
newImage = newImage[extraMargin:extraMargin+INVID_HEIGHT,:,:]
if newImage.shape[1] > INVID_WIDTH:
extraMargin = (newImage.shape[1]-INVID_WIDTH)//2
newImage = newImage[:,extraMargin:extraMargin+INVID_WIDTH,:]

h = newImage.shape[0]
w = newImage.shape[1]
yStart = (INVID_HEIGHT-h)//2
xStart = (INVID_WIDTH-w)//2
arr[:,yStart:yStart+h,xStart:xStart+w,imageIndex*3:(imageIndex+1)*3] = newImage
return np.asarray(arr)/255.0

def getLabelsAtFrame(f):
return int(phoframes[f])

INVID_WIDTH = 256 # mouth width
INVID_HEIGHT = 256 # mouth height
INVID_DEPTH = 87 # 29 images of R, G, B

PHONEME_CATEGORIES = 41

learning_rate = 0.0002

invids_ = tf.placeholder(tf.float32, (None, INVID_HEIGHT, INVID_WIDTH, INVID_DEPTH), name='invids')
labels_ = tf.placeholder(tf.int32, (None), name='labels')

### Encode the invids
conv1 = tf.layers.conv2d(inputs=invids_, filters=40, kernel_size=(5,5), strides=(2,2), padding='same', activation=tf.nn.relu)
# Now 128x128x40
maxpool1 = tf.layers.max_pooling2d(conv1, pool_size=2, strides=(2,2), padding='same')
# Now 64x64x40
conv2 = tf.layers.conv2d(inputs=maxpool1, filters=70, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
# Now 64x64x70
maxpool2 = tf.layers.max_pooling2d(conv2, pool_size=2, strides=(2,2), padding='same')
# Now 32x32x70
conv3 = tf.layers.conv2d(inputs=maxpool2, filters=100, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
# Now 32x32x100
maxpool3 = tf.layers.max_pooling2d(conv3, pool_size=2, strides=(2,2), padding='same')
# Now 16x16x100
conv4 = tf.layers.conv2d(inputs=maxpool3, filters=130, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
# Now 16x16x130
maxpool4 = tf.layers.max_pooling2d(conv4, pool_size=4, strides=(4,4), padding='same')
# Now 4x4x130 (flatten to 2080)

maxpool4_flat = tf.reshape(maxpool4, [-1,4*4*130])
# Now 2080

W_fc1 = weight_variable([2080, 1000])
b_fc1 = bias_variable([1000])
fc1 = tf.nn.relu(tf.matmul(maxpool4_flat, W_fc1) + b_fc1)

W_fc2 = weight_variable([1000, 300])
b_fc2 = bias_variable([300])
fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)

W_fc3 = weight_variable([300, PHONEME_CATEGORIES])
b_fc3 = bias_variable([PHONEME_CATEGORIES])
logits = tf.matmul(fc2, W_fc3) + b_fc3
#Now 114
onehot_labels = tf.one_hot(indices=labels_, depth=PHONEME_CATEGORIES)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits)

output = tf.nn.softmax(logits,name=None)

# Get cost and define the optimizer
cost = tf.reduce_mean(loss)
opt = tf.train.AdamOptimizer(learning_rate).minimize(cost)



print("made it here! :D")
sess = tf.Session()
RANGE_START = 120030
RANGE_END = 131030
epochs = 2000000
batch_size = 50
MODEL_SAVE_EVERY = 50
SAVE_FILE_START_POINT = 5750

saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())

if SAVE_FILE_START_POINT >= 1:
saver.restore(sess, FOLDER_SAVE_NAME+"/models/model"+str(SAVE_FILE_START_POINT)+".ckpt")

print("about to start...")

f = open(FOLDER_SAVE_NAME+'/outputted.txt','w')
for frame in range(RANGE_START,RANGE_END):
invids = np.empty([0,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
labels = np.empty(0)

invids = np.vstack((invids,getInVidsAtFrame(frame)))
labels = np.append(labels,getLabelsAtFrame(frame))

_output, batch_cost, _logits = sess.run([output, cost, logits],
feed_dict={invids_: invids, labels_: labels})

for i in _output[0]:
f.write(str(i)+"\t");
f.write("\n");
print("Done with "+str(frame-RANGE_START)+" / "+str(RANGE_END-RANGE_START))
f.close()
Loading

0 comments on commit e61158c

Please sign in to comment.