files

carykh · Mar 23, 2018 · e61158c · e61158c
1 parent 94589c3
commit e61158c
Show file tree

Hide file tree

Showing 15 changed files with 758 additions and 0 deletions.
diff --git a/audioStitcher.py b/audioStitcher.py
@@ -0,0 +1,16 @@
+import wave
+
+infiles = ["3/audiop1.wav", "3/audiop2.wav"]
+outfile = "3/fullAudio.wav"
+
+data= []
+for infile in infiles:
+    w = wave.open(infile, 'rb')
+    data.append( [w.getparams(), w.readframes(w.getnframes())] )
+    w.close()
+
+output = wave.open(outfile, 'wb')
+output.setparams(data[0][0])
+output.writeframes(data[0][1])
+output.writeframes(data[1][1])
+output.close()
diff --git a/cropper.py b/cropper.py
@@ -0,0 +1,11 @@
+import numpy as np
+from scipy import misc
+import random
+import math
+
+for i in range(0,20):
+    strIndex = str(i)
+    while len(strIndex) < 6:
+        strIndex = "0"+strIndex
+    arr = misc.imread('2/origImages/frame'+strIndex+'.jpg')
+    misc.imsave('2/croppedImages/frame'+strIndex+'.jpg',arr[180:540,384:896])
diff --git a/dump.py b/dump.py
@@ -0,0 +1,52 @@
+import numpy as np
+from scipy import misc
+
+INSPEC_WIDTH = 240 # 2 seconds
+INSPEC_HEIGHT = 368
+
+def readAndClipImage(i):
+    if i < 0 or i > 90:
+        return np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH,1))
+    arr = misc.imread('2/audioSnippets/'+str(i)+'.jpg')
+    if i == 0: 
+        return arr[:,:]
+    elif i == 90:
+        return arr[:,120:]
+    else:
+        return arr[:,120:]
+
+
+def getSpecAtFrame(f,w):
+    specIndex = (f // 300)
+
+    arr = np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH))
+
+    specImageFile = readAndClipImage(specIndex)
+    prevSpecImageFile = readAndClipImage(specIndex-1)
+
+    mod = frameIndex%300
+
+    if mod < w: # The previous 2 seconds is going to bleed into the previous section
+        seamSpot = (w-mod)*4
+        arr[:,seamSpot:] = specImageFile[:,0:mod*4,0]
+        arr[:,:seamSpot] = prevSpecImageFile[:,1200-seamSpot:1200,0]
+        for col in range(seamSpot,min(seamSpot+w,INSPEC_WIDTH)): #60-pixel smoothing between one portion and the next, cuz I'm fancy.
+	    sFrom = prevSpecImageFile[:,1200+col-seamSpot,0]
+	    sTo = specImageFile[:,col-seamSpot,0]
+	    prog = (col-seamSpot)/60.0
+	    arr[:,col] = sFrom+(sTo-sFrom)*prog
+    else:
+	arr = specImageFile[:,(mod-w)*4:mod*4,0]
+    return np.asarray(arr)/255.0
+
+def getInSpecAtFrame(f):
+    return getSpecAtFrame(f,60)
+
+def getOutSpecAtFrame(f):
+    return getSpecAtFrame(f+2,2)
+
+
+frameIndex = 5125
+
+misc.imsave('dump9.png',getOutSpecAtFrame(frameIndex))
+
diff --git a/faceReadTest.py b/faceReadTest.py
@@ -0,0 +1,13 @@
+import face_recognition
+import subprocess
+
+image = face_recognition.load_image_file("2/origImages/frame0000.jpg")
+
+face_locations = face_recognition.face_locations(image)
+
+if(len(face_locations) == 1):
+    top, right, bottom, left = face_locations[0]
+    faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(0)
+    height = top-bottom
+    faceFrame = frame.crop((left,top,right,bottom-height*0.3))
+    faceFrame.save(faceFilename)
diff --git a/getAudio.py b/getAudio.py
@@ -0,0 +1,5 @@
+import subprocess
+
+command = "ffmpeg -i 3/IMG_4700.MOV -ab 160k -ac 2 -ar 44100 -vn 3/audiop2.wav"
+
+subprocess.call(command, shell=True)
diff --git a/imageTest.py b/imageTest.py
@@ -0,0 +1,11 @@
+from PIL import Image
+from PIL import ImageFont
+from PIL import ImageDraw 
+
+img = Image.open("sample_in.jpg")
+draw = ImageDraw.Draw(img)
+# font = ImageFont.truetype(<font-file>, <font-size>)
+font = ImageFont.truetype("sans-serif.ttf", 16)
+# draw.text((x, y),"Sample Text",(r,g,b))
+draw.text((0, 0),"Sample Text",(255,255,255),font=font)
+img.save('sample-out.jpg')
diff --git a/lipTester.py b/lipTester.py
@@ -0,0 +1,45 @@
+import face_recognition
+from scipy import misc
+margin = 25
+maxWidth = 0
+maxHeight = 0
+
+# frames with my hand in front of my mouth
+# 3197 - 3224
+
+
+for i in range(0,131663):
+    strIndex = str(i)
+    while len(strIndex) < 4:
+        strIndex = "0"+strIndex
+
+    image = face_recognition.load_image_file("/media/rob/Ma Book1/CS 230/videoToVoice/3/origImages/frame"+strIndex+".jpg")
+    face_landmarks_list = face_recognition.face_landmarks(image)
+
+    if(len(face_landmarks_list) >= 1):
+        xMin = 999999
+        xMax = -999999
+        yMin = 999999
+        yMax = -999999
+
+        points = face_landmarks_list[0]['bottom_lip']+face_landmarks_list[0]['top_lip']
+
+        for point in points:
+            if point[0] < xMin:
+                xMin = point[0]
+            if point[0] > xMax:
+                xMax = point[0]
+            if point[1] < yMin:
+                yMin = point[1]
+            if point[1] > yMax:
+                yMax = point[1]
+
+        if(yMax-yMin > maxHeight):
+            maxHeight = yMax-yMin
+
+        if(xMax-xMin > maxWidth):
+            maxWidth = xMax-xMin
+
+        arr = misc.imread("3/origImages/frame"+strIndex+".jpg")
+        misc.imsave("3/mouthImages/frame"+strIndex+".jpg",arr[yMin-margin:yMax+margin,xMin-margin:xMax+margin])
+        print("FINISHED IMAGE #"+str(i)+". Also, the maximum dimensions are "+str(maxWidth)+" x "+str(maxHeight))
diff --git a/phoframeTest.py b/phoframeTest.py
@@ -0,0 +1,160 @@
+import tensorflow as tf
+import numpy as np
+from scipy import misc
+import random
+import math
+import os
+
+phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r") 
+
+phoframes = phoframeFile.read().split("\n")
+
+FOLDER_SAVE_NAME = "phoframe41"
+
+if not os.path.exists(FOLDER_SAVE_NAME):
+    os.makedirs(FOLDER_SAVE_NAME)
+
+if not os.path.exists(FOLDER_SAVE_NAME+"/samples"):
+    os.makedirs(FOLDER_SAVE_NAME+"/samples")
+
+if not os.path.exists(FOLDER_SAVE_NAME+"/models"):
+    os.makedirs(FOLDER_SAVE_NAME+"/models")
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev=0.1)
+    return tf.Variable(initial)
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+def getRandomFrame():
+    #return 5+int(math.floor(random.randrange(0, 60)))
+    f = int(math.floor(random.randrange(14, 120000)))
+    while not isFValid(f):  # Exclude portions of the video with no visible mouth
+        f = int(math.floor(random.randrange(14, 120000)))
+    return f
+
+def isFValid(f):
+    for nearF in range(f-14,f+15):
+        strIndex = str(nearF)
+        while len(strIndex) < 4:
+            strIndex = "0"+strIndex
+        if not os.path.exists('3/mouthImages/frame'+strIndex+'.jpg'):
+            return False
+    return True # As of now, i can't remember where the invalid frames are.
+
+def getInVidsAtFrame(f):
+    arr = np.zeros([1, INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
+    for imageIndex in range(0,29):
+        strIndex = str(f-14+imageIndex)
+        while len(strIndex) < 4:
+            strIndex = "0"+strIndex
+        newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg')
+
+        if newImage.shape[0] > INVID_HEIGHT:
+            extraMargin = (newImage.shape[0]-INVID_HEIGHT)//2
+            newImage = newImage[extraMargin:extraMargin+INVID_HEIGHT,:,:]
+        if newImage.shape[1] > INVID_WIDTH:
+            extraMargin = (newImage.shape[1]-INVID_WIDTH)//2
+            newImage = newImage[:,extraMargin:extraMargin+INVID_WIDTH,:]
+
+        h = newImage.shape[0]
+        w = newImage.shape[1]
+        yStart = (INVID_HEIGHT-h)//2
+        xStart = (INVID_WIDTH-w)//2
+        arr[:,yStart:yStart+h,xStart:xStart+w,imageIndex*3:(imageIndex+1)*3] = newImage
+    return np.asarray(arr)/255.0
+
+def getLabelsAtFrame(f):
+  return int(phoframes[f])
+
+INVID_WIDTH = 256 # mouth width
+INVID_HEIGHT = 256 # mouth height
+INVID_DEPTH = 87 # 29 images of R, G, B
+
+PHONEME_CATEGORIES = 41
+
+learning_rate = 0.0002
+
+invids_ = tf.placeholder(tf.float32, (None, INVID_HEIGHT, INVID_WIDTH, INVID_DEPTH), name='invids')
+labels_ = tf.placeholder(tf.int32, (None), name='labels')
+
+### Encode the invids
+conv1 = tf.layers.conv2d(inputs=invids_, filters=40, kernel_size=(5,5), strides=(2,2), padding='same', activation=tf.nn.relu)
+# Now 128x128x40
+maxpool1 = tf.layers.max_pooling2d(conv1, pool_size=2, strides=(2,2), padding='same')
+# Now 64x64x40
+conv2 = tf.layers.conv2d(inputs=maxpool1, filters=70, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
+# Now 64x64x70
+maxpool2 = tf.layers.max_pooling2d(conv2, pool_size=2, strides=(2,2), padding='same')
+# Now 32x32x70
+conv3 = tf.layers.conv2d(inputs=maxpool2, filters=100, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
+# Now 32x32x100
+maxpool3 = tf.layers.max_pooling2d(conv3, pool_size=2, strides=(2,2), padding='same')
+# Now 16x16x100
+conv4 = tf.layers.conv2d(inputs=maxpool3, filters=130, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
+# Now 16x16x130
+maxpool4 = tf.layers.max_pooling2d(conv4, pool_size=4, strides=(4,4), padding='same')
+# Now 4x4x130 (flatten to 2080)
+
+maxpool4_flat = tf.reshape(maxpool4, [-1,4*4*130])
+# Now 2080
+
+W_fc1 = weight_variable([2080, 1000])
+b_fc1 = bias_variable([1000])
+fc1 = tf.nn.relu(tf.matmul(maxpool4_flat, W_fc1) + b_fc1)
+
+W_fc2 = weight_variable([1000, 300])
+b_fc2 = bias_variable([300])
+fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
+
+W_fc3 = weight_variable([300, PHONEME_CATEGORIES])
+b_fc3 = bias_variable([PHONEME_CATEGORIES])
+logits = tf.matmul(fc2, W_fc3) + b_fc3
+#Now 114
+onehot_labels = tf.one_hot(indices=labels_, depth=PHONEME_CATEGORIES)
+loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits)
+
+output = tf.nn.softmax(logits,name=None)
+
+# Get cost and define the optimizer
+cost = tf.reduce_mean(loss)
+opt = tf.train.AdamOptimizer(learning_rate).minimize(cost)
+
+
+
+print("made it here! :D")
+sess = tf.Session()
+RANGE_START = 120030
+RANGE_END = 131030
+epochs = 2000000
+batch_size = 50
+MODEL_SAVE_EVERY = 50
+SAVE_FILE_START_POINT = 5750
+
+saver = tf.train.Saver()
+
+sess.run(tf.global_variables_initializer())
+
+if SAVE_FILE_START_POINT >= 1:
+    saver.restore(sess,  FOLDER_SAVE_NAME+"/models/model"+str(SAVE_FILE_START_POINT)+".ckpt")
+
+print("about to start...")
+
+f = open(FOLDER_SAVE_NAME+'/outputted.txt','w')
+for frame in range(RANGE_START,RANGE_END):
+    invids = np.empty([0,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
+    labels = np.empty(0)
+
+    invids = np.vstack((invids,getInVidsAtFrame(frame)))
+    labels = np.append(labels,getLabelsAtFrame(frame))
+
+    _output, batch_cost, _logits = sess.run([output, cost, logits],
+       feed_dict={invids_: invids, labels_: labels})
+
+    for i in _output[0]:
+        f.write(str(i)+"\t");
+    f.write("\n");
+    print("Done with "+str(frame-RANGE_START)+" / "+str(RANGE_END-RANGE_START))
+f.close()