-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathslide-extractor.py
89 lines (76 loc) · 3.72 KB
/
slide-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os, glob
import subprocess
from tqdm import tqdm
from PIL import Image
import imagehash
import cv2
import PyPDF2
CHECK_PER_FRAMES = 30 # check per 30 frames (i.e. 1 frame per sec for 30 fps video)
DIFF_THRESHOLD = 3
def main():
for filename in tqdm(sorted(glob.glob('**/*.mp4', recursive=True))):
basename = os.path.basename(filename)
tqdm.write('Extracting key frames from ' + basename + ', may take a minute...')
cap = cv2.VideoCapture(filename)
success, cv2_im = cap.read()
count = 0
idx = 0
im_hash = "0123456789abcdef" # just an arbitrary hash
while success:
for i in range(CHECK_PER_FRAMES): # skip frames
success, cv2_im = cap.read()
if not success:
break
cv2_im = cv2.cvtColor(cv2_im, cv2.COLOR_BGR2RGB)
pil_im = Image.fromarray(cv2_im)
# compare current frame to previous frame, save frame if sufficiently different
prev_im_hash = im_hash
im_hash = imagehash.phash(pil_im)
# tqdm.write(str(count) + ' ' + str(im_hash)) # print current frame & its image hash
if idx == 0 or im_hash - prev_im_hash > DIFF_THRESHOLD:
# tqdm.write('|------------|\n| save frame |\n|------------|')
pil_im.save('frame{}.png'.format(str(idx).zfill(3)), dpi=(72, 72))
idx += 1
count += 1
# bash script: img2pdf, tesseract, ghostscript
FNULL = open(os.devnull, 'w')
tqdm.write('Generating image-only PDF...')
# subprocess.call(['bash', '-c', 'convert frame*.png combine-img.pdf'], stdout=FNULL, stderr=subprocess.STDOUT) # imagicmagick
subprocess.call(['bash', '-c', 'img2pdf frame*.png -o combine-img.pdf'], stdout=FNULL, stderr=subprocess.STDOUT)
tqdm.write('Running OCR...')
subprocess.call(['bash', '-c', 'for i in frame*.png; do tesseract -c textonly_pdf=1 $i $i pdf; done;'], stdout=FNULL, stderr=subprocess.STDOUT)
tqdm.write('Generating text-only PDF...')
subprocess.call(['bash', '-c', 'gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=combine-text.pdf -dBATCH frame*.pdf;'], stdout=FNULL, stderr=subprocess.STDOUT)
# merge pdf
tqdm.write('Merging image-only & text-only PDF...')
merge('combine-text.pdf', 'combine-img.pdf', filename + '.pdf') # save file where video file is
tqdm.write('Temporary files removed\n')
subprocess.call(['bash', '-c', 'rm -f frame*.png frame*.pdf combine-*.pdf'])
def distance(s1, s2):
# return distance of two hashes
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(abs(int(ch1, 16)-int(ch2, 16)) for ch1, ch2 in zip(s1, s2))
# Function Author: https://github.com/gsauthof
# copied from pdfmerge.py (https://github.com/gsauthof/utility/blob/master/pdfmerge.py)
def merge(textonlyPDF, imageonlyPDF, ofilename):
'''
Merge text-only and image-only PDFs into one
e.g. text, images, merged
cf. https://github.com/tesseract-ocr/tesseract/issues/660#issuecomment-273629726
'''
with open(textonlyPDF, 'rb') as f1, open(imageonlyPDF, 'rb') as f2:
# PdfReader isn't usable as context-manager
pdf1, pdf2 = (PyPDF2.PdfReader(x) for x in (f1, f2))
opdf = PyPDF2.PdfWriter()
for page1, page2 in zip(pdf1.pages, pdf2.pages):
page1.merge_page(page2)
opdf.add_page(page1)
n1, n2 = len(pdf1.pages), len(pdf2.pages)
if n1 != n2:
for page in (pdf2.pages[n1:] if n1 < n2 else pdf1.pages[n2:]):
opdf.add_page(page)
with open(ofilename, 'wb') as g:
opdf.write(g)
if __name__ == '__main__':
main()