-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr-checker.py
executable file
·56 lines (43 loc) · 1.57 KB
/
ocr-checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 15 10:12:51 2020
@author: briancroxall
This script will check which objects have been downloaded and which haven't.
"""
from glob import glob
# Corpus
expected = set()
downloaded = set(glob('ocr-txt/*.txt'))
# File creation
with open('ocr-checker.txt', 'w'):
pass
# Counters
duplicates = 0
duplicatesB = 0
with open('data-dictionary.tsv') as data_file:
for counter, line in enumerate(data_file):
if counter % 100 == 0:
print('.', end='', flush=True)
newspaper, location, date, img_num, link, ocr_link = line.split('\t')
newspaper = newspaper.lower().replace(' ', '-').replace('.','')
location = location.lower().replace(' ', '-')
img_num = img_num.replace('seq-', '') # remove seq- from each page number
filename = f'ocr-txt/{newspaper}_{location}_{date}_p{img_num}.txt'
if filename in expected:
duplicates += 1
with open('ocr-checker.txt', 'a') as output:
print(filename, file=output)
else:
expected.add(filename)
# expected.add(filename)
print()
print('Results of comparing sets: ', expected.difference(downloaded))
print('Number of expected files: ', len(expected))
print('Number of downloaded files: ', len(downloaded))
print('Number of duplicates in data-dictionary.tsv found during set creation:',
duplicates)
with open('ocr-checker.txt') as file:
for line in file:
duplicatesB += 1
print('Number of duplicates in ocr-checker.txt:', duplicatesB)