diff --git a/htmlFileToCsv.py b/htmlFileToCsv.py deleted file mode 100644 index c1e1241..0000000 --- a/htmlFileToCsv.py +++ /dev/null @@ -1,1135 +0,0 @@ -# import this library to automatically download and install the rest of the libraries if they do not exist -import tkinter -from tkinter import filedialog, ttk -from math import floor -from time import sleep -import re -import json -import logging - -# if opencv isnt installed, it'll install it for you -from sys import argv -import os -try: - import numpy as nm - import cv2 -except ImportError: - if(os.system("pip install opencv-python")): - os.system("pip install --user opencv-python") - import numpy as nm - import cv2 -try: - from PIL import Image, ImageTk -except ModuleNotFoundError: - if(os.system("pip install pillow")): - os.system("pip install --user pillow") - from PIL import Image, ImageTk -except ImportError: - import Image - import ImageTk - -# if tesseract isnt installed, itll install it for you -try: - import pytesseract as tess -except ImportError: - if(os.system("pip install pytesseract")): - os.system("pip install --user pytesseract") - import pytesseract as tess -# installing pdf to image libraries -try: - from pdf2image import convert_from_path -except ImportError: - if(os.system("pip install pdf2image")): - os.system("pip install --user pdf2image") - from pdf2image import convert_from_path - -# Checking that external software is installed and ready to use - - -def installError(name, URL, filename): - def download(): - import webbrowser - webbrowser.open_new_tab(URL) - - def navigate(): - path = filedialog.askopenfilename( - filetypes=((name, filename), (name, filename))) - if(os.getenv("path")[-1] != ";"): - path = ";" + path - path = path.replace("/", "\\").replace("\\" + filename, "") - if(len(os.getenv("path") + path) >= 1024): - info0.configure( - text="Error: we could not add the file to your path for you. You will have to do this manually.") - if os.getenv("userprofile") in path: - if(os.system("setx PATH \"%path%" + path + "\"")): - print("Failed to do command") - else: - if(os.system("setx PATH /M \"%path%" + path + "\"")): - print("failed to do command") - - ie = tkinter.Tk(baseName="Missing Software") - ie.title("Missing Software") - ie.geometry("438x478") - ie.minsize(120, 1) - ie.maxsize(1370, 749) - ie.resizable(1, 1) - ie.configure(background="#d9d9d9") - font11 = "-family {Segoe UI} -size 18 -weight bold" - font13 = "-family {Segoe UI} -size 16 -weight bold" - Header = tkinter.Label(ie, text="Software Not Installed") - Header.place(relx=0.16, rely=0.042, height=61, width=294) - Header.configure(font=font11, activeforeground="#372fd7", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#2432d9") - info0 = tkinter.Label( - ie, text="Warning: Youre missing {name}. it is a required software to make this tool run. To fix this issue, please follow the instructions below.".format(name=name)) - info0.place(relx=0.16, rely=0.167, height=151, width=294) - info0.configure(font="-family {Segoe UI} -size 14", background="#ffffff", - disabledforeground="#a3a3a3", foreground="#000000", wraplength="294") - info1 = tkinter.Label( - ie, text="If you havent already installed this software, please follow the download link.") - info1.place(relx=0.16, rely=0.523, height=31, width=294) - info1.configure(background="#eeeeee", disabledforeground="#a3a3a3", - foreground="#000000", wraplength="294") - tor = tkinter.Label(ie, text="Or") - tor.place(relx=0.457, rely=0.69, height=36, width=40) - tor.configure(font="-family {Segoe UI} -size 16 -weight bold", - background="#d9d9d9", disabledforeground="#a3a3a3", foreground="#29c1dc") - info2 = tkinter.Label( - ie, text="If you've already installed the software, please lead us to where it is as we cannot find it.") - info2.place(relx=0.16, rely=0.774, height=41, width=294) - info2.configure(background="#eeeeee", wraplength="294", - disabledforeground="#a3a3a3", foreground="#000000") - download = tkinter.Button( - ie, text="Download {name}".format(name=name), command=download) - download.place(relx=0.16, rely=0.607, height=34, width=297) - download.configure(font=font11, activebackground="#ececec", activeforeground="#000000", background="#48d250", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - navigate = tkinter.Button( - ie, text="Navigate to {name}".format(name=name), command=navigate) - navigate.place(relx=0.16, rely=0.879, height=34, width=297) - navigate.configure(font=font13, activebackground="#ececec", activeforeground="#000000", background="#eaecec", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - - ie.mainloop() - os.sys.exit(1) - - -# check if tesseract exists -if os.system("tesseract --help"): - if os.path.exists("C:\\Program Files\\Tesseract-OCR\\tesseract.exe"): - tess.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract' - else: - installError( - "Tesseract", "https://github.com/UB-Mannheim/tesseract/wiki", "tesseract.exe") -# check if poppler exists -if os.system("pdfimages -help"): - installError("Poppler", "https://blog.alivate.com.au/poppler-windows/", - "pdfimages.exe") -del installError - - -# Functions - - -logging.getLogger().setLevel(logging.WARNING) -if "info" in os.sys.argv: - logging.basicConfig(format="%(asctime)s: INFO %(message)s", - datefmt="%H:%M:%S", level=logging.INFO) -elif "debug" in os.sys.argv: - logging.basicConfig(format="%(asctime)s: DEBUG %(message)s", - datefmt="%H:%M:%S", level=logging.DEBUG) - if not os.path.exists("debugOutput/."): - os.makedirs("debugOutput/dictionary", exist_ok=True) - os.makedirs("debugOutput/scrapper", exist_ok=True) - else: - os.system("del /s debugOutput\\*.jpg") - -JSONFile = open("./aliases.json", "r") -JSON = json.load(JSONFile) -JSONFile.close() -JSONChange = False # this is only used when the database is updated - - -def debug(label: str, content: list): - logging.debug("%s:", label) - if(logging.getLogger().level <= logging.DEBUG): - for i in content: - print(i) - - -def debugImageDictionary(diction): - if (logging.getLogger().level <= logging.INFO): - debugOutput = "Sheet | SheetLen | TableRow | TableCol\n" - for sheet in range(len(diction)): - debugOutput += "{ind: 5d} | {slen: 8d} | {trow: 8d} | {tcol: 8d}\n".format(ind=sheet, slen=len( - diction[sheet]), trow=len(diction[sheet][-1]), tcol=len(diction[sheet][-1][0])) - logging.info(debugOutput) - exportToFile("debugOutput/dictionaryStats.txt", debugOutput) - for sheet in range(len(diction)): - for dates in range(len(diction[sheet][:-1])): - cv2.imwrite("debugOutput/dictionary/sheet{sheet}date{date}.jpg".format( - sheet=sheet, date=dates), diction[sheet][dates]) - for row in range(len(diction[sheet][-1])): - for col in range(len(diction[sheet][-1][row])): - cv2.imwrite("debugOutput/dictionary/sheet{sheet}table{row}{col}.jpg".format( - sheet=sheet, row=row, col=col), diction[sheet][-1][row][col]) - - -def exportToFile(dir, content): - open(dir, "w").write(content) - - -def appendToFile(dir, content): - try: - inside = open(dir, "r").read() - open(dir, "w").write(inside + content) - except: - open(dir, "w").write(content) - - -def collectContours(image): - """ Sub function used by scrapper.\n - @param image: an opencv image\n - @return returns an ordered list of contours found in the image.\n - This function was heavily influenced by its source.\n - @source: https://medium.com/coinmonks/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26 - """ - debugIndex = 0 - # Grab absolute thresh of image - image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - thresh = cv2.threshold( - image, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - invert = 255 - thresh - - if (logging.getLogger().level <= logging.DEBUG): - while(os.path.exists("debugOutput/scrapper/{ind}1invert.jpg".format(ind=debugIndex))): - debugIndex += 1 - cv2.imwrite( - "debugOutput/scrapper/{ind}1invert.jpg".format(ind=debugIndex), invert) - ####################################### - # Defining kernels for line detection # - ####################################### - kernel_length = nm.array(image).shape[1]//80 - verticle_kernel = cv2.getStructuringElement( - cv2.MORPH_RECT, (1, kernel_length)) # kernel for finding all verticle lines - # kernel for finding all horizontal lines - hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1)) - kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) # 3x3 kernel - - # Collecting Verticle Lines - verticleLines = cv2.erode(invert, verticle_kernel, iterations=3) - verticleLines = cv2.dilate(verticleLines, verticle_kernel, iterations=3) - verticleLines = cv2.threshold( - verticleLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/{ind}2verticleLines.jpg".format(ind=debugIndex), verticleLines) - - # Collecting Horizontal Lines - horizontalLines = cv2.erode(invert, hori_kernel, iterations=3) - horizontalLines = cv2.dilate(horizontalLines, hori_kernel, iterations=3) - horizontalLines = cv2.threshold( - horizontalLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/{ind}3horizontalLines.jpg".format(ind=debugIndex), horizontalLines) - - # Weighting parameters, this will decide the quantity of an image to be added to make a new image. - alpha = 0.5 - beta = 1.0 - alpha - - # combining verticle and horizontal lines. This gives us an empty table so that letters dont become boxes - blankTable = cv2.addWeighted( - verticleLines, alpha, horizontalLines, beta, 0.0) - blankTable = cv2.erode(~blankTable, kernel, iterations=2) - blankTable = cv2.threshold(blankTable, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[ - 1] # sharpening new table - - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/{ind}4blankTable.jpg".format(ind=debugIndex), blankTable) - # Detecting all contours, which gives me all box positions - contours = cv2.findContours( - blankTable, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] - - # Organizing contours - # we got our boxes, but its mostly to sort the contours - bboxes = [cv2.boundingRect(c) for c in contours] - # Sort all the contours in ascending order - contours, bboxes = zip( - *sorted(zip(contours, bboxes), key=lambda b: b[1][1], reverse=False)) - return contours - -# Generator -# PHASE 1: manipulate image to clearly show tabs - - -def imageScraper(file, outputArray=None): - """This function if phase 1 of the process. It starts by taking the image/pdf - of the signin sheet and breaks the table apart to isolate each value in the exact - order that they came in.\n - @param file: the image/pdf that needs to be scraped into its values.\n - @param outputArray: a parameter passed by reference due to the nature - of tkinters buttons. If the param is not filled, it will just return the result.\n - @return a multidimension array of images that containes the values of all the slots in the table. - """ - images = [] - sheets = [] # an array with each index containing the output per page - debugIndex = 0 - if not (file.split(".")[1] in ["jpg", "jpeg", "png", "pdf"]): - return - elif not (os.path.exists(file)): - raise FileNotFoundError("File given does not exist.") - if file.split(".")[1] == "pdf": - for image in convert_from_path(file): - image = nm.array(image) - image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) - images.append(image) - else: - # , cv2.IMREAD_GRAYSCALE) - images.append(cv2.imread(file, cv2.COLOR_RGB2BGR)) - - for image in images: - contours = collectContours(image) - # // This is to tell which boxes correlate to the date - # Phase 1: Finding Main Boxes ## // and which big box is the signin table - ################################# - mainBoxes = [] - for c in contours: - x, y, w, h = cv2.boundingRect(c) - if((h, w, 3) == image.shape): - continue - for m in mainBoxes: - if (x > m[0] and w < m[2]) or (y > m[1] and h < m[3]): - break - elif(x <= m[0] and w >= m[2] and y <= m[1] and h >= m[3]): - mainBoxes.remove(m) - mainBoxes.append([x, y, w, h]) - else: - mainBoxes.append([x, y, w, h]) - - table = mainBoxes[0] # img that contains whole table - - for x, y, w, h in mainBoxes: - if((w - x > table[2] - table[0]) or (h - y > table[3] - table[1])): - table = [x, y, w, h] - mainBoxes.remove(table) - - # making images for date and day - sheets.append([]) - for x, y, w, h in mainBoxes: - sheets[-1].append(image[y:y+h, x:x+w]) - - ######################################### - # Phase 2: Collecting pairs for mapping # - ######################################### - - # Collecting contours collected from table - table = image[table[1]-5:table[1]+table[3] + - 5, table[0]-5:table[0]+table[2]+5] - - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/mainTable{image}.jpg".format(image=debugIndex), table) - debugIndex += 1 - - # Grabbing verticle and horizontal images of table for better scraping - tableCompute = cv2.cvtColor(table, cv2.COLOR_BGR2GRAY) - tableCompute = cv2.threshold( - tableCompute, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - tableInvert = 255 - tableCompute - tKernelLength = nm.array(tableCompute).shape[1]//80 - tKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) - - ############################# - # Collecting Verticle Pairs # - ############################# - verticlePairs = [] - # Creating verticle kernel lines - tKernelVerticle = cv2.getStructuringElement( - cv2.MORPH_RECT, (1, tKernelLength)) - tVerticleLines = cv2.erode(tableInvert, tKernelVerticle, iterations=3) - tVerticleLines = cv2.dilate( - tVerticleLines, tKernelVerticle, iterations=3) - tVerticleLines = cv2.threshold( - tVerticleLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/table{}VertLines.jpg".format(debugIndex), tVerticleLines) - # Added this line because it needs a white background rather than black background - tVerticleLines = 255 - tVerticleLines - # Adding edge lines for contour collection - cv2.line(tVerticleLines, (0, floor(tVerticleLines.shape[0] * 0.01)), ( - tVerticleLines.shape[1], floor(tVerticleLines.shape[0] * 0.01)), (0, 0, 0), 5) - cv2.line(tVerticleLines, (0, floor(tVerticleLines.shape[0] * 0.99)), ( - tVerticleLines.shape[1], floor(tVerticleLines.shape[0] * 0.99)), (0, 0, 0), 5) - # Collecting verticle contours - contours = cv2.findContours( - tVerticleLines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] - # Figuring out the length that relates to the majority of the table, (aka, longer lengths relates to length of table rather than random lines) - maxLength = 0 - tableHeightPair = () # empty tuple for checking later - for c in contours: - x, y, w, h = cv2.boundingRect(c) - # if the height of the contour is at least 90% as long as the whole table, its safe to assume that that belongs to the whole table - if(h >= table.shape[0] * 0.9): - tableHeightPair = (y, h) - break - elif(h > maxLength): # if the height isnt a significant size, then the best choice is the longest length - maxlength = h - tableHeightPair = (y, h) - for c in contours: - x, y, w, h = cv2.boundingRect(c) - if((y, h) == tableHeightPair): - verticlePairs.append((x, x + w)) - verticlePairs.sort() - - logging.debug("VerticlePairs: %s", verticlePairs) - - # Fixing overlapping of some pairs - for v in range(len(verticlePairs) - 1): - # if the tail end of a pair overlaps the beginning of the next pair, then swap positions. itll make it slightly smaller, but it will miss table walls - if(verticlePairs[v][1] > verticlePairs[v + 1][0]): - temp = verticlePairs[v][1] - verticlePairs[v] = (verticlePairs[v][0], - verticlePairs[v + 1][0]) - verticlePairs[v + 1] = (temp, verticlePairs[v + 1][1]) - - # this is the gap before the table from the left side - verticlePairs.pop(0) - # this is the gap after the table from the right side - verticlePairs.pop(-1) - - if (logging.getLogger().level <= logging.DEBUG): - logging.debug("VerticlePairs: %s", verticlePairs) - debugimg = cv2.cvtColor(tVerticleLines, cv2.COLOR_GRAY2BGR) - for v in verticlePairs: - cv2.line(debugimg, (v[0], 0), - (v[0], debugimg.shape[0]), (0, 0, 255)) - cv2.line(debugimg, (v[1], 0), - (v[1], debugimg.shape[0]), (0, 0, 255)) - cv2.imwrite( - "debugOutput/scrapper/table{}VertContours.jpg".format(debugIndex), debugimg) - - ############################### - # Collecting Horizontal Pairs # - ############################### - horizontalPairs = [] - # Creating horizontal kernel lines - tKernelHorizontal = cv2.getStructuringElement( - cv2.MORPH_RECT, (tKernelLength, 1)) - tHorizontalLines = cv2.erode( - tableInvert, tKernelHorizontal, iterations=3) - tHorizontalLines = cv2.dilate( - tHorizontalLines, tKernelHorizontal, iterations=3) - tHorizontalLines = cv2.threshold( - tHorizontalLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite( - "debugOutput/scrapper/table{}HorLines.jpg".format(debugIndex), tHorizontalLines) - # Added this line because it needs a white background rather than black background - tHorizontalLines = 255 - tHorizontalLines - # Adding edge lines for contour collection - cv2.line(tHorizontalLines, (floor(tHorizontalLines.shape[1] * 0.01), 0), (floor( - tHorizontalLines.shape[1] * 0.01), tHorizontalLines.shape[0]), (0, 0, 0), 5) - cv2.line(tHorizontalLines, (floor(tHorizontalLines.shape[1] * 0.99), 0), (floor( - tHorizontalLines.shape[1] * 0.99), tHorizontalLines.shape[0]), (0, 0, 0), 5) - # Collecting Horizontal contours - contours = cv2.findContours( - tHorizontalLines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] - - # Figuring out the length that relates to the majority of the table, (aka, longer lengths relates to length of table rather than random lines) - maxLength = 0 - tableWidthPair = () # empty tuple for checking later - for c in contours: - x, y, w, h = cv2.boundingRect(c) - # if the width of the contour is at least 90% as long as the whole table, its safe to assume that that belongs to the whole table - if(w >= tHorizontalLines.shape[1] * 0.9): - tableWidthPair = (x, w) - break - elif(w > maxLength): # if the width isnt a significant size, then the best choice is the longest length - maxLength = w - tableWidthPair = (x, w) - for c in contours: - x, y, w, h = cv2.boundingRect(c) - if((x, w) == tableWidthPair): - horizontalPairs.append((y, y + h)) - horizontalPairs.sort() - - logging.debug("HorizontalPairs: %s", horizontalPairs) - - # Fixing overlapping of some pairs - for h in range(len(horizontalPairs) - 1): - # if the tail end of a pair overlaps the beginning of the next pair, then swap positions. itll make it slightly smaller, but it will miss table walls - if(horizontalPairs[h][1] > horizontalPairs[h + 1][0]): - temp = horizontalPairs[h][1] - horizontalPairs[h] = ( - horizontalPairs[h][0], horizontalPairs[h + 1][0]) - horizontalPairs[h + 1] = (temp, horizontalPairs[h + 1][1]) - - # this is the gap before the table from the left side - horizontalPairs.pop(0) - # this is the gap after the table from the right side - horizontalPairs.pop(-1) - - if (logging.getLogger().level <= logging.DEBUG): - logging.debug("HorizontalPairs: %s", horizontalPairs) - debugimg = cv2.cvtColor(tHorizontalLines, cv2.COLOR_GRAY2BGR) - for h in horizontalPairs: - cv2.line(debugimg, (0, h[0]), - (debugimg.shape[1], h[0]), (0, 0, 255)) - cv2.line(debugimg, (0, h[1]), - (debugimg.shape[1], h[1]), (0, 0, 255)) - cv2.imwrite( - "debugOutput/scrapper/table{}HorContours.jpg".format(debugIndex), debugimg) - - ##################################### - # Phase 3: Time for actual Scraping # - ##################################### - sheets[-1].append([]) - # the dictionary thatll hold all our information - dictionary = sheets[-1][-1] - dictRow = 0 - for row in horizontalPairs: - dictionary.append([]) - for col in verticlePairs: - dictionary[dictRow].append(table[row[0]:row[1], col[0]:col[1]]) - if (logging.getLogger().level <= logging.DEBUG): - cv2.imwrite("debugOutput/dictionary/raw/table{}{}.jpg".format(dictRow, - col[1]-col[0]), table[row[0]:row[1], col[0]:col[1]]) - dictRow += 1 - - if(outputArray == None): - return sheets - else: - globals()[outputArray] = sheets.copy() - return - - -def compareKnownAliases(id, col=1): - """Uses a dictionary of known valid aliases to find the most accurate guess for a name.\n - @param id: The string that you want a guess as what name it closest resembles.\n - @param col: the column of the string thats being checked. This is important as it clarifies - whether its a name being searched or a purpose.\n - @return: it returns the name it believes closest resembles the string given and it will return - the number of characters the string has in common with it. If the string matches with nothing, - it will return ("", 0) but this is rare. - """ - id = id.lower() - closestMatch = "" - mostMatches = 0 - matches = 0 - if (col == 1 and id.count(" ") == 1): - for alias in JSON["names"]["1"]: - matches = 0 - for i in range(min(alias.find(" "), id.find(" "))): - if(id[i] == alias[i]): - matches += 1 - lalias = alias.find(" ") + 1 - lid = id.find(" ") + 1 - for i in range(min(len(alias) - lalias, len(id) - lid)): - if(id[lid + i] == alias[lalias + i]): - matches += 1 - if (matches > mostMatches): - closestMatch = alias - mostMatches = matches - else: - for alias in JSON["names"][str(col)]: - matches = 0 - for i in range(min(len(id), len(alias))): - if(id[i] == alias[i]): - matches += 1 - if (matches > mostMatches): - closestMatch = alias - mostMatches = matches - return closestMatch, mostMatches - - -def correctValue(image, column, threshold=0.3): - """This function is how we get accurate values from the images in each dictionary.\n - @param {cvimg} image: The image that is being transcribed.\n - @param {int} column: The column in the table that the image is in. This is very important as its part of how the translator corrects the outputs.\n - @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n - @returns: It will return the name that closest resembles the image, or it will return \"RequestCorrection:\" if no name could be accepted.\n - It works by taking an image and running tesseract to get the value from the unchanges color image, then it grabs the ocr output from the same image with different effects, such as greyscale, thresholds, and contrast increase.\n - The next step for it is to take each unique value make, then run it through another function that creates a new string with the characters in it resembling what should be in there (no numbers or symbols in names, no chars in numbers, etc.) and adds it to the pile of strings.\n - The last step is for it take all the new unique strings and run them through another function to see which names the strings closest resemble. The name with the most conclusions is considered the best guess.\n - However, the best guess may not be accepted if the name doesnt share enough characters in common with all the guesses, then its scrapped and nothing is returned. - """ - - # Running initial checks to see if cell is empty - # were creating an inverted thresh of the image for counting pixels, removes 8px border in case it includes external lines or table borders - invert = cv2.cvtColor(image[8: -8, 8: -8], cv2.COLOR_BGR2GRAY) - invert = cv2.threshold( - invert, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - invert = 255 - invert - # countnonzero only counts white pixels, so i need to invert to turn black pixels white - pixelCount = cv2.countNonZero(invert) - pixelTotal = invert.shape[0] * invert.shape[1] - - logging.debug("blankPercent: %s", pixelCount/pixelTotal) - # will only consider empty if image used less than 1% of pixels. yes, that small - if(pixelCount/pixelTotal <= 0.01): - logging.info("It's Blank") - return "" # Skipping ahead if its already looking like theres nothing - del invert, pixelCount, pixelTotal - - outputs = [] - # Get normal results - outputs.append(tess.image_to_string(image)) - - # Get black and white results - temp = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - outputs.append(tess.image_to_string(temp)) - - # get thresh results - temp = cv2.threshold( - temp, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] - outputs.append(tess.image_to_string(temp)) - - # quick check incase box is looking empty; will only skip if 2/3 or more are blank - if(outputs.count("") >= len(outputs)*0.5): - logging.info("we couldnt read it") - # if theres enough pixels to describe a possbile image, then it isnt empty, but it cant read it - return "RequestCorrection:NaN" - - # Using contrast for more values - for i in range(50): - temp = cv2.addWeighted(image, (1 + i/100), image, 0, 0) - outputs.append(tess.image_to_string(temp)) - outputs.sort() - for i in range(len(outputs)-1, 1, -1): - if(outputs[i] == outputs[i-1]): - outputs.pop(i) - - ########################## - ## APPLYING CORRECTIONS ## - ########################## - - if column in [1, 5]: - ####################################### - ## Corrections for names and purpose ## - ####################################### - alphaCorrections = { - "A": ["^"], # A - "B": ["8", "|3", "/3", "\\3", "13", "&", "6"], # B - "C": ["(", "<", "{", "[", "¢", "©"], # C G - "G": ["(", "<", "{", "[", "¢", "©"], - # "D":["|]", "|)"], - # "d":["c|", "c/", "c\\"], # D d - "E": ["3", "€"], # E - "g": ["9"], # g - # "H":["|-|", "+-+", "++", "4"], # H - "I": ["1", "/", "\\", "|", "]", "["], # I l - "l": ["1", "/", "\\", "|", "]", "["], - # "K":["|<", "|(", "/<", "/(", "\\<", "\\(", "1<", "1("], # K - "O": ["0"], # O - "S": ["5", "$"], # S - "T": ["7"], # T - # "W":["VV"], # W - # "X":["><", ")("], # X - "Z": ["2"] # Z - } - - template = "" - additions = [] - for word in outputs: - template = "" - for char in range(len(word)): - for i in alphaCorrections: - if word[char] in alphaCorrections[i]: - template += i[0] - break - else: - template += word[char] - additions.append(template) - outputs.extend(additions) - outputs.sort() - - for string in outputs: # Remove duplicate entries - for copies in range(outputs.count(string) - 1): - outputs.remove(string) - - # Removing blank entries. it wasnt considered blank, so it shouldnt be there - for blanks in range(outputs.count("")): - outputs.remove("") - - logging.debug("Words[outputs]: %s", outputs) - largest = len(max(set(outputs), key=len)) - bestGuess = "" # variable that determines result - closestMatch = 0 # the number of times best guess occurs in our guesses - accuracy = 0 # the max number of characters that matches with the best guess - score = 0 # temp var for accuracy - count = 0 # temp variable for closestMatch - guesses = [] - for i in outputs: - guesses.append(compareKnownAliases(i, column)) - guesses.sort() - guesses.append(("", 0)) # full stop to make searcher read last item - logging.debug("Words[Guesses]: %s", guesses) - check = guesses[0][0] - - for i in guesses: - if(i[0] != check): - # print(check, accuracy, score, count) - # if the name occurs more often than previous string or the number of accurate characters is more than the length of previous string - if((count > closestMatch and accuracy <= len(check)) or score > len(bestGuess)): - closestMatch = count - accuracy = score - bestGuess = check - score = count = 0 - check = i[0] - score = max(score, i[1]) - count += 1 - - logging.debug("Words[accuracy]: %s", accuracy) - logging.info("Words[bestGuess]: %s", bestGuess) - if (bestGuess == ""): - # if we did our job correctly, the name/purpose should never be blank - return "RequestCorrection:NaN" - elif(accuracy >= len(bestGuess)*threshold and (len(bestGuess) <= largest or threshold == 0)): - return bestGuess - else: - return "RequestCorrection:" + bestGuess - - elif column in [2, 3, 4]: - #################################### - ## Corrections to Dates and Hours ## - #################################### - digitCorrections = { - "0": ["o", "O", "Q", "C", "c"], # 0 - "1": ["I", "l", "/", "\\", "|", "[", "]", "(", ")", "j"], # 1 - "2": ["z", "Z"], # 2 - "3": ["E"], # 3 - "4": ["h", "H", "y", "A"], # 4 - "5": ["s", "S"], # 5 - "6": ["b", "e"], # 6 - "7": ["t", ")", "}"], # 7 - "8": ["B", "&"], # 8 - "9": ["g", "q"], # 9 - ":": ["'", ".", ","] - } - - template = "" - correctFormat = [] # the array that will only take in outputs that fit formatting - - logging.debug("outputs[nums]: %s", outputs) - if column in [2, 3]: - # Source for regex string http://regexlib.com/DisplayPatterns.aspx?cattabindex=4&categoryId=5&AspxAutoDetectCookieSupport=1 - timeFilter = re.compile( - r'^((([0]?[1-9]|1[0-2])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?( )?(AM|am|aM|Am|PM|pm|pM|Pm))|(([0]?[0-9]|1[0-9]|2[0-3])(:|\.)[0-5][0-9]((:|\.)[0-5][0-9])?))$') - - # Removing outputs either too big or too small to be plausible time. - colonSet = set(digitCorrections[":"]) - colonSet.add(":") - for i in range(len(outputs) - 1, -1, -1): - if ((len(outputs[i]) < 3 + bool(set(outputs[i]) & colonSet)) or (len(outputs[i]) > 4 + bool(set(outputs[i]) & colonSet))): - outputs.pop(i) - - logging.debug("time[outputs]: %s", outputs) - # Doing translations - # by using a while loop, I allow the program to keep checkign until the entire array is gone, assuring no out of place characters - while(0 < len(outputs)): - # checking if item is already time or digit incase we can skip it - # if the string matches a time, sends it straight to correct values - if(bool(timeFilter.match(outputs[0]))): - for e in range(outputs.count(outputs[0])): - correctFormat.append(outputs[0]) - # If its a number, then it will turn the number into a time and put it into resulting check if its a proper time. - elif (outputs[0].isdigit() or outputs[0].isdecimal()): - # make template word so that it can be molded into a time. - template = outputs[0] - for e in range(len(template) - 2, 0, -2): - template = template[:e] + ":" + template[e:] - # if the time is legit, then add all repeating similiar strings - if(bool(timeFilter.match(template))): - for e in range(outputs.count(outputs[0])): - correctFormat.append(template) - else: - for digit, sets in digitCorrections.items(): # iterate through entire translation dictionary - # iterates only between the characters that can be replaced. - for elem in set(sets).intersection(set(outputs[0])): - for e in range(outputs.count(outputs[0])): - outputs.append( - outputs[0].replace(elem, digit)) - # once added additional lines or added legit guesses, removed all of string to avoid checking it again. - template = outputs[0] - for e in range(outputs.count(outputs[0])): - outputs.remove(template) - - elif(column == 4): - while(0 < len(outputs)): - if (outputs[0].isdigit() or outputs[0].isdecimal()): - # if the number discovered is less than 12 hours, because no one is expected to be there the entire day. - if (int(outputs[0]) < 12): - for e in range(outputs.count(outputs[0])): - correctFormat.append(outputs[0]) - else: # if the string has alpha letters in it: attempt to translate - for digit, sets in digitCorrections.items(): - for elem in set(sets).intersection(set(outputs[0])): - for e in range(outputs.count(outputs[0])): - outputs.append( - outputs[0].replace(elem, digit)) - template = outputs[0] - for e in range(outputs.count(outputs[0])): - outputs.remove(template) - if (len(correctFormat) == 0): - return "RequestCorrection:NaN" - else: - bestGuess = max(set(correctFormat), key=correctFormat.count) - if (threshold == 0): - return bestGuess - if column in [2, 3]: - logging.info("time[bestguess]: %s", bestGuess) - logging.debug("time[correctFormat]: %s", correctFormat) - if(bool(timeFilter.match(bestGuess))): - return bestGuess - else: - return "RequestCorrection:" + str(bestGuess) - elif(column == 4): - logging.info("hours[bestguess]: %s", bestGuess) - if(bestGuess.isdigit() or bestGuess.isdecimal()): - # will only return the hours if theyre a valid number - return bestGuess - else: - return "" # This is the one exception to the errors The reason why is because we can calculate the hours if we have two valid times - return "RequestCorrection:" - - -def requestCorrection(displayImage, col, guess=""): - """This is the function used when a string doesnt confidently match a name.\n - @param displayImage: The image placed on the display for user to see.\n - @param {int} col: The column number that the image was found in. This is needed for placing the AI's guess.\n - @param {string} guess: This is to straight up overwrite the AI's guess with the string. This can be helpful so that the AI doesnt have to process the image again.\n - @return: the users answer. - """ - global labelImage - global errorLabel - global confidenceDescription - global AIGuess - global guessButton - global orLabel - global correctionEntry - global submitButton - - result = "" # the string to be returned for final answer - - # Setting up image to place in GUI - image = Image.fromarray(displayImage) - if(displayImage.shape[1] > labelImage.winfo_width()): - hgt, wth = displayImage.shape[:2] - ratio = labelImage.winfo_width()/wth - image = image.resize( - (floor(wth * ratio), floor(hgt * ratio)), Image.ANTIALIAS) - image = ImageTk.PhotoImage(image) - - # setting values to labels in gui - labelImage.configure(image=image) - labelImage.image = image - errorLabel.configure( - text="Uh oh. It looks like we couldnt condifently decide who or what this is. We need you to either confirm our guess or type in the correct value") - confidenceDescription.configure(text="Were not confident, but is it:") - AIGuess.configure(text=guess) - orLabel.configure(text="or") - - # basically waits till user presses a button and changes variable scope - root.update_idletasks() - root.wait_variable(decision) - result = correctionEntry.get() - - # Resetting changes made - labelImage.configure(image=None) - labelImage.image = None - errorLabel.configure(text="") - confidenceDescription.configure(text="") - AIGuess.configure(text="") - orLabel.configure(text="") - correctionEntry.delete(0, "end") - root.update_idletasks() - sleep(1) - decision.set(0) - - if(guessButton): - guessButton = False - submitButton = False - return guess - elif(submitButton): - guessButton = False - submitButton = False - return result - - -def TranslateDictionary(sheetsDict, gui=False, outputDict=None): - """ Phase two of plan. This function goes through the image dictionary passed - to it and creates a matrix of the dictionary in text.\n - @param sheetsDict: a matrix of images made from a table.\n - @param gui: whether to switch on global gui manipulation for the progress bar.\n - @param outputDict: a variable passed by reference instead of using return.\n - @return a matrix of strings that represents the text in the image dictionary. - """ - global JSON - global JSONChange - results = [] - # GUI widgets to manipulate while in middle of function - if(gui): - global sheetStatus - global rowStatus - global progressBar - sheetMax = len(sheetsDict) - sheetInd = 0 - rowInd = 0 - progressMax = 1 - - # Getting max for progress bar - for sheet in sheetsDict: - progressMax += len(sheet[-1]) - 1 - progressBar.configure(mode="determinate", maximum=progressMax) - for sheet in sheetsDict: - results.append([]) - if gui: - sheetInd += 1 - rowMax = len(sheet[-1]) - 1 - sheetStatus.configure( - text="Sheet: " + str(sheetInd) + " of " + str(sheetMax)) - - # Collecting dates on page first - dates = [] - dformat = re.compile(r'\d{1,2}\/\d{1,2}\/(\d{4}|\d{2})') - dstr = "" - for date in sheet[:-1]: - dstr = tess.image_to_string(date).replace( - "\n", "").replace(" ", "") - if (bool(dformat.match(dstr))): - dates.insert(0, dstr) - else: - dates.append(dstr) - - # | Full name | Time in | Time out | hours (possibly blank) | purpose | date | day (possibly blank) | - for row in sheet[-1][1:]: # skips first row which is dummy - if gui: - rowInd += 1 - progressBar.step() - rowStatus.configure( - text="Row: " + str(rowInd) + " of " + str(rowMax)) - root.update_idletasks() - results[-1].append([]) - for col in range(1, len(row)): # skip first col which is dummy - logging.info("Sheet[%d]: [%d, %d]", int(sheetInd), int(rowInd), int(col)) - temp = correctValue(row[col], col) - if(temp == None): # the correction failed. the user must return the correction - temp = "RequestCorrection" - results[-1][-1].append(temp) - if(results[-1][-1].count("") == len(results[-1][-1])): - results[-1].pop(-1) - else: - results[-1][-1].extend(dates) - if (logging.getLogger().level <= logging.DEBUG): - for e in range(len(results)): - debug("Results Sheet[" + str(e) + "]", results[e]) - # Iterating through results to see where errors occured - for row in range(len(results[-1])): - for col in range(len(results[-1][row][:-len(dates)])): - if (results[-1][row][col][0:18] == "RequestCorrection:"): - results[-1][row][col] = requestCorrection( - sheet[-1][row + 1][col + 1], col + 1, results[-1][row][col][18:]) - if (col + 1 in [1, 5]): - for entry in JSON["names"][str(col + 1)]: - if (results[-1][row][col].lower() == entry): - break - else: - JSONChange = True - # if the name possibly entered in by the user doesnt exist in the database, add it - JSON["names"][str( - col + 1)].append(results[-1][row][col].lower()) - if(outputDict == None): - return results - else: - globals()[outputDict] = results.copy() - return - - -def arrayToCsv(directory): - """takes a matrix and returns a string in CSV format. - var directory: a string[][] matrix that contains the information of people at the center. - returns: a string that contains all the information in CSV format. - """ - cvarray = '' - for i in range(len(directory)): - for e in range(len(directory[i])-1): - cvarray += (directory[i][e]+",") - cvarray += (directory[i][-1]+"\n") - logging.debug("cvarray:\n%s", cvarray) - return (cvarray+"\n") - - -# Gui Variables -signinsheet = "" -outputCSV = os.getenv("userprofile").replace( - "\\", "/") + "/Documents/signinSheetOutput.csv" -guessButton = False -submitButton = False -# Gui Functions - - -def reconfigOutput(): - global outputCSV - global outputFile - outputCSV = filedialog.askopenfilename(filetypes=( - ("Comma Style Values", "*.csv"), ("Comma Style Values", "*.csv"))) - if(outputCSV != ""): - outputFile.configure(text=outputCSV.split("/")[-1]) - - -def guessSwitch(): - global guessButton - guessButton = True - decision.set(1) - - -def submitSwitch(event=None): - global submitButton - if(event != None and correctionEntry.get() == ""): - return - submitButton = True - decision.set(1) - - -def popupTag(title, text, color="#000000"): - # Popup box for errors and completion - def end(): - popupBox.destroy() - root.destroy() - popupBox = tkinter.Toplevel() - popupBox.geometry("335x181+475+267") - popupBox.minsize(120, 1) - popupBox.maxsize(1370, 749) - popupBox.resizable(1, 1) - popupBox.configure(background="#d9d9d9") - popupBox.title = title - - popupDescription = tkinter.Text(popupBox) - popupDescription.insert("end", text) - popupDescription.configure(foreground=color, wrap="word", state="disabled", background="#FFFFFF", font="TkTextFont", highlightbackground="#d9d9d9", highlightcolor="black", insertbackground="black", - selectbackground="#c4c4c4", selectforeground="black") - popupDescription.place(relx=0.03, rely=0.055, height=91, width=314) - popupOK = tkinter.Button(popupBox, text="OK", command=end) - popupOK.configure(activebackground="#ececec", activeforeground="#000000", background="#ebebeb", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - popupOK.place(relx=0.328, rely=0.663, height=34, width=117) - popupBox.mainloop() - - -def main(): - ########################################## - ## Phase 3: Hooking everything together ## - ########################################## - global signinsheet - global inputFile - global errorLabel - - try: - signinsheet = filedialog.askopenfilename(filetypes=( - ("PDF Files", "*.pdf"), ("Jpeg Files", "*.jpg"), ("Png Files", "*.png"))) - inputFile.configure(text=signinsheet.split("/")[-1]) - imageDictionary = imageScraper(signinsheet) - debugImageDictionary(imageDictionary) - textDictionary = TranslateDictionary(imageDictionary, gui=True) - csvString = "" - for sheet in textDictionary: - csvString += arrayToCsv(sheet) - exportToFile(outputCSV, csvString) - errorLabel.configure(text="All finished.") - except BaseException: - import traceback - popupTag("Error", "Looks like something went wrong.\n" + - str(os.sys.exc_info())+"\n"+str(traceback.format_exc()), "#ff0000") - raise - popupTag( - "Done", "Congrats! its all finished.\nLook at your csv and see if it looks alright.") - if (JSONChange): - JSON["names"]["1"].sort() # Sorting new libraries for optimization - JSON["names"]["5"].sort() - JSONFile = open("aliases.json", "w") - json.dump(JSON, JSONFile, indent=4, separators=( - ",", ": "), ensure_ascii=True, sort_keys=True) - JSONFile.close() - return - - -if __name__ == "__main__": - root = tkinter.Tk(screenName="OCR To CSV Interpreter") - root.title("OCR To CSV Interpreter") - root.geometry("600x450+401+150") - root.configure(background="#d9d9d9") - root.minsize(120, 1) - root.maxsize(1370, 749) - root.resizable(1, 1) - - decision = tkinter.BooleanVar() - - inputFile = tkinter.Button(root, text="Select Signin Sheet", command=main) - inputFile.configure(activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - inputFile.place(relx=0.033, rely=0.044, height=34, width=157) - - outputFile = tkinter.Button( - root, text=outputCSV.split("/")[-1], command=reconfigOutput) - outputFile.configure(activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - outputFile.place(relx=0.033, rely=0.156, height=34, width=157) - - labelImage = tkinter.Label(root, text="No corrections required yet.") - labelImage.configure(background="#e6e6e6", - disabledforeground="#a3a3a3", foreground="#000000") - labelImage.place(relx=0.417, rely=0.022, height=221, width=314) - - errorLabel = tkinter.Label(root) - errorLabel.configure(wraplength=224, activebackground="#f9f9f9", activeforeground="black", background="#e1e1e1", - disabledforeground="#a3a3a3", foreground="#ff0000", highlightbackground="#d9d9d9", highlightcolor="black") - errorLabel.place(relx=0.017, rely=0.267, height=111, width=224) - - confidenceDescription = tkinter.Label(root) - confidenceDescription.configure(activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black") - confidenceDescription.place(relx=0.267, rely=0.556, height=31, width=164) - - AIGuess = tkinter.Button(root, text="No guesses yet.", command=guessSwitch) - AIGuess.configure(activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - AIGuess.place(relx=0.55, rely=0.556, height=34, width=227) - - orLabel = tkinter.Label(root) - orLabel.configure(activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black") - orLabel.place(relx=0.017, rely=0.689, height=31, width=64) - - correctionEntry = tkinter.Entry(root) - correctionEntry.configure(background="white", disabledforeground="#a3a3a3", font="TkFixedFont", foreground="#000000", - highlightbackground="#d9d9d9", highlightcolor="black", insertbackground="black", selectbackground="#c4c4c4", selectforeground="black") - correctionEntry.place(relx=0.133, rely=0.689, height=30, relwidth=0.557) - - submit = tkinter.Button(root, text="Submit", command=submitSwitch) - submit.configure(activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black", pady="0") - submit.place(relx=0.717, rely=0.689, height=34, width=127) - root.bind("", submitSwitch) - - # Status bars - sheetStatus = tkinter.Label(root, text="Sheet: 0 of 0") - sheetStatus.configure(activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black") - sheetStatus.place(relx=0.017, rely=0.844, height=21, width=94) - - rowStatus = tkinter.Label(root, text="Row: 0 of 0") - rowStatus.configure(activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", - disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", highlightcolor="black") - rowStatus.place(relx=0.217, rely=0.844, height=21, width=64) - - progressBar = ttk.Progressbar(root) - progressBar.place(relx=0.017, rely=0.911, relwidth=0.95, - relheight=0.0, height=22) - - # Run main program - root.mainloop() diff --git a/main.py b/main.py new file mode 100644 index 0000000..99a4119 --- /dev/null +++ b/main.py @@ -0,0 +1,270 @@ +import json +import logging +import os +import re + +from modules.corrections import JSON, connectDict, correctValue +from modules.gui import InstallError, PopupTag, mainGUI +from modules.imageScraper import imageScraper +from modules.sanity import checkBlankRow, sanityName + +# if opencv isnt installed, it'll install it for you +try: + import numpy as nm + import cv2 +except ImportError: + if(os.system("pip install opencv-python")): + os.system("pip install --user opencv-python") +try: + from PIL import Image, ImageTk +except ModuleNotFoundError: + if(os.system("pip install pillow")): + os.system("pip install --user pillow") +except ImportError: + import Image + import ImageTk + +# if tesseract isnt installed, itll install it for you +try: + import pytesseract as tess +except ImportError: + if(os.system("pip install pytesseract")): + os.system("pip install --user pytesseract") + import pytesseract as tess +# installing pdf to image libraries +try: + from pdf2image import convert_from_path +except ImportError: + if(os.system("pip install pdf2image")): + os.system("pip install --user pdf2image") + from pdf2image import convert_from_path + +# Checking that external software is installed and ready to use +# check if tesseract exists +if os.system("tesseract --help"): + if os.path.exists("C:\\Program Files\\Tesseract-OCR\\tesseract.exe"): + tess.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract' + else: + InstallError( + "Tesseract", "https://github.com/UB-Mannheim/tesseract/releases", "tesseract.exe").run() +# check if poppler exists +if os.system("pdfimages -help"): + InstallError("Poppler", "https://poppler.freedesktop.org/", + "pdfimages.exe").run() + + +# Functions + + +logging.getLogger().setLevel(logging.WARNING) +if "info" in os.sys.argv: + logging.basicConfig(format="%(asctime)s: INFO %(message)s", + datefmt="%H:%M:%S", level=logging.INFO) +elif "debug" in os.sys.argv: + logging.basicConfig(format="%(asctime)s: DEBUG %(message)s", + datefmt="%H:%M:%S", level=logging.DEBUG) + if not os.path.exists("debugOutput/."): + os.makedirs("debugOutput/dictionary", exist_ok=True) + os.makedirs("debugOutput/scrapper", exist_ok=True) + else: + os.system("del /s debugOutput\\*.jpg") + +JSONFile = open("./aliases.json", "r") +connectDict(json.load(JSONFile)) +JSONFile.close() +JSONChange = False # this is only used when the database is updated +mainDisplay = None + + +def debug(label: str, content: list): + logging.debug("%s:", label) + if(logging.getLogger().level <= logging.DEBUG): + for i in content: + print(i) + + +def debugImageDictionary(diction): + if (logging.getLogger().level <= logging.INFO): + debugOutput = "Sheet | SheetLen | TableRow | TableCol\n" + for sheet in range(len(diction)): + debugOutput += "{ind: 5d} | {slen: 8d} | {trow: 8d} | {tcol: 8d}\n".format(ind=sheet, slen=len( + diction[sheet]), trow=len(diction[sheet][1]), tcol=len(diction[sheet][1][0])) + logging.info(debugOutput) + exportToFile("debugOutput/dictionaryStats.txt", debugOutput) + for sheet in range(len(diction)): + for dates in range(len(diction[sheet][0])): + cv2.imwrite("debugOutput/dictionary/sheet{sheet}date{date}.jpg".format( + sheet=sheet, date=dates), diction[sheet][0][dates]) + for row in range(len(diction[sheet][1])): + for col in range(len(diction[sheet][1][row])): + cv2.imwrite("debugOutput/dictionary/sheet{sheet}table{row}{col}.jpg".format( + sheet=sheet, row=row, col=col), diction[sheet][1][row][col]) + + +def exportToFile(dir, content): + open(dir, "w").write(content) + + +def appendToFile(dir, content): + try: + inside = open(dir, "r").read() + open(dir, "w").write(inside + content) + except: + open(dir, "w").write(content) + + +def TranslateDictionary(sheetsDict, gui=False, outputDict=None): + """ Phase two of plan. This function goes through the image dictionary passed + to it and creates a matrix of the dictionary in text.\n + @param sheetsDict: a matrix of images made from a table.\n + @param gui: whether to switch on global gui manipulation for the progress bar.\n + @param outputDict: a variable passed by reference instead of using return.\n + @return a matrix of strings that represents the text in the image dictionary. + """ + global JSON + global JSONChange + results = [[] for x in sheetsDict] # results the size of pages in dict + + # GUI widgets to manipulate while in middle of function + if(gui): + sheetMax = len(sheetsDict) + sheetInd = 0 + rowInd = 0 + progressMax = 1 + + # Gui Texts + textScan = "Scanning\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}" + textSanitize = "Sanitizing\tSheet: {sInd} of {sMax}\tRow: {rInd} of {rMax}" + + # Getting max for progress bar + for sheet in sheetsDict: + progressMax += len(sheet[1]) - 1 + mainDisplay.progressBar.configure( + mode="determinate", maximum=progressMax) + + # Collecting data to database + for sheet in range(len(sheetsDict)): + if gui: + sheetInd += 1 + rowMax = len(sheetsDict[sheet][1]) - 1 + # Collecting dates on page first + dates = [] + dformat = re.compile(r'\d{1,2}\/\d{1,2}\/(\d{4}|\d{2})') + dstr = "" + for date in sheetsDict[sheet][0]: + dstr = tess.image_to_string(date).replace( + "\n", "").replace(" ", "") + if (bool(dformat.match(dstr))): + dates.insert(0, (dstr, 1, True)) + else: + dates.append((dstr, 1, True)) + + # | Full name | Time in | Time out | hours (possibly blank) | purpose | date | day (possibly blank) | + # skips first row which is dummy + for row in range(1, len(sheetsDict[sheet][1])): + if gui: + rowInd += 1 + mainDisplay.progressBar.step() + mainDisplay.sheetStatus.configure( + text=textScan.format(sInd=sheetInd, sMax=sheetMax, rInd=rowInd, rMax=rowMax)) + mainDisplay.root.update_idletasks() + results[sheet].append([None for x in range(5)]) # array of 5 slots + # skip first col which is dummy + for col in range(1, len(sheetsDict[sheet][1][row])): + logging.info("Sheet[%d]: [%d, %d]", int( + sheetInd), int(rowInd), int(col)) + results[sheet][row - 1][col - + 1] = correctValue(sheetsDict[sheet][1][row][col], col) + results[sheet][-1].extend(dates) + if (logging.getLogger().level <= logging.DEBUG): + for e in range(len(results)): + debug("Results Sheet[" + str(e) + "]", results[e]) + + # Checking names for repetitions + results = sanityName(results) + + # Analysis + for sheet in range(len(results)): + # Iterating through results to see where errors occured + for row in range(len(results[sheet])): + for col in range(len(results[sheet][row][:-len(dates)])): + mainDisplay.sheetStatus.configure( + text=textSanitize.format(sInd=sheet + 1, sMax=len(results), rInd=row + 1, rMax=len(results[sheet]))) + if (results[sheet][row][col][2] == False): + results[sheet][row][col] = mainDisplay.requestCorrection( + sheetsDict[sheet][1][row + 1][col + 1], results[sheet][row][col][0]) + if (col + 1 in [1, 5]): + for entry in JSON["names"][str(col + 1)]: + if (results[sheet][row][col][0].lower() == entry): + break + else: + JSONChange = True + # if the name possibly entered in by the user doesnt exist in the database, add it + JSON["names"][str( + col + 1)].append(results[sheet][row][col][0].lower()) + + # Checking if any rows are blank + for row in range(len(results[sheet])-1, -1, -1): + if checkBlankRow(results[sheet][row]): + results[sheet].pop(row) + + if(outputDict == None): + return results + else: + globals()[outputDict] = results.copy() + return + + +def arrayToCsv(directory): + """takes a matrix and returns a string in CSV format. + var directory: a string[][] matrix that contains the information of people at the center. + returns: a string that contains all the information in CSV format. + """ + cvarray = '' + for i in range(len(directory)): + for e in range(len(directory[i])-1): + cvarray += (directory[i][e][0]+",") + cvarray += (directory[i][-1][0]+"\n") + logging.debug("cvarray:\n%s", cvarray) + return (cvarray+"\n") + + +def main(): + ########################################## + ## Phase 3: Hooking everything together ## + ########################################## + + try: + signinsheet = mainDisplay.signinsheet + outputCSV = mainDisplay.outputCSV + imageDictionary = imageScraper(signinsheet) + debugImageDictionary(imageDictionary) + textDictionary = TranslateDictionary(imageDictionary, gui=True) + csvString = "" + for sheet in textDictionary: + csvString += arrayToCsv(sheet) + exportToFile(mainDisplay.outputCSV, csvString) + mainDisplay.errorLabel.configure(text="All finished.") + except BaseException: + import traceback + PopupTag(mainDisplay, "Error", "Looks like something went wrong.\n" + + str(os.sys.exc_info())+"\n"+str(traceback.format_exc()), "#ff0000").run() + raise + PopupTag(mainDisplay, "Done", + "Congrats! its all finished.\nLook at your csv and see if it looks alright.").run() + if (JSONChange): + JSON["names"]["1"].sort() # Sorting new libraries for optimization + JSON["names"]["5"].sort() + JSONFile = open("aliases.json", "w") + json.dump(JSON, JSONFile, indent=4, separators=( + ",", ": "), ensure_ascii=True, sort_keys=True) + JSONFile.close() + + # Cleaning old ocr files from tmp + os.system("del /s /q %tmp%\\tess_*.hocr") + return + + +mainDisplay = mainGUI(main) +if __name__ == "__main__": + mainDisplay.run() diff --git a/modules/corrections.py b/modules/corrections.py new file mode 100644 index 0000000..d8b61d6 --- /dev/null +++ b/modules/corrections.py @@ -0,0 +1,625 @@ +import re +import logging +import cv2 +import pytesseract as tess +import xml.etree.ElementTree as ET +from itertools import product + +tess.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract' +JSON = {} + + +def connectDict(mainJSON: dict): + for key, val in mainJSON.items(): + JSON[key] = val + try: # if I connect the dictionary by a variable, I can connect it by reference + globals()["mainJSON"] = JSON + except: # prevents crashes when the value comes from a function return + pass + + +corrections = { + "a": { + "A": {"^"}, # A + "B": {"8", "&", "6", "3"}, # B + "C": {"(", "<", "{", "[", "¢", "©"}, # C G + "G": {"(", "<", "{", "[", "¢", "©", "6", "e"}, + "E": {"3", "€"}, # E + "e": {"G"}, + "g": {"9"}, # g + "I": {"1", "/", "\\", "|", "]", "["}, # I l + "l": {"1", "/", "\\", "|", "]", "["}, + "O": {"0"}, # O + "S": {"5", "$"}, # S + "T": {"7"}, # T + "Z": {"2"}, # Z + " ": {None} + }, + "d": { + "0": {"o", "O", "Q", "C", "c"}, # 0 + "1": {"I", "l", "/", "\\", "|", "[", "]", "(", ")", "j"}, # 1 + "2": {"z", "Z", "7", "?"}, # 2 + "3": {"E", "B"}, # 3 + "4": {"h", "H", "y", "A"}, # 4 + "5": {"s", "S"}, # 5 + "6": {"b", "e"}, # 6 + "7": {"t", ")", "}", "Z", "z", "2", "?"}, # 7 + "8": {"B", "&"}, # 8 + "9": {"g", "q"}, # 9 + ":": {"'", ".", ",", "i", ";"} + } +} + +timeFilter = re.compile( + r'^(1[0-2]|[1-9]):?([0-5][0-9])$') + + +def parseHocr(html): + """ Scraped string of hOCR html text to get the outputs organized into nested lists and dictionaries\n + @param html: a string containing the hOCR content\n + @return a list of lists and dictionary based on the results of the hOCR + """ + results = [] + words = [] + chars = [] + + # Remove XML Namespace for my own sanity + try: + html = html.replace( + b'', b'') + except TypeError: + html = html.replace( + "", "") + + root = ET.fromstring(html) + + try: + # body.div.div.p.span#line_1_1 + base = root[1][0][0][0][0] # Ends at span#line_1_1 + except IndexError: + base = root.find("body/div/div/p/span") + if base == None: + logging.error("Error: couldnt follow tree properly") + return [[]] + + # Allocating space for all words + words = [a for a in base if "id" in a.attrib and "word_" in a.attrib["id"]] + results = [None] * len(words) + + # Populating space of words + for word in range(len(words)): + chars = [a for a in words[word] + if "id" in a.attrib and "lstm_choices_" in a.attrib["id"]] + results[word] = [None] * len(chars) + + # Populating char dicts in each space + for char in range(len(chars)): + results[word][char] = {} + # placing words into each char + for cprob in chars[char]: # getting elements themselves for dict + results[word][char][cprob.text] = max( + float(cprob.attrib["title"][8:]), 1) / 100 + + # Checking on letter headers + charHeader = [a for a in words[word] if not "id" in a.attrib] + if (len(charHeader) == len(chars)): + for char in range(len(charHeader)): + if charHeader[char].text in results[word][char]: + if float(charHeader[char].attrib["title"][charHeader[ + char].attrib["title"].find("x_conf") + 7:])/100 > results[ + word][char][charHeader[char].text]: + results[word][char][charHeader[char].text] = float( + charHeader[char].attrib["title"][charHeader[char].attrib[ + "title"].find("x_conf") + 7:])/100 + else: + results[word][char][charHeader[char].text] = max(float( + charHeader[char].attrib["title"][charHeader[char].attrib[ + "title"].find("x_conf") + 7:]), 1)/100 + + return results + + +def addMissing(resultArr, key): + """ Takes values in a given hOCR Output and adds in characters that arent present that could have been mistaken + for another character. For example, a "4" could have been mistaken for an "A". The probability of the new + characters are the same as the highest probability of a similiar character.\n + @param resultArr: The output array from an hOCR output\n + @param key: This is a string that decides which dictionary to check from.\n + "a" represents alphabet\n + "d" represents digits\n + based on which one you choose wil decide what additions will be considered.\n + @return an array similar to resultArr, but with any found similar characters in there given positions. + """ + prob = 0 + for word in range(len(resultArr)): # iterates through words in hocr + # iterates between each character + for char in range(len(resultArr[word])): + # iterates translating dictionary + for val, sim in corrections[key].items(): + prob = 0 + # will only add new character if it doesnt already exist + if not val in resultArr[word][char].keys(): + for j in set(resultArr[word][char]).intersection(sim): + # the probability of the character will be equal to the highest probability of similiar character + prob = max(prob, resultArr[word][char][j]) + if (prob != 0): + resultArr[word][char][val] = prob + return resultArr + + +def adjustResult(resultArr): + """ This function specifically affects hOCR outputs that include alphabeticall letters. + It iterates through the output and makes everything lowercase for better matching. If + both the uppercase and lowercase form of a letter exist, then it will go with the highest + probability for the lowercase value.\n + @param resultArr: The hOCR output to be lowered.\n + @return an output similar to resultArr, but all characters are lowercase. + """ + for i in range(len(resultArr)): # iterates through all words + # iterates through character positions + for char in range(len(resultArr[i])): + # iterates through possible chars in slot + for possibleChars in list(resultArr[i][char].keys()): + if (possibleChars == None): + pass + elif possibleChars.isupper(): + # making lower val equal to previous val + # takes max prob if lower and upper exist + if possibleChars.lower() in resultArr[i][char]: + resultArr[i][char][possibleChars.lower()] = max( + resultArr[i][char][possibleChars], resultArr[i][char][possibleChars.lower()]) + else: + # otherwise creates lower char with same probability + resultArr[i][char][possibleChars.lower( + )] = resultArr[i][char][possibleChars] + # removes old val from dict + del resultArr[i][char][possibleChars] + return resultArr + + +def matchName(outputs: list, threshold=0.0): + """This function is how we get accurate values from the images in each dictionary. This one in particular tries to match output to a specific name.\n + @param {list} outputs: The list object that comes from parsing the hOCR output of 3 objects.\n + @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n + @returns: {tuple} it returns a tuple containing the expected name, the probability of that name being true, and a bool discussing whether it passed the threshold. + """ + for i in range(len(outputs)): # Iterating through all outputs + outputs[i] = addMissing(outputs[i], "a") + outputs[i] = adjustResult(outputs[i]) + + # Attempting to separate first and last name + if(len(outputs[i]) > 0): + # find the largest portion of words. likely either first or last name + largestWord = max(outputs[i], key=len) + if(largestWord == outputs[i][0]): + while(len(outputs[i]) > 2): + # Example: Firstname La stN ame + outputs[i][1].extend(outputs[i][2]) + # Firstname LastName + outputs[i].pop(2) + elif(largestWord == outputs[i][-1]): + while(len(outputs[i]) > 2): + # Example Fir stNa me Lastname + outputs[i][0].extend(outputs[i][1]) + # FirstName Lastname + outputs[i].pop(1) + # if the largest portion is in the middle or isnt the largest, + # then theres no way to guess how to stitch parts together. leave it alone then + + #################################### + ## CALCULATING NAME PROBABILITIES ## + #################################### + + tempName = "" + tempList = [] + bestName = "Nan" + bestProb = 0 + probability = 0 + for name in JSON["names"]["1"]: + for output in outputs: + probability = 0 + + # Calculation for single words + if(len(output) == 1): + for char in range(min(len(name), len(output[0]))): + # if the character is in the same position + if name[char] in output[0][char].keys(): + probability += output[0][char][name[char]] + # if the character is in next position and none is currently available + elif (None in output[0][char].keys()) and ( + char < len(output[0]) - 1) and name[char] in output[0][char + 1].keys(): + probability += output[0][char + 1][name[char]] + # if the character is in next pos + elif (char < len(output[0]) - 1) and name[char] in output[0][char + 1].keys(): + probability += output[0][char + 1][name[char]] * 0.75 + # if character is in previous position + elif (char > 0) and name[char] in output[0][char - 1].keys(): + probability += output[0][char - 1][name[char]] * 0.5 + else: # if the character just doesnt exist + probability += 0 # 0.25 + + # Calculation for exactly two words + # separate first and last name and evaluate + elif (len(output) == 2): + if not " " in name: + continue # skip any names with no spaces + namep = name.split(" ", 2) + for word in range(2): + for char in range(min(len(namep[word]), len(output[word]))): + if namep[word][char] in output[word][char].keys(): + probability += output[word][char][namep[word][char]] + elif (char < len(output[word]) - 1) and namep[word] in output[word][char + 1].keys(): + probability += output[word][char + + 1][namep[word][char]] * 0.75 + elif (char > 0) and namep[word][char] in output[word][char - 1].keys(): + probability += output[word][char - + 1][namep[word][char]] * 0.5 + else: + probability += 0 # 0.25 + + # its more than 1 or 2 words. strip all words down to one line and evaluate + else: + tempName = name.replace(" ", "") + for li in output: + tempList.extend(li) + + for i in range(min(len(tempName), len(tempList))): + if tempName[i] in tempList[i].keys(): + probability += tempList[i][tempName[i]] + elif tempName[i].upper() in tempList[i].keys(): + probability += tempList[i][tempName[i].upper()] + elif (i < len(tempList) - 1) and tempName[i] in tempList[i + 1].keys(): + probability += tempList[i + 1][tempName[i]] * 0.75 + elif (i > 0) and tempName[i] in tempList[i - 1].keys(): + probability += tempList[i - 1][tempName[i]] * 0.5 + else: + probability += 0 # 0.25 + + logging.debug("MatchName %s: %lf", name, probability) + if(probability > bestProb): + bestName = name + bestProb = probability + logging.info("MatchName Best: %s %lf", bestName, bestProb) + if (bestName == "Nan"): + return (bestName, bestProb, False) + if (bestProb/len(bestName.replace(" ", "")) >= threshold): + return (bestName, bestProb, True) + return (bestName, bestProb, False) + + +def matchTime(outputs: list, threshold=0.0): + """This function is how we get accurate values from the images in each dictionary. This one in particular tries to match output to a specific time.\n + @param {list} outputs: The list object that comes from parsing the hOCR output of 3 objects.\n + @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n + @returns: {tuple} it returns a tuple containing the expected name, the probability of that name being true, and a bool discussing whether it passed the threshold. + """ + #################### + ## Enriching Data ## + #################### + time = "" + timeAlt = "" + probability = 0 + probabilityAddition = 0 + bestTime = "Nan" + bestProb = 0 + bestAltProb = 0 + + # Adding alternatives + for i in range(2, -1, -1): + outputs[i] = addMissing(outputs[i], "d") # adds alternate digits + + # Checking if size constraints are correct + if ((len(outputs[i]) > 1) or ( + # if its a size less than 3, then itll never be a time + len(outputs[i][0]) < 3) or ( + # if colon in middle, then + len(outputs[i][0]) < 4 and ":" in outputs[i][0][-3]) or ( + len(outputs[i][0]) > 4 and not ":" in outputs[i][0][-3]) or ( + len(outputs[i][0]) > 5)): + print("Failed size") + for char in outputs[i][0]: + print(char) + print(len(outputs[i][0])) + outputs.pop(i) + + for i in range(len(outputs)): + # Removing any letters in dictionary + for slot in range(len(outputs[i][0])): + for char in list(outputs[i][0][slot].keys()): + if not char.isdigit() and char != ":": + # if the key isnt a number or a colon, then remove it + del outputs[i][0][slot][char] + + ########################### + # Calculating Probability # + ########################### + + # Permutating through all combos in the list + for timed in product(*outputs[i][0]): + # building the time string + time = "".join(timed) + probability = 0 + probabilityAddition = 0 + # print(timed) + for cnum in range(len(timed)): + probability += outputs[i][0][cnum][timed[cnum]] + + logging.debug("Time: %s - %lf", time, probability) + if bool(timeFilter.match(time)): + # BOOSTING PROBABILITY FROM OTHER OUTPUTS + + probAdd = 0 # additional probability to add based on other two outputs + probAddAlt = 0 # addition probability if alternate is better + + # Creating alternate with/out colon + if not ":" in time: + timeAlt = time[:-2] + ":" + time[-2:] + else: + timeAlt = time.replace(":", "") + + logging.info("Probability: %lf", probability) + logging.info("Time: %s", time) + logging.info("Time Alternate: %s", timeAlt) + + for j in range(len(outputs)): + if j == i: # preventing iterating through itself + continue + + for slot in range(min(len(time), len(outputs[j][0]))): + # if the char in the outputs dict, itll add its value + # itll only add values if it could fit it + if time[slot] in outputs[j][0][slot].keys(): + probAdd += outputs[j][0][slot][time[slot]] + else: + probAdd = 0 # do this to remove probability if it doesnt perfectly fit in match + break + for slot in range(min(len(timeAlt), len(outputs[j][0]))): + if timeAlt[slot] in outputs[j][0][slot].keys(): + probAddAlt += outputs[j][0][slot][timeAlt[slot]] + else: + probAddAlt = 0 + break + + # Double assurance for non values + if ":" in time or probAdd > probAddAlt: + probabilityAddition += probAdd + else: + probabilityAddition += probAddAlt + # probAdd = max(probAdd, 0) + # probAddAlt = max(probAddAlt, 0) + + # probabilityAddition += probAdd + probAddAlt + logging.info("Time Probability: %s, %s, %lf, %lf, %lf", + time, timeAlt, probability, probabilityAddition, probability + probabilityAddition) + + # Deciding best decision + if (probability + probabilityAddition >= bestProb + bestAltProb and probability > bestProb): + if ":" in time: + bestTime = time + else: + bestTime = timeAlt + bestProb = probability + bestAltProb = probabilityAddition + + # To decide if time probability is past its threshold + if (bestAltProb + bestProb > bestProb * len(outputs) * threshold): + return (bestTime, bestProb + bestAltProb, True) + return (bestTime, bestProb + bestAltProb, False) + + +def matchHour(outputs: list, threshold=0.3): + """This function is how we get accurate values from the images in each dictionary. This one in particular tries to match output to a number that represents the amount of hours there.\n + @param {list} outputs: The list object that comes from parsing the hOCR output of 3 objects.\n + @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n + @returns: {tuple} it returns a tuple containing the expected name, the probability of that name being true, and a bool discussing whether it passed the threshold. + """ + hour = "" + bestHour = "" + probability = 0 + altProb = 0 + bestProb = 0 + bestAlt = 0 + + # Refining the selection + for i in range(len(outputs)): + outputs[i] = addMissing(outputs[i], "d") + + # Removing non int values + for slot in range(len(outputs[i][0])): + for char in list(outputs[i][0][slot].keys()): + if not (char.isdigit() or char.isdecimal()): + del outputs[i][0][slot][char] + + # Calculations + for i in range(len(outputs)): + for hourd in product(*outputs[i][0]): + hour = "".join(hourd) + probability = 0 + altProb = 0 + + if not (hour.isdigit() or hour.isdecimal()): + continue + + # Building hour string + for char in range(len(hourd)): + probability += outputs[i][0][char][hourd[char]] + + if (hour.isdigit() or hour.isdecimal()): + for j in range(len(outputs)): + temp = 0 + if i == j: + continue + for char in range(min(len(hour), len(outputs[j][0]))): + if char in outputs[j][0][char]: + temp += outputs[j][0][char] + else: + temp = 0 + break + altProb += temp + + logging.info("Hour %s: %lf %lf = %lf", hour, + probability, altProb, probability + altProb) + # Deciding best one + if (probability + altProb > bestProb + bestAlt and probability > bestProb): + bestHour = hour + bestProb = probability + bestAlt = altProb + + if (bestProb + bestAlt > bestProb * len(outputs) * threshold): + return (bestHour, bestProb + bestAlt, True) + return (bestHour, bestProb + bestAlt, False) + + +def matchPurpose(outputs: list, threshold=0.3): + """This function is how we get accurate values from the images in each dictionary. This one in particular tries to match output to a specific prupose.\n + @param {list} outputs: The list object that comes from parsing the hOCR output of 3 objects.\n + @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n + @returns: {tuple} it returns a tuple containing the expected purpose, the probability of that name being true, and a bool discussing whether it passed the threshold. + """ + tempPurpose = "" + tempList = [] + bestPurpose = "Nan" + bestProb = 0 + probability = 0 + + for i in range(len(outputs)): + outputs[i] = addMissing(outputs[i], "a") + outputs[i] = adjustResult(outputs[i]) + + # iterating through possible results + for output in outputs: + for purpose in JSON["names"]["5"]: + probability = 0 + + # Checking for 1 word purposes + if (len(output) == 1): + if " " in purpose: # skip any purpose that isnt exactly one word + continue + for slot in range(min(len(purpose), len(output[0]))): + # if the character is in the same position + if purpose[slot] in output[0][slot].keys(): + probability += output[0][slot][purpose[slot]] + # if the character is in next position and none is currently available + elif (None in output[0][slot].keys()) and ( + slot < len(output[0]) - 1) and purpose[slot] in output[0][slot + 1].keys(): + probability += output[0][slot + 1][purpose[slot]] + # if the character is in next pos + elif (slot < len(output[0]) - 1) and purpose[slot] in output[0][slot + 1].keys(): + probability += output[0][slot + + 1][purpose[slot]] * 0.75 + # if character is in previous position + elif (slot > 0) and purpose[slot] in output[0][slot - 1].keys(): + probability += output[0][slot - 1][purpose[slot]] * 0.5 + else: # if the character just doesnt exist + probability += 0 # 0.25 + else: # for literally any other value + tempPurpose = purpose.replace(" ", "") + for li in output: + tempList.extend(li) + + for i in range(min(len(tempPurpose), len(tempList))): + if tempPurpose[i] in tempList[i].keys(): + probability += tempList[i][tempPurpose[i]] + elif tempPurpose[i].upper() in tempList[i].keys(): + probability += tempList[i][tempPurpose[i].upper()] + elif (i < len(tempList) - 1) and tempPurpose[i] in tempList[i + 1].keys(): + probability += tempList[i + 1][tempPurpose[i]] * 0.75 + elif (i > 0) and tempPurpose[i] in tempList[i - 1].keys(): + probability += tempList[i - 1][tempPurpose[i]] * 0.5 + else: + probability += 0 # 0.25 + + logging.debug("MatchPurpose %s: %lf", purpose, probability) + if(probability > bestProb): + bestPurpose = purpose + bestProb = probability + logging.info("MatchPurpose Best: %s %lf", bestPurpose, bestProb) + + if (bestPurpose == "Nan"): + return (bestPurpose, bestProb, False) + if (bestProb/len(bestPurpose.replace(" ", "")) >= threshold): + return (bestPurpose, bestProb, True) + return (bestPurpose, bestProb, False) + + +def correctValue(image, column, threshold=-1): + """This function is how we get accurate values from the images in each dictionary.\n + @param {cvimg} image: The image that is being transcribed.\n + @param {int} column: The column in the table that the image is in. This is very important as its part of how the translator corrects the outputs.\n + @param {double} threshold: Optional variable. Changes the percentage of characters that need to match the origional of it to return. Higher threshholds mean more strict requirements and higher chance of getting nothing. Lower threshholds mean higher chance to get a value that may or may not be incorrect.\n + @returns: It will return the name that closest resembles the image, or it will return \"RequestCorrection:\" if no name could be accepted.\n + It works by taking an image and running tesseract to get the value from the unchanges color image, then it grabs the ocr output from the same image with different effects, such as greyscale, thresholds, and contrast increase.\n + The next step for it is to take each unique value make, then run it through another function that creates a new string with the characters in it resembling what should be in there (no numbers or symbols in names, no chars in numbers, etc.) and adds it to the pile of strings.\n + The last step is for it take all the new unique strings and run them through another function to see which names the strings closest resemble. The name with the most conclusions is considered the best guess.\n + However, the best guess may not be accepted if the name doesnt share enough characters in common with all the guesses, then its scrapped and nothing is returned. + """ + thr = 0 + # Default settings for threshold + if not (threshold == -1): + thr = threshold + + # Running initial checks to see if cell is empty + # were creating an inverted thresh of the image for counting pixels, removes 8px border in case it includes external lines or table borders + invert = cv2.cvtColor(image[8: -8, 8: -8], cv2.COLOR_BGR2GRAY) + invert = cv2.threshold( + invert, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + invert = 255 - invert + # countnonzero only counts white pixels, so i need to invert to turn black pixels white + pixelCount = cv2.countNonZero(invert) + pixelTotal = invert.shape[0] * invert.shape[1] + + logging.debug("blankPercent: %s", pixelCount/pixelTotal) + # will only consider empty if image used less than 1% of pixels. yes, that small + if(pixelCount/pixelTotal <= 0.01): + logging.info("It's Blank") + # Skipping ahead if its already looking like theres nothing + return ("", 0, True) + del invert, pixelCount, pixelTotal + + outputs = [None] * 3 + + conf = "" + if column in [1, 5]: + conf = "--dpi 300 -c lstm_choice_mode=2" + elif column in [2, 3, 4]: + conf = "--dpi 300 -c lstm_choice_mode=2 -c hocr_char_boxes=1 --psm 8" + + # Safety net incase tesseract breaks for no reason + try: + # Get normal results + outputs[0] = parseHocr(tess.image_to_pdf_or_hocr( + image, lang="eng", extension="hocr", config=conf)) + + # Get black and white results + temp = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + outputs[1] = parseHocr(tess.image_to_pdf_or_hocr( + temp, lang="eng", extension="hocr", config=conf)) + + # get thresh results + temp = cv2.threshold( + temp, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + outputs[2] = parseHocr(tess.image_to_pdf_or_hocr( + temp, lang="eng", extension="hocr", config=conf)) + except tess.pytesseract.TesseractError: + logging.error("Tesseract Error") + return ("Nan", 0, False) + + # quick check incase box is looking empty; will only skip if 2/3 or more are blank + if(not (bool(outputs[0]) or bool(outputs[1]) or bool(outputs[2]))): + logging.info("we couldnt read it") + # if theres enough pixels to describe a possbile image, then it isnt empty, but it cant read it + return ("NaN", 0, False) + + ########################## + ## APPLYING CORRECTIONS ## + ########################## + + if (column == 1): + return matchName(outputs, threshold=thr) + elif(column == 2 or column == 3): + return matchTime(outputs, threshold=thr) + elif (column == 4): + return matchHour(outputs, threshold=thr) + elif(column == 5): + return matchPurpose(outputs, threshold=thr) + return ("NaN", 0, False) diff --git a/modules/gui.py b/modules/gui.py new file mode 100644 index 0000000..7713bec --- /dev/null +++ b/modules/gui.py @@ -0,0 +1,409 @@ +import tkinter +from math import floor +from os.path import basename, dirname, expanduser, normpath +from time import sleep +from tkinter import filedialog, ttk + +from PIL import Image, ImageTk + + +class mainGUI: + """ This is the GUI that is used for the entire program. Everything is based + off of this GUI. Any changes here will reflect directly on the program. + """ + # Gui Variables + signinsheet = "" + outputCSV = normpath(expanduser("~\\Documents\\signinSheetOutput.csv")) + guessButton = False + submitButton = False + + # GUI Components + root = None + decision = None + inputFile = None + outputFile = None + start = None + labelImage = None + errorLabel = None + confidenceDescription = None + AIGuess = None + orLabel = None + correctionEntry = None + submit = None + + # Status bars + sheetStatus = None + progressBar = None + + def __init__(self, cmd=None): + # Init GUI Components + self.root = tkinter.Tk(screenName="OCR To CSV Interpreter") + self.decision = tkinter.BooleanVar(self.root) + self.inputFile = tkinter.Button(self.root) + self.outputFile = tkinter.Button(self.root) + self.start = tkinter.Button(self.root) + self.labelImage = tkinter.Label(self.root) + self.errorLabel = tkinter.Label(self.root) + self.confidenceDescription = tkinter.Label(self.root) + self.AIGuess = tkinter.Button(self.root) + self.orLabel = tkinter.Label(self.root) + self.correctionEntry = tkinter.Entry(self.root) + self.submit = tkinter.Button(self.root) + + # Status bars + self.sheetStatus = tkinter.Label(self.root) + self.progressBar = ttk.Progressbar(self.root) + + self.root.title("OCR To CSV Interpreter") + self.root.geometry("600x450+401+150") + self.root.configure(background="#d9d9d9") + self.root.minsize(120, 1) + self.root.maxsize(1370, 749) + self.root.resizable(1, 1) + + self.inputFile.configure( + text="Select Signin Sheet", command=self.reconfigInput) + self.inputFile.configure( + activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", + highlightbackground="#d9d9d9", highlightcolor="black", pady="0") + self.inputFile.place(relx=0.033, rely=0.044, height=34, width=157) + + self.outputFile.configure( + text=basename(self.outputCSV), command=self.reconfigOutput) + self.outputFile.configure( + activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black", pady="0") + self.outputFile.place(relx=0.033, rely=0.156, height=34, width=157) + + self.start.configure(text="Start", command=cmd) + self.start.configure( + activebackground="#ececec", activeforeground="#000000", background="#17a252", + disabledforeground="#a3a3a3", foreground="#ffffff", highlightbackground="#d9d9d9", + highlightcolor="black", pady="0") + self.start.place(relx=0.033, rely=0.256, height=34, width=157) + + self.labelImage.configure(text="No corrections required yet.") + self.labelImage.configure( + background="#e6e6e6", disabledforeground="#a3a3a3", foreground="#000000") + self.labelImage.place(relx=0.417, rely=0.022, height=221, width=314) + + self.errorLabel.configure( + wraplength=224, activebackground="#f9f9f9", activeforeground="black", + background="#e1e1e1", disabledforeground="#a3a3a3", foreground="#ff0000", + highlightbackground="#d9d9d9", highlightcolor="black") + self.errorLabel.place(relx=0.017, rely=0.356, height=71, width=224) + + self.confidenceDescription.configure( + activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black") + self.confidenceDescription.place( + relx=0.267, rely=0.556, height=31, width=164) + + self.AIGuess.configure(text="No guesses yet.") + self.AIGuess.configure( + activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black", pady="0", command=self.guessSwitch) + self.AIGuess.place(relx=0.55, rely=0.556, height=34, width=227) + + self.orLabel.configure( + activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black") + self.orLabel.place(relx=0.017, rely=0.689, height=31, width=64) + + self.correctionEntry.configure( + background="white", disabledforeground="#a3a3a3", font="TkFixedFont", foreground="#000000", + highlightbackground="#d9d9d9", highlightcolor="black", insertbackground="black", + selectbackground="#c4c4c4", selectforeground="black") + self.correctionEntry.place( + relx=0.133, rely=0.689, height=30, relwidth=0.557) + + self.submit.configure(text="Submit") + self.submit.configure( + activebackground="#ececec", activeforeground="#000000", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black", pady="0", command=self.submitSwitch) + self.submit.place(relx=0.717, rely=0.689, height=34, width=127) + self.root.bind("", self.submitSwitch) + + self.sheetStatus.configure( + activebackground="#f9f9f9", activeforeground="black", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black", anchor="w") + self.sheetStatus.place(relx=0.033, rely=0.844, height=21, width=554) + + self.progressBar.place(relx=0.017, rely=0.911, relwidth=0.95, + relheight=0.0, height=22) + + # GUI Functions + + def run(self): + """ This starts the GUI """ + self.root.mainloop() + + def guessSwitch(self): + """ Switches decision switch towards the guess """ + self.guessButton = True + self.decision.set(1) + + def submitSwitch(self, event=None): + """ Switches the decision switch towards what is submitted """ + if(event != None and self.correctionEntry.get() == ""): + return + self.submitButton = True + self.decision.set(1) + + def reconfigOutput(self): + """ Changes the output CSV used """ + outputCSV = filedialog.askopenfilename(filetypes=( + ("Comma Style Values", "*.csv"), ("Comma Style Values", "*.csv"))) + if(outputCSV != ""): + self.outputFile.configure(text=basename(outputCSV)) + self.outputCSV = outputCSV + + def reconfigInput(self): + """ Changes the input PDF used """ + signinsheet = filedialog.askopenfilename(filetypes=( + ("PDF Files", "*.pdf"), ("Jpeg Files", "*.jpg"), ("Png Files", "*.png"))) + if(signinsheet != ""): + self.inputFile.configure(text=basename(signinsheet)) + self.signinsheet = signinsheet + + def requestCorrection(self, displayImage, guess=""): + """This is the function used when a string doesnt confidently match a name.\n + @param displayImage: The image placed on the display for user to see.\n + @param {int} col: The column number that the image was found in. This is needed + for placing the AI's guess.\n + @param {string} guess: This is to straight up overwrite the AI's guess with the + string. This can be helpful so that the AI doesnt have to process the image + again.\n + @return: the users answer. + """ + + result = "" # the string to be returned for final answer + + # Setting up image to place in GUI + image = Image.fromarray(displayImage) + if(displayImage.shape[1] > self.labelImage.winfo_width()): + hgt, wth = displayImage.shape[:2] + ratio = self.labelImage.winfo_width()/wth + image = image.resize( + (floor(wth * ratio), floor(hgt * ratio)), Image.ANTIALIAS) + image = ImageTk.PhotoImage(image) + + # setting values to labels in gui + self.labelImage.configure(image=image) + self.labelImage.image = image + self.errorLabel.configure( + text="Uh oh. It looks like we couldnt condifently decide who or what this is. We need you to either confirm our guess or type in the correct value") + self.confidenceDescription.configure( + text="Were not confident, but is it:") + self.AIGuess.configure(text=guess) + self.orLabel.configure(text="or") + + # basically waits till user presses a button and changes variable scope + self.root.update_idletasks() + self.root.wait_variable(self.decision) + result = self.correctionEntry.get() + + # Resetting changes made + self.labelImage.configure(image=None) + self.labelImage.image = None + self.errorLabel.configure(text="") + self.confidenceDescription.configure(text="") + self.AIGuess.configure(text="") + self.orLabel.configure(text="") + self.correctionEntry.delete(0, "end") + self.root.update_idletasks() + sleep(1) + self.decision.set(0) + + if(self.guessButton): + self.guessButton = False + self.submitButton = False + return (guess, 100, True) + elif(self.submitButton): + self.guessButton = False + self.submitButton = False + return (result, 100, True) + + +class PopupTag: + """ This GUI is designed to popup for the main GUI and give various information. + It is sometimes used for displaying Errors to the user. It is also used for + confirming completion of work for the user. + """ + # GUI Components + popupBox = None + popupDescription = None + popupOK = None + top = None + + def __init__(self, top, title, text, color="#000000"): + self.top = top + + self.popupBox = tkinter.Toplevel() + self.popupDescription = tkinter.Text(self.popupBox) + self.popupOK = tkinter.Button(self.popupBox) + + self.popupBox.geometry("335x181+475+267") + self.popupBox.minsize(120, 1) + self.popupBox.maxsize(1370, 749) + self.popupBox.resizable(1, 1) + self.popupBox.configure(background="#d9d9d9") + self.popupBox.title = title + + self.popupDescription.insert("end", text) + self.popupDescription.configure( + foreground=color, wrap="word", state="disabled", background="#FFFFFF", + font="TkTextFont", highlightbackground="#d9d9d9", highlightcolor="black", + insertbackground="black", selectbackground="#c4c4c4", selectforeground="black") + self.popupDescription.place( + relx=0.03, rely=0.055, height=91, width=314) + self.popupOK.configure(text="OK", command=self.end) + self.popupOK.configure( + activebackground="#ececec", activeforeground="#000000", background="#ebebeb", + disabledforeground="#a3a3a3", foreground="#000000", highlightbackground="#d9d9d9", + highlightcolor="black", pady="0") + self.popupOK.place(relx=0.328, rely=0.663, height=34, width=117) + + def run(self): + """ This starts the GUI """ + self.popupBox.mainloop() + + def end(self): + """ This closes the popup and the GUI tied to it """ + self.popupBox.destroy() + self.top.root.destroy() + + +class InstallError: + """ This GUI is only useful for aknowledging errors due to a necessary + program not being installed. This takes the name of the program as well + as the download URL for the program in order to provide it to the user. + """ + # Variables + name = None + URL = None + fileName = None + + # Fonts + font11 = "-family {Segoe UI} -size 18 -weight bold" + font13 = "-family {Segoe UI} -size 16 -weight bold" + + # Text + text_description = "Warning: Youre missing {name}. it is a required software to make this tool run. To fix this issue, please follow the instructions below." + + # GUI Components + root = None + header = None + description = None + link = None + orLabel = None + location = None + downloadLabel = None + navigateLabel = None + + def __init__(self, name, URL, filename): + self.name = name + self.URL = URL + self.fileName = filename + + self.root = tkinter.Tk(baseName="Missing Software") + self.header = tkinter.Label(self.root) + self.description = tkinter.Label(self.root) + self.link = tkinter.Label(self.root) + self.orLabel = tkinter.Label(self.root) + self.location = tkinter.Label(self.root) + self.downloadLabel = tkinter.Button(self.root) + self.navigateLabel = tkinter.Button(self.root) + + self.root.title("Missing Software") + self.root.geometry("438x478") + self.root.minsize(120, 1) + self.root.maxsize(1370, 749) + self.root.resizable(1, 1) + self.root.configure(background="#d9d9d9") + + self.header.configure(text="Software Not Installed") + self.header.configure( + font=self.font11, activeforeground="#372fd7", background="#d9d9d9", + disabledforeground="#a3a3a3", foreground="#2432d9") + self.header.place(relx=0.16, rely=0.042, height=61, width=294) + + self.header.configure(text=self.text_description.format(name=name)) + self.description.configure( + font="-family {Segoe UI} -size 14", background="#ffffff", + disabledforeground="#a3a3a3", foreground="#000000", wraplength="294") + self.description.place(relx=0.16, rely=0.167, height=151, width=294) + + self.link.configure( + text="If you havent already installed this software, please follow the download link.") + self.link.configure( + background="#eeeeee", disabledforeground="#a3a3a3", foreground="#000000", + wraplength="294") + self.link.place(relx=0.16, rely=0.523, height=31, width=294) + + self.orLabel.configure(text="Or") + self.orLabel.configure( + font="-family {Segoe UI} -size 16 -weight bold", + background="#d9d9d9", disabledforeground="#a3a3a3", foreground="#29c1dc") + self.orLabel.place(relx=0.457, rely=0.69, height=36, width=40) + + self.location.configure( + text="If you've already installed the software, please lead us to where it is as we cannot find it.") + self.location.configure( + background="#eeeeee", wraplength="294", + disabledforeground="#a3a3a3", foreground="#000000") + self.location.place(relx=0.16, rely=0.774, height=41, width=294) + + self.downloadLabel.configure( + text="Download {name}".format(name=name), command=self.download) + self.downloadLabel.configure( + font=self.font11, activebackground="#ececec", activeforeground="#000000", + background="#48d250", disabledforeground="#a3a3a3", foreground="#000000", + highlightbackground="#d9d9d9", highlightcolor="black", pady="0") + self.downloadLabel.place(relx=0.16, rely=0.607, height=34, width=297) + + self.navigateLabel.configure( + text="Navigate to {name}".format(name=name), command=self.navigate) + self.navigateLabel.configure( + font=self.font13, activebackground="#ececec", activeforeground="#000000", + background="#eaecec", disabledforeground="#a3a3a3", foreground="#000000", + highlightbackground="#d9d9d9", highlightcolor="black", pady="0") + self.navigateLabel.place(relx=0.16, rely=0.879, height=34, width=297) + + def run(self): + """ This starts the GUI """ + self.root.mainloop() + + def download(self): + """ Runs command to open URL in users web browser """ + import webbrowser + webbrowser.open(self.URL, autoraise=True) + + def navigate(self): + """ If the user has already installed the program, but the program + doesnt see it, it'll attempt to add the program to the user's path + so that the user doesnt have to explain where the program is every + time it is open. + """ + from os import getenv, system + path = filedialog.askopenfilename( + filetypes=((self.name, self.fileName), (self.name, self.fileName))) + path = dirname(normpath(path)) + if (getenv("path")[-1] != ";"): + path = ";" + path + if(len(getenv("path") + path) >= 1024): + self.description.configure( + text="Error: we could not add the file to your path for you. You will have to do this manually.") + if getenv("userprofile") in path: + if(system("setx PATH \"%path%" + path + "\"")): + print("Failed to do command") + else: + if(system("setx PATH /M \"%path%" + path + "\"")): + print("failed to do command") diff --git a/modules/imageScraper.py b/modules/imageScraper.py new file mode 100644 index 0000000..a25f918 --- /dev/null +++ b/modules/imageScraper.py @@ -0,0 +1,299 @@ +import logging +import numpy as nm +import cv2 +from pdf2image import convert_from_path +from os.path import exists + + +def collectContours(image): + """ Sub function used by scrapper.\n + @param image: an opencv image\n + @return returns an ordered list of contours found in the image.\n + This function was heavily influenced by its source.\n + @source: https://medium.com/coinmonks/a-box-detection-algorithm-for-any-image-containing-boxes-756c15d7ed26 + """ + debugIndex = 0 + # Grab absolute thresh of image + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + thresh = cv2.threshold( + image, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + invert = 255 - thresh + + if (logging.getLogger().level <= logging.DEBUG): + while(exists("debugOutput/scrapper/{ind}1invert.jpg".format(ind=debugIndex))): + debugIndex += 1 + cv2.imwrite( + "debugOutput/scrapper/{ind}1invert.jpg".format(ind=debugIndex), invert) + ####################################### + # Defining kernels for line detection # + ####################################### + kernel_length = nm.array(image).shape[1]//80 + verticle_kernel = cv2.getStructuringElement( + cv2.MORPH_RECT, (1, kernel_length)) # kernel for finding all verticle lines + # kernel for finding all horizontal lines + hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1)) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) # 3x3 kernel + + # Collecting Verticle Lines + verticleLines = cv2.erode(invert, verticle_kernel, iterations=3) + verticleLines = cv2.dilate(verticleLines, verticle_kernel, iterations=3) + verticleLines = cv2.threshold( + verticleLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/{ind}2verticleLines.jpg".format(ind=debugIndex), verticleLines) + + # Collecting Horizontal Lines + horizontalLines = cv2.erode(invert, hori_kernel, iterations=3) + horizontalLines = cv2.dilate(horizontalLines, hori_kernel, iterations=3) + horizontalLines = cv2.threshold( + horizontalLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/{ind}3horizontalLines.jpg".format(ind=debugIndex), horizontalLines) + + # Weighting parameters, this will decide the quantity of an image to be added to make a new image. + alpha = 0.5 + beta = 1.0 - alpha + + # combining verticle and horizontal lines. This gives us an empty table so that letters dont become boxes + blankTable = cv2.addWeighted( + verticleLines, alpha, horizontalLines, beta, 0.0) + blankTable = cv2.erode(~blankTable, kernel, iterations=2) + blankTable = cv2.threshold(blankTable, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[ + 1] # sharpening new table + + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/{ind}4blankTable.jpg".format(ind=debugIndex), blankTable) + # Detecting all contours, which gives me all box positions + contours = cv2.findContours( + blankTable, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] + + # Organizing contours + # we got our boxes, but its mostly to sort the contours + bboxes = [cv2.boundingRect(c) for c in contours] + # Sort all the contours in ascending order + contours, bboxes = zip( + *sorted(zip(contours, bboxes), key=lambda b: b[1][1], reverse=False)) + return contours + +# Generator +# PHASE 1: manipulate image to clearly show tabs + + +def imageScraper(file, outputArray=None): + """This function if phase 1 of the process. It starts by taking the image/pdf + of the signin sheet and breaks the table apart to isolate each value in the exact + order that they came in.\n + @param file: the image/pdf that needs to be scraped into its values.\n + @param outputArray: a parameter passed by reference due to the nature + of tkinters buttons. If the param is not filled, it will just return the result.\n + @return a multidimension array of images that containes the values of all the slots in the table. + """ + images = [] + sheets = [] # an array with each index containing the output per page + debugIndex = 0 + if not (file.split(".")[1] in ["jpg", "jpeg", "png", "pdf"]): + return + elif not (exists(file)): + raise FileNotFoundError("File given does not exist.") + if file.split(".")[1] == "pdf": + for image in convert_from_path(file): + image = nm.array(image) + image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + images.append(image) + else: + # , cv2.IMREAD_GRAYSCALE) + images.append(cv2.imread(file, cv2.COLOR_RGB2BGR)) + + for image in images: + contours = collectContours(image) + # // This is to tell which boxes correlate to the date + # Phase 1: Finding Main Boxes ## // and which big box is the signin table + ################################# + mainBoxes = [] + for c in contours: + x, y, w, h = cv2.boundingRect(c) + if((h, w, 3) == image.shape): + continue + for m in mainBoxes: + if (x > m[0] and w < m[2]) or (y > m[1] and h < m[3]): + break + elif(x <= m[0] and w >= m[2] and y <= m[1] and h >= m[3]): + mainBoxes.remove(m) + mainBoxes.append([x, y, w, h]) + else: + mainBoxes.append([x, y, w, h]) + + table = mainBoxes[0] # img that contains whole table + + for x, y, w, h in mainBoxes: + if((w - x > table[2] - table[0]) or (h - y > table[3] - table[1])): + table = [x, y, w, h] + mainBoxes.remove(table) + + # making images for date and day + sheets.append([[], []]) + for x, y, w, h in mainBoxes: + sheets[-1][0].append(image[y:y+h, x:x+w]) + + # Checking if dates are text and not random images + for i in range(len(sheets[-1][0]) - 1, -1, -1): + date = sheets[-1][0][i] + tempDate = cv2.cvtColor(date, cv2.COLOR_BGR2GRAY) + tempDate = cv2.threshold( + tempDate, 230, 255, cv2.THRESH_BINARY_INV)[1] + blackPixel = cv2.countNonZero(tempDate) + totalPixel = tempDate.shape[0] * tempDate.shape[1] + # if the space filled is not between 1%-20%, then its a dud + if(blackPixel/totalPixel <= 0.01 or blackPixel/totalPixel >= 0.20): + sheets[-1][0].pop(i) + + ######################################### + # Phase 2: Collecting pairs for mapping # + ######################################### + + # Collecting contours collected from table + table = image[table[1]-5:table[1]+table[3] + + 5, table[0]-5:table[0]+table[2]+5] + + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/mainTable{image}.jpg".format(image=debugIndex), table) + debugIndex += 1 + + # Grabbing verticle and horizontal images of table for better scraping + tableCompute = cv2.cvtColor(table, cv2.COLOR_BGR2GRAY) + tableCompute = cv2.threshold( + tableCompute, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + tableInvert = 255 - tableCompute + tKernelLength = nm.array(tableCompute).shape[1]//80 + tKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) + + ############################# + # Collecting Verticle Pairs # + ############################# + verticlePoints = [] + verticlePairs = [] + # Creating verticle kernel lines + tKernelVerticle = cv2.getStructuringElement( + cv2.MORPH_RECT, (1, tKernelLength)) + tVerticleLines = cv2.erode(tableInvert, tKernelVerticle, iterations=3) + tVerticleLines = cv2.dilate( + tVerticleLines, tKernelVerticle, iterations=3) + tVerticleLines = cv2.threshold( + tVerticleLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/table{}VertLines.jpg".format(debugIndex), tVerticleLines) + # Collecting verticle contours + contours = cv2.findContours( + tVerticleLines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] + # Figuring out the length that relates to the majority of the table, (aka, longer lengths relates to length of table rather than random lines) + maxLength = 0 + tableHeightPair = () # empty tuple for checking later + for c in contours: + x, y, w, h = cv2.boundingRect(c) + if(h >= table.shape[0] * 0.9): # (y, h) == tableHeightPair): + verticlePoints.append(x) + verticlePoints.append(x + w) + verticlePoints.sort() + + # this is the gap before the table from the left side + verticlePoints.pop(0) + # this is the gap after the table from the right side + verticlePoints.pop(-1) + + # taking points and making pairs + for i in range(0, len(verticlePoints), 2): + verticlePairs.append((verticlePoints[i], verticlePoints[i + 1])) + logging.debug("VerticlePairs: %s", verticlePairs) + + if (logging.getLogger().level <= logging.DEBUG): + debugimg = cv2.cvtColor(tVerticleLines, cv2.COLOR_GRAY2BGR) + for v in verticlePairs: + cv2.line(debugimg, (v[0], 0), + (v[0], debugimg.shape[0]), (0, 0, 255)) + cv2.line(debugimg, (v[1], 0), + (v[1], debugimg.shape[0]), (0, 0, 255)) + cv2.imwrite( + "debugOutput/scrapper/table{}VertContours.jpg".format(debugIndex), debugimg) + + ############################### + # Collecting Horizontal Pairs # + ############################### + horizontalPairs = [] + horizontalPoints = [] + # Creating horizontal kernel lines + tKernelHorizontal = cv2.getStructuringElement( + cv2.MORPH_RECT, (tKernelLength, 1)) + tHorizontalLines = cv2.erode( + tableInvert, tKernelHorizontal, iterations=3) + tHorizontalLines = cv2.dilate( + tHorizontalLines, tKernelHorizontal, iterations=3) + tHorizontalLines = cv2.threshold( + tHorizontalLines, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/scrapper/table{}HorLines.jpg".format(debugIndex), tHorizontalLines) + # Collecting Horizontal contours + contours = cv2.findContours( + tHorizontalLines, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[0] + + # Figuring out the length that relates to the majority of the table, (aka, longer lengths relates to length of table rather than random lines) + maxLength = 0 + tableWidthPair = () # empty tuple for checking later + for c in contours: + x, y, w, h = cv2.boundingRect(c) + # (x, w) == tableWidthPair or w >= tHorizontalLines.shape[1] * 0.9): + if(w >= tHorizontalLines.shape[1] * 0.9): + horizontalPoints.append(y) + horizontalPoints.append(y + h) + horizontalPoints.sort() + logging.debug("HorizontalPoints: %s", horizontalPoints) + + # this is the gap before the table from the top + horizontalPoints.pop(0) + # this is the gap after the table from the bottom + horizontalPoints.pop(-1) + + # Building pairs from points + for i in range(0, len(horizontalPoints), 2): + horizontalPairs.append( + (horizontalPoints[i], horizontalPoints[i + 1])) + logging.debug("HorizontalPairs: %s", horizontalPairs) + + if (logging.getLogger().level <= logging.DEBUG): + debugimg = cv2.cvtColor(tHorizontalLines, cv2.COLOR_GRAY2BGR) + for h in horizontalPairs: + cv2.line(debugimg, (0, h[0]), + (debugimg.shape[1], h[0]), (0, 0, 255)) + cv2.line(debugimg, (0, h[1]), + (debugimg.shape[1], h[1]), (0, 0, 255)) + cv2.imwrite( + "debugOutput/scrapper/table{}HorContours.jpg".format(debugIndex), debugimg) + + ##################################### + # Phase 3: Time for actual Scraping # + ##################################### + + # the dictionary thatll hold all our information + dictRow = 0 + for row in horizontalPairs: + sheets[-1][1].append([]) + for col in verticlePairs: + sheets[-1][1][dictRow].append(table[row[0]:row[1], col[0]:col[1]]) + if (logging.getLogger().level <= logging.DEBUG): + cv2.imwrite( + "debugOutput/dictionary/raw/table{}{}.jpg".format( + dictRow, col[1]-col[0]), table[row[0]:row[1], col[0]:col[1]]) + dictRow += 1 + + if(outputArray == None): + return sheets + else: + globals()[outputArray] = sheets.copy() + return diff --git a/modules/sanity.py b/modules/sanity.py new file mode 100644 index 0000000..36fba7c --- /dev/null +++ b/modules/sanity.py @@ -0,0 +1,67 @@ +def sanityName(results: list): + """ This function checks the databases names captured. It checks + if the name exists multiple times. If it does, it maintains the validity + of the most accurate name and sets the rest to incorrect to be either + scanned again or get a request from the user.\n + @param results: The entire database to be checked for duplicate names\n + @return the database itself if it isnt modified by reference + """ + dupeNames = [] + bestChoice = 0 # The index of the best choice amongst the duplicate names + bestProb = 0 # the probability of the best choice + for page in range(len(results)): + for i in range(len(results[page])): + if results[page][i][0][2] and results[page][i][0][0] != "": + bestChoice = i + bestProb = results[page][i][0][1] + dupeNames.clear() + dupeNames.append(i) + + # Looking for same names + for j in range(i + 1, len(results[page])): + if not results[page][j][0][2] or ( + results[page][i][0][0] != results[page][j][0][0]): + continue # if names dont match, or already isnt expected to be correct + if (results[page][j][0][1] > bestProb): + bestProb = results[page][j][0][1] + bestChoice = j + # checking for highest probability as it + # is most likely to be actual person + dupeNames.append(j) + + # removing best candidate from dupe list + dupeNames.remove(bestChoice) + for d in dupeNames: + results[page][d][0] = ( # all other dupes are considered failure + results[page][d][0][0], results[page][d][0][1], False) + dupeNames.clear() + + # setting it equal + try: + globals()["results"] = results + except: + pass + return results + + +def sanityTime(personRow: list): + pass + + +def checkSanity(results: list): + pass + + +def checkBlankRow(personRow: list): + """ This function takes a name row and sees if theres enough information + to determine if the row has no information in it.\n + @param personRow the row to be determined whether or not its blank.\n + @return True if the row is blank and can be removed, False otherwise. + """ + countConfirms = 1 + countBlanks = 1 + for i in personRow[:4]: + # and every one. any false will 0 the entire thing + countConfirms *= i[2] + countBlanks *= (i[0] == '') + return countConfirms and countBlanks diff --git a/sources/guiOutlines/mainGuiRevamp.py b/sources/guiOutlines/mainGuiRevamp.py new file mode 100644 index 0000000..ef55576 --- /dev/null +++ b/sources/guiOutlines/mainGuiRevamp.py @@ -0,0 +1,214 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +# +# GUI module generated by PAGE version 4.26 +# in conjunction with Tcl version 8.6 +# Mar 29, 2020 09:50:30 PM EDT platform: Windows NT + +import sys + +try: + import Tkinter as tk +except ImportError: + import tkinter as tk + +try: + import ttk + py3 = False +except ImportError: + import tkinter.ttk as ttk + py3 = True + +import mainGuiRevamp_support + +def vp_start_gui(): + '''Starting point when module is the main routine.''' + global val, w, root + root = tk.Tk() + top = Toplevel1 (root) + mainGuiRevamp_support.init(root, top) + root.mainloop() + +w = None +def create_Toplevel1(root, *args, **kwargs): + '''Starting point when module is imported by another program.''' + global w, w_win, rt + rt = root + w = tk.Toplevel (root) + top = Toplevel1 (w) + mainGuiRevamp_support.init(w, top, *args, **kwargs) + return (w, top) + +def destroy_Toplevel1(): + global w + w.destroy() + w = None + +class Toplevel1: + def __init__(self, top=None): + '''This class configures and populates the toplevel window. + top is the toplevel containing window.''' + _bgcolor = '#d9d9d9' # X11 color: 'gray85' + _fgcolor = '#000000' # X11 color: 'black' + _compcolor = '#d9d9d9' # X11 color: 'gray85' + _ana1color = '#d9d9d9' # X11 color: 'gray85' + _ana2color = '#ececec' # Closest X11 color: 'gray92' + self.style = ttk.Style() + if sys.platform == "win32": + self.style.theme_use('winnative') + self.style.configure('.',background=_bgcolor) + self.style.configure('.',foreground=_fgcolor) + self.style.map('.',background= + [('selected', _compcolor), ('active',_ana2color)]) + + top.geometry("600x450+401+150") + top.minsize(120, 1) + top.maxsize(1370, 749) + top.resizable(1, 1) + top.title("New Toplevel") + top.configure(background="#d9d9d9") + top.configure(highlightbackground="#d9d9d9") + top.configure(highlightcolor="black") + + self.inputSheet = tk.Button(top) + self.inputSheet.place(relx=0.033, rely=0.044, height=34, width=157) + self.inputSheet.configure(activebackground="#ececec") + self.inputSheet.configure(activeforeground="#000000") + self.inputSheet.configure(background="#d9d9d9") + self.inputSheet.configure(disabledforeground="#a3a3a3") + self.inputSheet.configure(foreground="#000000") + self.inputSheet.configure(highlightbackground="#d9d9d9") + self.inputSheet.configure(highlightcolor="black") + self.inputSheet.configure(pady="0") + self.inputSheet.configure(text='''Select Signin sheet''') + + self.outputCSV = tk.Button(top) + self.outputCSV.place(relx=0.033, rely=0.156, height=34, width=157) + self.outputCSV.configure(activebackground="#ececec") + self.outputCSV.configure(activeforeground="#000000") + self.outputCSV.configure(background="#d9d9d9") + self.outputCSV.configure(disabledforeground="#a3a3a3") + self.outputCSV.configure(foreground="#000000") + self.outputCSV.configure(highlightbackground="#d9d9d9") + self.outputCSV.configure(highlightcolor="black") + self.outputCSV.configure(pady="0") + self.outputCSV.configure(text='''Select output csv''') + + self.imagePortrait = tk.Label(top) + self.imagePortrait.place(relx=0.417, rely=0.022, height=221, width=314) + self.imagePortrait.configure(activebackground="#f9f9f9") + self.imagePortrait.configure(activeforeground="black") + self.imagePortrait.configure(background="#e6e6e6") + self.imagePortrait.configure(disabledforeground="#a3a3a3") + self.imagePortrait.configure(foreground="#000000") + self.imagePortrait.configure(highlightbackground="#d9d9d9") + self.imagePortrait.configure(highlightcolor="black") + self.imagePortrait.configure(text='''Label''') + + self.errorLabel = tk.Label(top) + self.errorLabel.place(relx=0.017, rely=0.356, height=71, width=224) + self.errorLabel.configure(activebackground="#f9f9f9") + self.errorLabel.configure(activeforeground="black") + self.errorLabel.configure(background="#e1e1e1") + self.errorLabel.configure(disabledforeground="#a3a3a3") + self.errorLabel.configure(foreground="#ff0000") + self.errorLabel.configure(highlightbackground="#d9d9d9") + self.errorLabel.configure(highlightcolor="black") + self.errorLabel.configure(text='''Uh oh. It looks like we couldnt condifently decide who or what this is. We need you to either confirm our guess or type in the correct value''') + + self.correctionEntry = tk.Entry(top) + self.correctionEntry.place(relx=0.133, rely=0.689, height=30 + , relwidth=0.557) + self.correctionEntry.configure(background="white") + self.correctionEntry.configure(disabledforeground="#a3a3a3") + self.correctionEntry.configure(font="TkFixedFont") + self.correctionEntry.configure(foreground="#000000") + self.correctionEntry.configure(highlightbackground="#d9d9d9") + self.correctionEntry.configure(highlightcolor="black") + self.correctionEntry.configure(insertbackground="black") + self.correctionEntry.configure(selectbackground="#c4c4c4") + self.correctionEntry.configure(selectforeground="black") + + self.AIGuess = tk.Button(top) + self.AIGuess.place(relx=0.55, rely=0.556, height=34, width=227) + self.AIGuess.configure(activebackground="#ececec") + self.AIGuess.configure(activeforeground="#000000") + self.AIGuess.configure(background="#d9d9d9") + self.AIGuess.configure(disabledforeground="#a3a3a3") + self.AIGuess.configure(foreground="#000000") + self.AIGuess.configure(highlightbackground="#d9d9d9") + self.AIGuess.configure(highlightcolor="black") + self.AIGuess.configure(pady="0") + self.AIGuess.configure(text='''Button''') + + self.submit = tk.Button(top) + self.submit.place(relx=0.717, rely=0.689, height=34, width=127) + self.submit.configure(activebackground="#ececec") + self.submit.configure(activeforeground="#000000") + self.submit.configure(background="#d9d9d9") + self.submit.configure(disabledforeground="#a3a3a3") + self.submit.configure(foreground="#000000") + self.submit.configure(highlightbackground="#d9d9d9") + self.submit.configure(highlightcolor="black") + self.submit.configure(pady="0") + self.submit.configure(text='''Submit''') + + self.confidenceDescription = tk.Label(top) + self.confidenceDescription.place(relx=0.267, rely=0.556, height=31 + , width=164) + self.confidenceDescription.configure(activebackground="#f9f9f9") + self.confidenceDescription.configure(activeforeground="black") + self.confidenceDescription.configure(background="#d9d9d9") + self.confidenceDescription.configure(disabledforeground="#a3a3a3") + self.confidenceDescription.configure(foreground="#000000") + self.confidenceDescription.configure(highlightbackground="#d9d9d9") + self.confidenceDescription.configure(highlightcolor="black") + self.confidenceDescription.configure(justify='right') + self.confidenceDescription.configure(text='''Were not confident, but is it:''') + + self.orLabel = tk.Label(top) + self.orLabel.place(relx=0.017, rely=0.689, height=31, width=64) + self.orLabel.configure(activebackground="#f9f9f9") + self.orLabel.configure(activeforeground="black") + self.orLabel.configure(background="#d9d9d9") + self.orLabel.configure(disabledforeground="#a3a3a3") + self.orLabel.configure(foreground="#000000") + self.orLabel.configure(highlightbackground="#d9d9d9") + self.orLabel.configure(highlightcolor="black") + self.orLabel.configure(text='''Or''') + + self.TranslationProgress = ttk.Progressbar(top) + self.TranslationProgress.place(relx=0.017, rely=0.911, relwidth=0.95 + , relheight=0.0, height=22) + + self.SheetStatus = tk.Label(top) + self.SheetStatus.place(relx=0.033, rely=0.844, height=21, width=554) + self.SheetStatus.configure(activebackground="#f9f9f9") + self.SheetStatus.configure(activeforeground="black") + self.SheetStatus.configure(anchor='w') + self.SheetStatus.configure(background="#d9d9d9") + self.SheetStatus.configure(disabledforeground="#a3a3a3") + self.SheetStatus.configure(foreground="#000000") + self.SheetStatus.configure(highlightbackground="#d9d9d9") + self.SheetStatus.configure(highlightcolor="black") + self.SheetStatus.configure(text='''Sheet: x of y''') + + self.start = tk.Button(top) + self.start.place(relx=0.033, rely=0.256, height=34, width=157) + self.start.configure(activebackground="#ececec") + self.start.configure(activeforeground="#000000") + self.start.configure(background="#17a252") + self.start.configure(disabledforeground="#a3a3a3") + self.start.configure(foreground="#ffffff") + self.start.configure(highlightbackground="#d9d9d9") + self.start.configure(highlightcolor="black") + self.start.configure(pady="0") + self.start.configure(text='''Start''') + +if __name__ == '__main__': + vp_start_gui() + + + + + diff --git a/sources/guiOutlines/mainGuiRevamp.tcl b/sources/guiOutlines/mainGuiRevamp.tcl new file mode 100644 index 0000000..078ef99 --- /dev/null +++ b/sources/guiOutlines/mainGuiRevamp.tcl @@ -0,0 +1,201 @@ +############################################################################# +# Generated by PAGE version 4.26 +# in conjunction with Tcl version 8.6 +# Mar 29, 2020 09:50:27 PM EDT platform: Windows NT +set vTcl(timestamp) "" + + +if {!$vTcl(borrow) && !$vTcl(template)} { + +set vTcl(actual_gui_bg) #d9d9d9 +set vTcl(actual_gui_fg) #000000 +set vTcl(actual_gui_analog) #ececec +set vTcl(actual_gui_menu_analog) #ececec +set vTcl(actual_gui_menu_bg) #d9d9d9 +set vTcl(actual_gui_menu_fg) #000000 +set vTcl(complement_color) #d9d9d9 +set vTcl(analog_color_p) #d9d9d9 +set vTcl(analog_color_m) #ececec +set vTcl(active_fg) #000000 +set vTcl(actual_gui_menu_active_bg) #ececec +set vTcl(active_menu_fg) #000000 +} + + + + +proc vTclWindow.top42 {base} { + global vTcl + if {$base == ""} { + set base .top42 + } + if {[winfo exists $base]} { + wm deiconify $base; return + } + set top $base + ################### + # CREATING WIDGETS + ################### + vTcl::widgets::core::toplevel::createCmd $top -class Toplevel \ + -menu {{}} -background $vTcl(actual_gui_bg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black + wm focusmodel $top passive + wm geometry $top 600x450+401+150 + update + # set in toplevel.wgt. + global vTcl + global img_list + set vTcl(save,dflt,origin) 0 + wm maxsize $top 1370 749 + wm minsize $top 120 1 + wm overrideredirect $top 0 + wm resizable $top 1 1 + wm deiconify $top + wm title $top "New Toplevel" + vTcl:DefineAlias "$top" "Toplevel1" vTcl:Toplevel:WidgetProc "" 1 + button $top.but43 \ + -activebackground $vTcl(analog_color_m) -activeforeground #000000 \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -pady 0 -text {Select Signin sheet} + vTcl:DefineAlias "$top.but43" "inputSheet" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.but43 + button $top.but44 \ + -activebackground $vTcl(analog_color_m) -activeforeground #000000 \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -pady 0 -text {Select output csv} + vTcl:DefineAlias "$top.but44" "outputCSV" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.but44 + label $top.lab45 \ + -activebackground #f9f9f9 -activeforeground black -background #e6e6e6 \ + -disabledforeground #a3a3a3 -font TkDefaultFont \ + -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -text Label + vTcl:DefineAlias "$top.lab45" "imagePortrait" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.lab45 + label $top.lab46 \ + -activebackground #f9f9f9 -activeforeground black -background #e1e1e1 \ + -disabledforeground #a3a3a3 -font TkDefaultFont -foreground #ff0000 \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -text {Uh oh. It looks like we couldnt condifently decide who or what this is. We need you to either confirm our guess or type in the correct value} + vTcl:DefineAlias "$top.lab46" "errorLabel" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.lab46 + entry $top.ent47 \ + -background white -disabledforeground #a3a3a3 -font TkFixedFont \ + -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -insertbackground black -selectbackground #c4c4c4 \ + -selectforeground black + vTcl:DefineAlias "$top.ent47" "correctionEntry" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.ent47 + button $top.but45 \ + -activebackground $vTcl(analog_color_m) -activeforeground #000000 \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -pady 0 -text Button + vTcl:DefineAlias "$top.but45" "AIGuess" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.but45 + button $top.but46 \ + -activebackground $vTcl(analog_color_m) -activeforeground #000000 \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -pady 0 -text Submit + vTcl:DefineAlias "$top.but46" "submit" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.but46 + label $top.lab47 \ + -activebackground #f9f9f9 -activeforeground black \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -justify right -text {Were not confident, but is it:} + vTcl:DefineAlias "$top.lab47" "confidenceDescription" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.lab47 + label $top.lab48 \ + -activebackground #f9f9f9 -activeforeground black \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -text Or + vTcl:DefineAlias "$top.lab48" "orLabel" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.lab48 + ttk::progressbar $top.tPr49 + vTcl:DefineAlias "$top.tPr49" "TranslationProgress" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.tPr49 + label $top.lab50 \ + -activebackground #f9f9f9 -activeforeground black -anchor w \ + -background $vTcl(actual_gui_bg) -disabledforeground #a3a3a3 \ + -font TkDefaultFont -foreground $vTcl(actual_gui_fg) \ + -highlightbackground $vTcl(actual_gui_bg) -highlightcolor black \ + -text {Sheet: x of y} + vTcl:DefineAlias "$top.lab50" "SheetStatus" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.lab50 + button $top.but47 \ + -activebackground $vTcl(analog_color_m) -activeforeground #000000 \ + -background #17a252 -disabledforeground #a3a3a3 -font TkDefaultFont \ + -foreground #ffffff -highlightbackground $vTcl(actual_gui_bg) \ + -highlightcolor black -pady 0 -text Start + vTcl:DefineAlias "$top.but47" "start" vTcl:WidgetProc "Toplevel1" 1 + vTcl:copy_lock $top.but47 + ################### + # SETTING GEOMETRY + ################### + place $top.but43 \ + -in $top -x 20 -y 20 -width 157 -relwidth 0 -height 34 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.but44 \ + -in $top -x 20 -y 70 -width 157 -relwidth 0 -height 34 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.lab45 \ + -in $top -x 250 -y 10 -width 314 -relwidth 0 -height 221 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.lab46 \ + -in $top -x 10 -y 160 -width 224 -relwidth 0 -height 71 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.ent47 \ + -in $top -x 80 -y 310 -width 334 -relwidth 0 -height 30 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.but45 \ + -in $top -x 330 -y 250 -width 227 -relwidth 0 -height 34 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.but46 \ + -in $top -x 430 -y 310 -width 127 -relwidth 0 -height 34 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.lab47 \ + -in $top -x 160 -y 250 -width 164 -relwidth 0 -height 31 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.lab48 \ + -in $top -x 10 -y 310 -width 64 -relwidth 0 -height 31 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.tPr49 \ + -in $top -x 10 -y 410 -width 570 -relwidth 0 -height 22 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.lab50 \ + -in $top -x 20 -y 380 -width 554 -relwidth 0 -height 21 -relheight 0 \ + -anchor nw -bordermode ignore + place $top.but47 \ + -in $top -x 20 -y 115 -width 157 -relwidth 0 -height 34 -relheight 0 \ + -anchor nw -bordermode ignore + + vTcl:FireEvent $base <> +} + +set btop "" +if {$vTcl(borrow)} { + set btop .bor[expr int([expr rand() * 100])] + while {[lsearch $btop $vTcl(tops)] != -1} { + set btop .bor[expr int([expr rand() * 100])] + } +} +set vTcl(btop) $btop +Window show . +Window show .top42 $btop +if {$vTcl(borrow)} { + $btop configure -background plum +} +