Skip to content

Commit

Permalink
validate currency, land and city
Browse files Browse the repository at this point in the history
  • Loading branch information
mrzaizai2k committed Sep 29, 2024
1 parent 40cfe23 commit 87f425a
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 41 deletions.
32 changes: 29 additions & 3 deletions src/Utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from collections import Counter
from io import BytesIO
import openpyxl
from fuzzywuzzy import fuzz


from dotenv import load_dotenv
load_dotenv()
Expand Down Expand Up @@ -439,7 +441,29 @@ def get_land_and_city_list(file_path:str = "config/travel_expenses-2024.xlsx",
# Convert sets back to lists before returning
return list(lands), list(cities)

def find_best_match_fuzzy(string_list: list[str], text:str):
"""
Find the closest match to text using fuzzy matching.
:param text: string text (possibly incorrect)
:param list: List of string to find
:return: Position of best matching name in the original list, best matching name, and highest similarity score.
"""
# Preprocess the OCR output
text = text.lower()

# Extract the closest match using fuzzy matching (search over both last-first and first-last formats)
best_idx, best_score = None, 0

for idx, item in enumerate(string_list):
score = fuzz.ratio(text, item.lower())
if score > best_score:
best_score = score
best_idx = idx

# Return the index of the original name in the list, the best match, and the score
original_name = string_list[best_idx]
return best_idx, original_name, best_score

if __name__ == "__main__":
config_path = "config/config.yaml"
config = read_config(config_path)
Expand All @@ -449,9 +473,11 @@ def get_land_and_city_list(file_path:str = "config/travel_expenses-2024.xlsx",
# Define the Berlin time zone
# Get the list of all currencies
currencies = get_currencies_from_txt(file_path=config['currencies_path'])
print(currencies)
# print(currencies)
lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
sheet_name=config['country_and_city']['sheet_name'])
print("countries",lands)
print("cities",cities)
# print("countries",lands)
# print("cities",cities)

print(find_best_match_fuzzy(string_list=cities, text = "Tokioo"))

28 changes: 8 additions & 20 deletions src/excel_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import openpyxl
from fuzzywuzzy import fuzz

from src.Utils.utils import read_config
from src.Utils.utils import read_config, find_best_match_fuzzy

class ExcelProcessor:
def __init__(self, config_path:str = None, config:dict = None):
Expand Down Expand Up @@ -139,26 +139,14 @@ def find_best_match(self, ocr_output):
:param ocr_output: OCR string (possibly incorrect)
:return: Position of best matching name in the original list, best matching name, and highest similarity score.
"""
# Preprocess the OCR output
ocr_output = ocr_output.lower()

# Extract the closest match using fuzzy matching (search over both last-first and first-last formats)
best_match, best_score = None, 0
best_idx = None

for name, idx in self.canonical_names:
score = fuzz.ratio(ocr_output, name)
if score > best_score:
best_match = name
best_score = score
best_idx = idx

name_list = [name for name, _ in self.canonical_names]
best_idx, _, best_score = find_best_match_fuzzy(string_list=name_list,
text=ocr_output)
# Return the index of the original name in the list, the best match, and the score
if best_match is not None:
original_name = self.names[best_idx]
return best_idx, original_name, best_score
else:
return None, None, 0
(_, idx) = self.canonical_names[best_idx]
original_name = self.names[idx]
return best_idx, original_name, best_score


def get_full_name(name_tuple):
"""
Expand Down
4 changes: 2 additions & 2 deletions src/invoice_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def validate_invoice(invoice_info:dict, invoice_type:str, config:dict) ->dict:
full_invoice = Invoice2(invoice_info=valid_invoice['invoice_info'])

elif invoice_type == "invoice 3":
valid_invoice = validate_invoice_3(invoice_data=invoice_info)
valid_invoice = validate_invoice_3(invoice_data=invoice_info, config=config)
full_invoice = Invoice3(invoice_info=valid_invoice['invoice_info'])

full_invoice_dict = full_invoice.model_dump(exclude_unset=False)
Expand All @@ -159,7 +159,7 @@ def validate_invoice(invoice_info:dict, invoice_type:str, config:dict) ->dict:
ocr_reader = OcrReader(config_path=config_path, translator=GoogleTranslator())
invoice_extractor = OpenAIExtractor(config_path=config_path)
# img_path = "fr_1.png"
img_path = "test/images/007_2.png"
img_path = "test/images/009_1.png"
base64_img = convert_img_path_to_base64(img_path)
result = extract_invoice_info(base64_img=base64_img, ocr_reader=ocr_reader,
invoice_extractor=invoice_extractor, config=config)
Expand Down
8 changes: 4 additions & 4 deletions src/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,20 +138,20 @@ def test_get_frontend_defines(root_url):
config = read_config(path=config_path)

root_url = f"http://{config['IES_host']}:{config['IES_port']}"
# root_url = f"http://46.137.228.37" # aws
# root_url = f"http://46.137.228.37/api" # aws

img_path = "test/images/007_2.png"
img_path = "test/images/009_1.png"
# user_uuid = "gauss"
user_uuid = "2111_1111_1111_1111"
invoice_uuid = "66f3d0eb898e7aaf3dd6e00b"
invoice_info = {"amount": "1111",}

# test_upload_invoice(img_path=img_path, user_uuid=user_uuid)
test_upload_invoice(img_path=img_path, user_uuid=user_uuid)
# test_get_invoices(user_uuid=user_uuid, invoice_type=None, created_at='desc', invoice_uuid=invoice_uuid)
# test_get_invoices(user_uuid=user_uuid, invoice_type=None, created_at='desc', status="not extracted")
# test_modify_invoice(invoice_uuid=invoice_uuid, user_uuid=user_uuid, new_invoice_info=invoice_info)
# test_delete_invoice(invoice_uuid=invoice_uuid, user_uuid=user_uuid)
test_get_frontend_defines(root_url=root_url)
# test_get_frontend_defines(root_url=root_url)



Expand Down
79 changes: 67 additions & 12 deletions src/validate_invoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from typing import List, Optional, Any, Union
from datetime import date, datetime
from src.excel_export import FuzzyNameMatcher, ExcelProcessor, get_full_name
from src.Utils.utils import read_config
from src.Utils.utils import (read_config, get_currencies_from_txt,
get_land_and_city_list, find_best_match_fuzzy)


def preprocess_name(name: str) -> str:
Expand Down Expand Up @@ -67,14 +68,45 @@ def normalize_time(value):
except Exception:
return "" # Return an empty string if splitting fails


# Normalize currency to uppercase three-letter code
def normalize_currency(value):
if isinstance(value, str) and len(value.strip()) == 3:
return value.strip().upper()
return None
def validate_currency(currency_text: str, config:dict) -> str:
# Use fuzzy matching to find the best match
if not currency_text or currency_text =="":
return ""

currencies = get_currencies_from_txt(file_path=config['currencies_path'])
best_idx, currency, best_score = find_best_match_fuzzy(string_list=currencies, text=currency_text)
# Return the best matching currency or the original currency if no match is found
return currency

def validate_land(land_text: str, config:dict) -> str:
if not land_text or land_text =="":
return ""

lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
sheet_name=config['country_and_city']['sheet_name'])

best_idx, land, best_score = find_best_match_fuzzy(string_list=lands, text=land_text)
# Return the best matching currency or the original currency if no match is found
return land


def validate_city(city_text: str, config:dict) -> str:
if not city_text or city_text =="":
return "Other"

lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
sheet_name=config['country_and_city']['sheet_name'])

best_idx, city, best_score = find_best_match_fuzzy(string_list=cities, text=city_text)
# Return the best matching currency or the original currency if no match is found
if best_score <= 50:
return "Other"

return city


def validate_invoice_3(invoice_data: dict) -> dict:

def validate_invoice_3(invoice_data: dict, config:dict) -> dict:

# Normalize payment card number by removing all non-digit characters
def normalize_payment_card_number(value):
Expand Down Expand Up @@ -116,7 +148,7 @@ def validate_and_normalize(data):
data[key] = normalize_phone_number(data[key])

if 'currency' in key:
data[key] = normalize_currency(data[key])
data[key] = validate_currency(data[key], config=config)

# Recursively normalize nested dictionaries or lists
if isinstance(value, dict) or isinstance(value, list):
Expand Down Expand Up @@ -158,6 +190,12 @@ def validate_and_normalize(data: Any, reference_year=None):
if key == 'name':
data[key] = map_name(value, config)

if key == 'land':
data[key] = validate_land(value, config)

if key == 'city':
data[key] = validate_city(value, config)

# Recursively normalize nested dictionaries or lists
if isinstance(value, dict) or isinstance(value, list):
data[key] = validate_and_normalize(value, reference_year)
Expand Down Expand Up @@ -225,9 +263,13 @@ def validate_and_normalize(data: Any, reference_year=None):
if 'payment_method' in line:
line['payment_method'] = normalize_payment_method(line['payment_method'])

if key == 'currency':
data[key] = validate_currency(value, config=config)

# Recursively normalize nested dictionaries or lists
if isinstance(value, dict) or isinstance(value, list):
data[key] = validate_and_normalize(value, reference_year)


elif isinstance(data, list):
data = [validate_and_normalize(item, reference_year) for item in data]
Expand Down Expand Up @@ -549,8 +591,8 @@ class Invoice3(BaseModel):
'name': 'Tümmler Dirk',
'project_number': 'V240045',
'customer': 'Magua',
'city': 'Salzgitter',
'land': 'DE',
'city': 'Othe',
'land': 'Vietna',
'lines': [
{
'date': '07/08/2024',
Expand Down Expand Up @@ -581,7 +623,7 @@ class Invoice3(BaseModel):
'name': 'Schmidt, Timo',
'project_number': 'V123023',
'is_in_egw': True,
'currency': 'EUR',
'currency': 'EURk',
'lines': [
{'title': 'Hotel', 'amount': 504.0},
{'title': 'Fuel', 'amount': 24.6},
Expand All @@ -592,6 +634,19 @@ class Invoice3(BaseModel):
'has_employee_signature': True
}
}
data3 = {
'invoice_info': {

'currency': 'EURk',

}
}

data = validate_invoice_1(data1, config=config)
print("\ndata1", data)

data = validate_invoice_2(data2, config=config)
print("\ndata2", data)

data = validate_invoice_3(data3, config=config)
print("\ndata3", data)

0 comments on commit 87f425a

Please sign in to comment.