validate currency, land and city

mrzaizai2k · Sep 29, 2024 · 87f425a · 87f425a
1 parent 40cfe23
commit 87f425a
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 41 deletions.
diff --git a/src/Utils/utils.py b/src/Utils/utils.py
@@ -20,6 +20,8 @@
 from collections import Counter
 from io import BytesIO
 import openpyxl
+from fuzzywuzzy import fuzz
+
 
 from dotenv import load_dotenv
 load_dotenv()
@@ -439,7 +441,29 @@ def get_land_and_city_list(file_path:str = "config/travel_expenses-2024.xlsx",
     # Convert sets back to lists before returning
     return list(lands), list(cities) 
 
+def find_best_match_fuzzy(string_list: list[str], text:str):
+    """
+    Find the closest match to text using fuzzy matching.
+    :param text: string text (possibly incorrect)
+    :param list: List of string to find
+    :return: Position of best matching name in the original list, best matching name, and highest similarity score.
+    """
+    # Preprocess the OCR output
+    text = text.lower()
+
+    # Extract the closest match using fuzzy matching (search over both last-first and first-last formats)
+    best_idx, best_score = None, 0
 
+    for idx, item in enumerate(string_list):
+        score = fuzz.ratio(text, item.lower())
+        if score > best_score:
+            best_score = score
+            best_idx = idx
+
+    # Return the index of the original name in the list, the best match, and the score
+    original_name = string_list[best_idx]
+    return best_idx, original_name, best_score
+
 if __name__ == "__main__":
     config_path = "config/config.yaml"
     config = read_config(config_path)
@@ -449,9 +473,11 @@ def get_land_and_city_list(file_path:str = "config/travel_expenses-2024.xlsx",
     # Define the Berlin time zone
     # Get the list of all currencies
     currencies = get_currencies_from_txt(file_path=config['currencies_path'])
-    print(currencies)
+    # print(currencies)
     lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
                                                   sheet_name=config['country_and_city']['sheet_name'])
-    print("countries",lands)
-    print("cities",cities)
+    # print("countries",lands)
+    # print("cities",cities)
+
+    print(find_best_match_fuzzy(string_list=cities, text = "Tokioo"))
 
diff --git a/src/excel_export.py b/src/excel_export.py
@@ -4,7 +4,7 @@
 import openpyxl
 from fuzzywuzzy import fuzz
 
-from src.Utils.utils import read_config
+from src.Utils.utils import read_config, find_best_match_fuzzy
 
 class ExcelProcessor:
     def __init__(self, config_path:str = None, config:dict = None):
@@ -139,26 +139,14 @@ def find_best_match(self, ocr_output):
         :param ocr_output: OCR string (possibly incorrect)
         :return: Position of best matching name in the original list, best matching name, and highest similarity score.
         """
-        # Preprocess the OCR output
-        ocr_output = ocr_output.lower()
-
-        # Extract the closest match using fuzzy matching (search over both last-first and first-last formats)
-        best_match, best_score = None, 0
-        best_idx = None
-
-        for name, idx in self.canonical_names:
-            score = fuzz.ratio(ocr_output, name)
-            if score > best_score:
-                best_match = name
-                best_score = score
-                best_idx = idx
-
+        name_list = [name for name, _ in self.canonical_names]
+        best_idx, _, best_score = find_best_match_fuzzy(string_list=name_list, 
+                                                           text=ocr_output)
         # Return the index of the original name in the list, the best match, and the score
-        if best_match is not None:
-            original_name = self.names[best_idx]
-            return best_idx, original_name, best_score
-        else:
-            return None, None, 0
+        (_, idx) = self.canonical_names[best_idx]
+        original_name = self.names[idx]
+        return best_idx, original_name, best_score
+
 
 def get_full_name(name_tuple):
     """

diff --git a/src/invoice_extraction.py b/src/invoice_extraction.py
@@ -145,7 +145,7 @@ def validate_invoice(invoice_info:dict, invoice_type:str, config:dict) ->dict:
         full_invoice = Invoice2(invoice_info=valid_invoice['invoice_info'])
 
     elif invoice_type == "invoice 3":
-        valid_invoice = validate_invoice_3(invoice_data=invoice_info)
+        valid_invoice = validate_invoice_3(invoice_data=invoice_info, config=config)
         full_invoice = Invoice3(invoice_info=valid_invoice['invoice_info'])
 
     full_invoice_dict = full_invoice.model_dump(exclude_unset=False)
@@ -159,7 +159,7 @@ def validate_invoice(invoice_info:dict, invoice_type:str, config:dict) ->dict:
     ocr_reader = OcrReader(config_path=config_path, translator=GoogleTranslator())
     invoice_extractor = OpenAIExtractor(config_path=config_path)
     # img_path = "fr_1.png"
-    img_path = "test/images/007_2.png"
+    img_path = "test/images/009_1.png"
     base64_img = convert_img_path_to_base64(img_path)
     result = extract_invoice_info(base64_img=base64_img, ocr_reader=ocr_reader,
                                         invoice_extractor=invoice_extractor, config=config)

diff --git a/src/test_api.py b/src/test_api.py
@@ -138,20 +138,20 @@ def test_get_frontend_defines(root_url):
     config = read_config(path=config_path)
 
     root_url = f"http://{config['IES_host']}:{config['IES_port']}"
-    # root_url = f"http://46.137.228.37" # aws
+    # root_url = f"http://46.137.228.37/api" # aws
 
-    img_path = "test/images/007_2.png"
+    img_path = "test/images/009_1.png"
     # user_uuid = "gauss"
     user_uuid = "2111_1111_1111_1111"
     invoice_uuid = "66f3d0eb898e7aaf3dd6e00b"
     invoice_info = {"amount": "1111",} 
 
-    # test_upload_invoice(img_path=img_path, user_uuid=user_uuid)
+    test_upload_invoice(img_path=img_path, user_uuid=user_uuid)
     # test_get_invoices(user_uuid=user_uuid, invoice_type=None, created_at='desc', invoice_uuid=invoice_uuid)
     # test_get_invoices(user_uuid=user_uuid, invoice_type=None, created_at='desc', status="not extracted")
     # test_modify_invoice(invoice_uuid=invoice_uuid, user_uuid=user_uuid, new_invoice_info=invoice_info)
     # test_delete_invoice(invoice_uuid=invoice_uuid, user_uuid=user_uuid)
-    test_get_frontend_defines(root_url=root_url)
+    # test_get_frontend_defines(root_url=root_url)
 
 
 

diff --git a/src/validate_invoice.py b/src/validate_invoice.py
@@ -5,7 +5,8 @@
 from typing import List, Optional, Any, Union
 from datetime import date, datetime
 from src.excel_export import FuzzyNameMatcher, ExcelProcessor, get_full_name
-from src.Utils.utils import read_config
+from src.Utils.utils import (read_config, get_currencies_from_txt,
+                              get_land_and_city_list, find_best_match_fuzzy)
 
 
 def preprocess_name(name: str) -> str:
@@ -67,14 +68,45 @@ def normalize_time(value):
     except Exception:
         return ""  # Return an empty string if splitting fails
 
-
- # Normalize currency to uppercase three-letter code
-def normalize_currency(value):
-    if isinstance(value, str) and len(value.strip()) == 3:
-        return value.strip().upper()
-    return None
+def validate_currency(currency_text: str, config:dict) -> str:
+    # Use fuzzy matching to find the best match
+    if not currency_text or currency_text =="":
+        return ""
+
+    currencies = get_currencies_from_txt(file_path=config['currencies_path'])
+    best_idx, currency, best_score = find_best_match_fuzzy(string_list=currencies, text=currency_text)
+    # Return the best matching currency or the original currency if no match is found
+    return currency
+
+def validate_land(land_text: str, config:dict) -> str:
+    if not land_text or land_text =="":
+        return ""
+
+    lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
+                                                  sheet_name=config['country_and_city']['sheet_name'])
+
+    best_idx, land, best_score = find_best_match_fuzzy(string_list=lands, text=land_text)
+    # Return the best matching currency or the original currency if no match is found
+    return land
+
+
+def validate_city(city_text: str, config:dict) -> str:
+    if not city_text or city_text =="":
+        return "Other"
+
+    lands, cities = get_land_and_city_list(file_path=config['country_and_city']['file_path'],
+                                                  sheet_name=config['country_and_city']['sheet_name'])
+
+    best_idx, city, best_score = find_best_match_fuzzy(string_list=cities, text=city_text)
+    # Return the best matching currency or the original currency if no match is found
+    if best_score <= 50:
+        return "Other"
+
+    return city
+
 
-def validate_invoice_3(invoice_data: dict) -> dict:
+
+def validate_invoice_3(invoice_data: dict, config:dict) -> dict:
 
     # Normalize payment card number by removing all non-digit characters
     def normalize_payment_card_number(value):
@@ -116,7 +148,7 @@ def validate_and_normalize(data):
                     data[key] = normalize_phone_number(data[key])
 
                 if 'currency' in key:
-                    data[key] = normalize_currency(data[key])
+                    data[key] = validate_currency(data[key], config=config)
 
                 # Recursively normalize nested dictionaries or lists
                 if isinstance(value, dict) or isinstance(value, list):
@@ -158,6 +190,12 @@ def validate_and_normalize(data: Any, reference_year=None):
                 if key == 'name':
                     data[key] = map_name(value, config)
 
+                if key == 'land':
+                    data[key] = validate_land(value, config)
+
+                if key == 'city':
+                    data[key] = validate_city(value, config)
+
                 # Recursively normalize nested dictionaries or lists
                 if isinstance(value, dict) or isinstance(value, list):
                     data[key] = validate_and_normalize(value, reference_year)
@@ -225,9 +263,13 @@ def validate_and_normalize(data: Any, reference_year=None):
                         if 'payment_method' in line:
                             line['payment_method'] = normalize_payment_method(line['payment_method'])
 
+                if key == 'currency':
+                    data[key] = validate_currency(value, config=config)
+
                 # Recursively normalize nested dictionaries or lists
                 if isinstance(value, dict) or isinstance(value, list):
                     data[key] = validate_and_normalize(value, reference_year)
+
 
         elif isinstance(data, list):
             data = [validate_and_normalize(item, reference_year) for item in data]
@@ -549,8 +591,8 @@ class Invoice3(BaseModel):
             'name': 'Tümmler Dirk',
             'project_number': 'V240045',
             'customer': 'Magua',
-            'city': 'Salzgitter',
-            'land': 'DE',
+            'city': 'Othe',
+            'land': 'Vietna',
             'lines': [
                 {
                     'date': '07/08/2024',
@@ -581,7 +623,7 @@ class Invoice3(BaseModel):
             'name': 'Schmidt, Timo',
             'project_number': 'V123023',
             'is_in_egw': True,
-            'currency': 'EUR',
+            'currency': 'EURk',
             'lines': [
                 {'title': 'Hotel', 'amount': 504.0},
                 {'title': 'Fuel', 'amount': 24.6},
@@ -592,6 +634,19 @@ class Invoice3(BaseModel):
             'has_employee_signature': True
         }
     }
+    data3 = {
+        'invoice_info': {
+
+            'currency': 'EURk',
+
+        }
+    }
 
     data = validate_invoice_1(data1, config=config)
     print("\ndata1", data)
+
+    data = validate_invoice_2(data2, config=config)
+    print("\ndata2", data)
+
+    data = validate_invoice_3(data3, config=config)
+    print("\ndata3", data)