Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 75f56f4

Browse files
committedJul 31, 2024·
[fix] microsoft financial parser ValidationError previous_unpaid_balance
1 parent 397824d commit 75f56f4

File tree

1 file changed

+135
-58
lines changed

1 file changed

+135
-58
lines changed
 

‎edenai_apis/apis/microsoft/microsoft_helpers.py

Lines changed: 135 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ def miscrosoft_normalize_face_detection_response(response, img_size):
313313
)
314314
return deepcopy(faces_list)
315315

316+
316317
def _get_page_val(
317318
fields: dict,
318319
page_num: int,
@@ -326,6 +327,7 @@ def _get_page_val(
326327
value = extract(fields, path)
327328
return value
328329

330+
329331
def normalize_invoice_result(response):
330332
"""normalize the original response of the provider api"""
331333
invoices = []
@@ -339,42 +341,74 @@ def normalize_invoice_result(response):
339341

340342
page_num = idx + 1
341343

342-
customer_name = _get_page_val(fields, page_num, ["CustomerName", "value" ])
343-
customer_id = _get_page_val(fields, page_num, ["CustomerId", "value" ])
344-
customer_tax_id = _get_page_val(fields, page_num, ["CustomerTaxId", "value" ])
345-
customer_address = _get_page_val(fields, page_num, ["CustomerAddress", "content" ])
346-
customer_mailing_address = _get_page_val(fields, page_num, ["CustomerAddress", "content" ])
347-
customer_billing_address = _get_page_val(fields, page_num, ["BillingAddress", "content" ])
348-
customer_shipping_address = _get_page_val(fields, page_num, ["ShippingAddress", "content" ])
349-
customer_service_address = _get_page_val(fields, page_num, ["ServiceAddress", "content" ])
350-
customer_remittance_address = _get_page_val(fields, page_num, ["RemittanceAddress", "content" ])
351-
merchant_address = _get_page_val(fields, page_num, ["VendorAddress", "content" ])
352-
merchant_name = _get_page_val(fields, page_num, ["VendorName", "value" ])
353-
merchant_tax_id = _get_page_val(fields, page_num, ["VendorTaxId", "value" ])
354-
purchase_order = _get_page_val(fields, page_num, ["PurchaseOrder", "value" ])
355-
payment_term = _get_page_val(fields, page_num, ["PaymentTerm", "value" ])
356-
invoice_total = _get_page_val(fields, page_num, ["InvoiceTotal", "value", "amount"])
357-
invoice_subtotal = _get_page_val(fields, page_num, ["SubTotal", "value", "amount"])
358-
invoice_number = _get_page_val(fields, page_num, ["InvoiceId", "value" ])
359-
invoice_date = _get_page_val(fields, page_num, ["InvoiceDate", "value" ])
360-
invoice_time = _get_page_val(fields, page_num, ["InvoiceTime", "value" ])
361-
due_date = _get_page_val(fields, page_num, ["DueDate", "value" ])
362-
tax = _get_page_val(fields, page_num, ["TotalTax", "value", "amount"])
363-
amount_due = _get_page_val(fields, page_num, ["AmountDue", "value", "amount"])
364-
previous_unpaid_balance = _get_page_val(fields, page_num, ["PreviousUnpaidBalance", "value", "amount"])
344+
customer_name = _get_page_val(fields, page_num, ["CustomerName", "value"])
345+
customer_id = _get_page_val(fields, page_num, ["CustomerId", "value"])
346+
customer_tax_id = _get_page_val(
347+
fields, page_num, ["CustomerTaxId", "value"]
348+
)
349+
customer_address = _get_page_val(
350+
fields, page_num, ["CustomerAddress", "content"]
351+
)
352+
customer_mailing_address = _get_page_val(
353+
fields, page_num, ["CustomerAddress", "content"]
354+
)
355+
customer_billing_address = _get_page_val(
356+
fields, page_num, ["BillingAddress", "content"]
357+
)
358+
customer_shipping_address = _get_page_val(
359+
fields, page_num, ["ShippingAddress", "content"]
360+
)
361+
customer_service_address = _get_page_val(
362+
fields, page_num, ["ServiceAddress", "content"]
363+
)
364+
customer_remittance_address = _get_page_val(
365+
fields, page_num, ["RemittanceAddress", "content"]
366+
)
367+
merchant_address = _get_page_val(
368+
fields, page_num, ["VendorAddress", "content"]
369+
)
370+
merchant_name = _get_page_val(fields, page_num, ["VendorName", "value"])
371+
merchant_tax_id = _get_page_val(fields, page_num, ["VendorTaxId", "value"])
372+
purchase_order = _get_page_val(fields, page_num, ["PurchaseOrder", "value"])
373+
payment_term = _get_page_val(fields, page_num, ["PaymentTerm", "value"])
374+
invoice_total = _get_page_val(
375+
fields, page_num, ["InvoiceTotal", "value", "amount"]
376+
)
377+
invoice_subtotal = _get_page_val(
378+
fields, page_num, ["SubTotal", "value", "amount"]
379+
)
380+
invoice_number = _get_page_val(fields, page_num, ["InvoiceId", "value"])
381+
invoice_date = _get_page_val(fields, page_num, ["InvoiceDate", "value"])
382+
invoice_time = _get_page_val(fields, page_num, ["InvoiceTime", "value"])
383+
due_date = _get_page_val(fields, page_num, ["DueDate", "value"])
384+
tax = _get_page_val(fields, page_num, ["TotalTax", "value", "amount"])
385+
amount_due = _get_page_val(
386+
fields, page_num, ["AmountDue", "value", "amount"]
387+
)
388+
previous_unpaid_balance = _get_page_val(
389+
fields, page_num, ["PreviousUnpaidBalance", "value", "amount"]
390+
)
365391

366392
# Items line
367393
items = extract(fields, ["Items", "value"], [])
368394
item_lines: Sequence[ItemLinesInvoice] = []
369395
for item in items:
370396
if line := item.get("value"):
371-
amount = _get_page_val(line, page_num, ["Amount", "value", "amount"])
372-
description = _get_page_val(line, page_num, ["Description", "value" ])
373-
quantity = _get_page_val(line, page_num, ["Quantity", "value" ])
374-
unit_price = _get_page_val(line, page_num, ["UnitPrice", "value", "amount"])
375-
product_code = _get_page_val(line, page_num, ["ProductCode", "value" ])
376-
date_item = _get_page_val(line, page_num, ["Date", "value" ])
377-
tax_item = _get_page_val(line, page_num, ["Tax", "value", "amount"])
397+
amount = _get_page_val(
398+
line, page_num, ["Amount", "value", "amount"]
399+
)
400+
description = _get_page_val(
401+
line, page_num, ["Description", "value"]
402+
)
403+
quantity = _get_page_val(line, page_num, ["Quantity", "value"])
404+
unit_price = _get_page_val(
405+
line, page_num, ["UnitPrice", "value", "amount"]
406+
)
407+
product_code = _get_page_val(
408+
line, page_num, ["ProductCode", "value"]
409+
)
410+
date_item = _get_page_val(line, page_num, ["Date", "value"])
411+
tax_item = _get_page_val(line, page_num, ["Tax", "value", "amount"])
378412

379413
item_lines.append(
380414
ItemLinesInvoice(
@@ -427,7 +461,9 @@ def normalize_invoice_result(response):
427461
payment_term=payment_term,
428462
amount_due=amount_due,
429463
previous_unpaid_balance=previous_unpaid_balance,
430-
date=combine_date_with_time(format_date(invoice_date), invoice_time),
464+
date=combine_date_with_time(
465+
format_date(invoice_date), invoice_time
466+
),
431467
due_date=format_date(due_date),
432468
purchase_order=purchase_order,
433469
taxes=[TaxesInvoice(value=tax, rate=None)],
@@ -518,6 +554,7 @@ def get_right_audio_support_and_sampling_rate(
518554
)
519555
return extension, right_audio_format
520556

557+
521558
def microsoft_ocr_tables_standardize_response(
522559
original_response: dict,
523560
) -> OcrTablesAsyncDataClass:
@@ -531,7 +568,10 @@ def microsoft_ocr_tables_standardize_response(
531568

532569
return OcrTablesAsyncDataClass(pages=pages, num_pages=num_pages)
533570

534-
def _ocr_tables_standardize_table(table: dict, original_response: dict, page_index: int) -> Table:
571+
572+
def _ocr_tables_standardize_table(
573+
table: dict, original_response: dict, page_index: int
574+
) -> Table:
535575
num_rows = table.get("rowCount", 0)
536576
rows = [Row() for _ in range(num_rows)]
537577

@@ -545,7 +585,10 @@ def _ocr_tables_standardize_table(table: dict, original_response: dict, page_ind
545585
)
546586
return std_table
547587

548-
def _ocr_tables_standardize_cell(cell: dict, original_response: dict, page_index: int) -> Cell:
588+
589+
def _ocr_tables_standardize_cell(
590+
cell: dict, original_response: dict, page_index: int
591+
) -> Cell:
549592
current_page_num = cell["boundingRegions"][0]["pageNumber"]
550593
width = original_response["pages"][current_page_num - 1]["width"]
551594
height = original_response["pages"][current_page_num - 1]["height"]
@@ -572,20 +615,38 @@ def _ocr_tables_standardize_cell(cell: dict, original_response: dict, page_index
572615
confidence=cell_confidence,
573616
)
574617

618+
575619
def _calculate_cell_confidence(words: List[Dict], bounding_box: List[float]) -> float:
576620
cell_words = [
577-
word for word in words
621+
word
622+
for word in words
578623
if _is_word_in_bounding_box(word["polygon"], bounding_box)
579624
]
580625
if not cell_words:
581626
return 1.0
582627
confidences = [word["confidence"] for word in cell_words]
583628
return mean(confidences)
584629

630+
585631
def _is_word_in_bounding_box(word_box: List[float], cell_box: List[float]) -> bool:
586-
word_left, word_top, word_right, word_bottom = word_box[0], word_box[1], word_box[4], word_box[5]
587-
cell_left, cell_top, cell_right, cell_bottom = cell_box[0], cell_box[1], cell_box[4], cell_box[5]
588-
return not (word_right < cell_left or word_left > cell_right or word_bottom < cell_top or word_top > cell_bottom)
632+
word_left, word_top, word_right, word_bottom = (
633+
word_box[0],
634+
word_box[1],
635+
word_box[4],
636+
word_box[5],
637+
)
638+
cell_left, cell_top, cell_right, cell_bottom = (
639+
cell_box[0],
640+
cell_box[1],
641+
cell_box[4],
642+
cell_box[5],
643+
)
644+
return not (
645+
word_right < cell_left
646+
or word_left > cell_right
647+
or word_bottom < cell_top
648+
or word_top > cell_bottom
649+
)
589650

590651

591652
def _create_ocr_async_bounding_box(polygon, height, width):
@@ -707,27 +768,37 @@ def microsoft_financial_parser_formatter(
707768
shipping_address=extract(page_document, ["ShippingAddress", "content"]),
708769
remittance_address=extract(page_document, ["RemittanceAddress", "content"]),
709770
service_address=extract(page_document, ["ServiceAddress", "content"]),
710-
remit_to_name=extract(page_document, ["CustomerAddressRecipient", "content"]),
771+
remit_to_name=extract(
772+
page_document, ["CustomerAddressRecipient", "content"]
773+
),
711774
)
712775

713776
# Merchant information
714777
merchant_information = FinancialMerchantInformation(
715778
phone=extract(page_document, ["MerchantPhoneNumber", "value"]),
716779
tax_id=extract(page_document, ["VendorTaxId", "value"]),
717-
house_number=extract(page_document, ["MerchantAddress", "value", "house_number"]),
718-
street_name=extract(page_document, ["MerchantAddress", "value", "street_address"]),
780+
house_number=extract(
781+
page_document, ["MerchantAddress", "value", "house_number"]
782+
),
783+
street_name=extract(
784+
page_document, ["MerchantAddress", "value", "street_address"]
785+
),
719786
city=extract(page_document, ["MerchantAddress", "value", "city_district"]),
720-
zip_code=extract(page_document, ["MerchantAddress", "value", "postal_code"]),
721-
province=extract(page_document, ["MerchantAddress", "value", "state_district"]),
787+
zip_code=extract(
788+
page_document, ["MerchantAddress", "value", "postal_code"]
789+
),
790+
province=extract(
791+
page_document, ["MerchantAddress", "value", "state_district"]
792+
),
722793
name=extract(
723794
obj=page_document,
724-
path=['VendorName', 'value'],
725-
fallback=extract(page_document, ["MerchantName", "value"])
795+
path=["VendorName", "value"],
796+
fallback=extract(page_document, ["MerchantName", "value"]),
726797
),
727798
address=extract(
728799
obj=page_document,
729-
path=["VendorAddress", 'content'],
730-
fallback=extract(page_document, ['MerchantAddress', "content"])
800+
path=["VendorAddress", "content"],
801+
fallback=extract(page_document, ["MerchantAddress", "content"]),
731802
),
732803
)
733804

@@ -737,13 +808,15 @@ def microsoft_financial_parser_formatter(
737808
subtotal=extract(page_document, ["SubTotal", "value", "amount"]),
738809
payment_terms=extract(page_document, ["PaymentTerm", "value"]),
739810
amount_due=extract(page_document, ["AmountDue", "value", "amount"]),
740-
previous_unpaid_balance=extract_amount(page_document, ["PreviousUnpaidBalance", "value"]),
811+
previous_unpaid_balance=extract_amount(
812+
page_document, ["PreviousUnpaidBalance", "value", "amount"]
813+
),
741814
discount=extract(page_document, ["TotalDiscount", "value", "amount"]),
742-
total_tax = extract(
815+
total_tax=extract(
743816
obj=page_document,
744817
path=["TotalTax", "value"],
745818
type_validator=float,
746-
fallback=extract(page_document, ["TotalTax", "value", "amount"])
819+
fallback=extract(page_document, ["TotalTax", "value", "amount"]),
747820
),
748821
)
749822

@@ -769,14 +842,18 @@ def microsoft_financial_parser_formatter(
769842
)
770843

771844
# Bank information
772-
payment_details = extract(page_document, ["PaymentDetails", "value"], fallback=[])
845+
payment_details = extract(
846+
page_document, ["PaymentDetails", "value"], fallback=[]
847+
)
773848
payment_items = []
774849
for obj in payment_details:
775850
if line := obj.get("value"):
776-
payment_items.append({
777-
"iban": extract(line, ["IBAN", "content"]),
778-
"swift": extract(line, ["SWIFT", "content"]),
779-
})
851+
payment_items.append(
852+
{
853+
"iban": extract(line, ["IBAN", "content"]),
854+
"swift": extract(line, ["SWIFT", "content"]),
855+
}
856+
)
780857
bank = FinancialBankInformation(
781858
swift=extract(payment_items, [0, "swift"]),
782859
iban=extract(payment_items, [0, "iban"]),
@@ -798,12 +875,12 @@ def microsoft_financial_parser_formatter(
798875
items = page_document.get("items") or []
799876
item_lines = []
800877
for item in items:
801-
page_item=extract(item, ["bounding_regions", 0, "page_number"])
878+
page_item = extract(item, ["bounding_regions", 0, "page_number"])
802879
line = item.get("value")
803880
if line and page_item == (page_idx + 1):
804881
# Amount Line
805882

806-
date = extract(line, ['Date', 'value'])
883+
date = extract(line, ["Date", "value"])
807884
if isinstance(date, datetime.date):
808885
date = date.isoformat()
809886

@@ -822,8 +899,8 @@ def microsoft_financial_parser_formatter(
822899
tax=extract(line, ["Tax", "value", "amount"]),
823900
tax_rate=convert_string_to_number(
824901
string_number=extract(line, ["TaxRate", "value"]),
825-
val_type=float
826-
)
902+
val_type=float,
903+
),
827904
)
828905
)
829906
extracted_data.append(

0 commit comments

Comments
 (0)
Please sign in to comment.