@@ -313,6 +313,7 @@ def miscrosoft_normalize_face_detection_response(response, img_size):
313
313
)
314
314
return deepcopy (faces_list )
315
315
316
+
316
317
def _get_page_val (
317
318
fields : dict ,
318
319
page_num : int ,
@@ -326,6 +327,7 @@ def _get_page_val(
326
327
value = extract (fields , path )
327
328
return value
328
329
330
+
329
331
def normalize_invoice_result (response ):
330
332
"""normalize the original response of the provider api"""
331
333
invoices = []
@@ -339,42 +341,74 @@ def normalize_invoice_result(response):
339
341
340
342
page_num = idx + 1
341
343
342
- customer_name = _get_page_val (fields , page_num , ["CustomerName" , "value" ])
343
- customer_id = _get_page_val (fields , page_num , ["CustomerId" , "value" ])
344
- customer_tax_id = _get_page_val (fields , page_num , ["CustomerTaxId" , "value" ])
345
- customer_address = _get_page_val (fields , page_num , ["CustomerAddress" , "content" ])
346
- customer_mailing_address = _get_page_val (fields , page_num , ["CustomerAddress" , "content" ])
347
- customer_billing_address = _get_page_val (fields , page_num , ["BillingAddress" , "content" ])
348
- customer_shipping_address = _get_page_val (fields , page_num , ["ShippingAddress" , "content" ])
349
- customer_service_address = _get_page_val (fields , page_num , ["ServiceAddress" , "content" ])
350
- customer_remittance_address = _get_page_val (fields , page_num , ["RemittanceAddress" , "content" ])
351
- merchant_address = _get_page_val (fields , page_num , ["VendorAddress" , "content" ])
352
- merchant_name = _get_page_val (fields , page_num , ["VendorName" , "value" ])
353
- merchant_tax_id = _get_page_val (fields , page_num , ["VendorTaxId" , "value" ])
354
- purchase_order = _get_page_val (fields , page_num , ["PurchaseOrder" , "value" ])
355
- payment_term = _get_page_val (fields , page_num , ["PaymentTerm" , "value" ])
356
- invoice_total = _get_page_val (fields , page_num , ["InvoiceTotal" , "value" , "amount" ])
357
- invoice_subtotal = _get_page_val (fields , page_num , ["SubTotal" , "value" , "amount" ])
358
- invoice_number = _get_page_val (fields , page_num , ["InvoiceId" , "value" ])
359
- invoice_date = _get_page_val (fields , page_num , ["InvoiceDate" , "value" ])
360
- invoice_time = _get_page_val (fields , page_num , ["InvoiceTime" , "value" ])
361
- due_date = _get_page_val (fields , page_num , ["DueDate" , "value" ])
362
- tax = _get_page_val (fields , page_num , ["TotalTax" , "value" , "amount" ])
363
- amount_due = _get_page_val (fields , page_num , ["AmountDue" , "value" , "amount" ])
364
- previous_unpaid_balance = _get_page_val (fields , page_num , ["PreviousUnpaidBalance" , "value" , "amount" ])
344
+ customer_name = _get_page_val (fields , page_num , ["CustomerName" , "value" ])
345
+ customer_id = _get_page_val (fields , page_num , ["CustomerId" , "value" ])
346
+ customer_tax_id = _get_page_val (
347
+ fields , page_num , ["CustomerTaxId" , "value" ]
348
+ )
349
+ customer_address = _get_page_val (
350
+ fields , page_num , ["CustomerAddress" , "content" ]
351
+ )
352
+ customer_mailing_address = _get_page_val (
353
+ fields , page_num , ["CustomerAddress" , "content" ]
354
+ )
355
+ customer_billing_address = _get_page_val (
356
+ fields , page_num , ["BillingAddress" , "content" ]
357
+ )
358
+ customer_shipping_address = _get_page_val (
359
+ fields , page_num , ["ShippingAddress" , "content" ]
360
+ )
361
+ customer_service_address = _get_page_val (
362
+ fields , page_num , ["ServiceAddress" , "content" ]
363
+ )
364
+ customer_remittance_address = _get_page_val (
365
+ fields , page_num , ["RemittanceAddress" , "content" ]
366
+ )
367
+ merchant_address = _get_page_val (
368
+ fields , page_num , ["VendorAddress" , "content" ]
369
+ )
370
+ merchant_name = _get_page_val (fields , page_num , ["VendorName" , "value" ])
371
+ merchant_tax_id = _get_page_val (fields , page_num , ["VendorTaxId" , "value" ])
372
+ purchase_order = _get_page_val (fields , page_num , ["PurchaseOrder" , "value" ])
373
+ payment_term = _get_page_val (fields , page_num , ["PaymentTerm" , "value" ])
374
+ invoice_total = _get_page_val (
375
+ fields , page_num , ["InvoiceTotal" , "value" , "amount" ]
376
+ )
377
+ invoice_subtotal = _get_page_val (
378
+ fields , page_num , ["SubTotal" , "value" , "amount" ]
379
+ )
380
+ invoice_number = _get_page_val (fields , page_num , ["InvoiceId" , "value" ])
381
+ invoice_date = _get_page_val (fields , page_num , ["InvoiceDate" , "value" ])
382
+ invoice_time = _get_page_val (fields , page_num , ["InvoiceTime" , "value" ])
383
+ due_date = _get_page_val (fields , page_num , ["DueDate" , "value" ])
384
+ tax = _get_page_val (fields , page_num , ["TotalTax" , "value" , "amount" ])
385
+ amount_due = _get_page_val (
386
+ fields , page_num , ["AmountDue" , "value" , "amount" ]
387
+ )
388
+ previous_unpaid_balance = _get_page_val (
389
+ fields , page_num , ["PreviousUnpaidBalance" , "value" , "amount" ]
390
+ )
365
391
366
392
# Items line
367
393
items = extract (fields , ["Items" , "value" ], [])
368
394
item_lines : Sequence [ItemLinesInvoice ] = []
369
395
for item in items :
370
396
if line := item .get ("value" ):
371
- amount = _get_page_val (line , page_num , ["Amount" , "value" , "amount" ])
372
- description = _get_page_val (line , page_num , ["Description" , "value" ])
373
- quantity = _get_page_val (line , page_num , ["Quantity" , "value" ])
374
- unit_price = _get_page_val (line , page_num , ["UnitPrice" , "value" , "amount" ])
375
- product_code = _get_page_val (line , page_num , ["ProductCode" , "value" ])
376
- date_item = _get_page_val (line , page_num , ["Date" , "value" ])
377
- tax_item = _get_page_val (line , page_num , ["Tax" , "value" , "amount" ])
397
+ amount = _get_page_val (
398
+ line , page_num , ["Amount" , "value" , "amount" ]
399
+ )
400
+ description = _get_page_val (
401
+ line , page_num , ["Description" , "value" ]
402
+ )
403
+ quantity = _get_page_val (line , page_num , ["Quantity" , "value" ])
404
+ unit_price = _get_page_val (
405
+ line , page_num , ["UnitPrice" , "value" , "amount" ]
406
+ )
407
+ product_code = _get_page_val (
408
+ line , page_num , ["ProductCode" , "value" ]
409
+ )
410
+ date_item = _get_page_val (line , page_num , ["Date" , "value" ])
411
+ tax_item = _get_page_val (line , page_num , ["Tax" , "value" , "amount" ])
378
412
379
413
item_lines .append (
380
414
ItemLinesInvoice (
@@ -427,7 +461,9 @@ def normalize_invoice_result(response):
427
461
payment_term = payment_term ,
428
462
amount_due = amount_due ,
429
463
previous_unpaid_balance = previous_unpaid_balance ,
430
- date = combine_date_with_time (format_date (invoice_date ), invoice_time ),
464
+ date = combine_date_with_time (
465
+ format_date (invoice_date ), invoice_time
466
+ ),
431
467
due_date = format_date (due_date ),
432
468
purchase_order = purchase_order ,
433
469
taxes = [TaxesInvoice (value = tax , rate = None )],
@@ -518,6 +554,7 @@ def get_right_audio_support_and_sampling_rate(
518
554
)
519
555
return extension , right_audio_format
520
556
557
+
521
558
def microsoft_ocr_tables_standardize_response (
522
559
original_response : dict ,
523
560
) -> OcrTablesAsyncDataClass :
@@ -531,7 +568,10 @@ def microsoft_ocr_tables_standardize_response(
531
568
532
569
return OcrTablesAsyncDataClass (pages = pages , num_pages = num_pages )
533
570
534
- def _ocr_tables_standardize_table (table : dict , original_response : dict , page_index : int ) -> Table :
571
+
572
+ def _ocr_tables_standardize_table (
573
+ table : dict , original_response : dict , page_index : int
574
+ ) -> Table :
535
575
num_rows = table .get ("rowCount" , 0 )
536
576
rows = [Row () for _ in range (num_rows )]
537
577
@@ -545,7 +585,10 @@ def _ocr_tables_standardize_table(table: dict, original_response: dict, page_ind
545
585
)
546
586
return std_table
547
587
548
- def _ocr_tables_standardize_cell (cell : dict , original_response : dict , page_index : int ) -> Cell :
588
+
589
+ def _ocr_tables_standardize_cell (
590
+ cell : dict , original_response : dict , page_index : int
591
+ ) -> Cell :
549
592
current_page_num = cell ["boundingRegions" ][0 ]["pageNumber" ]
550
593
width = original_response ["pages" ][current_page_num - 1 ]["width" ]
551
594
height = original_response ["pages" ][current_page_num - 1 ]["height" ]
@@ -572,20 +615,38 @@ def _ocr_tables_standardize_cell(cell: dict, original_response: dict, page_index
572
615
confidence = cell_confidence ,
573
616
)
574
617
618
+
575
619
def _calculate_cell_confidence (words : List [Dict ], bounding_box : List [float ]) -> float :
576
620
cell_words = [
577
- word for word in words
621
+ word
622
+ for word in words
578
623
if _is_word_in_bounding_box (word ["polygon" ], bounding_box )
579
624
]
580
625
if not cell_words :
581
626
return 1.0
582
627
confidences = [word ["confidence" ] for word in cell_words ]
583
628
return mean (confidences )
584
629
630
+
585
631
def _is_word_in_bounding_box (word_box : List [float ], cell_box : List [float ]) -> bool :
586
- word_left , word_top , word_right , word_bottom = word_box [0 ], word_box [1 ], word_box [4 ], word_box [5 ]
587
- cell_left , cell_top , cell_right , cell_bottom = cell_box [0 ], cell_box [1 ], cell_box [4 ], cell_box [5 ]
588
- return not (word_right < cell_left or word_left > cell_right or word_bottom < cell_top or word_top > cell_bottom )
632
+ word_left , word_top , word_right , word_bottom = (
633
+ word_box [0 ],
634
+ word_box [1 ],
635
+ word_box [4 ],
636
+ word_box [5 ],
637
+ )
638
+ cell_left , cell_top , cell_right , cell_bottom = (
639
+ cell_box [0 ],
640
+ cell_box [1 ],
641
+ cell_box [4 ],
642
+ cell_box [5 ],
643
+ )
644
+ return not (
645
+ word_right < cell_left
646
+ or word_left > cell_right
647
+ or word_bottom < cell_top
648
+ or word_top > cell_bottom
649
+ )
589
650
590
651
591
652
def _create_ocr_async_bounding_box (polygon , height , width ):
@@ -707,27 +768,37 @@ def microsoft_financial_parser_formatter(
707
768
shipping_address = extract (page_document , ["ShippingAddress" , "content" ]),
708
769
remittance_address = extract (page_document , ["RemittanceAddress" , "content" ]),
709
770
service_address = extract (page_document , ["ServiceAddress" , "content" ]),
710
- remit_to_name = extract (page_document , ["CustomerAddressRecipient" , "content" ]),
771
+ remit_to_name = extract (
772
+ page_document , ["CustomerAddressRecipient" , "content" ]
773
+ ),
711
774
)
712
775
713
776
# Merchant information
714
777
merchant_information = FinancialMerchantInformation (
715
778
phone = extract (page_document , ["MerchantPhoneNumber" , "value" ]),
716
779
tax_id = extract (page_document , ["VendorTaxId" , "value" ]),
717
- house_number = extract (page_document , ["MerchantAddress" , "value" , "house_number" ]),
718
- street_name = extract (page_document , ["MerchantAddress" , "value" , "street_address" ]),
780
+ house_number = extract (
781
+ page_document , ["MerchantAddress" , "value" , "house_number" ]
782
+ ),
783
+ street_name = extract (
784
+ page_document , ["MerchantAddress" , "value" , "street_address" ]
785
+ ),
719
786
city = extract (page_document , ["MerchantAddress" , "value" , "city_district" ]),
720
- zip_code = extract (page_document , ["MerchantAddress" , "value" , "postal_code" ]),
721
- province = extract (page_document , ["MerchantAddress" , "value" , "state_district" ]),
787
+ zip_code = extract (
788
+ page_document , ["MerchantAddress" , "value" , "postal_code" ]
789
+ ),
790
+ province = extract (
791
+ page_document , ["MerchantAddress" , "value" , "state_district" ]
792
+ ),
722
793
name = extract (
723
794
obj = page_document ,
724
- path = [' VendorName' , ' value' ],
725
- fallback = extract (page_document , ["MerchantName" , "value" ])
795
+ path = [" VendorName" , " value" ],
796
+ fallback = extract (page_document , ["MerchantName" , "value" ]),
726
797
),
727
798
address = extract (
728
799
obj = page_document ,
729
- path = ["VendorAddress" , ' content' ],
730
- fallback = extract (page_document , [' MerchantAddress' , "content" ])
800
+ path = ["VendorAddress" , " content" ],
801
+ fallback = extract (page_document , [" MerchantAddress" , "content" ]),
731
802
),
732
803
)
733
804
@@ -737,13 +808,15 @@ def microsoft_financial_parser_formatter(
737
808
subtotal = extract (page_document , ["SubTotal" , "value" , "amount" ]),
738
809
payment_terms = extract (page_document , ["PaymentTerm" , "value" ]),
739
810
amount_due = extract (page_document , ["AmountDue" , "value" , "amount" ]),
740
- previous_unpaid_balance = extract_amount (page_document , ["PreviousUnpaidBalance" , "value" ]),
811
+ previous_unpaid_balance = extract_amount (
812
+ page_document , ["PreviousUnpaidBalance" , "value" , "amount" ]
813
+ ),
741
814
discount = extract (page_document , ["TotalDiscount" , "value" , "amount" ]),
742
- total_tax = extract (
815
+ total_tax = extract (
743
816
obj = page_document ,
744
817
path = ["TotalTax" , "value" ],
745
818
type_validator = float ,
746
- fallback = extract (page_document , ["TotalTax" , "value" , "amount" ])
819
+ fallback = extract (page_document , ["TotalTax" , "value" , "amount" ]),
747
820
),
748
821
)
749
822
@@ -769,14 +842,18 @@ def microsoft_financial_parser_formatter(
769
842
)
770
843
771
844
# Bank information
772
- payment_details = extract (page_document , ["PaymentDetails" , "value" ], fallback = [])
845
+ payment_details = extract (
846
+ page_document , ["PaymentDetails" , "value" ], fallback = []
847
+ )
773
848
payment_items = []
774
849
for obj in payment_details :
775
850
if line := obj .get ("value" ):
776
- payment_items .append ({
777
- "iban" : extract (line , ["IBAN" , "content" ]),
778
- "swift" : extract (line , ["SWIFT" , "content" ]),
779
- })
851
+ payment_items .append (
852
+ {
853
+ "iban" : extract (line , ["IBAN" , "content" ]),
854
+ "swift" : extract (line , ["SWIFT" , "content" ]),
855
+ }
856
+ )
780
857
bank = FinancialBankInformation (
781
858
swift = extract (payment_items , [0 , "swift" ]),
782
859
iban = extract (payment_items , [0 , "iban" ]),
@@ -798,12 +875,12 @@ def microsoft_financial_parser_formatter(
798
875
items = page_document .get ("items" ) or []
799
876
item_lines = []
800
877
for item in items :
801
- page_item = extract (item , ["bounding_regions" , 0 , "page_number" ])
878
+ page_item = extract (item , ["bounding_regions" , 0 , "page_number" ])
802
879
line = item .get ("value" )
803
880
if line and page_item == (page_idx + 1 ):
804
881
# Amount Line
805
882
806
- date = extract (line , [' Date' , ' value' ])
883
+ date = extract (line , [" Date" , " value" ])
807
884
if isinstance (date , datetime .date ):
808
885
date = date .isoformat ()
809
886
@@ -822,8 +899,8 @@ def microsoft_financial_parser_formatter(
822
899
tax = extract (line , ["Tax" , "value" , "amount" ]),
823
900
tax_rate = convert_string_to_number (
824
901
string_number = extract (line , ["TaxRate" , "value" ]),
825
- val_type = float
826
- )
902
+ val_type = float ,
903
+ ),
827
904
)
828
905
)
829
906
extracted_data .append (
0 commit comments