Update app.py
This commit is contained in:
174
app.py
174
app.py
@@ -50,14 +50,14 @@ def get_db_connection():
|
|||||||
)
|
)
|
||||||
|
|
||||||
class AHKParser:
|
class AHKParser:
|
||||||
lab = "AHK"
|
lab = "AHK"
|
||||||
|
|
||||||
def _clean_value(self, value):
|
def _clean_value(self, value):
|
||||||
"""Nettoie la valeur en supprimant les espaces inutiles"""
|
"""Nettoie la valeur en supprimant les espaces inutiles"""
|
||||||
if value:
|
if value:
|
||||||
return value.strip()
|
return value.strip()
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def parse(self, text):
|
def parse(self, text):
|
||||||
"""Parse le texte et retourne un dictionnaire structuré"""
|
"""Parse le texte et retourne un dictionnaire structuré"""
|
||||||
result = {
|
result = {
|
||||||
@@ -70,7 +70,7 @@ class AHKParser:
|
|||||||
}
|
}
|
||||||
self.data = result
|
self.data = result
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_report_info(self, text):
|
def _extract_report_info(self, text):
|
||||||
"""Extrait les informations du rapport"""
|
"""Extrait les informations du rapport"""
|
||||||
report_info = {
|
report_info = {
|
||||||
@@ -78,24 +78,24 @@ class AHKParser:
|
|||||||
"file_no": None,
|
"file_no": None,
|
||||||
"date": None
|
"date": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Recherche de la référence client - plus précise
|
# Recherche de la référence client - plus précise
|
||||||
ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
|
ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
|
||||||
if ref_match:
|
if ref_match:
|
||||||
report_info["reference"] = self._clean_value(ref_match.group(1))
|
report_info["reference"] = self._clean_value(ref_match.group(1))
|
||||||
|
|
||||||
# Recherche du numéro de fichier AHK
|
# Recherche du numéro de fichier AHK
|
||||||
file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
|
file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
|
||||||
if file_no_match:
|
if file_no_match:
|
||||||
report_info["file_no"] = self._clean_value(file_no_match.group(1))
|
report_info["file_no"] = self._clean_value(file_no_match.group(1))
|
||||||
|
|
||||||
# Recherche de la date du rapport
|
# Recherche de la date du rapport
|
||||||
date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
|
date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
|
||||||
if date_match:
|
if date_match:
|
||||||
report_info["date"] = self._clean_value(date_match.group(1))
|
report_info["date"] = self._clean_value(date_match.group(1))
|
||||||
|
|
||||||
return report_info
|
return report_info
|
||||||
|
|
||||||
def _extract_contract_info(self, text):
|
def _extract_contract_info(self, text):
|
||||||
"""Extrait les informations du contrat"""
|
"""Extrait les informations du contrat"""
|
||||||
contract_info = {
|
contract_info = {
|
||||||
@@ -105,7 +105,7 @@ class AHKParser:
|
|||||||
"origin": None,
|
"origin": None,
|
||||||
"commodity": None
|
"commodity": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction de la référence client
|
# Extraction de la référence client
|
||||||
ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
|
ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
|
||||||
if ref_match:
|
if ref_match:
|
||||||
@@ -117,7 +117,7 @@ class AHKParser:
|
|||||||
contract_info["contract_no"] = part.strip()
|
contract_info["contract_no"] = part.strip()
|
||||||
elif part.startswith('INV'):
|
elif part.startswith('INV'):
|
||||||
contract_info["invoice_no"] = part.strip()
|
contract_info["invoice_no"] = part.strip()
|
||||||
|
|
||||||
# Extraction de l'origine et de la marchandise - regex plus précise
|
# Extraction de l'origine et de la marchandise - regex plus précise
|
||||||
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
|
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
|
||||||
if growth_match:
|
if growth_match:
|
||||||
@@ -125,9 +125,9 @@ class AHKParser:
|
|||||||
if "AUSTRALIAN" in origin_text.upper():
|
if "AUSTRALIAN" in origin_text.upper():
|
||||||
contract_info["origin"] = "AUSTRALIA"
|
contract_info["origin"] = "AUSTRALIA"
|
||||||
contract_info["commodity"] = "RAW COTTON"
|
contract_info["commodity"] = "RAW COTTON"
|
||||||
|
|
||||||
return contract_info
|
return contract_info
|
||||||
|
|
||||||
def _extract_parties_info(self, text):
|
def _extract_parties_info(self, text):
|
||||||
"""Extrait les informations sur les parties"""
|
"""Extrait les informations sur les parties"""
|
||||||
parties_info = {
|
parties_info = {
|
||||||
@@ -135,24 +135,24 @@ class AHKParser:
|
|||||||
"buyer": None,
|
"buyer": None,
|
||||||
"carrier": None
|
"carrier": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du vendeur (Client) - regex plus précise
|
# Extraction du vendeur (Client) - regex plus précise
|
||||||
seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
|
seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
|
||||||
if seller_match:
|
if seller_match:
|
||||||
parties_info["seller"] = self._clean_value(seller_match.group(1))
|
parties_info["seller"] = self._clean_value(seller_match.group(1))
|
||||||
|
|
||||||
# Extraction de l'acheteur (Buyer) - regex plus précise
|
# Extraction de l'acheteur (Buyer) - regex plus précise
|
||||||
buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
|
buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
|
||||||
if buyer_match:
|
if buyer_match:
|
||||||
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
|
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
|
||||||
|
|
||||||
# Extraction du transporteur (nom du navire seulement)
|
# Extraction du transporteur (nom du navire seulement)
|
||||||
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
|
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
|
||||||
if vessel_match:
|
if vessel_match:
|
||||||
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
|
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
|
||||||
|
|
||||||
return parties_info
|
return parties_info
|
||||||
|
|
||||||
def _extract_shipment_info(self, text):
|
def _extract_shipment_info(self, text):
|
||||||
"""Extrait les informations d'expédition"""
|
"""Extrait les informations d'expédition"""
|
||||||
shipment_info = {
|
shipment_info = {
|
||||||
@@ -166,37 +166,37 @@ class AHKParser:
|
|||||||
"weighing_method": None,
|
"weighing_method": None,
|
||||||
"bales": None
|
"bales": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du navire (nom seulement)
|
# Extraction du navire (nom seulement)
|
||||||
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
|
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
|
||||||
if vessel_match:
|
if vessel_match:
|
||||||
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
|
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
|
||||||
|
|
||||||
# Extraction du numéro de connaissement (seulement le numéro)
|
# Extraction du numéro de connaissement (seulement le numéro)
|
||||||
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
|
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
|
||||||
if bl_no_match:
|
if bl_no_match:
|
||||||
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
|
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
|
||||||
|
|
||||||
# Extraction de la date du connaissement
|
# Extraction de la date du connaissement
|
||||||
bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
|
bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
|
||||||
if bl_date_match:
|
if bl_date_match:
|
||||||
shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))
|
shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))
|
||||||
|
|
||||||
# Extraction du port de destination (sans le "Tare")
|
# Extraction du port de destination (sans le "Tare")
|
||||||
dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
|
dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
|
||||||
if dest_match:
|
if dest_match:
|
||||||
shipment_info["port_destination"] = self._clean_value(dest_match.group(1))
|
shipment_info["port_destination"] = self._clean_value(dest_match.group(1))
|
||||||
|
|
||||||
# Extraction de la date d'arrivée
|
# Extraction de la date d'arrivée
|
||||||
arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
|
arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
|
||||||
if arrival_match:
|
if arrival_match:
|
||||||
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
|
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
|
||||||
|
|
||||||
# Extraction de la méthode de pesée
|
# Extraction de la méthode de pesée
|
||||||
weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
|
weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
|
||||||
if weighing_method_match:
|
if weighing_method_match:
|
||||||
shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))
|
shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))
|
||||||
|
|
||||||
# Extraction du nombre de balles
|
# Extraction du nombre de balles
|
||||||
bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
|
bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
|
||||||
if bales_match:
|
if bales_match:
|
||||||
@@ -204,9 +204,9 @@ class AHKParser:
|
|||||||
shipment_info["bales"] = int(bales_match.group(1).strip())
|
shipment_info["bales"] = int(bales_match.group(1).strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
shipment_info["bales"] = None
|
shipment_info["bales"] = None
|
||||||
|
|
||||||
return shipment_info
|
return shipment_info
|
||||||
|
|
||||||
def _extract_weights_info(self, text):
|
def _extract_weights_info(self, text):
|
||||||
"""Extrait les informations de poids"""
|
"""Extrait les informations de poids"""
|
||||||
weights_info = {
|
weights_info = {
|
||||||
@@ -217,7 +217,7 @@ class AHKParser:
|
|||||||
"gain_loss_kg": None,
|
"gain_loss_kg": None,
|
||||||
"gain_loss_percent": None
|
"gain_loss_percent": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
|
# Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
|
||||||
gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
|
gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
|
||||||
if gross_landed_match:
|
if gross_landed_match:
|
||||||
@@ -225,7 +225,7 @@ class AHKParser:
|
|||||||
weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
|
weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extraction du poids de tare
|
# Extraction du poids de tare
|
||||||
tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
|
tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
|
||||||
if tare_match:
|
if tare_match:
|
||||||
@@ -233,7 +233,7 @@ class AHKParser:
|
|||||||
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
|
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
|
# Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
|
||||||
net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
|
net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
|
||||||
if net_landed_match:
|
if net_landed_match:
|
||||||
@@ -241,7 +241,7 @@ class AHKParser:
|
|||||||
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
|
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extraction du poids net facturé (101299 kg)
|
# Extraction du poids net facturé (101299 kg)
|
||||||
invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
|
invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
|
||||||
if invoice_net_match:
|
if invoice_net_match:
|
||||||
@@ -249,7 +249,7 @@ class AHKParser:
|
|||||||
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
|
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extraction de la perte en kg
|
# Extraction de la perte en kg
|
||||||
loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
|
loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
|
||||||
if loss_match:
|
if loss_match:
|
||||||
@@ -257,7 +257,7 @@ class AHKParser:
|
|||||||
weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
|
weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extraction du pourcentage de perte
|
# Extraction du pourcentage de perte
|
||||||
percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
|
percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
|
||||||
if percent_match:
|
if percent_match:
|
||||||
@@ -265,20 +265,20 @@ class AHKParser:
|
|||||||
weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
|
weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return weights_info
|
return weights_info
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
class IntertekParser:
|
class IntertekParser:
|
||||||
lab = "Intertek"
|
lab = "Intertek"
|
||||||
|
|
||||||
def _clean_value(self, value):
|
def _clean_value(self, value):
|
||||||
"""Nettoie la valeur en supprimant les espaces inutiles"""
|
"""Nettoie la valeur en supprimant les espaces inutiles"""
|
||||||
if value:
|
if value:
|
||||||
return value.strip()
|
return value.strip()
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def _extract_number(self, text, pattern, is_int=False):
|
def _extract_number(self, text, pattern, is_int=False):
|
||||||
"""Extrait un nombre (int ou float) du texte selon un pattern regex"""
|
"""Extrait un nombre (int ou float) du texte selon un pattern regex"""
|
||||||
match = re.search(pattern, text)
|
match = re.search(pattern, text)
|
||||||
@@ -293,7 +293,7 @@ class IntertekParser:
|
|||||||
except (ValueError, AttributeError):
|
except (ValueError, AttributeError):
|
||||||
return None
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def parse(self, text):
|
def parse(self, text):
|
||||||
"""Parse le texte et retourne un dictionnaire structuré"""
|
"""Parse le texte et retourne un dictionnaire structuré"""
|
||||||
result = {
|
result = {
|
||||||
@@ -305,7 +305,7 @@ class IntertekParser:
|
|||||||
"weights": self._extract_weights_info(text)
|
"weights": self._extract_weights_info(text)
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_report_info(self, text):
|
def _extract_report_info(self, text):
|
||||||
"""Extrait les informations du rapport"""
|
"""Extrait les informations du rapport"""
|
||||||
report_info = {
|
report_info = {
|
||||||
@@ -313,24 +313,24 @@ class IntertekParser:
|
|||||||
"file_no": None,
|
"file_no": None,
|
||||||
"date": None
|
"date": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Recherche de la référence globale
|
# Recherche de la référence globale
|
||||||
ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text)
|
ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text)
|
||||||
if ref_match:
|
if ref_match:
|
||||||
report_info["reference"] = self._clean_value(ref_match.group(1))
|
report_info["reference"] = self._clean_value(ref_match.group(1))
|
||||||
|
|
||||||
# Recherche du numéro de fichier
|
# Recherche du numéro de fichier
|
||||||
file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text)
|
file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text)
|
||||||
if file_no_match:
|
if file_no_match:
|
||||||
report_info["file_no"] = self._clean_value(file_no_match.group(1))
|
report_info["file_no"] = self._clean_value(file_no_match.group(1))
|
||||||
|
|
||||||
# Recherche de la date du rapport
|
# Recherche de la date du rapport
|
||||||
date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
|
date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
|
||||||
if date_match:
|
if date_match:
|
||||||
report_info["date"] = self._clean_value(date_match.group(1))
|
report_info["date"] = self._clean_value(date_match.group(1))
|
||||||
|
|
||||||
return report_info
|
return report_info
|
||||||
|
|
||||||
def _extract_contract_info(self, text):
|
def _extract_contract_info(self, text):
|
||||||
"""Extrait les informations du contrat"""
|
"""Extrait les informations du contrat"""
|
||||||
contract_info = {
|
contract_info = {
|
||||||
@@ -340,17 +340,17 @@ class IntertekParser:
|
|||||||
"origin": None,
|
"origin": None,
|
||||||
"commodity": None
|
"commodity": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du numéro de contrat
|
# Extraction du numéro de contrat
|
||||||
contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text)
|
contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text)
|
||||||
if contract_match:
|
if contract_match:
|
||||||
contract_info["contract_no"] = self._clean_value(contract_match.group(1))
|
contract_info["contract_no"] = self._clean_value(contract_match.group(1))
|
||||||
|
|
||||||
# Extraction du numéro de facture
|
# Extraction du numéro de facture
|
||||||
invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text)
|
invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text)
|
||||||
if invoice_match:
|
if invoice_match:
|
||||||
contract_info["invoice_no"] = self._clean_value(invoice_match.group(1))
|
contract_info["invoice_no"] = self._clean_value(invoice_match.group(1))
|
||||||
|
|
||||||
# Extraction de l'origine et de la marchandise
|
# Extraction de l'origine et de la marchandise
|
||||||
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text)
|
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text)
|
||||||
if growth_match:
|
if growth_match:
|
||||||
@@ -358,9 +358,9 @@ class IntertekParser:
|
|||||||
if "GREECE" in origin_text.upper():
|
if "GREECE" in origin_text.upper():
|
||||||
contract_info["origin"] = "GREECE"
|
contract_info["origin"] = "GREECE"
|
||||||
contract_info["commodity"] = "RAW COTTON"
|
contract_info["commodity"] = "RAW COTTON"
|
||||||
|
|
||||||
return contract_info
|
return contract_info
|
||||||
|
|
||||||
def _extract_parties_info(self, text):
|
def _extract_parties_info(self, text):
|
||||||
"""Extrait les informations sur les parties"""
|
"""Extrait les informations sur les parties"""
|
||||||
parties_info = {
|
parties_info = {
|
||||||
@@ -368,24 +368,24 @@ class IntertekParser:
|
|||||||
"buyer": None,
|
"buyer": None,
|
||||||
"carrier": None
|
"carrier": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du vendeur (Shipper)
|
# Extraction du vendeur (Shipper)
|
||||||
seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text)
|
seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text)
|
||||||
if seller_match:
|
if seller_match:
|
||||||
parties_info["seller"] = self._clean_value(seller_match.group(1))
|
parties_info["seller"] = self._clean_value(seller_match.group(1))
|
||||||
|
|
||||||
# Extraction de l'acheteur (Buyer)
|
# Extraction de l'acheteur (Buyer)
|
||||||
buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text)
|
buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text)
|
||||||
if buyer_match:
|
if buyer_match:
|
||||||
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
|
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
|
||||||
|
|
||||||
# Extraction du transporteur (nom du navire seulement)
|
# Extraction du transporteur (nom du navire seulement)
|
||||||
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
|
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
|
||||||
if vessel_match:
|
if vessel_match:
|
||||||
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
|
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
|
||||||
|
|
||||||
return parties_info
|
return parties_info
|
||||||
|
|
||||||
def _extract_shipment_info(self, text):
|
def _extract_shipment_info(self, text):
|
||||||
"""Extrait les informations d'expédition"""
|
"""Extrait les informations d'expédition"""
|
||||||
shipment_info = {
|
shipment_info = {
|
||||||
@@ -399,27 +399,27 @@ class IntertekParser:
|
|||||||
"weighing_method": None,
|
"weighing_method": None,
|
||||||
"bales": None
|
"bales": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du navire
|
# Extraction du navire
|
||||||
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
|
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
|
||||||
if vessel_match:
|
if vessel_match:
|
||||||
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
|
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
|
||||||
|
|
||||||
# Extraction du numéro de connaissement
|
# Extraction du numéro de connaissement
|
||||||
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text)
|
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text)
|
||||||
if bl_no_match:
|
if bl_no_match:
|
||||||
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
|
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
|
||||||
|
|
||||||
# Extraction de la date d'arrivée
|
# Extraction de la date d'arrivée
|
||||||
arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
|
arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
|
||||||
if arrival_match:
|
if arrival_match:
|
||||||
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
|
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
|
||||||
|
|
||||||
# Extraction du lieu de pesée
|
# Extraction du lieu de pesée
|
||||||
weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text)
|
weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text)
|
||||||
if weighing_place_match:
|
if weighing_place_match:
|
||||||
shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1))
|
shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1))
|
||||||
|
|
||||||
# Extraction de la méthode de pesée
|
# Extraction de la méthode de pesée
|
||||||
# Recherche dans les remarques
|
# Recherche dans les remarques
|
||||||
remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE)
|
remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE)
|
||||||
@@ -427,22 +427,22 @@ class IntertekParser:
|
|||||||
remarks_text = remarks_section.group(1)
|
remarks_text = remarks_section.group(1)
|
||||||
if "weighbridge" in remarks_text.lower():
|
if "weighbridge" in remarks_text.lower():
|
||||||
shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck"
|
shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck"
|
||||||
|
|
||||||
# Extraction du nombre de balles (à partir du total)
|
# Extraction du nombre de balles (à partir du total)
|
||||||
bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text)
|
bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text)
|
||||||
if not bales_match:
|
if not bales_match:
|
||||||
# Essayons une autre approche
|
# Essayons une autre approche
|
||||||
bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text)
|
bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text)
|
||||||
|
|
||||||
if bales_match:
|
if bales_match:
|
||||||
try:
|
try:
|
||||||
bales_str = bales_match.group(1).replace(',', '').strip()
|
bales_str = bales_match.group(1).replace(',', '').strip()
|
||||||
shipment_info["bales"] = int(bales_str)
|
shipment_info["bales"] = int(bales_str)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
shipment_info["bales"] = None
|
shipment_info["bales"] = None
|
||||||
|
|
||||||
return shipment_info
|
return shipment_info
|
||||||
|
|
||||||
def _extract_weights_info(self, text):
|
def _extract_weights_info(self, text):
|
||||||
"""Extrait les informations de poids"""
|
"""Extrait les informations de poids"""
|
||||||
weights_info = {
|
weights_info = {
|
||||||
@@ -453,32 +453,32 @@ class IntertekParser:
|
|||||||
"gain_loss_kg": None,
|
"gain_loss_kg": None,
|
||||||
"gain_loss_percent": None
|
"gain_loss_percent": None
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extraction du poids brut débarqué
|
# Extraction du poids brut débarqué
|
||||||
gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text)
|
gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text)
|
||||||
if gross_match:
|
if gross_match:
|
||||||
weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', ''))
|
weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', ''))
|
||||||
|
|
||||||
# Extraction du poids de tare
|
# Extraction du poids de tare
|
||||||
tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
||||||
if tare_match:
|
if tare_match:
|
||||||
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', ''))
|
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', ''))
|
||||||
|
|
||||||
# Extraction du poids net débarqué
|
# Extraction du poids net débarqué
|
||||||
net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
||||||
if net_landed_match:
|
if net_landed_match:
|
||||||
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', ''))
|
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', ''))
|
||||||
|
|
||||||
# Extraction du poids net facturé
|
# Extraction du poids net facturé
|
||||||
invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
|
||||||
if invoice_net_match:
|
if invoice_net_match:
|
||||||
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', ''))
|
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', ''))
|
||||||
|
|
||||||
# Extraction du gain en kg
|
# Extraction du gain en kg
|
||||||
gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text)
|
gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text)
|
||||||
if gain_match:
|
if gain_match:
|
||||||
weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', ''))
|
weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', ''))
|
||||||
|
|
||||||
# Extraction du pourcentage de gain (0.4% dans le tableau)
|
# Extraction du pourcentage de gain (0.4% dans le tableau)
|
||||||
percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text)
|
percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text)
|
||||||
if percent_match:
|
if percent_match:
|
||||||
@@ -486,7 +486,7 @@ class IntertekParser:
|
|||||||
weights_info["gain_loss_percent"] = float(percent_match.group(1))
|
weights_info["gain_loss_percent"] = float(percent_match.group(1))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return weights_info
|
return weights_info
|
||||||
|
|
||||||
# Configure root logger explicitly
|
# Configure root logger explicitly
|
||||||
@@ -639,43 +639,43 @@ async def ocr(file: UploadFile):
|
|||||||
Smart PDF processing optimized for cotton landing reports
|
Smart PDF processing optimized for cotton landing reports
|
||||||
"""
|
"""
|
||||||
logger.info(f"Smart OCR request: {file.filename}")
|
logger.info(f"Smart OCR request: {file.filename}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
file_data = await file.read()
|
file_data = await file.read()
|
||||||
|
|
||||||
# Strategy 1: Try pdfplumber (best for digital PDFs)
|
# Strategy 1: Try pdfplumber (best for digital PDFs)
|
||||||
try:
|
try:
|
||||||
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||||
text_parts = []
|
text_parts = []
|
||||||
tables_found = []
|
tables_found = []
|
||||||
|
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
# Extract text
|
# Extract text
|
||||||
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
|
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
|
||||||
if page_text:
|
if page_text:
|
||||||
text_parts.append(page_text)
|
text_parts.append(page_text)
|
||||||
|
|
||||||
# Look for tables (common in landing reports)
|
# Look for tables (common in landing reports)
|
||||||
tables = page.extract_tables({
|
tables = page.extract_tables({
|
||||||
"vertical_strategy": "text",
|
"vertical_strategy": "text",
|
||||||
"horizontal_strategy": "text",
|
"horizontal_strategy": "text",
|
||||||
"snap_tolerance": 5,
|
"snap_tolerance": 5,
|
||||||
})
|
})
|
||||||
|
|
||||||
for table in tables:
|
for table in tables:
|
||||||
if table and len(table) > 1:
|
if table and len(table) > 1:
|
||||||
tables_found.append(table)
|
tables_found.append(table)
|
||||||
|
|
||||||
combined_text = "\n".join(text_parts)
|
combined_text = "\n".join(text_parts)
|
||||||
return {"ocr_text": combined_text}
|
return {"ocr_text": combined_text}
|
||||||
# if combined_text.strip():
|
# if combined_text.strip():
|
||||||
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
|
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
|
||||||
|
|
||||||
# # Try parsing structured data
|
# # Try parsing structured data
|
||||||
# structured_data = parse_cotton_report(combined_text)
|
# structured_data = parse_cotton_report(combined_text)
|
||||||
|
|
||||||
# # Check if we got key fields
|
# # Check if we got key fields
|
||||||
# if (structured_data.get("shipment", {}).get("bales") and
|
# if (structured_data.get("shipment", {}).get("bales") and
|
||||||
# structured_data.get("weights", {}).get("net_landed_kg")):
|
# structured_data.get("weights", {}).get("net_landed_kg")):
|
||||||
# logger.info("Successfully parsed structured data from pdfplumber")
|
# logger.info("Successfully parsed structured data from pdfplumber")
|
||||||
# return {
|
# return {
|
||||||
@@ -683,13 +683,13 @@ async def ocr(file: UploadFile):
|
|||||||
# "structured_data": structured_data,
|
# "structured_data": structured_data,
|
||||||
# "raw_text_sample": combined_text[:500]
|
# "raw_text_sample": combined_text[:500]
|
||||||
# }
|
# }
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"pdfplumber attempt: {e}")
|
logger.warning(f"pdfplumber attempt: {e}")
|
||||||
|
|
||||||
# from pdf2image import convert_from_bytes
|
# from pdf2image import convert_from_bytes
|
||||||
# images = convert_from_bytes(file_data, dpi=200)
|
# images = convert_from_bytes(file_data, dpi=200)
|
||||||
|
|
||||||
# ocr_results = []
|
# ocr_results = []
|
||||||
# for img in images:
|
# for img in images:
|
||||||
# text = pytesseract.image_to_string(
|
# text = pytesseract.image_to_string(
|
||||||
@@ -697,15 +697,15 @@ async def ocr(file: UploadFile):
|
|||||||
# config='--psm 6 -c preserve_interword_spaces=1'
|
# config='--psm 6 -c preserve_interword_spaces=1'
|
||||||
# )
|
# )
|
||||||
# ocr_results.append(text)
|
# ocr_results.append(text)
|
||||||
|
|
||||||
# ocr_text = "\n".join(ocr_results)
|
# ocr_text = "\n".join(ocr_results)
|
||||||
|
|
||||||
# return {
|
# return {
|
||||||
# "method": "tesseract_ocr",
|
# "method": "tesseract_ocr",
|
||||||
# "structured_data": ocr_text,
|
# "structured_data": ocr_text,
|
||||||
# "raw_text_sample": ocr_text[:500]
|
# "raw_text_sample": ocr_text[:500]
|
||||||
# }
|
# }
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
||||||
return {
|
return {
|
||||||
@@ -928,10 +928,10 @@ def detect_template(text):
|
|||||||
|
|
||||||
if "robertson international" in t or "ri ref no" in t:
|
if "robertson international" in t or "ri ref no" in t:
|
||||||
return "ROBERTSON"
|
return "ROBERTSON"
|
||||||
|
|
||||||
if "landing report" in t and "carcon cargo" in t:
|
if "landing report" in t and "carcon cargo" in t:
|
||||||
return "CARGO CONTROL"
|
return "CARGO CONTROL"
|
||||||
|
|
||||||
if "pacific inspection company" in t or "picl-bd.com" in t:
|
if "pacific inspection company" in t or "picl-bd.com" in t:
|
||||||
return "PICL"
|
return "PICL"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user