diff --git a/app.py b/app.py index 2a10004..d72fe55 100644 --- a/app.py +++ b/app.py @@ -243,326 +243,227 @@ class AHKParser: pass return weights_info -# class AHKParser: -# lab="AHK" - -# def parse(self, text): -# """Parse le texte et retourne un dictionnaire structuré""" -# result = { -# "lab": self.lab, -# "report": self._extract_report_info(text), -# "contract": self._extract_contract_info(text), -# "parties": self._extract_parties_info(text), -# "shipment": self._extract_shipment_info(text), -# "weights": self._extract_weights_info(text) -# } -# self.data = result -# return result - -# def _extract_report_info(self, text): -# """Extrait les informations du rapport""" -# report_info = { -# "reference": None, -# "file_no": None, -# "date": None -# } - -# # Recherche de la référence client -# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) -# if ref_match: -# report_info["reference"] = ref_match.group(1).strip() - -# # Recherche du numéro de fichier AHK -# file_no_match = re.search(r'AHK\s*S/([\w/]+)', text) -# if file_no_match: -# report_info["file_no"] = file_no_match.group(1).strip() - -# # Recherche de la date du rapport -# date_match = re.search(r'Signed on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) -# if date_match: -# report_info["date"] = date_match.group(1).strip() - -# return report_info - -# def _extract_contract_info(self, text): -# """Extrait les informations du contrat""" -# contract_info = { -# "contract_no": None, -# "invoice_no": None, -# "lc_no": None, -# "origin": None, -# "commodity": None -# } - -# # Extraction de la référence client (peut servir comme numéro de contrat) -# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) -# if ref_match: -# ref_parts = ref_match.group(1).split('/') -# if len(ref_parts) >= 2: -# contract_info["contract_no"] = ref_parts[0].strip() -# contract_info["invoice_no"] = ref_parts[1].strip() - -# # Extraction de l'origine et de la marchandise -# origin_match = re.search(r'Growth\s*:\s*([\w\s]+)', text) -# if origin_match: -# origin_text = origin_match.group(1).strip() -# if "AUSTRALIAN" in origin_text.upper(): -# contract_info["origin"] = "AUSTRALIA" -# # La marchandise est généralement "RAW COTTON" -# contract_info["commodity"] = "RAW COTTON" - -# return contract_info - -# def _extract_parties_info(self, text): -# """Extrait les informations sur les parties""" -# parties_info = { -# "seller": None, -# "buyer": None, -# "carrier": None -# } - -# # Extraction du vendeur (Client) -# seller_match = re.search(r'Client\s*:\s*([^\n]+)', text) -# if seller_match: -# parties_info["seller"] = seller_match.group(1).strip() - -# # Extraction de l'acheteur (Buyer) -# buyer_match = re.search(r'Buyer\s*:\s*([^\n]+)', text) -# if buyer_match: -# parties_info["buyer"] = buyer_match.group(1).strip() - -# # Extraction du transporteur (Vessel) -# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) -# if vessel_match: -# # On considère le nom du navire comme transporteur -# parties_info["carrier"] = vessel_match.group(1).strip() - -# return parties_info - -# def _extract_shipment_info(self, text): -# """Extrait les informations d'expédition""" -# shipment_info = { -# "vessel": None, -# "bl_no": None, -# "bl_date": None, -# "port_loading": None, # Non spécifié dans le texte -# "port_destination": None, -# "arrival_date": None, -# "weighing_place": None, # Non spécifié dans le texte -# "weighing_method": None, -# "bales": None -# } - -# # Extraction du navire -# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) -# if vessel_match: -# shipment_info["vessel"] = vessel_match.group(1).strip() - -# # Extraction du numéro de connaissement -# bl_no_match = re.search(r'B/L No\.\s*:\s*([^\n]+)', text) -# if bl_no_match: -# shipment_info["bl_no"] = bl_no_match.group(1).strip() - -# # Extraction de la date du connaissement -# bl_date_match = re.search(r'B/L Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) -# if bl_date_match: -# shipment_info["bl_date"] = bl_date_match.group(1).strip() - -# # Extraction du port de destination -# dest_match = re.search(r'Destination\s*:\s*([^\n]+)', text) -# if dest_match: -# shipment_info["port_destination"] = dest_match.group(1).strip() - -# # Extraction de la date d'arrivée -# arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) -# if arrival_match: -# shipment_info["arrival_date"] = arrival_match.group(1).strip() - -# # Extraction de la méthode de pesée -# weighing_method_match = re.search(r'Weighing method\s*:\s*([^\n]+)', text) -# if weighing_method_match: -# shipment_info["weighing_method"] = weighing_method_match.group(1).strip() - -# # Extraction du nombre de balles -# bales_match = re.search(r'Total Bales\s*:\s*(\d+)', text) -# if bales_match: -# shipment_info["bales"] = int(bales_match.group(1).strip()) - -# return shipment_info - -# def _extract_weights_info(self, text): -# """Extrait les informations de poids""" -# weights_info = { -# "gross_landed_kg": None, -# "tare_kg": None, -# "net_landed_kg": None, -# "invoice_net_kg": None, -# "gain_loss_kg": None, -# "gain_loss_percent": None -# } - -# # Extraction du poids brut débarqué -# gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.]+)\s*kg', text) -# if gross_landed_match: -# weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).strip()) - -# # Extraction du poids de tare -# tare_match = re.search(r'Tare\s*:\s*([\d.]+)\s*kg', text) -# if tare_match: -# weights_info["tare_kg"] = float(tare_match.group(1).strip()) - -# # Extraction du poids net débarqué -# net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) -# if net_landed_match: -# weights_info["net_landed_kg"] = float(net_landed_match.group(1).strip()) - -# # Extraction du poids net facturé -# invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) -# if invoice_net_match: -# weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).strip()) - -# # Extraction de la perte en kg -# loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.]+)\s*kg', text) -# if loss_match: -# weights_info["gain_loss_kg"] = -float(loss_match.group(1).strip()) - -# # Extraction du pourcentage de perte -# percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.]+)%', text) -# if percent_match: -# weights_info["gain_loss_percent"] = -float(percent_match.group(1).strip()) - -# return weights_info +import re + class IntertekParser: - lab="INTERTEK" - def parse(self,text): - r=empty_weight_report("INTERTEK") - pct=safe_search(r"([0-9.]+)\s*%",text) - - r["report"]["reference"]=extract("Global Ref",text) - r["report"]["file_no"]=extract("Report / File No",text) - r["report"]["date"]=extract("Dated",text) - - r["contract"]["contract_no"]=extract("Contract No",text) - r["contract"]["invoice_no"]=extract("Invoice No",text) - r["contract"]["origin"]=extract("Growth",text) - r["contract"]["commodity"]="Raw Cotton" - - r["parties"]["buyer"]=extract("Buyer",text) - - r["shipment"]["vessel"]=extract("Vessel",text) - r["shipment"]["bl_no"]=extract("B/L No",text) - r["shipment"]["arrival_date"]=extract("Arrival Date",text) - r["shipment"]["weighing_place"]=extract("Weighed at",text) - r["shipment"]["bales"]=to_float(extract("Invoice Quantity",text)) - - r["weights"]["gross_landed_kg"]=to_float(extract("Gross",text)) - r["weights"]["tare_kg"]=to_float(extract("Invoice Tare",text)) - r["weights"]["net_landed_kg"]=to_float(extract("Landed Weight",text)) - r["weights"]["invoice_net_kg"]=to_float(extract("Invoice Weight",text)) - r["weights"]["gain_loss_kg"]=to_float(extract("Gain",text)) - r["weights"]["gain_loss_percent"]=to_float(pct) - return r - -class RobertsonParser: - lab="ROBERTSON" - def parse(self,text): - r=empty_weight_report("ROBERTSON") - pct=safe_search(r"([0-9.]+)\s*%",text) - - r["report"]["reference"]=extract("OUR REF",text) - r["report"]["date"]=extract("DATE",text) - - r["contract"]["contract_no"]=extract("CONTRACT NO",text) - r["contract"]["invoice_no"]=extract("INVOICE NO",text) - r["contract"]["lc_no"]=extract("LIC NO",text) - r["contract"]["commodity"]="Raw Cotton" - - r["parties"]["seller"]=extract("SELLER",text) - r["parties"]["buyer"]=extract("BUYER",text) - - r["shipment"]["vessel"]=extract("NAME OF VESSEL",text) - r["shipment"]["port_loading"]=extract("SAILED FROM",text) - r["shipment"]["port_destination"]=extract("ARRIVED AT",text) - r["shipment"]["arrival_date"]=extract("DATE OF ARRIVAL",text) - r["shipment"]["weighing_place"]=extract("PLACE OF CONTROL",text) - r["shipment"]["bales"]=to_float(extract("CONSIGNMENT",text)) - - r["weights"]["gross_landed_kg"]=to_float(extract("GROSS",text)) - r["weights"]["tare_kg"]=to_float(extract("TARE",text)) - r["weights"]["net_landed_kg"]=to_float(extract("LANDED NET",text)) - r["weights"]["invoice_net_kg"]=to_float(extract("INVOICE NET",text)) - r["weights"]["gain_loss_kg"]=to_float(extract("GAIN",text)) - r["weights"]["gain_loss_percent"]=to_float(pct) - return r - -class SGSParser: - lab="SGS" - def parse(self,text): - r=empty_weight_report("SGS") - r["report"]["reference"]=extract("LANDING REPORT No",text) - r["report"]["file_no"]=extract("FILE NO.",text) - r["report"]["date"]=extract("DATE",text) - - r["contract"]["contract_no"]=extract("CONTRACT NO.",text) - r["contract"]["invoice_no"]=extract("INVOICE NO.",text) - r["contract"]["origin"]=extract("ORIGIN",text) - r["contract"]["commodity"]=extract("PRODUCT",text) - - r["parties"]["seller"]=extract("Seller",text) - r["parties"]["buyer"]=extract("Buyer",text) - r["parties"]["carrier"]=extract("Carrier",text) - - r["shipment"]["bl_no"]=extract("B/L no.",text) - r["shipment"]["port_loading"]=extract("Port of loading",text) - r["shipment"]["port_destination"]=extract("Port of destination",text) - r["shipment"]["arrival_date"]=extract("Vessel arrival date",text) - r["shipment"]["weighing_place"]=extract("Place of weighing",text) - r["shipment"]["weighing_method"]=extract("Weighing mode",text) - r["shipment"]["bales"]=to_float(extract("Quantity arrived",text)) - - r["weights"]["gross_landed_kg"]=to_float(extract("Gross landed",text)) - r["weights"]["tare_kg"]=to_float(extract("Tare",text)) - r["weights"]["net_landed_kg"]=to_float(extract("Net landed",text)) - r["weights"]["invoice_net_kg"]=to_float(extract("Net invoiced",text)) - r["weights"]["gain_loss_kg"]=to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs",text)) - r["weights"]["gain_loss_percent"]=to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%",text)) - return r - -class PICLParser: - lab="PICL" - def parse(self,text): - r=empty_weight_report("PICL") - - r["report"]["reference"]=safe_search(r"No[:\s]+([A-Z0-9\-]+)",text) - r["report"]["date"]=safe_search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})",text,group_index=2) - - r["contract"]["contract_no"]=extract("Contract/Pl No & Date",text) - r["contract"]["invoice_no"]=extract("Invoice ilo & Date",text) - r["contract"]["lc_no"]=extract("L/C No & Date",text) - r["contract"]["origin"]=extract("Country of Origin",text) - r["contract"]["commodity"]=extract("Commodity",text) - - r["parties"]["seller"]=extract("FAIRCOT SA",text) - r["parties"]["buyer"]=extract("M/S.",text) - r["parties"]["carrier"]=extract("Shipping Agent",text) - - r["shipment"]["vessel"]=extract("Shipped Per Vessel",text) - r["shipment"]["bl_no"]=extract("B/L No & Date",text) - r["shipment"]["port_loading"]=extract("Port of Loading",text) - r["shipment"]["port_destination"]=extract("Port of Discharge",text) - r["shipment"]["arrival_date"]=extract("Date of Anival & LDL",text) - r["shipment"]["weighing_place"]=extract("Place & Date of Weighment",text) - r["shipment"]["weighing_method"]=extract("Method of Weighment",text) - r["shipment"]["bales"]=to_float(extract("Grand Total",text)) - - r["weights"]["gross_landed_kg"]=to_float(extract("Total;",text)) - r["weights"]["tare_kg"]=to_float(extract("Tare Weight",text)) - r["weights"]["net_landed_kg"]=to_float(extract("Grand Total",text)) - r["weights"]["invoice_net_kg"]=to_float(extract("Invoice weight",text)) - r["weights"]["gain_loss_kg"]=to_float(safe_search(r"(-[0-9.,]+)\s*KGS",text)) - r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text)) - return r + lab = "Intertek" + + def _clean_value(self, value): + """Nettoie la valeur en supprimant les espaces inutiles""" + if value: + return value.strip() + return value + + def _extract_number(self, text, pattern, is_int=False): + """Extrait un nombre (int ou float) du texte selon un pattern regex""" + match = re.search(pattern, text) + if match: + try: + # Nettoie la chaîne numérique + num_str = match.group(1).replace(',', '').replace(' ', '').strip() + if is_int: + return int(num_str) + else: + return float(num_str) + except (ValueError, AttributeError): + return None + return None + + def parse(self, text): + """Parse le texte et retourne un dictionnaire structuré""" + result = { + "lab": self.lab, + "report": self._extract_report_info(text), + "contract": self._extract_contract_info(text), + "parties": self._extract_parties_info(text), + "shipment": self._extract_shipment_info(text), + "weights": self._extract_weights_info(text) + } + return result + + def _extract_report_info(self, text): + """Extrait les informations du rapport""" + report_info = { + "reference": None, + "file_no": None, + "date": None + } + + # Recherche de la référence globale + ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text) + if ref_match: + report_info["reference"] = self._clean_value(ref_match.group(1)) + + # Recherche du numéro de fichier + file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text) + if file_no_match: + report_info["file_no"] = self._clean_value(file_no_match.group(1)) + + # Recherche de la date du rapport + date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) + if date_match: + report_info["date"] = self._clean_value(date_match.group(1)) + + return report_info + + def _extract_contract_info(self, text): + """Extrait les informations du contrat""" + contract_info = { + "contract_no": None, + "invoice_no": None, + "lc_no": None, # Non présent dans ce rapport + "origin": None, + "commodity": None + } + + # Extraction du numéro de contrat + contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text) + if contract_match: + contract_info["contract_no"] = self._clean_value(contract_match.group(1)) + + # Extraction du numéro de facture + invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text) + if invoice_match: + contract_info["invoice_no"] = self._clean_value(invoice_match.group(1)) + + # Extraction de l'origine et de la marchandise + growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text) + if growth_match: + origin_text = growth_match.group(1).strip() + if "GREECE" in origin_text.upper(): + contract_info["origin"] = "GREECE" + contract_info["commodity"] = "RAW COTTON" + + return contract_info + + def _extract_parties_info(self, text): + """Extrait les informations sur les parties""" + parties_info = { + "seller": None, + "buyer": None, + "carrier": None + } + + # Extraction du vendeur (Shipper) + seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text) + if seller_match: + parties_info["seller"] = self._clean_value(seller_match.group(1)) + + # Extraction de l'acheteur (Buyer) + buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text) + if buyer_match: + parties_info["buyer"] = self._clean_value(buyer_match.group(1)) + + # Extraction du transporteur (nom du navire seulement) + vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) + if vessel_match: + parties_info["carrier"] = self._clean_value(vessel_match.group(1)) + + return parties_info + + def _extract_shipment_info(self, text): + """Extrait les informations d'expédition""" + shipment_info = { + "vessel": None, + "bl_no": None, + "bl_date": None, # Non présent dans ce rapport + "port_loading": None, # Non présent dans ce rapport + "port_destination": None, # Non présent dans ce rapport + "arrival_date": None, + "weighing_place": None, + "weighing_method": None, + "bales": None + } + + # Extraction du navire + vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) + if vessel_match: + shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) + + # Extraction du numéro de connaissement + bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text) + if bl_no_match: + shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) + + # Extraction de la date d'arrivée + arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) + if arrival_match: + shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) + + # Extraction du lieu de pesée + weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text) + if weighing_place_match: + shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1)) + + # Extraction de la méthode de pesée + # Recherche dans les remarques + remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE) + if remarks_section: + remarks_text = remarks_section.group(1) + if "weighbridge" in remarks_text.lower(): + shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck" + + # Extraction du nombre de balles (à partir du total) + bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text) + if not bales_match: + # Essayons une autre approche + bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text) + + if bales_match: + try: + bales_str = bales_match.group(1).replace(',', '').strip() + shipment_info["bales"] = int(bales_str) + except ValueError: + shipment_info["bales"] = None + + return shipment_info + + def _extract_weights_info(self, text): + """Extrait les informations de poids""" + weights_info = { + "gross_landed_kg": None, + "tare_kg": None, + "net_landed_kg": None, + "invoice_net_kg": None, + "gain_loss_kg": None, + "gain_loss_percent": None + } + + # Extraction du poids brut débarqué + gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text) + if gross_match: + weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', '')) + + # Extraction du poids de tare + tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) + if tare_match: + weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '')) + + # Extraction du poids net débarqué + net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) + if net_landed_match: + weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '')) + + # Extraction du poids net facturé + invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) + if invoice_net_match: + weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '')) + + # Extraction du gain en kg + gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text) + if gain_match: + weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', '')) + + # Extraction du pourcentage de gain (0.4% dans le tableau) + percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text) + if percent_match: + try: + weights_info["gain_loss_percent"] = float(percent_match.group(1)) + except ValueError: + pass + + return weights_info # Configure root logger explicitly root = logging.getLogger() @@ -937,10 +838,7 @@ async def parse_endpoint(text: str = Body(..., embed=True)): PARSERS = { "AHK": AHKParser(), - "INTERTEK": IntertekParser(), - "ROBERTSON": RobertsonParser(), - "SGS": SGSParser(), - "PICL": PICLParser() + "INTERTEK": IntertekParser() } def empty_weight_report(lab):