diff --git a/app.py b/app.py index f1f6d83..2a10004 100644 --- a/app.py +++ b/app.py @@ -11,6 +11,7 @@ import logging import io from logging.handlers import RotatingFileHandler import re +from datetime import datetime LOG_PATH = "/var/log/automation-service.log" @@ -24,11 +25,14 @@ file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s" )) -import re -from datetime import datetime - class AHKParser: - lab="AHK" + lab = "AHK" + + def _clean_value(self, value): + """Nettoie la valeur en supprimant les espaces inutiles""" + if value: + return value.strip() + return value def parse(self, text): """Parse le texte et retourne un dictionnaire structuré""" @@ -51,20 +55,20 @@ class AHKParser: "date": None } - # Recherche de la référence client - ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) + # Recherche de la référence client - plus précise + ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text) if ref_match: - report_info["reference"] = ref_match.group(1).strip() + report_info["reference"] = self._clean_value(ref_match.group(1)) # Recherche du numéro de fichier AHK - file_no_match = re.search(r'AHK\s*S/([\w/]+)', text) + file_no_match = re.search(r'AHK\s+S/([\w/]+)', text) if file_no_match: - report_info["file_no"] = file_no_match.group(1).strip() + report_info["file_no"] = self._clean_value(file_no_match.group(1)) # Recherche de la date du rapport - date_match = re.search(r'Signed on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) + date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) if date_match: - report_info["date"] = date_match.group(1).strip() + report_info["date"] = self._clean_value(date_match.group(1)) return report_info @@ -78,21 +82,24 @@ class AHKParser: "commodity": None } - # Extraction de la référence client (peut servir comme numéro de contrat) - ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) + # Extraction de la référence client + ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text) if ref_match: - ref_parts = ref_match.group(1).split('/') - if len(ref_parts) >= 2: - contract_info["contract_no"] = ref_parts[0].strip() - contract_info["invoice_no"] = ref_parts[1].strip() + ref_text = ref_match.group(1).strip() + # Sépare S-3488 et INV 4013 + parts = re.split(r'[/\s]+', ref_text) + for part in parts: + if part.startswith('S-'): + contract_info["contract_no"] = part.strip() + elif part.startswith('INV'): + contract_info["invoice_no"] = part.strip() - # Extraction de l'origine et de la marchandise - origin_match = re.search(r'Growth\s*:\s*([\w\s]+)', text) - if origin_match: - origin_text = origin_match.group(1).strip() + # Extraction de l'origine et de la marchandise - regex plus précise + growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text) + if growth_match: + origin_text = growth_match.group(1).strip() if "AUSTRALIAN" in origin_text.upper(): contract_info["origin"] = "AUSTRALIA" - # La marchandise est généralement "RAW COTTON" contract_info["commodity"] = "RAW COTTON" return contract_info @@ -105,21 +112,20 @@ class AHKParser: "carrier": None } - # Extraction du vendeur (Client) - seller_match = re.search(r'Client\s*:\s*([^\n]+)', text) + # Extraction du vendeur (Client) - regex plus précise + seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text) if seller_match: - parties_info["seller"] = seller_match.group(1).strip() + parties_info["seller"] = self._clean_value(seller_match.group(1)) - # Extraction de l'acheteur (Buyer) - buyer_match = re.search(r'Buyer\s*:\s*([^\n]+)', text) + # Extraction de l'acheteur (Buyer) - regex plus précise + buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text) if buyer_match: - parties_info["buyer"] = buyer_match.group(1).strip() + parties_info["buyer"] = self._clean_value(buyer_match.group(1)) - # Extraction du transporteur (Vessel) - vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) + # Extraction du transporteur (nom du navire seulement) + vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: - # On considère le nom du navire comme transporteur - parties_info["carrier"] = vessel_match.group(1).strip() + parties_info["carrier"] = self._clean_value(vessel_match.group(1)) return parties_info @@ -129,48 +135,51 @@ class AHKParser: "vessel": None, "bl_no": None, "bl_date": None, - "port_loading": None, # Non spécifié dans le texte + "port_loading": None, "port_destination": None, "arrival_date": None, - "weighing_place": None, # Non spécifié dans le texte + "weighing_place": None, "weighing_method": None, "bales": None } - # Extraction du navire - vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) + # Extraction du navire (nom seulement) + vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: - shipment_info["vessel"] = vessel_match.group(1).strip() + shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) - # Extraction du numéro de connaissement - bl_no_match = re.search(r'B/L No\.\s*:\s*([^\n]+)', text) + # Extraction du numéro de connaissement (seulement le numéro) + bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text) if bl_no_match: - shipment_info["bl_no"] = bl_no_match.group(1).strip() + shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) # Extraction de la date du connaissement - bl_date_match = re.search(r'B/L Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) + bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if bl_date_match: - shipment_info["bl_date"] = bl_date_match.group(1).strip() + shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1)) - # Extraction du port de destination - dest_match = re.search(r'Destination\s*:\s*([^\n]+)', text) + # Extraction du port de destination (sans le "Tare") + dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text) if dest_match: - shipment_info["port_destination"] = dest_match.group(1).strip() + shipment_info["port_destination"] = self._clean_value(dest_match.group(1)) # Extraction de la date d'arrivée - arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) + arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if arrival_match: - shipment_info["arrival_date"] = arrival_match.group(1).strip() + shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) # Extraction de la méthode de pesée - weighing_method_match = re.search(r'Weighing method\s*:\s*([^\n]+)', text) + weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text) if weighing_method_match: - shipment_info["weighing_method"] = weighing_method_match.group(1).strip() + shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1)) # Extraction du nombre de balles - bales_match = re.search(r'Total Bales\s*:\s*(\d+)', text) + bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text) if bales_match: - shipment_info["bales"] = int(bales_match.group(1).strip()) + try: + shipment_info["bales"] = int(bales_match.group(1).strip()) + except ValueError: + shipment_info["bales"] = None return shipment_info @@ -185,113 +194,245 @@ class AHKParser: "gain_loss_percent": None } - # Extraction du poids brut débarqué - gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.]+)\s*kg', text) + # Extraction du poids brut débarqué (corrigé - doit être 100580 kg) + gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text) if gross_landed_match: - weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).strip()) + try: + weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip()) + except ValueError: + pass # Extraction du poids de tare - tare_match = re.search(r'Tare\s*:\s*([\d.]+)\s*kg', text) + tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text) if tare_match: - weights_info["tare_kg"] = float(tare_match.group(1).strip()) + try: + weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip()) + except ValueError: + pass - # Extraction du poids net débarqué - net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) + # Extraction du poids net débarqué (corrigé - doit être 100078.40 kg) + net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if net_landed_match: - weights_info["net_landed_kg"] = float(net_landed_match.group(1).strip()) + try: + weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip()) + except ValueError: + pass - # Extraction du poids net facturé - invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) + # Extraction du poids net facturé (101299 kg) + invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if invoice_net_match: - weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).strip()) + try: + weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip()) + except ValueError: + pass # Extraction de la perte en kg - loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.]+)\s*kg', text) + loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text) if loss_match: - weights_info["gain_loss_kg"] = -float(loss_match.group(1).strip()) + try: + weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip()) + except ValueError: + pass # Extraction du pourcentage de perte - percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.]+)%', text) + percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text) if percent_match: - weights_info["gain_loss_percent"] = -float(percent_match.group(1).strip()) + try: + weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip()) + except ValueError: + pass return weights_info - # class AHKParser: -# lab = "AHK" - -# def _lines(self, text): -# return [l.strip() for l in text.splitlines() if l.strip()] - -# def _col_block(self, lines, labels, max_scan=30): -# idx = [i for i,l in enumerate(lines) if l in labels] -# if not idx: -# return {} # << empêche le crash -# start = max(idx) + 1 -# vals = [] -# for l in lines[start:start+max_scan]: -# if l.startswith(":"): -# v = l[1:].replace("kg","").strip() -# vals.append(v) -# if len(vals) == len(labels): -# break -# return dict(zip(labels, vals)) - +# lab="AHK" + # def parse(self, text): -# L = self._lines(text) -# r = empty_weight_report("AHK") - -# # report -# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) -# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) - -# # contract -# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) -# r["contract"]["commodity"] = "Raw Cotton" - -# # buyer -# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) - -# # shipment block 1 -# ship1 = self._col_block(L, [ -# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" -# ]) - -# # shipment block 2 -# ship2 = self._col_block(L, [ -# "Growth","Arrival Date","First date of weighing", -# "Last Date of Weighing","Weighing method","Tare" -# ]) - -# r["shipment"]["bales"] = to_float(ship1.get("Total Bales")) -# r["shipment"]["vessel"] = ship1.get("Vessel") -# r["shipment"]["bl_no"] = ship1.get("B/L No.") -# r["shipment"]["port_destination"] = ship1.get("Destination") -# r["shipment"]["arrival_date"] = ship2.get("Arrival Date") -# r["shipment"]["weighing_method"] = ship2.get("Weighing method") -# r["contract"]["origin"] = ship2.get("Growth") - -# # invoice weights -# inv = self._col_block(L, ["Bales","Gross","Tare","Net"]) -# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) - -# # landed weights -# land = self._col_block( -# self._lines(section(text,"Bales Weighed","Outturn")), -# ["Bales","Gross","Tare","Net"] -# ) - -# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) -# r["weights"]["tare_kg"] = to_float(land.get("Tare")) -# r["weights"]["net_landed_kg"] = to_float(land.get("Net")) - -# # loss -# loss = section(text,"LOSS","Invoice average") -# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) -# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) - -# return r - +# """Parse le texte et retourne un dictionnaire structuré""" +# result = { +# "lab": self.lab, +# "report": self._extract_report_info(text), +# "contract": self._extract_contract_info(text), +# "parties": self._extract_parties_info(text), +# "shipment": self._extract_shipment_info(text), +# "weights": self._extract_weights_info(text) +# } +# self.data = result +# return result + +# def _extract_report_info(self, text): +# """Extrait les informations du rapport""" +# report_info = { +# "reference": None, +# "file_no": None, +# "date": None +# } + +# # Recherche de la référence client +# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) +# if ref_match: +# report_info["reference"] = ref_match.group(1).strip() + +# # Recherche du numéro de fichier AHK +# file_no_match = re.search(r'AHK\s*S/([\w/]+)', text) +# if file_no_match: +# report_info["file_no"] = file_no_match.group(1).strip() + +# # Recherche de la date du rapport +# date_match = re.search(r'Signed on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) +# if date_match: +# report_info["date"] = date_match.group(1).strip() + +# return report_info + +# def _extract_contract_info(self, text): +# """Extrait les informations du contrat""" +# contract_info = { +# "contract_no": None, +# "invoice_no": None, +# "lc_no": None, +# "origin": None, +# "commodity": None +# } + +# # Extraction de la référence client (peut servir comme numéro de contrat) +# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text) +# if ref_match: +# ref_parts = ref_match.group(1).split('/') +# if len(ref_parts) >= 2: +# contract_info["contract_no"] = ref_parts[0].strip() +# contract_info["invoice_no"] = ref_parts[1].strip() + +# # Extraction de l'origine et de la marchandise +# origin_match = re.search(r'Growth\s*:\s*([\w\s]+)', text) +# if origin_match: +# origin_text = origin_match.group(1).strip() +# if "AUSTRALIAN" in origin_text.upper(): +# contract_info["origin"] = "AUSTRALIA" +# # La marchandise est généralement "RAW COTTON" +# contract_info["commodity"] = "RAW COTTON" + +# return contract_info + +# def _extract_parties_info(self, text): +# """Extrait les informations sur les parties""" +# parties_info = { +# "seller": None, +# "buyer": None, +# "carrier": None +# } + +# # Extraction du vendeur (Client) +# seller_match = re.search(r'Client\s*:\s*([^\n]+)', text) +# if seller_match: +# parties_info["seller"] = seller_match.group(1).strip() + +# # Extraction de l'acheteur (Buyer) +# buyer_match = re.search(r'Buyer\s*:\s*([^\n]+)', text) +# if buyer_match: +# parties_info["buyer"] = buyer_match.group(1).strip() + +# # Extraction du transporteur (Vessel) +# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) +# if vessel_match: +# # On considère le nom du navire comme transporteur +# parties_info["carrier"] = vessel_match.group(1).strip() + +# return parties_info + +# def _extract_shipment_info(self, text): +# """Extrait les informations d'expédition""" +# shipment_info = { +# "vessel": None, +# "bl_no": None, +# "bl_date": None, +# "port_loading": None, # Non spécifié dans le texte +# "port_destination": None, +# "arrival_date": None, +# "weighing_place": None, # Non spécifié dans le texte +# "weighing_method": None, +# "bales": None +# } + +# # Extraction du navire +# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text) +# if vessel_match: +# shipment_info["vessel"] = vessel_match.group(1).strip() + +# # Extraction du numéro de connaissement +# bl_no_match = re.search(r'B/L No\.\s*:\s*([^\n]+)', text) +# if bl_no_match: +# shipment_info["bl_no"] = bl_no_match.group(1).strip() + +# # Extraction de la date du connaissement +# bl_date_match = re.search(r'B/L Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) +# if bl_date_match: +# shipment_info["bl_date"] = bl_date_match.group(1).strip() + +# # Extraction du port de destination +# dest_match = re.search(r'Destination\s*:\s*([^\n]+)', text) +# if dest_match: +# shipment_info["port_destination"] = dest_match.group(1).strip() + +# # Extraction de la date d'arrivée +# arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) +# if arrival_match: +# shipment_info["arrival_date"] = arrival_match.group(1).strip() + +# # Extraction de la méthode de pesée +# weighing_method_match = re.search(r'Weighing method\s*:\s*([^\n]+)', text) +# if weighing_method_match: +# shipment_info["weighing_method"] = weighing_method_match.group(1).strip() + +# # Extraction du nombre de balles +# bales_match = re.search(r'Total Bales\s*:\s*(\d+)', text) +# if bales_match: +# shipment_info["bales"] = int(bales_match.group(1).strip()) + +# return shipment_info + +# def _extract_weights_info(self, text): +# """Extrait les informations de poids""" +# weights_info = { +# "gross_landed_kg": None, +# "tare_kg": None, +# "net_landed_kg": None, +# "invoice_net_kg": None, +# "gain_loss_kg": None, +# "gain_loss_percent": None +# } + +# # Extraction du poids brut débarqué +# gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.]+)\s*kg', text) +# if gross_landed_match: +# weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).strip()) + +# # Extraction du poids de tare +# tare_match = re.search(r'Tare\s*:\s*([\d.]+)\s*kg', text) +# if tare_match: +# weights_info["tare_kg"] = float(tare_match.group(1).strip()) + +# # Extraction du poids net débarqué +# net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) +# if net_landed_match: +# weights_info["net_landed_kg"] = float(net_landed_match.group(1).strip()) + +# # Extraction du poids net facturé +# invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text) +# if invoice_net_match: +# weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).strip()) + +# # Extraction de la perte en kg +# loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.]+)\s*kg', text) +# if loss_match: +# weights_info["gain_loss_kg"] = -float(loss_match.group(1).strip()) + +# # Extraction du pourcentage de perte +# percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.]+)%', text) +# if percent_match: +# weights_info["gain_loss_percent"] = -float(percent_match.group(1).strip()) + +# return weights_info + class IntertekParser: lab="INTERTEK" def parse(self,text):