from fastapi import FastAPI, UploadFile, HTTPException, Body, Request import smtplib import base64 from email.message import EmailMessage import os from PIL import Image import pytesseract from doctr.models import ocr_predictor from doctr.io import DocumentFile from PyPDF2 import PdfReader import pdfplumber import camelot import spacy import logging import io from logging.handlers import RotatingFileHandler import re from datetime import datetime from io import BytesIO import requests LOG_PATH = "/var/log/automation-service.log" file_handler = RotatingFileHandler( LOG_PATH, maxBytes=10*1024*1024, backupCount=5, encoding="utf-8" ) file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s" )) SMTP_SERVER = "smtp.gmail.com" SMTP_PORT = 587 EMAIL_ACCOUNT = "faircotbot@gmail.com" EMAIL_PASSWORD = "zmaqjfrvjpyvcrlg" import pyodbc def get_db_connection(): return pyodbc.connect( "DRIVER={ODBC Driver 18 for SQL Server};" "SERVER=VPS88.DATACENTER.CSTI;" "DATABASE=Faircot-Test;" "UID=SINGA_META;" "PWD=Start.123;" "TrustServerCertificate=yes;" ) class AHKParser: lab = "AHK" def _clean_value(self, value): """Nettoie la valeur en supprimant les espaces inutiles""" if value: return value.strip() return value def parse(self, text): """Parse le texte et retourne un dictionnaire structuré""" result = { "lab": self.lab, "report": self._extract_report_info(text), "contract": self._extract_contract_info(text), "parties": self._extract_parties_info(text), "shipment": self._extract_shipment_info(text), "weights": self._extract_weights_info(text) } self.data = result return result def _extract_report_info(self, text): """Extrait les informations du rapport""" report_info = { "reference": None, "file_no": None, "date": None } # Recherche de la référence client - plus précise ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text) if ref_match: report_info["reference"] = self._clean_value(ref_match.group(1)) # Recherche du numéro de fichier AHK file_no_match = re.search(r'AHK\s+S/([\w/]+)', text) if file_no_match: report_info["file_no"] = self._clean_value(file_no_match.group(1)) # Recherche de la date du rapport date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) if date_match: report_info["date"] = self._clean_value(date_match.group(1)) return report_info def _extract_contract_info(self, text): """Extrait les informations du contrat""" contract_info = { "contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None } # Extraction de la référence client ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text) if ref_match: ref_text = ref_match.group(1).strip() # Sépare S-3488 et INV 4013 parts = re.split(r'[/\s]+', ref_text) for part in parts: if part.startswith('S-'): contract_info["contract_no"] = part.strip() elif part.startswith('INV'): contract_info["invoice_no"] = part.strip() # Extraction de l'origine et de la marchandise - regex plus précise growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text) if growth_match: origin_text = growth_match.group(1).strip() if "AUSTRALIAN" in origin_text.upper(): contract_info["origin"] = "AUSTRALIA" contract_info["commodity"] = "RAW COTTON" return contract_info def _extract_parties_info(self, text): """Extrait les informations sur les parties""" parties_info = { "seller": None, "buyer": None, "carrier": None } # Extraction du vendeur (Client) - regex plus précise seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text) if seller_match: parties_info["seller"] = self._clean_value(seller_match.group(1)) # Extraction de l'acheteur (Buyer) - regex plus précise buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text) if buyer_match: parties_info["buyer"] = self._clean_value(buyer_match.group(1)) # Extraction du transporteur (nom du navire seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: parties_info["carrier"] = self._clean_value(vessel_match.group(1)) return parties_info def _extract_shipment_info(self, text): """Extrait les informations d'expédition""" shipment_info = { "vessel": None, "bl_no": None, "bl_date": None, "port_loading": None, "port_destination": None, "arrival_date": None, "weighing_place": None, "weighing_method": None, "bales": None } # Extraction du navire (nom seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) # Extraction du numéro de connaissement (seulement le numéro) bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text) if bl_no_match: shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) # Extraction de la date du connaissement bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if bl_date_match: shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1)) # Extraction du port de destination (sans le "Tare") dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text) if dest_match: shipment_info["port_destination"] = self._clean_value(dest_match.group(1)) # Extraction de la date d'arrivée arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if arrival_match: shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) # Extraction de la méthode de pesée weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text) if weighing_method_match: shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1)) # Extraction du nombre de balles bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text) if bales_match: try: shipment_info["bales"] = int(bales_match.group(1).strip()) except ValueError: shipment_info["bales"] = None return shipment_info def _extract_weights_info(self, text): """Extrait les informations de poids""" weights_info = { "gross_landed_kg": None, "tare_kg": None, "net_landed_kg": None, "invoice_net_kg": None, "gain_loss_kg": None, "gain_loss_percent": None } # Extraction du poids brut débarqué (corrigé - doit être 100580 kg) gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text) if gross_landed_match: try: weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip()) except ValueError: pass # Extraction du poids de tare tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text) if tare_match: try: weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip()) except ValueError: pass # Extraction du poids net débarqué (corrigé - doit être 100078.40 kg) net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if net_landed_match: try: weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip()) except ValueError: pass # Extraction du poids net facturé (101299 kg) invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if invoice_net_match: try: weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip()) except ValueError: pass # Extraction de la perte en kg loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text) if loss_match: try: weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip()) except ValueError: pass # Extraction du pourcentage de perte percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text) if percent_match: try: weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip()) except ValueError: pass return weights_info import re class IntertekParser: lab = "Intertek" def _clean_value(self, value): """Nettoie la valeur en supprimant les espaces inutiles""" if value: return value.strip() return value def _extract_number(self, text, pattern, is_int=False): """Extrait un nombre (int ou float) du texte selon un pattern regex""" match = re.search(pattern, text) if match: try: # Nettoie la chaîne numérique num_str = match.group(1).replace(',', '').replace(' ', '').strip() if is_int: return int(num_str) else: return float(num_str) except (ValueError, AttributeError): return None return None def parse(self, text): """Parse le texte et retourne un dictionnaire structuré""" result = { "lab": self.lab, "report": self._extract_report_info(text), "contract": self._extract_contract_info(text), "parties": self._extract_parties_info(text), "shipment": self._extract_shipment_info(text), "weights": self._extract_weights_info(text) } return result def _extract_report_info(self, text): """Extrait les informations du rapport""" report_info = { "reference": None, "file_no": None, "date": None } # Recherche de la référence globale ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text) if ref_match: report_info["reference"] = self._clean_value(ref_match.group(1)) # Recherche du numéro de fichier file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text) if file_no_match: report_info["file_no"] = self._clean_value(file_no_match.group(1)) # Recherche de la date du rapport date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) if date_match: report_info["date"] = self._clean_value(date_match.group(1)) return report_info def _extract_contract_info(self, text): """Extrait les informations du contrat""" contract_info = { "contract_no": None, "invoice_no": None, "lc_no": None, # Non présent dans ce rapport "origin": None, "commodity": None } # Extraction du numéro de contrat contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text) if contract_match: contract_info["contract_no"] = self._clean_value(contract_match.group(1)) # Extraction du numéro de facture invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text) if invoice_match: contract_info["invoice_no"] = self._clean_value(invoice_match.group(1)) # Extraction de l'origine et de la marchandise growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text) if growth_match: origin_text = growth_match.group(1).strip() if "GREECE" in origin_text.upper(): contract_info["origin"] = "GREECE" contract_info["commodity"] = "RAW COTTON" return contract_info def _extract_parties_info(self, text): """Extrait les informations sur les parties""" parties_info = { "seller": None, "buyer": None, "carrier": None } # Extraction du vendeur (Shipper) seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text) if seller_match: parties_info["seller"] = self._clean_value(seller_match.group(1)) # Extraction de l'acheteur (Buyer) buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text) if buyer_match: parties_info["buyer"] = self._clean_value(buyer_match.group(1)) # Extraction du transporteur (nom du navire seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) if vessel_match: parties_info["carrier"] = self._clean_value(vessel_match.group(1)) return parties_info def _extract_shipment_info(self, text): """Extrait les informations d'expédition""" shipment_info = { "vessel": None, "bl_no": None, "bl_date": None, # Non présent dans ce rapport "port_loading": None, # Non présent dans ce rapport "port_destination": None, # Non présent dans ce rapport "arrival_date": None, "weighing_place": None, "weighing_method": None, "bales": None } # Extraction du navire vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) if vessel_match: shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) # Extraction du numéro de connaissement bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text) if bl_no_match: shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) # Extraction de la date d'arrivée arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) if arrival_match: shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) # Extraction du lieu de pesée weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text) if weighing_place_match: shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1)) # Extraction de la méthode de pesée # Recherche dans les remarques remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE) if remarks_section: remarks_text = remarks_section.group(1) if "weighbridge" in remarks_text.lower(): shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck" # Extraction du nombre de balles (à partir du total) bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text) if not bales_match: # Essayons une autre approche bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text) if bales_match: try: bales_str = bales_match.group(1).replace(',', '').strip() shipment_info["bales"] = int(bales_str) except ValueError: shipment_info["bales"] = None return shipment_info def _extract_weights_info(self, text): """Extrait les informations de poids""" weights_info = { "gross_landed_kg": None, "tare_kg": None, "net_landed_kg": None, "invoice_net_kg": None, "gain_loss_kg": None, "gain_loss_percent": None } # Extraction du poids brut débarqué gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text) if gross_match: weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', '')) # Extraction du poids de tare tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if tare_match: weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '')) # Extraction du poids net débarqué net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if net_landed_match: weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '')) # Extraction du poids net facturé invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if invoice_net_match: weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '')) # Extraction du gain en kg gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text) if gain_match: weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', '')) # Extraction du pourcentage de gain (0.4% dans le tableau) percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text) if percent_match: try: weights_info["gain_loss_percent"] = float(percent_match.group(1)) except ValueError: pass return weights_info # Configure root logger explicitly root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(file_handler) root.addHandler(logging.StreamHandler()) # Use root logger for your app logger = logging.getLogger(__name__) app = FastAPI() logger.info("Loading models...") nlp = spacy.load("en_core_web_sm") predictor = ocr_predictor(pretrained=True) logger.info("Models loaded successfully.") @app.post("/weight-report") def create_weight_report(payload: dict = Body(...)): logger.info("Create weight report called") # -------- Validation minimale -------- required_fields = [ "chunk_key", "gross_weight", "net_weight", "tare_total", "bags", "surveyor_code", "place_key", "report_date" ] missing = [f for f in required_fields if f not in payload] if missing: raise HTTPException( status_code=400, detail=f"Missing fields: {', '.join(missing)}" ) try: chunk_key = int(payload["chunk_key"]) gross_weight = float(payload["gross_weight"]) net_weight = float(payload["net_weight"]) tare_total = float(payload["tare_total"]) bags = int(payload["bags"]) surveyor_code = int(payload["surveyor_code"]) place_key = int(payload["place_key"]) report_date = int(payload["report_date"]) except Exception as e: raise HTTPException( status_code=400, detail=f"Invalid payload types: {e}" ) try: conn = get_db_connection() cursor = conn.cursor() # 🔹 On déclare la variable OUTPUT @OUT_WEIGHT_REPORT_KEY cursor.execute(""" DECLARE @OUT_WEIGHT_REPORT_KEY INT; EXEC dbo.sp_Singa_Automation_InsertWeightReport @CHUNK_KEY = ?, @BAGS_SOUND_AND_FULL = ?, @BAGS_SOUND_AND_SLACK = 0, @BAGS_DAMAGED_AND_FULL = 0, @BAGS_DAMAGED_AND_SLACK = 0, @BAGS_SHORT_LANDED = 0, @GROSS_SOUND_AND_FULL = ?, @GROSS_SOUND_AND_SLACK = 0, @GROSS_DAMAGED_AND_FULL = 0, @GROSS_DAMAGED_AND_SLACK = 0, @GROSS_SAMPLES = 0, @WEIGHING_DATE = ?, @REPORT_DATE = ?, @DATE_RECEIVED = ?, @NET_WEIGHT = ?, @TARE_TOTAL = ?, @TARE_FOR_TEN_BAGS = 0, @SURVEYOR_CODE = ?, @PLACE_KEY = ?, @SAMPLE_AFTER_WEIGHING = 'N', @MODIFIED_BY = 'FAIRCOTBOT', @MODIFY_DATE = ?, @VERSION_NB = 1, @FORWARDER_REF = 'API-TRYTON', @INSURED_VALUE = '0', @CREATED_BY = 1424, @UPDATED_BY = 1424, @BUY_INVOICE_AMOUNT = 0, @BUY_CURR_KEY = 0, @SEL_INVOICE_AMOUNT = 0, @SEL_CURR_KEY = 0, @CONSISTENCY = 'N', @FINALIZED = 'N', @MOISTURE_VALUE = NULL, @REPORT_TYPE = 0, @WET_WEIGHT = NULL, @WSMD_LOCATION = 0, @OUT_WEIGHT_REPORT_KEY = @OUT_WEIGHT_REPORT_KEY OUTPUT; SELECT @OUT_WEIGHT_REPORT_KEY AS OUT_WEIGHT_REPORT_KEY; """, chunk_key, bags, gross_weight, report_date, report_date, report_date, net_weight, tare_total, surveyor_code, place_key, report_date ) # 🔹 Récupération de la variable OUTPUT row = cursor.fetchone() conn.commit() if not row: raise HTTPException( status_code=500, detail="Stored procedure returned no data" ) logger.info("Columns returned: %s", [column[0] for column in cursor.description]) return { "success": True, "weight_report_key": row[0] } except Exception as e: logger.exception("Weight report creation failed") raise HTTPException(status_code=500, detail=str(e)) finally: try: conn.close() except: pass @app.post("/ocr") async def ocr(file: UploadFile): """ Smart PDF processing optimized for cotton landing reports """ logger.info(f"Smart OCR request: {file.filename}") try: file_data = await file.read() # Strategy 1: Try pdfplumber (best for digital PDFs) try: with pdfplumber.open(io.BytesIO(file_data)) as pdf: text_parts = [] tables_found = [] for page in pdf.pages: # Extract text page_text = page.extract_text(x_tolerance=2, y_tolerance=2) if page_text: text_parts.append(page_text) # Look for tables (common in landing reports) tables = page.extract_tables({ "vertical_strategy": "text", "horizontal_strategy": "text", "snap_tolerance": 5, }) for table in tables: if table and len(table) > 1: tables_found.append(table) combined_text = "\n".join(text_parts) return {"ocr_text": combined_text} # if combined_text.strip(): # logger.info(f"pdfplumber extracted {len(combined_text)} chars") # # Try parsing structured data # structured_data = parse_cotton_report(combined_text) # # Check if we got key fields # if (structured_data.get("shipment", {}).get("bales") and # structured_data.get("weights", {}).get("net_landed_kg")): # logger.info("Successfully parsed structured data from pdfplumber") # return { # "method": "pdfplumber", # "structured_data": structured_data, # "raw_text_sample": combined_text[:500] # } except Exception as e: logger.warning(f"pdfplumber attempt: {e}") # from pdf2image import convert_from_bytes # images = convert_from_bytes(file_data, dpi=200) # ocr_results = [] # for img in images: # text = pytesseract.image_to_string( # img, # config='--psm 6 -c preserve_interword_spaces=1' # ) # ocr_results.append(text) # ocr_text = "\n".join(ocr_results) # return { # "method": "tesseract_ocr", # "structured_data": ocr_text, # "raw_text_sample": ocr_text[:500] # } except Exception as e: logger.error(f"Smart OCR failed: {e}", exc_info=True) return { "error": str(e), "success": False } # ============================= # 🧱 Structure / Layout # ============================= @app.post("/structure") async def structure(file: UploadFile): logger.info(f"Received structure request: {file.filename}") try: file_data = await file.read() ext = file.filename.lower() if ext.endswith(".pdf"): doc = DocumentFile.from_pdf(file_data) logger.info(f"Structure prediction on PDF ({len(doc)} pages)") else: img = Image.open(io.BytesIO(file_data)).convert("RGB") doc = DocumentFile.from_images([img]) logger.info("Structure prediction on image") res = predictor(doc) return {"structure": str(res)} except Exception as e: logger.error(f"Structure extraction failed: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) # ============================= # 📊 Tables extraction (PDF only) # ============================= @app.post("/tables") async def tables(file: UploadFile): logger.info(f"Received table extraction request: {file.filename}") try: file_data = await file.read() buffer = io.BytesIO(file_data) tables = camelot.read_pdf(buffer) logger.info(f"Found {len(tables)} tables") return {"tables": [t.df.to_dict() for t in tables]} except Exception as e: logger.error(f"Table extraction failed: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) def safe_search(pattern, text, default=None, group_index=1, context=""): """Recherche sécurisée avec logging en cas d'absence de correspondance.""" m = re.search(pattern, text, re.I | re.S) if not m: logger.warning("Pattern not found for %s: %s", context, pattern) return default try: return m.group(group_index).strip() except IndexError: logger.warning("Group index %d not found for %s: %s", group_index, context, pattern) return default def to_float(s): if not s: return None s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "") s = s.replace("lbs", "").replace("LBS", "") s = s.strip() try: return float(s) except: return None def section(text, start, end=None): """Extract a block of text between two headings, safely.""" pattern_start = re.escape(start) if end: pattern_end = re.escape(end) reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I) else: reg = re.compile(pattern_start + r"(.*)", re.S | re.I) m = reg.search(text) if not m: logger.warning("Section not found: start='%s', end='%s'", start, end) return "" return m.group(1).strip() def extract_field(text, label, default=None): """Extract a line of the form 'Label: value', safely.""" pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)" return safe_search(pattern, text, default=default, context=f"field '{label}'") def extract(label, text, default=None): """ Robust extraction for OCR/PDF text. Works with: Label: Value Label Value Label .... Value """ if not text: return default patterns = [ rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)", rf"{re.escape(label)}\s+([^\n\r]+)" ] for p in patterns: m = re.search(p, text, re.I) if m: return m.group(1).strip() return default def extract_report_metadata(text): logger.info("Starting metadata extraction, text length=%d", len(text)) try: # ----------- SECTIONS ----------- order_details = section(text, "Order details", "Weights") invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed") landed_section = section(text, "Bales Weighed", "Outturn") loss_section = section(text, "LOSS", "Invoice average") avg_section = section(text, "Invoice average", "Comments") signature_block = section(text, "Signed on") # ----------- TOP INFO ----------- top_info = { "produced_on": extract_field(text, "Produced On"), "printed_date": extract_field(text, "Printed Date"), "client_reference": extract_field(text, "Client Reference"), "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1), } # ----------- ORDER DETAILS ----------- parties = { "client": extract_field(order_details, "Client"), "client_ref_no": extract_field(order_details, "Client Ref No"), "buyer": extract_field(order_details, "Buyer"), "destination": extract_field(order_details, "Destination"), } shipment = { "total_bales": extract_field(order_details, "Total Bales"), "vessel": extract_field(order_details, "Vessel"), "voyage_no": extract_field(order_details, "Voy. No"), "bl_no": extract_field(order_details, "B/L No"), "bl_date": extract_field(order_details, "B/L Date"), "growth": extract_field(order_details, "Growth"), "arrival_date": extract_field(order_details, "Arrival Date"), "first_weighing_date": extract_field(order_details, "First date of weighing"), "last_weighing_date": extract_field(order_details, "Last Date of Weighing"), "weighing_method": extract_field(order_details, "Weighing method"), "tare_basis": extract_field(order_details, "Tare"), } # ----------- INVOICE SECTION ----------- invoice = { "bales": extract_field(invoice_section, "Bales"), "gross": extract_field(invoice_section, "Gross"), "tare": extract_field(invoice_section, "Tare"), "net": extract_field(invoice_section, "Net"), } # ----------- LANDED SECTION ----------- landed = { "bales": extract_field(landed_section, "Bales"), "gross": extract_field(landed_section, "Gross"), "tare": extract_field(landed_section, "Tare"), "net": extract_field(landed_section, "Net"), } # ----------- LOSS SECTION ----------- loss = { "kg": extract_field(loss_section, "kg"), "lb": extract_field(loss_section, "lb"), "percent": extract_field(loss_section, "Percentage"), } # ----------- AVERAGES SECTION ----------- averages = { "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"), "landed_gross_per_bale": extract_field(avg_section, "Landed average"), } # ----------- SIGNATURE ----------- signature = { "signed_on": extract_field(signature_block, "Signed on"), "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"), "role": "Client Services Coordinator", "company": "Alfred H. Knight International Limited" } logger.info("Metadata extraction completed successfully") return { "report": top_info, "parties": parties, "shipment": shipment, "weights": { "invoice": invoice, "landed": landed, "loss": loss, "averages": averages }, "signature": signature } except Exception as e: logger.exception("Unexpected error during metadata extraction") raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}") def detect_template(text): t = text.lower() if "alfred h. knight" in t and "cotton landing report" in t: return "AHK" if "intertek" in t and "landing report" in t: return "INTERTEK" if "robertson international" in t or "ri ref no" in t: return "ROBERTSON" if "landing report" in t and "carcon cargo" in t: return "CARGO CONTROL" if "pacific inspection company" in t or "picl-bd.com" in t: return "PICL" return "UNKNOWN" @app.post("/metadata") async def metadata(text: str = Body(..., embed=True)): return extract_report_metadata(text) def call_extractor(text: str, lab: str = "AHK"): url = "http://62.72.36.116:8090/extract" params = {"lab": lab} fake_file = BytesIO(text.encode("utf-8")) files = { "file": ("document.txt", fake_file, "text/plain") } response = requests.post(url, params=params, files=files, timeout=60) response.raise_for_status() return response.json() @app.post("/parse") async def parse_endpoint(text: str = Body(..., embed=True)): lab = parse_report(text) result = call_extractor(text, lab=lab) return result PARSERS = { "AHK": AHKParser(), "INTERTEK": IntertekParser() } def empty_weight_report(lab): return { "lab": lab, "report": {"reference": None, "file_no": None, "date": None}, "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None}, "parties": {"seller": None, "buyer": None, "carrier": None}, "shipment": { "vessel": None, "bl_no": None, "bl_date": None, "port_loading": None, "port_destination": None, "arrival_date": None, "weighing_place": None, "weighing_method": None, "bales": None }, "weights": { "gross_landed_kg": None, "tare_kg": None, "net_landed_kg": None, "invoice_net_kg": None, "gain_loss_kg": None, "gain_loss_percent": None } } def parse_report(text): template=detect_template(text) # if template not in PARSERS: # return {"template":"UNKNOWN"} # return PARSERS[template].parse(text) return template @app.post("/mail") async def send_mail(request: Request): try: payload = await request.json() except Exception: raise HTTPException(status_code=400, detail="Invalid JSON") # Champs obligatoires to = payload.get("to") subject = payload.get("subject") body = payload.get("body") if not to or not subject or not body: raise HTTPException( status_code=400, detail="Missing required fields: to, subject, body" ) cc = payload.get("cc", []) attachments = payload.get("attachments", []) # Création du message msg = EmailMessage() msg["From"] = EMAIL_ACCOUNT msg["To"] = ", ".join(to) if cc: msg["Cc"] = ", ".join(cc) msg["Subject"] = subject msg.set_content(body) # Pièces jointes (base64) for att in attachments: filename = att.get("filename") content = att.get("content") content_type = att.get("content_type", "application/octet-stream") if not filename or not content: raise HTTPException( status_code=400, detail="Attachment must contain filename and content" ) try: file_bytes = base64.b64decode(content) except Exception: raise HTTPException( status_code=400, detail=f"Invalid base64 for attachment {filename}" ) maintype, subtype = content_type.split("/", 1) msg.add_attachment( file_bytes, maintype=maintype, subtype=subtype, filename=filename ) # Envoi SMTP try: with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as server: server.starttls() server.login(EMAIL_ACCOUNT, EMAIL_PASSWORD) server.send_message(msg) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) return { "status": "sent", "to": to, "attachments": len(attachments) }