automation-service/app.py

from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import pdfplumber
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re
from datetime import datetime

LOG_PATH = "/var/log/automation-service.log"

file_handler = RotatingFileHandler(
    LOG_PATH,
    maxBytes=10*1024*1024,
    backupCount=5,
    encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))

class AHKParser:
    lab = "AHK"

    def _clean_value(self, value):
        """Nettoie la valeur en supprimant les espaces inutiles"""
        if value:
            return value.strip()
        return value

    def parse(self, text):
        """Parse le texte et retourne un dictionnaire structuré"""
        result = {
            "lab": self.lab,
            "report": self._extract_report_info(text),
            "contract": self._extract_contract_info(text),
            "parties": self._extract_parties_info(text),
            "shipment": self._extract_shipment_info(text),
            "weights": self._extract_weights_info(text)
        }
        self.data = result
        return result

    def _extract_report_info(self, text):
        """Extrait les informations du rapport"""
        report_info = {
            "reference": None,
            "file_no": None,
            "date": None
        }

        # Recherche de la référence client - plus précise
        ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
        if ref_match:
            report_info["reference"] = self._clean_value(ref_match.group(1))

        # Recherche du numéro de fichier AHK
        file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
        if file_no_match:
            report_info["file_no"] = self._clean_value(file_no_match.group(1))

        # Recherche de la date du rapport
        date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
        if date_match:
            report_info["date"] = self._clean_value(date_match.group(1))

        return report_info

    def _extract_contract_info(self, text):
        """Extrait les informations du contrat"""
        contract_info = {
            "contract_no": None,
            "invoice_no": None,
            "lc_no": None,
            "origin": None,
            "commodity": None
        }

        # Extraction de la référence client
        ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
        if ref_match:
            ref_text = ref_match.group(1).strip()
            # Sépare S-3488 et INV 4013
            parts = re.split(r'[/\s]+', ref_text)
            for part in parts:
                if part.startswith('S-'):
                    contract_info["contract_no"] = part.strip()
                elif part.startswith('INV'):
                    contract_info["invoice_no"] = part.strip()

        # Extraction de l'origine et de la marchandise - regex plus précise
        growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
        if growth_match:
            origin_text = growth_match.group(1).strip()
            if "AUSTRALIAN" in origin_text.upper():
                contract_info["origin"] = "AUSTRALIA"
                contract_info["commodity"] = "RAW COTTON"

        return contract_info

    def _extract_parties_info(self, text):
        """Extrait les informations sur les parties"""
        parties_info = {
            "seller": None,
            "buyer": None,
            "carrier": None
        }

        # Extraction du vendeur (Client) - regex plus précise
        seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
        if seller_match:
            parties_info["seller"] = self._clean_value(seller_match.group(1))

        # Extraction de l'acheteur (Buyer) - regex plus précise
        buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
        if buyer_match:
            parties_info["buyer"] = self._clean_value(buyer_match.group(1))

        # Extraction du transporteur (nom du navire seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
        if vessel_match:
            parties_info["carrier"] = self._clean_value(vessel_match.group(1))

        return parties_info

    def _extract_shipment_info(self, text):
        """Extrait les informations d'expédition"""
        shipment_info = {
            "vessel": None,
            "bl_no": None,
            "bl_date": None,
            "port_loading": None,
            "port_destination": None,
            "arrival_date": None,
            "weighing_place": None,
            "weighing_method": None,
            "bales": None
        }

        # Extraction du navire (nom seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
        if vessel_match:
            shipment_info["vessel"] = self._clean_value(vessel_match.group(1))

        # Extraction du numéro de connaissement (seulement le numéro)
        bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
        if bl_no_match:
            shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))

        # Extraction de la date du connaissement
        bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
        if bl_date_match:
            shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))

        # Extraction du port de destination (sans le "Tare")
        dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
        if dest_match:
            shipment_info["port_destination"] = self._clean_value(dest_match.group(1))

        # Extraction de la date d'arrivée
        arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
        if arrival_match:
            shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))

        # Extraction de la méthode de pesée
        weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
        if weighing_method_match:
            shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))

        # Extraction du nombre de balles
        bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
        if bales_match:
            try:
                shipment_info["bales"] = int(bales_match.group(1).strip())
            except ValueError:
                shipment_info["bales"] = None

        return shipment_info

    def _extract_weights_info(self, text):
        """Extrait les informations de poids"""
        weights_info = {
            "gross_landed_kg": None,
            "tare_kg": None,
            "net_landed_kg": None,
            "invoice_net_kg": None,
            "gain_loss_kg": None,
            "gain_loss_percent": None
        }

        # Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
        gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
        if gross_landed_match:
            try:
                weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids de tare
        tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
        if tare_match:
            try:
                weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
        net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
        if net_landed_match:
            try:
                weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids net facturé (101299 kg)
        invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
        if invoice_net_match:
            try:
                weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction de la perte en kg
        loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
        if loss_match:
            try:
                weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du pourcentage de perte
        percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
        if percent_match:
            try:
                weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        return weights_info

import re

class IntertekParser:
    lab = "Intertek"

    def _clean_value(self, value):
        """Nettoie la valeur en supprimant les espaces inutiles"""
        if value:
            return value.strip()
        return value

    def _extract_number(self, text, pattern, is_int=False):
        """Extrait un nombre (int ou float) du texte selon un pattern regex"""
        match = re.search(pattern, text)
        if match:
            try:
                # Nettoie la chaîne numérique
                num_str = match.group(1).replace(',', '').replace(' ', '').strip()
                if is_int:
                    return int(num_str)
                else:
                    return float(num_str)
            except (ValueError, AttributeError):
                return None
        return None

    def parse(self, text):
        """Parse le texte et retourne un dictionnaire structuré"""
        result = {
            "lab": self.lab,
            "report": self._extract_report_info(text),
            "contract": self._extract_contract_info(text),
            "parties": self._extract_parties_info(text),
            "shipment": self._extract_shipment_info(text),
            "weights": self._extract_weights_info(text)
        }
        return result

    def _extract_report_info(self, text):
        """Extrait les informations du rapport"""
        report_info = {
            "reference": None,
            "file_no": None,
            "date": None
        }

        # Recherche de la référence globale
        ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text)
        if ref_match:
            report_info["reference"] = self._clean_value(ref_match.group(1))

        # Recherche du numéro de fichier
        file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text)
        if file_no_match:
            report_info["file_no"] = self._clean_value(file_no_match.group(1))

        # Recherche de la date du rapport
        date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
        if date_match:
            report_info["date"] = self._clean_value(date_match.group(1))

        return report_info

    def _extract_contract_info(self, text):
        """Extrait les informations du contrat"""
        contract_info = {
            "contract_no": None,
            "invoice_no": None,
            "lc_no": None,  # Non présent dans ce rapport
            "origin": None,
            "commodity": None
        }

        # Extraction du numéro de contrat
        contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text)
        if contract_match:
            contract_info["contract_no"] = self._clean_value(contract_match.group(1))

        # Extraction du numéro de facture
        invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text)
        if invoice_match:
            contract_info["invoice_no"] = self._clean_value(invoice_match.group(1))

        # Extraction de l'origine et de la marchandise
        growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text)
        if growth_match:
            origin_text = growth_match.group(1).strip()
            if "GREECE" in origin_text.upper():
                contract_info["origin"] = "GREECE"
                contract_info["commodity"] = "RAW COTTON"

        return contract_info

    def _extract_parties_info(self, text):
        """Extrait les informations sur les parties"""
        parties_info = {
            "seller": None,
            "buyer": None,
            "carrier": None
        }

        # Extraction du vendeur (Shipper)
        seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text)
        if seller_match:
            parties_info["seller"] = self._clean_value(seller_match.group(1))

        # Extraction de l'acheteur (Buyer)
        buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text)
        if buyer_match:
            parties_info["buyer"] = self._clean_value(buyer_match.group(1))

        # Extraction du transporteur (nom du navire seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
        if vessel_match:
            parties_info["carrier"] = self._clean_value(vessel_match.group(1))

        return parties_info

    def _extract_shipment_info(self, text):
        """Extrait les informations d'expédition"""
        shipment_info = {
            "vessel": None,
            "bl_no": None,
            "bl_date": None,  # Non présent dans ce rapport
            "port_loading": None,  # Non présent dans ce rapport
            "port_destination": None,  # Non présent dans ce rapport
            "arrival_date": None,
            "weighing_place": None,
            "weighing_method": None,
            "bales": None
        }

        # Extraction du navire
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
        if vessel_match:
            shipment_info["vessel"] = self._clean_value(vessel_match.group(1))

        # Extraction du numéro de connaissement
        bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text)
        if bl_no_match:
            shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))

        # Extraction de la date d'arrivée
        arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
        if arrival_match:
            shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))

        # Extraction du lieu de pesée
        weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text)
        if weighing_place_match:
            shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1))

        # Extraction de la méthode de pesée
        # Recherche dans les remarques
        remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE)
        if remarks_section:
            remarks_text = remarks_section.group(1)
            if "weighbridge" in remarks_text.lower():
                shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck"

        # Extraction du nombre de balles (à partir du total)
        bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text)
        if not bales_match:
            # Essayons une autre approche
            bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text)

        if bales_match:
            try:
                bales_str = bales_match.group(1).replace(',', '').strip()
                shipment_info["bales"] = int(bales_str)
            except ValueError:
                shipment_info["bales"] = None

        return shipment_info

    def _extract_weights_info(self, text):
        """Extrait les informations de poids"""
        weights_info = {
            "gross_landed_kg": None,
            "tare_kg": None,
            "net_landed_kg": None,
            "invoice_net_kg": None,
            "gain_loss_kg": None,
            "gain_loss_percent": None
        }

        # Extraction du poids brut débarqué
        gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text)
        if gross_match:
            weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', ''))

        # Extraction du poids de tare
        tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if tare_match:
            weights_info["tare_kg"] = float(tare_match.group(1).replace(',', ''))

        # Extraction du poids net débarqué
        net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if net_landed_match:
            weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', ''))

        # Extraction du poids net facturé
        invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if invoice_net_match:
            weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', ''))

        # Extraction du gain en kg
        gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text)
        if gain_match:
            weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', ''))

        # Extraction du pourcentage de gain (0.4% dans le tableau)
        percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text)
        if percent_match:
            try:
                weights_info["gain_loss_percent"] = float(percent_match.group(1))
            except ValueError:
                pass

        return weights_info

# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())

# Use root logger for your app
logger = logging.getLogger(__name__)

app = FastAPI()
logger.info("Loading models...")

nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)

logger.info("Models loaded successfully.")

# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
#     logger.info(f"Received OCR request: {file.filename}")
#     try:
#         file_data = await file.read()
#         ext = file.filename.lower()

#         # --------- PDF with native text ---------
#         if ext.endswith(".pdf"):
#             logger.info("PDF detected → Extracting native text first")
#             reader = PdfReader(io.BytesIO(file_data))
#             direct_text = "".join(
#                 page.extract_text() or "" for page in reader.pages
#             )

#             if direct_text.strip():
#                 logger.info("Native PDF text found → No OCR needed")
#                 return {"ocr_text": direct_text}

#             # -------- Fallback: scanned PDF OCR --------
#             logger.info("No native text → PDF treated as scanned → OCR")
#             from pdf2image import convert_from_bytes
#             images = convert_from_bytes(file_data)
#             text = ""
#             for i, img in enumerate(images):
#                 logger.info(f"OCR page {i+1}/{len(images)}")
#                 text += pytesseract.image_to_string(img) + "\n"

#             return {"ocr_text": text}

#         # --------- Image file OCR ---------
#         logger.info("Image detected → Running OCR")
#         img = Image.open(io.BytesIO(file_data))
#         text = pytesseract.image_to_string(img)
#         return {"ocr_text": text}

#     except Exception as e:
#         logger.error(f"OCR failed: {e}", exc_info=True)
#         raise HTTPException(status_code=500, detail=str(e))
@app.post("/ocr")
async def ocr(file: UploadFile):
    """
    Smart PDF processing optimized for cotton landing reports
    """
    logger.info(f"Smart OCR request: {file.filename}")

    try:
        file_data = await file.read()

        # Strategy 1: Try pdfplumber (best for digital PDFs)
        try:
            with pdfplumber.open(io.BytesIO(file_data)) as pdf:
                text_parts = []
                tables_found = []

                for page in pdf.pages:
                    # Extract text
                    page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
                    if page_text:
                        text_parts.append(page_text)

                    # Look for tables (common in landing reports)
                    tables = page.extract_tables({
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text",
                        "snap_tolerance": 5,
                    })

                    for table in tables:
                        if table and len(table) > 1:
                            tables_found.append(table)

                combined_text = "\n".join(text_parts)
                return {"ocr_text": combined_text}
                # if combined_text.strip():
                #     logger.info(f"pdfplumber extracted {len(combined_text)} chars")

                #     # Try parsing structured data
                #     structured_data = parse_cotton_report(combined_text)

                #     # Check if we got key fields
                #     if (structured_data.get("shipment", {}).get("bales") and
                #         structured_data.get("weights", {}).get("net_landed_kg")):
                #         logger.info("Successfully parsed structured data from pdfplumber")
                #         return {
                #             "method": "pdfplumber",
                #             "structured_data": structured_data,
                #             "raw_text_sample": combined_text[:500]
                #         }

        except Exception as e:
            logger.warning(f"pdfplumber attempt: {e}")

        # from pdf2image import convert_from_bytes
        # images = convert_from_bytes(file_data, dpi=200)

        # ocr_results = []
        # for img in images:
        #     text = pytesseract.image_to_string(
        #         img,
        #         config='--psm 6 -c preserve_interword_spaces=1'
        #     )
        #     ocr_results.append(text)

        # ocr_text = "\n".join(ocr_results)

        # return {
        #     "method": "tesseract_ocr",
        #     "structured_data": ocr_text,
        #     "raw_text_sample": ocr_text[:500]
        # }

    except Exception as e:
        logger.error(f"Smart OCR failed: {e}", exc_info=True)
        return {
            "error": str(e),
            "success": False
        }
# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
    logger.info(f"Received structure request: {file.filename}")
    try:
        file_data = await file.read()
        ext = file.filename.lower()

        if ext.endswith(".pdf"):
            doc = DocumentFile.from_pdf(file_data)
            logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
        else:
            img = Image.open(io.BytesIO(file_data)).convert("RGB")
            doc = DocumentFile.from_images([img])
            logger.info("Structure prediction on image")

        res = predictor(doc)
        return {"structure": str(res)}

    except Exception as e:
        logger.error(f"Structure extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
    logger.info(f"Received table extraction request: {file.filename}")
    try:
        file_data = await file.read()
        buffer = io.BytesIO(file_data)

        tables = camelot.read_pdf(buffer)
        logger.info(f"Found {len(tables)} tables")
        return {"tables": [t.df.to_dict() for t in tables]}

    except Exception as e:
        logger.error(f"Table extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

def safe_search(pattern, text, default=None, group_index=1, context=""):
    """Recherche sécurisée avec logging en cas d'absence de correspondance."""
    m = re.search(pattern, text, re.I | re.S)
    if not m:
        logger.warning("Pattern not found for %s: %s", context, pattern)
        return default
    try:
        return m.group(group_index).strip()
    except IndexError:
        logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
        return default

def to_float(s):
    if not s:
        return None
    s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
    s = s.replace("lbs", "").replace("LBS", "")
    s = s.strip()
    try:
        return float(s)
    except:
        return None

def section(text, start, end=None):
    """Extract a block of text between two headings, safely."""
    pattern_start = re.escape(start)
    if end:
        pattern_end = re.escape(end)
        reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
    else:
        reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
    m = reg.search(text)
    if not m:
        logger.warning("Section not found: start='%s', end='%s'", start, end)
        return ""
    return m.group(1).strip()

def extract_field(text, label, default=None):
    """Extract a line of the form 'Label: value', safely."""
    pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
    return safe_search(pattern, text, default=default, context=f"field '{label}'")

def extract(label, text, default=None):
    """
    Robust extraction for OCR/PDF text.
    Works with:
      Label: Value
      Label Value
      Label .... Value
    """
    if not text:
        return default

    patterns = [
        rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
        rf"{re.escape(label)}\s+([^\n\r]+)"
    ]

    for p in patterns:
        m = re.search(p, text, re.I)
        if m:
            return m.group(1).strip()

    return default

def extract_report_metadata(text):
    logger.info("Starting metadata extraction, text length=%d", len(text))

    try:
        # ----------- SECTIONS -----------
        order_details   = section(text, "Order details", "Weights")
        invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
        landed_section  = section(text, "Bales Weighed", "Outturn")
        loss_section    = section(text, "LOSS", "Invoice average")
        avg_section     = section(text, "Invoice average", "Comments")
        signature_block = section(text, "Signed on")

        # ----------- TOP INFO -----------
        top_info = {
            "produced_on": extract_field(text, "Produced On"),
            "printed_date": extract_field(text, "Printed Date"),
            "client_reference": extract_field(text, "Client Reference"),
            "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
        }

        # ----------- ORDER DETAILS -----------
        parties = {
            "client": extract_field(order_details, "Client"),
            "client_ref_no": extract_field(order_details, "Client Ref No"),
            "buyer": extract_field(order_details, "Buyer"),
            "destination": extract_field(order_details, "Destination"),
        }

        shipment = {
            "total_bales": extract_field(order_details, "Total Bales"),
            "vessel": extract_field(order_details, "Vessel"),
            "voyage_no": extract_field(order_details, "Voy. No"),
            "bl_no": extract_field(order_details, "B/L No"),
            "bl_date": extract_field(order_details, "B/L Date"),
            "growth": extract_field(order_details, "Growth"),
            "arrival_date": extract_field(order_details, "Arrival Date"),
            "first_weighing_date": extract_field(order_details, "First date of weighing"),
            "last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
            "weighing_method": extract_field(order_details, "Weighing method"),
            "tare_basis": extract_field(order_details, "Tare"),
        }

        # ----------- INVOICE SECTION -----------
        invoice = {
            "bales": extract_field(invoice_section, "Bales"),
            "gross": extract_field(invoice_section, "Gross"),
            "tare": extract_field(invoice_section, "Tare"),
            "net": extract_field(invoice_section, "Net"),
        }

        # ----------- LANDED SECTION -----------
        landed = {
            "bales": extract_field(landed_section, "Bales"),
            "gross": extract_field(landed_section, "Gross"),
            "tare": extract_field(landed_section, "Tare"),
            "net": extract_field(landed_section, "Net"),
        }

        # ----------- LOSS SECTION -----------
        loss = {
            "kg": extract_field(loss_section, "kg"),
            "lb": extract_field(loss_section, "lb"),
            "percent": extract_field(loss_section, "Percentage"),
        }

        # ----------- AVERAGES SECTION -----------
        averages = {
            "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
            "landed_gross_per_bale": extract_field(avg_section, "Landed average"),
        }

        # ----------- SIGNATURE -----------
        signature = {
            "signed_on": extract_field(signature_block, "Signed on"),
            "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
            "role": "Client Services Coordinator",
            "company": "Alfred H. Knight International Limited"
        }

        logger.info("Metadata extraction completed successfully")
        return {
            "report": top_info,
            "parties": parties,
            "shipment": shipment,
            "weights": {
                "invoice": invoice,
                "landed": landed,
                "loss": loss,
                "averages": averages
            },
            "signature": signature
        }

    except Exception as e:
        logger.exception("Unexpected error during metadata extraction")
        raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")

def detect_template(text):
    t = text.lower()

    if "alfred h. knight" in t and "cotton landing report" in t:
        return "AHK"

    if "intertek" in t and "landing report" in t:
        return "INTERTEK"

    if "robertson international" in t or "ri ref no" in t:
        return "ROBERTSON"

    if "landing report" in t and "carcon cargo" in t:
        return "SGS"

    if "pacific inspection company" in t or "picl-bd.com" in t:
        return "PICL"

    return "UNKNOWN"

@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
    return extract_report_metadata(text)

@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
    return parse_report(text)

PARSERS = {
    "AHK": AHKParser(),
    "INTERTEK": IntertekParser()
}

def empty_weight_report(lab):
    return {
        "lab": lab,
        "report": {"reference": None, "file_no": None, "date": None},
        "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
        "parties": {"seller": None, "buyer": None, "carrier": None},
        "shipment": {
            "vessel": None, "bl_no": None, "bl_date": None, "port_loading": None,
            "port_destination": None, "arrival_date": None,
            "weighing_place": None, "weighing_method": None,
            "bales": None
        },
        "weights": {
            "gross_landed_kg": None, "tare_kg": None,
            "net_landed_kg": None, "invoice_net_kg": None,
            "gain_loss_kg": None, "gain_loss_percent": None
        }
    }

def parse_report(text):
    template=detect_template(text)
    if template not in PARSERS:
        return {"template":"UNKNOWN"}
    return PARSERS[template].parse(text)