automation-service/app.py

from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import pdfplumber
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re
from datetime import datetime
from io import BytesIO
import requests

LOG_PATH = "/var/log/automation-service.log"

file_handler = RotatingFileHandler(
    LOG_PATH,
    maxBytes=10*1024*1024,
    backupCount=5,
    encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))

class AHKParser:
    lab = "AHK"

    def _clean_value(self, value):
        """Nettoie la valeur en supprimant les espaces inutiles"""
        if value:
            return value.strip()
        return value

    def parse(self, text):
        """Parse le texte et retourne un dictionnaire structuré"""
        result = {
            "lab": self.lab,
            "report": self._extract_report_info(text),
            "contract": self._extract_contract_info(text),
            "parties": self._extract_parties_info(text),
            "shipment": self._extract_shipment_info(text),
            "weights": self._extract_weights_info(text)
        }
        self.data = result
        return result

    def _extract_report_info(self, text):
        """Extrait les informations du rapport"""
        report_info = {
            "reference": None,
            "file_no": None,
            "date": None
        }

        # Recherche de la référence client - plus précise
        ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
        if ref_match:
            report_info["reference"] = self._clean_value(ref_match.group(1))

        # Recherche du numéro de fichier AHK
        file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
        if file_no_match:
            report_info["file_no"] = self._clean_value(file_no_match.group(1))

        # Recherche de la date du rapport
        date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
        if date_match:
            report_info["date"] = self._clean_value(date_match.group(1))

        return report_info

    def _extract_contract_info(self, text):
        """Extrait les informations du contrat"""
        contract_info = {
            "contract_no": None,
            "invoice_no": None,
            "lc_no": None,
            "origin": None,
            "commodity": None
        }

        # Extraction de la référence client
        ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
        if ref_match:
            ref_text = ref_match.group(1).strip()
            # Sépare S-3488 et INV 4013
            parts = re.split(r'[/\s]+', ref_text)
            for part in parts:
                if part.startswith('S-'):
                    contract_info["contract_no"] = part.strip()
                elif part.startswith('INV'):
                    contract_info["invoice_no"] = part.strip()

        # Extraction de l'origine et de la marchandise - regex plus précise
        growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
        if growth_match:
            origin_text = growth_match.group(1).strip()
            if "AUSTRALIAN" in origin_text.upper():
                contract_info["origin"] = "AUSTRALIA"
                contract_info["commodity"] = "RAW COTTON"

        return contract_info

    def _extract_parties_info(self, text):
        """Extrait les informations sur les parties"""
        parties_info = {
            "seller": None,
            "buyer": None,
            "carrier": None
        }

        # Extraction du vendeur (Client) - regex plus précise
        seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
        if seller_match:
            parties_info["seller"] = self._clean_value(seller_match.group(1))

        # Extraction de l'acheteur (Buyer) - regex plus précise
        buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
        if buyer_match:
            parties_info["buyer"] = self._clean_value(buyer_match.group(1))

        # Extraction du transporteur (nom du navire seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
        if vessel_match:
            parties_info["carrier"] = self._clean_value(vessel_match.group(1))

        return parties_info

    def _extract_shipment_info(self, text):
        """Extrait les informations d'expédition"""
        shipment_info = {
            "vessel": None,
            "bl_no": None,
            "bl_date": None,
            "port_loading": None,
            "port_destination": None,
            "arrival_date": None,
            "weighing_place": None,
            "weighing_method": None,
            "bales": None
        }

        # Extraction du navire (nom seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
        if vessel_match:
            shipment_info["vessel"] = self._clean_value(vessel_match.group(1))

        # Extraction du numéro de connaissement (seulement le numéro)
        bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
        if bl_no_match:
            shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))

        # Extraction de la date du connaissement
        bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
        if bl_date_match:
            shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))

        # Extraction du port de destination (sans le "Tare")
        dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
        if dest_match:
            shipment_info["port_destination"] = self._clean_value(dest_match.group(1))

        # Extraction de la date d'arrivée
        arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
        if arrival_match:
            shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))

        # Extraction de la méthode de pesée
        weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
        if weighing_method_match:
            shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))

        # Extraction du nombre de balles
        bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
        if bales_match:
            try:
                shipment_info["bales"] = int(bales_match.group(1).strip())
            except ValueError:
                shipment_info["bales"] = None

        return shipment_info

    def _extract_weights_info(self, text):
        """Extrait les informations de poids"""
        weights_info = {
            "gross_landed_kg": None,
            "tare_kg": None,
            "net_landed_kg": None,
            "invoice_net_kg": None,
            "gain_loss_kg": None,
            "gain_loss_percent": None
        }

        # Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
        gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
        if gross_landed_match:
            try:
                weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids de tare
        tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
        if tare_match:
            try:
                weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
        net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
        if net_landed_match:
            try:
                weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du poids net facturé (101299 kg)
        invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
        if invoice_net_match:
            try:
                weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction de la perte en kg
        loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
        if loss_match:
            try:
                weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        # Extraction du pourcentage de perte
        percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
        if percent_match:
            try:
                weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
            except ValueError:
                pass

        return weights_info

import re

class IntertekParser:
    lab = "Intertek"

    def _clean_value(self, value):
        """Nettoie la valeur en supprimant les espaces inutiles"""
        if value:
            return value.strip()
        return value

    def _extract_number(self, text, pattern, is_int=False):
        """Extrait un nombre (int ou float) du texte selon un pattern regex"""
        match = re.search(pattern, text)
        if match:
            try:
                # Nettoie la chaîne numérique
                num_str = match.group(1).replace(',', '').replace(' ', '').strip()
                if is_int:
                    return int(num_str)
                else:
                    return float(num_str)
            except (ValueError, AttributeError):
                return None
        return None

    def parse(self, text):
        """Parse le texte et retourne un dictionnaire structuré"""
        result = {
            "lab": self.lab,
            "report": self._extract_report_info(text),
            "contract": self._extract_contract_info(text),
            "parties": self._extract_parties_info(text),
            "shipment": self._extract_shipment_info(text),
            "weights": self._extract_weights_info(text)
        }
        return result

    def _extract_report_info(self, text):
        """Extrait les informations du rapport"""
        report_info = {
            "reference": None,
            "file_no": None,
            "date": None
        }

        # Recherche de la référence globale
        ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text)
        if ref_match:
            report_info["reference"] = self._clean_value(ref_match.group(1))

        # Recherche du numéro de fichier
        file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text)
        if file_no_match:
            report_info["file_no"] = self._clean_value(file_no_match.group(1))

        # Recherche de la date du rapport
        date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
        if date_match:
            report_info["date"] = self._clean_value(date_match.group(1))

        return report_info

    def _extract_contract_info(self, text):
        """Extrait les informations du contrat"""
        contract_info = {
            "contract_no": None,
            "invoice_no": None,
            "lc_no": None,  # Non présent dans ce rapport
            "origin": None,
            "commodity": None
        }

        # Extraction du numéro de contrat
        contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text)
        if contract_match:
            contract_info["contract_no"] = self._clean_value(contract_match.group(1))

        # Extraction du numéro de facture
        invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text)
        if invoice_match:
            contract_info["invoice_no"] = self._clean_value(invoice_match.group(1))

        # Extraction de l'origine et de la marchandise
        growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text)
        if growth_match:
            origin_text = growth_match.group(1).strip()
            if "GREECE" in origin_text.upper():
                contract_info["origin"] = "GREECE"
                contract_info["commodity"] = "RAW COTTON"

        return contract_info

    def _extract_parties_info(self, text):
        """Extrait les informations sur les parties"""
        parties_info = {
            "seller": None,
            "buyer": None,
            "carrier": None
        }

        # Extraction du vendeur (Shipper)
        seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text)
        if seller_match:
            parties_info["seller"] = self._clean_value(seller_match.group(1))

        # Extraction de l'acheteur (Buyer)
        buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text)
        if buyer_match:
            parties_info["buyer"] = self._clean_value(buyer_match.group(1))

        # Extraction du transporteur (nom du navire seulement)
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
        if vessel_match:
            parties_info["carrier"] = self._clean_value(vessel_match.group(1))

        return parties_info

    def _extract_shipment_info(self, text):
        """Extrait les informations d'expédition"""
        shipment_info = {
            "vessel": None,
            "bl_no": None,
            "bl_date": None,  # Non présent dans ce rapport
            "port_loading": None,  # Non présent dans ce rapport
            "port_destination": None,  # Non présent dans ce rapport
            "arrival_date": None,
            "weighing_place": None,
            "weighing_method": None,
            "bales": None
        }

        # Extraction du navire
        vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
        if vessel_match:
            shipment_info["vessel"] = self._clean_value(vessel_match.group(1))

        # Extraction du numéro de connaissement
        bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text)
        if bl_no_match:
            shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))

        # Extraction de la date d'arrivée
        arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
        if arrival_match:
            shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))

        # Extraction du lieu de pesée
        weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text)
        if weighing_place_match:
            shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1))

        # Extraction de la méthode de pesée
        # Recherche dans les remarques
        remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE)
        if remarks_section:
            remarks_text = remarks_section.group(1)
            if "weighbridge" in remarks_text.lower():
                shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck"

        # Extraction du nombre de balles (à partir du total)
        bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text)
        if not bales_match:
            # Essayons une autre approche
            bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text)

        if bales_match:
            try:
                bales_str = bales_match.group(1).replace(',', '').strip()
                shipment_info["bales"] = int(bales_str)
            except ValueError:
                shipment_info["bales"] = None

        return shipment_info

    def _extract_weights_info(self, text):
        """Extrait les informations de poids"""
        weights_info = {
            "gross_landed_kg": None,
            "tare_kg": None,
            "net_landed_kg": None,
            "invoice_net_kg": None,
            "gain_loss_kg": None,
            "gain_loss_percent": None
        }

        # Extraction du poids brut débarqué
        gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text)
        if gross_match:
            weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', ''))

        # Extraction du poids de tare
        tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if tare_match:
            weights_info["tare_kg"] = float(tare_match.group(1).replace(',', ''))

        # Extraction du poids net débarqué
        net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if net_landed_match:
            weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', ''))

        # Extraction du poids net facturé
        invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
        if invoice_net_match:
            weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', ''))

        # Extraction du gain en kg
        gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text)
        if gain_match:
            weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', ''))

        # Extraction du pourcentage de gain (0.4% dans le tableau)
        percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text)
        if percent_match:
            try:
                weights_info["gain_loss_percent"] = float(percent_match.group(1))
            except ValueError:
                pass

        return weights_info

# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())

# Use root logger for your app
logger = logging.getLogger(__name__)

app = FastAPI()
logger.info("Loading models...")

nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)

logger.info("Models loaded successfully.")

# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
#     logger.info(f"Received OCR request: {file.filename}")
#     try:
#         file_data = await file.read()
#         ext = file.filename.lower()

#         # --------- PDF with native text ---------
#         if ext.endswith(".pdf"):
#             logger.info("PDF detected → Extracting native text first")
#             reader = PdfReader(io.BytesIO(file_data))
#             direct_text = "".join(
#                 page.extract_text() or "" for page in reader.pages
#             )

#             if direct_text.strip():
#                 logger.info("Native PDF text found → No OCR needed")
#                 return {"ocr_text": direct_text}

#             # -------- Fallback: scanned PDF OCR --------
#             logger.info("No native text → PDF treated as scanned → OCR")
#             from pdf2image import convert_from_bytes
#             images = convert_from_bytes(file_data)
#             text = ""
#             for i, img in enumerate(images):
#                 logger.info(f"OCR page {i+1}/{len(images)}")
#                 text += pytesseract.image_to_string(img) + "\n"

#             return {"ocr_text": text}

#         # --------- Image file OCR ---------
#         logger.info("Image detected → Running OCR")
#         img = Image.open(io.BytesIO(file_data))
#         text = pytesseract.image_to_string(img)
#         return {"ocr_text": text}

#     except Exception as e:
#         logger.error(f"OCR failed: {e}", exc_info=True)
#         raise HTTPException(status_code=500, detail=str(e))
@app.post("/ocr")
async def ocr(file: UploadFile):
    """
    Smart PDF processing optimized for cotton landing reports
    """
    logger.info(f"Smart OCR request: {file.filename}")

    try:
        file_data = await file.read()

        # Strategy 1: Try pdfplumber (best for digital PDFs)
        try:
            with pdfplumber.open(io.BytesIO(file_data)) as pdf:
                text_parts = []
                tables_found = []

                for page in pdf.pages:
                    # Extract text
                    page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
                    if page_text:
                        text_parts.append(page_text)

                    # Look for tables (common in landing reports)
                    tables = page.extract_tables({
                        "vertical_strategy": "text",
                        "horizontal_strategy": "text",
                        "snap_tolerance": 5,
                    })

                    for table in tables:
                        if table and len(table) > 1:
                            tables_found.append(table)

                combined_text = "\n".join(text_parts)
                return {"ocr_text": combined_text}
                # if combined_text.strip():
                #     logger.info(f"pdfplumber extracted {len(combined_text)} chars")

                #     # Try parsing structured data
                #     structured_data = parse_cotton_report(combined_text)

                #     # Check if we got key fields
                #     if (structured_data.get("shipment", {}).get("bales") and
                #         structured_data.get("weights", {}).get("net_landed_kg")):
                #         logger.info("Successfully parsed structured data from pdfplumber")
                #         return {
                #             "method": "pdfplumber",
                #             "structured_data": structured_data,
                #             "raw_text_sample": combined_text[:500]
                #         }

        except Exception as e:
            logger.warning(f"pdfplumber attempt: {e}")

        # from pdf2image import convert_from_bytes
        # images = convert_from_bytes(file_data, dpi=200)

        # ocr_results = []
        # for img in images:
        #     text = pytesseract.image_to_string(
        #         img,
        #         config='--psm 6 -c preserve_interword_spaces=1'
        #     )
        #     ocr_results.append(text)

        # ocr_text = "\n".join(ocr_results)

        # return {
        #     "method": "tesseract_ocr",
        #     "structured_data": ocr_text,
        #     "raw_text_sample": ocr_text[:500]
        # }

    except Exception as e:
        logger.error(f"Smart OCR failed: {e}", exc_info=True)
        return {
            "error": str(e),
            "success": False
        }
# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
    logger.info(f"Received structure request: {file.filename}")
    try:
        file_data = await file.read()
        ext = file.filename.lower()

        if ext.endswith(".pdf"):
            doc = DocumentFile.from_pdf(file_data)
            logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
        else:
            img = Image.open(io.BytesIO(file_data)).convert("RGB")
            doc = DocumentFile.from_images([img])
            logger.info("Structure prediction on image")

        res = predictor(doc)
        return {"structure": str(res)}

    except Exception as e:
        logger.error(f"Structure extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
    logger.info(f"Received table extraction request: {file.filename}")
    try:
        file_data = await file.read()
        buffer = io.BytesIO(file_data)

        tables = camelot.read_pdf(buffer)
        logger.info(f"Found {len(tables)} tables")
        return {"tables": [t.df.to_dict() for t in tables]}

    except Exception as e:
        logger.error(f"Table extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

def safe_search(pattern, text, default=None, group_index=1, context=""):
    """Recherche sécurisée avec logging en cas d'absence de correspondance."""
    m = re.search(pattern, text, re.I | re.S)
    if not m:
        logger.warning("Pattern not found for %s: %s", context, pattern)
        return default
    try:
        return m.group(group_index).strip()
    except IndexError:
        logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
        return default

def to_float(s):
    if not s:
        return None
    s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
    s = s.replace("lbs", "").replace("LBS", "")
    s = s.strip()
    try:
        return float(s)
    except:
        return None

def section(text, start, end=None):
    """Extract a block of text between two headings, safely."""
    pattern_start = re.escape(start)
    if end:
        pattern_end = re.escape(end)
        reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
    else:
        reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
    m = reg.search(text)
    if not m:
        logger.warning("Section not found: start='%s', end='%s'", start, end)
        return ""
    return m.group(1).strip()

def extract_field(text, label, default=None):
    """Extract a line of the form 'Label: value', safely."""
    pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
    return safe_search(pattern, text, default=default, context=f"field '{label}'")

def extract(label, text, default=None):
    """
    Robust extraction for OCR/PDF text.
    Works with:
      Label: Value
      Label Value
      Label .... Value
    """
    if not text:
        return default

    patterns = [
        rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
        rf"{re.escape(label)}\s+([^\n\r]+)"
    ]

    for p in patterns:
        m = re.search(p, text, re.I)
        if m:
            return m.group(1).strip()

    return default

def extract_report_metadata(text):
    logger.info("Starting metadata extraction, text length=%d", len(text))

    try:
        # ----------- SECTIONS -----------
        order_details   = section(text, "Order details", "Weights")
        invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
        landed_section  = section(text, "Bales Weighed", "Outturn")
        loss_section    = section(text, "LOSS", "Invoice average")
        avg_section     = section(text, "Invoice average", "Comments")
        signature_block = section(text, "Signed on")

        # ----------- TOP INFO -----------
        top_info = {
            "produced_on": extract_field(text, "Produced On"),
            "printed_date": extract_field(text, "Printed Date"),
            "client_reference": extract_field(text, "Client Reference"),
            "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
        }

        # ----------- ORDER DETAILS -----------
        parties = {
            "client": extract_field(order_details, "Client"),
            "client_ref_no": extract_field(order_details, "Client Ref No"),
            "buyer": extract_field(order_details, "Buyer"),
            "destination": extract_field(order_details, "Destination"),
        }

        shipment = {
            "total_bales": extract_field(order_details, "Total Bales"),
            "vessel": extract_field(order_details, "Vessel"),
            "voyage_no": extract_field(order_details, "Voy. No"),
            "bl_no": extract_field(order_details, "B/L No"),
            "bl_date": extract_field(order_details, "B/L Date"),
            "growth": extract_field(order_details, "Growth"),
            "arrival_date": extract_field(order_details, "Arrival Date"),
            "first_weighing_date": extract_field(order_details, "First date of weighing"),
            "last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
            "weighing_method": extract_field(order_details, "Weighing method"),
            "tare_basis": extract_field(order_details, "Tare"),
        }

        # ----------- INVOICE SECTION -----------
        invoice = {
            "bales": extract_field(invoice_section, "Bales"),
            "gross": extract_field(invoice_section, "Gross"),
            "tare": extract_field(invoice_section, "Tare"),
            "net": extract_field(invoice_section, "Net"),
        }

        # ----------- LANDED SECTION -----------
        landed = {
            "bales": extract_field(landed_section, "Bales"),
            "gross": extract_field(landed_section, "Gross"),
            "tare": extract_field(landed_section, "Tare"),
            "net": extract_field(landed_section, "Net"),
        }

        # ----------- LOSS SECTION -----------
        loss = {
            "kg": extract_field(loss_section, "kg"),
            "lb": extract_field(loss_section, "lb"),
            "percent": extract_field(loss_section, "Percentage"),
        }

        # ----------- AVERAGES SECTION -----------
        averages = {
            "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
            "landed_gross_per_bale": extract_field(avg_section, "Landed average"),
        }

        # ----------- SIGNATURE -----------
        signature = {
            "signed_on": extract_field(signature_block, "Signed on"),
            "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
            "role": "Client Services Coordinator",
            "company": "Alfred H. Knight International Limited"
        }

        logger.info("Metadata extraction completed successfully")
        return {
            "report": top_info,
            "parties": parties,
            "shipment": shipment,
            "weights": {
                "invoice": invoice,
                "landed": landed,
                "loss": loss,
                "averages": averages
            },
            "signature": signature
        }

    except Exception as e:
        logger.exception("Unexpected error during metadata extraction")
        raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")

def detect_template(text):
    t = text.lower()

    if "alfred h. knight" in t and "cotton landing report" in t:
        return "AHK"

    if "intertek" in t and "landing report" in t:
        return "INTERTEK"

    if "robertson international" in t or "ri ref no" in t:
        return "ROBERTSON"

    if "landing report" in t and "carcon cargo" in t:
        return "SGS"

    if "pacific inspection company" in t or "picl-bd.com" in t:
        return "PICL"

    return "UNKNOWN"

@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
    return extract_report_metadata(text)

def call_extractor(text: str, lab: str = "AHK"):
    url = "http://62.72.36.116:8090/extract"
    params = {"lab": lab}

    fake_file = BytesIO(text.encode("utf-8"))

    files = {
        "file": ("document.txt", fake_file, "text/plain")
    }

    response = requests.post(url, params=params, files=files, timeout=60)
    response.raise_for_status()

    return response.json()


@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
    lab = parse_report(text)
    result = call_extractor(text, lab=lab)
    return result

PARSERS = {
    "AHK": AHKParser(),
    "INTERTEK": IntertekParser()
}

def empty_weight_report(lab):
    return {
        "lab": lab,
        "report": {"reference": None, "file_no": None, "date": None},
        "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
        "parties": {"seller": None, "buyer": None, "carrier": None},
        "shipment": {
            "vessel": None, "bl_no": None, "bl_date": None, "port_loading": None,
            "port_destination": None, "arrival_date": None,
            "weighing_place": None, "weighing_method": None,
            "bales": None
        },
        "weights": {
            "gross_landed_kg": None, "tare_kg": None,
            "net_landed_kg": None, "invoice_net_kg": None,
            "gain_loss_kg": None, "gain_loss_percent": None
        }
    }

def parse_report(text):
    template=detect_template(text)
    # if template not in PARSERS:
    #     return {"template":"UNKNOWN"}
    # return PARSERS[template].parse(text)
    return template