automation-service/app.py

from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re

LOG_PATH = "/var/log/automation-service.log"

file_handler = RotatingFileHandler(
    LOG_PATH,
    maxBytes=10*1024*1024,
    backupCount=5,
    encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))

# class AHKParser:
#     lab="AHK"
#     def parse(self,text):
#         r=empty_weight_report("AHK")
#         inv=section(text,"INVOICE WEIGHTS","Bales Weighed")
#         land=section(text,"Bales Weighed","Outturn")
#         loss=section(text,"LOSS","Invoice average")

#         r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text)
#         r["report"]["date"]=extract("Produced On",text)

#         r["contract"]["invoice_no"]=extract("Client Reference",text)
#         r["contract"]["origin"]=extract("Growth",text)
#         r["contract"]["commodity"]="Raw Cotton"

#         r["parties"]["seller"]=extract("Client",text)
#         r["parties"]["buyer"]=extract("Buyer",text)

#         r["shipment"]["vessel"]=extract("Vessel",text)
#         r["shipment"]["bl_no"]=extract("B/L No",text)
#         r["shipment"]["port_destination"]=extract("Destination",text)
#         r["shipment"]["arrival_date"]=extract("Arrival Date",text)
#         r["shipment"]["weighing_method"]=extract("Weighing method",text)
#         r["shipment"]["bales"]=to_float(extract("Total Bales",text))

#         r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land))
#         r["weights"]["tare_kg"]=to_float(extract("Tare",land))
#         r["weights"]["net_landed_kg"]=to_float(extract("Net",land))
#         r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv))
#         r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss))
#         r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss))
#         return r
# class AHKParser:
#     lab = "AHK"

#     def extract_table(self, text, headers):
#         lines = [l.strip() for l in text.splitlines() if l.strip()]
#         out = {}
#         for h in headers:
#             for i,l in enumerate(lines):
#                 if l == h:
#                     for j in range(i+1, i+8):
#                         if j < len(lines) and lines[j].startswith(":"):
#                             out[h] = lines[j][1:].strip()
#                             break
#         return out

#     def extract_weights(self, text):
#         lines = [l.strip() for l in text.splitlines() if l.strip()]
#         res = {}
#         for i,l in enumerate(lines):
#             if l == "Bales Weighed":
#                 headers = ["Bales","Gross","Tare","Net"]
#                 for h in headers:
#                     for j in range(i, i+20):
#                         if j < len(lines) and lines[j].startswith(":"):
#                             res[h] = lines[j][1:].replace("kg","").strip()
#                             break
#         return res

#     def parse(self, text):
#         r = empty_weight_report("AHK")

#         # report
#         r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
#         r["report"]["date"]      = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)

#         # contract
#         r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
#         r["contract"]["commodity"]  = "Raw Cotton"

#         # buyer
#         r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)

#         # shipment tables
#         ship = self.extract_table(text, [
#             "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
#         ])
#         ship2 = self.extract_table(text, [
#             "Growth","Arrival Date","First date of weighing",
#             "Last Date of Weighing","Weighing method","Tare"
#         ])

#         r["shipment"]["bales"]            = to_float(ship.get("Total Bales"))
#         r["shipment"]["vessel"]           = ship.get("Vessel")
#         r["shipment"]["bl_no"]            = ship.get("B/L No.")
#         r["shipment"]["port_destination"] = ship.get("Destination")
#         r["shipment"]["arrival_date"]     = ship2.get("Arrival Date")
#         r["shipment"]["weighing_method"]  = ship2.get("Weighing method")
#         r["contract"]["origin"]           = ship2.get("Growth")

#         # weights
#         inv  = self.extract_table(text, ["Bales","Gross","Tare","Net"])
#         land = self.extract_weights(text)

#         r["weights"]["invoice_net_kg"]  = to_float(inv.get("Net"))
#         r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
#         r["weights"]["tare_kg"]         = to_float(land.get("Tare"))
#         r["weights"]["net_landed_kg"]   = to_float(land.get("Net"))

#         # loss
#         loss = section(text,"LOSS","Invoice average")
#         r["weights"]["gain_loss_kg"]      = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
#         r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))

#         return r

class AHKParser:
    lab = "AHK"

    def _lines(self, text):
        return [l.strip() for l in text.splitlines() if l.strip()]

    def _col_block(self, lines, labels, max_scan=25):
        # trouve la dernière ligne du bloc de labels
        last = max(i for i,l in enumerate(lines) if l in labels)
        vals = []
        for l in lines[last+1:last+1+max_scan]:
            if l.startswith(":"):
                vals.append(l[1:].strip())
            if len(vals) == len(labels):
                break
        return dict(zip(labels, vals))

    def parse(self, text):
        L = self._lines(text)
        r = empty_weight_report("AHK")

        # report
        r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
        r["report"]["date"]      = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)

        # contract
        r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
        r["contract"]["commodity"]  = "Raw Cotton"

        # buyer
        r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)

        # shipment block 1
        ship1 = self._col_block(L, [
            "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
        ])

        # shipment block 2
        ship2 = self._col_block(L, [
            "Growth","Arrival Date","First date of weighing",
            "Last Date of Weighing","Weighing method","Tare"
        ])

        r["shipment"]["bales"]            = to_float(ship1.get("Total Bales"))
        r["shipment"]["vessel"]           = ship1.get("Vessel")
        r["shipment"]["bl_no"]            = ship1.get("B/L No.")
        r["shipment"]["port_destination"] = ship1.get("Destination")
        r["shipment"]["arrival_date"]     = ship2.get("Arrival Date")
        r["shipment"]["weighing_method"]  = ship2.get("Weighing method")
        r["contract"]["origin"]           = ship2.get("Growth")

        # invoice weights
        inv = self._col_block(L, ["Bales","Gross","Tare","Net"])
        r["weights"]["invoice_net_kg"] = to_float(inv.get("Net"))

        # landed weights
        land = self._col_block(self._lines(section(text,"Bales Weighed","Outturn")),
                               ["Bales","Gross","Tare","Net"])

        r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
        r["weights"]["tare_kg"]         = to_float(land.get("Tare"))
        r["weights"]["net_landed_kg"]   = to_float(land.get("Net"))

        # loss
        loss = section(text,"LOSS","Invoice average")
        r["weights"]["gain_loss_kg"]      = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
        r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))

        return r


class IntertekParser:
    lab="INTERTEK"
    def parse(self,text):
        r=empty_weight_report("INTERTEK")
        pct=safe_search(r"([0-9.]+)\s*%",text)

        r["report"]["reference"]=extract("Global Ref",text)
        r["report"]["file_no"]=extract("Report / File No",text)
        r["report"]["date"]=extract("Dated",text)

        r["contract"]["contract_no"]=extract("Contract No",text)
        r["contract"]["invoice_no"]=extract("Invoice No",text)
        r["contract"]["origin"]=extract("Growth",text)
        r["contract"]["commodity"]="Raw Cotton"

        r["parties"]["buyer"]=extract("Buyer",text)

        r["shipment"]["vessel"]=extract("Vessel",text)
        r["shipment"]["bl_no"]=extract("B/L No",text)
        r["shipment"]["arrival_date"]=extract("Arrival Date",text)
        r["shipment"]["weighing_place"]=extract("Weighed at",text)
        r["shipment"]["bales"]=to_float(extract("Invoice Quantity",text))

        r["weights"]["gross_landed_kg"]=to_float(extract("Gross",text))
        r["weights"]["tare_kg"]=to_float(extract("Invoice Tare",text))
        r["weights"]["net_landed_kg"]=to_float(extract("Landed Weight",text))
        r["weights"]["invoice_net_kg"]=to_float(extract("Invoice Weight",text))
        r["weights"]["gain_loss_kg"]=to_float(extract("Gain",text))
        r["weights"]["gain_loss_percent"]=to_float(pct)
        return r

class RobertsonParser:
    lab="ROBERTSON"
    def parse(self,text):
        r=empty_weight_report("ROBERTSON")
        pct=safe_search(r"([0-9.]+)\s*%",text)

        r["report"]["reference"]=extract("OUR REF",text)
        r["report"]["date"]=extract("DATE",text)

        r["contract"]["contract_no"]=extract("CONTRACT NO",text)
        r["contract"]["invoice_no"]=extract("INVOICE NO",text)
        r["contract"]["lc_no"]=extract("LIC NO",text)
        r["contract"]["commodity"]="Raw Cotton"

        r["parties"]["seller"]=extract("SELLER",text)
        r["parties"]["buyer"]=extract("BUYER",text)

        r["shipment"]["vessel"]=extract("NAME OF VESSEL",text)
        r["shipment"]["port_loading"]=extract("SAILED FROM",text)
        r["shipment"]["port_destination"]=extract("ARRIVED AT",text)
        r["shipment"]["arrival_date"]=extract("DATE OF ARRIVAL",text)
        r["shipment"]["weighing_place"]=extract("PLACE OF CONTROL",text)
        r["shipment"]["bales"]=to_float(extract("CONSIGNMENT",text))

        r["weights"]["gross_landed_kg"]=to_float(extract("GROSS",text))
        r["weights"]["tare_kg"]=to_float(extract("TARE",text))
        r["weights"]["net_landed_kg"]=to_float(extract("LANDED NET",text))
        r["weights"]["invoice_net_kg"]=to_float(extract("INVOICE NET",text))
        r["weights"]["gain_loss_kg"]=to_float(extract("GAIN",text))
        r["weights"]["gain_loss_percent"]=to_float(pct)
        return r

class SGSParser:
    lab="SGS"
    def parse(self,text):
        r=empty_weight_report("SGS")
        r["report"]["reference"]=extract("LANDING REPORT No",text)
        r["report"]["file_no"]=extract("FILE NO.",text)
        r["report"]["date"]=extract("DATE",text)

        r["contract"]["contract_no"]=extract("CONTRACT NO.",text)
        r["contract"]["invoice_no"]=extract("INVOICE NO.",text)
        r["contract"]["origin"]=extract("ORIGIN",text)
        r["contract"]["commodity"]=extract("PRODUCT",text)

        r["parties"]["seller"]=extract("Seller",text)
        r["parties"]["buyer"]=extract("Buyer",text)
        r["parties"]["carrier"]=extract("Carrier",text)

        r["shipment"]["bl_no"]=extract("B/L no.",text)
        r["shipment"]["port_loading"]=extract("Port of loading",text)
        r["shipment"]["port_destination"]=extract("Port of destination",text)
        r["shipment"]["arrival_date"]=extract("Vessel arrival date",text)
        r["shipment"]["weighing_place"]=extract("Place of weighing",text)
        r["shipment"]["weighing_method"]=extract("Weighing mode",text)
        r["shipment"]["bales"]=to_float(extract("Quantity arrived",text))

        r["weights"]["gross_landed_kg"]=to_float(extract("Gross landed",text))
        r["weights"]["tare_kg"]=to_float(extract("Tare",text))
        r["weights"]["net_landed_kg"]=to_float(extract("Net landed",text))
        r["weights"]["invoice_net_kg"]=to_float(extract("Net invoiced",text))
        r["weights"]["gain_loss_kg"]=to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs",text))
        r["weights"]["gain_loss_percent"]=to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%",text))
        return r

class PICLParser:
    lab="PICL"
    def parse(self,text):
        r=empty_weight_report("PICL")

        r["report"]["reference"]=safe_search(r"No[:\s]+([A-Z0-9\-]+)",text)
        r["report"]["date"]=safe_search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})",text,group_index=2)

        r["contract"]["contract_no"]=extract("Contract/Pl No & Date",text)
        r["contract"]["invoice_no"]=extract("Invoice ilo & Date",text)
        r["contract"]["lc_no"]=extract("L/C No & Date",text)
        r["contract"]["origin"]=extract("Country of Origin",text)
        r["contract"]["commodity"]=extract("Commodity",text)

        r["parties"]["seller"]=extract("FAIRCOT SA",text)
        r["parties"]["buyer"]=extract("M/S.",text)
        r["parties"]["carrier"]=extract("Shipping Agent",text)

        r["shipment"]["vessel"]=extract("Shipped Per Vessel",text)
        r["shipment"]["bl_no"]=extract("B/L No & Date",text)
        r["shipment"]["port_loading"]=extract("Port of Loading",text)
        r["shipment"]["port_destination"]=extract("Port of Discharge",text)
        r["shipment"]["arrival_date"]=extract("Date of Anival & LDL",text)
        r["shipment"]["weighing_place"]=extract("Place & Date of Weighment",text)
        r["shipment"]["weighing_method"]=extract("Method of Weighment",text)
        r["shipment"]["bales"]=to_float(extract("Grand Total",text))

        r["weights"]["gross_landed_kg"]=to_float(extract("Total;",text))
        r["weights"]["tare_kg"]=to_float(extract("Tare Weight",text))
        r["weights"]["net_landed_kg"]=to_float(extract("Grand Total",text))
        r["weights"]["invoice_net_kg"]=to_float(extract("Invoice weight",text))
        r["weights"]["gain_loss_kg"]=to_float(safe_search(r"(-[0-9.,]+)\s*KGS",text))
        r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text))
        return r

# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())

# Use root logger for your app
logger = logging.getLogger(__name__)

app = FastAPI()
logger.info("Loading models...")

nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)

logger.info("Models loaded successfully.")

import io
import re
from datetime import datetime
from typing import Dict, Any
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader
import json

def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
    """
    Parse structured data from cotton landing report OCR text
    """
    result = {
        "lab": "ALFRED H KNIGHT",
        "report": {"reference": None, "file_no": None, "date": None},
        "contract": {"contract_no": None, "invoice_no": None, "lc_no": None,
                    "origin": None, "commodity": None},
        "parties": {"seller": None, "buyer": None, "carrier": None},
        "shipment": {
            "vessel": None, "bl_no": None, "port_loading": None,
            "port_destination": None, "arrival_date": None,
            "weighing_place": None, "weighing_method": None,
            "bales": None
        },
        "weights": {
            "gross_landed_kg": None, "tare_kg": None,
            "net_landed_kg": None, "invoice_net_kg": None,
            "gain_loss_kg": None, "gain_loss_percent": None
        }
    }

    # Clean the text
    lines = ocr_text.split('\n')
    clean_lines = [line.strip() for line in lines if line.strip()]

    # Extract using patterns
    text = ocr_text.lower()

    # 1. Extract report reference and file number
    ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if ref_match:
        result["report"]["reference"] = ref_match.group(1).strip()

    # Try to get file number from AHK reference
    ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
    if ahk_match:
        result["report"]["file_no"] = ahk_match.group(1)

    # 2. Extract dates
    date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
    if date_match:
        result["report"]["date"] = date_match.group(1).title()

    # 3. Extract contract information
    # Origin/Growth
    growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
    if growth_match:
        origin = growth_match.group(1).strip()
        result["contract"]["origin"] = origin
        result["contract"]["commodity"] = "COTTON"

    # Invoice number from reference
    if result["report"]["reference"]:
        inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
        if inv_match:
            result["contract"]["invoice_no"] = inv_match.group(1)

    # 4. Extract parties
    # Seller
    seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if seller_match:
        # Skip the "Client" label if present
        seller_text = seller_match.group(1).strip()
        if not seller_text.lower().startswith('client'):
            result["parties"]["seller"] = seller_text

    # Buyer
    buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if buyer_match:
        buyer_text = buyer_match.group(1).strip()
        if not buyer_text.lower().startswith('buyer'):
            result["parties"]["buyer"] = buyer_text

    # 5. Extract shipment details
    # Vessel
    vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if vessel_match:
        vessel_text = vessel_match.group(1).strip()
        if not vessel_text.lower().startswith('vessel'):
            result["shipment"]["vessel"] = vessel_text

    # B/L Number
    bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if bl_match:
        bl_text = bl_match.group(1).strip()
        result["shipment"]["bl_no"] = bl_text

    # Destination
    dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if dest_match:
        dest_text = dest_match.group(1).strip()
        if not dest_text.lower().startswith('destination'):
            result["shipment"]["port_destination"] = dest_text

    # Arrival Date
    arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
    if arrival_match:
        result["shipment"]["arrival_date"] = arrival_match.group(1).title()

    # Weighing method
    weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
    if weigh_match:
        method_text = weigh_match.group(1).strip()
        if not method_text.lower().startswith('weighing'):
            result["shipment"]["weighing_method"] = method_text

    # Bales count
    bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
    if bales_match:
        result["shipment"]["bales"] = int(bales_match.group(1))

    # 6. Extract weights (critical section)
    # Gross Landed Weight
    gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
    if gross_match:
        # We need the second occurrence (landed weight)
        all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
        if len(all_gross) >= 2:
            result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))

    # Tare weight (should be same in both)
    tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
    if tare_match:
        result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))

    # Net weights
    net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
    if len(net_matches) >= 2:
        result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
        result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))

    # Loss/Gain
    loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
    if loss_match:
        loss_value = float(loss_match.group(1).replace(',', ''))
        # Make it negative if not already indicated
        if '-' not in loss_match.group(0) and '–' not in loss_match.group(0):
            loss_value = -loss_value
        result["weights"]["gain_loss_kg"] = loss_value

    # Percentage
    percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
    if percent_match:
        percent_value = float(percent_match.group(1).replace(',', ''))
        if '-' not in percent_match.group(0) and '–' not in percent_match.group(0):
            percent_value = -percent_value
        result["weights"]["gain_loss_percent"] = percent_value

    return result

@app.post("/ocr")
async def ocr(file: UploadFile):
    """
    Enhanced OCR endpoint that returns structured data
    """
    logger.info(f"Received structured OCR request: {file.filename}")

    try:
        file_data = await file.read()
        ext = file.filename.lower()

        ocr_text = ""

        # Process PDF
        if ext.endswith(".pdf"):
            # Try native text extraction first
            reader = PdfReader(io.BytesIO(file_data))
            direct_text = "".join(page.extract_text() or "" for page in reader.pages)

            if direct_text.strip():
                logger.info("Using native PDF text")
                ocr_text = direct_text
            else:
                # Fallback to OCR
                logger.info("Using OCR for scanned PDF")
                images = convert_from_bytes(file_data)
                for i, img in enumerate(images):
                    logger.info(f"OCR page {i+1}/{len(images)}")
                    ocr_text += pytesseract.image_to_string(img) + "\n"
        else:
            # Process image
            img = Image.open(io.BytesIO(file_data))
            ocr_text = pytesseract.image_to_string(img)

        # Parse structured data
        structured_data = parse_cotton_report(ocr_text)

        return {
            "success": True,
            "ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
            "structured_data": structured_data,
            "json": json.dumps(structured_data, indent=2, ensure_ascii=False)
        }

    except Exception as e:
        logger.error(f"Structured OCR failed: {e}", exc_info=True)
        return {
            "success": False,
            "error": str(e),
            "raw_text": "",
            "structured_data": {}
        }

# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
#     logger.info(f"Received OCR request: {file.filename}")
#     try:
#         file_data = await file.read()
#         ext = file.filename.lower()

#         # --------- PDF with native text ---------
#         if ext.endswith(".pdf"):
#             logger.info("PDF detected → Extracting native text first")
#             reader = PdfReader(io.BytesIO(file_data))
#             direct_text = "".join(
#                 page.extract_text() or "" for page in reader.pages
#             )

#             if direct_text.strip():
#                 logger.info("Native PDF text found → No OCR needed")
#                 return {"ocr_text": direct_text}

#             # -------- Fallback: scanned PDF OCR --------
#             logger.info("No native text → PDF treated as scanned → OCR")
#             from pdf2image import convert_from_bytes
#             images = convert_from_bytes(file_data)
#             text = ""
#             for i, img in enumerate(images):
#                 logger.info(f"OCR page {i+1}/{len(images)}")
#                 text += pytesseract.image_to_string(img) + "\n"

#             return {"ocr_text": text}

#         # --------- Image file OCR ---------
#         logger.info("Image detected → Running OCR")
#         img = Image.open(io.BytesIO(file_data))
#         text = pytesseract.image_to_string(img)
#         return {"ocr_text": text}

#     except Exception as e:
#         logger.error(f"OCR failed: {e}", exc_info=True)
#         raise HTTPException(status_code=500, detail=str(e))

# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
    logger.info(f"Received structure request: {file.filename}")
    try:
        file_data = await file.read()
        ext = file.filename.lower()

        if ext.endswith(".pdf"):
            doc = DocumentFile.from_pdf(file_data)
            logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
        else:
            img = Image.open(io.BytesIO(file_data)).convert("RGB")
            doc = DocumentFile.from_images([img])
            logger.info("Structure prediction on image")

        res = predictor(doc)
        return {"structure": str(res)}

    except Exception as e:
        logger.error(f"Structure extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
    logger.info(f"Received table extraction request: {file.filename}")
    try:
        file_data = await file.read()
        buffer = io.BytesIO(file_data)

        tables = camelot.read_pdf(buffer)
        logger.info(f"Found {len(tables)} tables")
        return {"tables": [t.df.to_dict() for t in tables]}

    except Exception as e:
        logger.error(f"Table extraction failed: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

def safe_search(pattern, text, default=None, group_index=1, context=""):
    """Recherche sécurisée avec logging en cas d'absence de correspondance."""
    m = re.search(pattern, text, re.I | re.S)
    if not m:
        logger.warning("Pattern not found for %s: %s", context, pattern)
        return default
    try:
        return m.group(group_index).strip()
    except IndexError:
        logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
        return default

def to_float(s):
    if not s:
        return None
    s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
    s = s.replace("lbs", "").replace("LBS", "")
    s = s.strip()
    try:
        return float(s)
    except:
        return None

def section(text, start, end=None):
    """Extract a block of text between two headings, safely."""
    pattern_start = re.escape(start)
    if end:
        pattern_end = re.escape(end)
        reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
    else:
        reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
    m = reg.search(text)
    if not m:
        logger.warning("Section not found: start='%s', end='%s'", start, end)
        return ""
    return m.group(1).strip()

def extract_field(text, label, default=None):
    """Extract a line of the form 'Label: value', safely."""
    pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
    return safe_search(pattern, text, default=default, context=f"field '{label}'")

def extract(label, text, default=None):
    """
    Robust extraction for OCR/PDF text.
    Works with:
      Label: Value
      Label Value
      Label .... Value
    """
    if not text:
        return default

    patterns = [
        rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
        rf"{re.escape(label)}\s+([^\n\r]+)"
    ]

    for p in patterns:
        m = re.search(p, text, re.I)
        if m:
            return m.group(1).strip()

    return default

def extract_report_metadata(text):
    logger.info("Starting metadata extraction, text length=%d", len(text))

    try:
        # ----------- SECTIONS -----------
        order_details   = section(text, "Order details", "Weights")
        invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
        landed_section  = section(text, "Bales Weighed", "Outturn")
        loss_section    = section(text, "LOSS", "Invoice average")
        avg_section     = section(text, "Invoice average", "Comments")
        signature_block = section(text, "Signed on")

        # ----------- TOP INFO -----------
        top_info = {
            "produced_on": extract_field(text, "Produced On"),
            "printed_date": extract_field(text, "Printed Date"),
            "client_reference": extract_field(text, "Client Reference"),
            "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
        }

        # ----------- ORDER DETAILS -----------
        parties = {
            "client": extract_field(order_details, "Client"),
            "client_ref_no": extract_field(order_details, "Client Ref No"),
            "buyer": extract_field(order_details, "Buyer"),
            "destination": extract_field(order_details, "Destination"),
        }

        shipment = {
            "total_bales": extract_field(order_details, "Total Bales"),
            "vessel": extract_field(order_details, "Vessel"),
            "voyage_no": extract_field(order_details, "Voy. No"),
            "bl_no": extract_field(order_details, "B/L No"),
            "bl_date": extract_field(order_details, "B/L Date"),
            "growth": extract_field(order_details, "Growth"),
            "arrival_date": extract_field(order_details, "Arrival Date"),
            "first_weighing_date": extract_field(order_details, "First date of weighing"),
            "last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
            "weighing_method": extract_field(order_details, "Weighing method"),
            "tare_basis": extract_field(order_details, "Tare"),
        }

        # ----------- INVOICE SECTION -----------
        invoice = {
            "bales": extract_field(invoice_section, "Bales"),
            "gross": extract_field(invoice_section, "Gross"),
            "tare": extract_field(invoice_section, "Tare"),
            "net": extract_field(invoice_section, "Net"),
        }

        # ----------- LANDED SECTION -----------
        landed = {
            "bales": extract_field(landed_section, "Bales"),
            "gross": extract_field(landed_section, "Gross"),
            "tare": extract_field(landed_section, "Tare"),
            "net": extract_field(landed_section, "Net"),
        }

        # ----------- LOSS SECTION -----------
        loss = {
            "kg": extract_field(loss_section, "kg"),
            "lb": extract_field(loss_section, "lb"),
            "percent": extract_field(loss_section, "Percentage"),
        }

        # ----------- AVERAGES SECTION -----------
        averages = {
            "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
            "landed_gross_per_bale": extract_field(avg_section, "Landed average"),
        }

        # ----------- SIGNATURE -----------
        signature = {
            "signed_on": extract_field(signature_block, "Signed on"),
            "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
            "role": "Client Services Coordinator",
            "company": "Alfred H. Knight International Limited"
        }

        logger.info("Metadata extraction completed successfully")
        return {
            "report": top_info,
            "parties": parties,
            "shipment": shipment,
            "weights": {
                "invoice": invoice,
                "landed": landed,
                "loss": loss,
                "averages": averages
            },
            "signature": signature
        }

    except Exception as e:
        logger.exception("Unexpected error during metadata extraction")
        raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")

def detect_template(text):
    t = text.lower()

    if "alfred h. knight" in t and "cotton landing report" in t:
        return "AHK"

    if "intertek" in t and "landing report" in t:
        return "INTERTEK"

    if "robertson international" in t or "ri ref no" in t:
        return "ROBERTSON"

    if "landing report" in t and "carcon cargo" in t:
        return "SGS"

    if "pacific inspection company" in t or "picl-bd.com" in t:
        return "PICL"

    return "UNKNOWN"

@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
    return extract_report_metadata(text)

@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
    return parse_report(text)

PARSERS = {
    "AHK": AHKParser(),
    "INTERTEK": IntertekParser(),
    "ROBERTSON": RobertsonParser(),
    "SGS": SGSParser(),
    "PICL": PICLParser()
}

def empty_weight_report(lab):
    return {
        "lab": lab,
        "report": {"reference": None, "file_no": None, "date": None},
        "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
        "parties": {"seller": None, "buyer": None, "carrier": None},
        "shipment": {
            "vessel": None, "bl_no": None, "port_loading": None,
            "port_destination": None, "arrival_date": None,
            "weighing_place": None, "weighing_method": None,
            "bales": None
        },
        "weights": {
            "gross_landed_kg": None, "tare_kg": None,
            "net_landed_kg": None, "invoice_net_kg": None,
            "gain_loss_kg": None, "gain_loss_percent": None
        }
    }

def parse_report(text):
    template=detect_template(text)
    if template not in PARSERS:
        return {"template":"UNKNOWN"}
    return PARSERS[template].parse(text)