09.01.26

2026-01-09 19:27:10 +01:00
parent 377ff3a613
commit e6e0d98593
1 changed files with 253 additions and 0 deletions
--- a/app.py
+++ b/app.py
@@ -23,6 +23,176 @@ file_handler.setFormatter(logging.Formatter(
    "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
 ))
 class AHKParser:
    lab = "AHK"
    def parse(self, text):
        invoice_block = section(text, "INVOICE WEIGHTS", "Bales Weighed")
        landed_block  = section(text, "Bales Weighed", "Outturn")
        loss_block    = section(text, "LOSS", "Invoice average")
        return {
            "report": {
                "lab": "AHK",
                "reference": safe_search(r"(AHK\s*/\S+)", text, default=None, context="AHK reference"),
                "date": extract("Produced On", text)
            },
            "shipment": {
                "bales": to_float(extract("Total Bales", text)),
                "vessel": extract("Vessel", text),
                "bl": extract("B/L No", text),
                "arrival_date": extract("Arrival Date", text)
            },
            "weights": {
                "invoice_kg": to_float(extract("Net", invoice_block)),
                "landed_kg": to_float(extract("Net", landed_block)),
                "gain_loss_kg": to_float(extract("kg", loss_block)),
                "gain_loss_percent": to_float(extract("Percentage", loss_block))
            }
        }
 class IntertekParser:
    lab = "INTERTEK"
    def parse(self, text):
        m = re.search(r"([0-9.]+)\s*%", text)
        percent = m.group(1) if m else None
        return {
            "report": {
                "lab": "INTERTEK",
                "reference": extract("Global Ref", text),
                "date": extract("Dated", text)
            },
            "shipment": {
                "bales": to_float(extract("Invoice Quantity", text)),
                "vessel": extract("Vessel", text),
                "bl": extract("B/L No", text),
                "arrival_date": extract("Arrival Date", text)
            },
            "weights": {
                "invoice_kg": to_float(extract("Invoice Weight", text)),
                "landed_kg": to_float(extract("Landed Weight", text)),
                "gain_loss_kg": to_float(extract("Gain", text)),
                "gain_loss_percent": to_float(percent)
            }
        }
 class RobertsonParser:
    lab = "ROBERTSON"
    def parse(self, text):
        m = re.search(r"([0-9.]+)\s*%", text)
        percent = m.group(1) if m else None
        return {
            "report": {
                "lab": "ROBERTSON",
                "reference": extract("RI REF NO.", text),
                "date": extract("DATED", text)
            },
            "shipment": {
                "bales": to_float(extract("QUANTITY", text)),
                "vessel": extract("VESSEL", text),
                "bl": extract("B/L NO.", text),
                "arrival_date": extract("ARRIVAL DATE", text)
            },
            "weights": {
                "invoice_kg": to_float(extract("NET INVOICE WEIGHT", text)),
                "landed_kg": to_float(extract("NET LANDED WEIGHT", text)),
                "gain_loss_kg": to_float(extract("LOSS", text)),
                "gain_loss_percent": to_float(percent)
            }
        }
 class SGSParser:
    lab = "SGS"
    def parse(self, text):
        return {
            "report": {
                "lab": "SGS",
                "reference": extract("LANDING REPORT No", text),
                "file_no": extract("FILE NO.", text),
                "date": extract("DATE", text)
            },
            "contract": {
                "contract_no": extract("CONTRACT NO.", text),
                "invoice_no": extract("INVOICE NO.", text),
                "origin": extract("ORIGIN", text),
                "product": extract("PRODUCT", text)
            },
            "parties": {
                "seller": extract("Seller", text),
                "buyer": extract("Buyer", text),
                "carrier": extract("Carrier", text)
            },
            "shipment": {
                "bl": extract("B/L no.", text),
                "port_loading": extract("Port of loading", text),
                "port_destination": extract("Port of destination", text),
                "arrival_date": extract("Vessel arrival date", text),
                "devanning_date": extract("Container devanning date", text),
                "weighing_date": extract("Weighing date", text),
                "weighing_mode": extract("Weighing mode", text),
                "quantity_bales": to_float(extract("Quantity arrived", text))
            },
            "weights": {
                "gross_landed_kg": to_float(extract("Gross landed", text)),
                "tare_kg": to_float(extract("Tare", text)),
                "net_landed_kg": to_float(extract("Net landed", text)),
                "net_invoiced_kg": to_float(extract("Net invoiced", text)),
                "gain_percent": to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%", text)),
                "gain_kg": to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs", text))
            }
        }
 class PICLParser:
    lab = "PICL"
    def parse(self, text):
        return {
            "report": {
                "lab": "PICL",
                "reference": safe_search(r"No[:\s]+([A-Z0-9\-]+)", text),
                "date": safe_search(r"Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,|Sunday,?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})", text)
            },
            "parties": {
                "seller": extract("FAIRCOT SA", text),
                "buyer": extract("M/S.", text)
            },
            "shipment": {
                "bales": to_float(extract("Grand Total", text)),
                "vessel": extract("Shipped Per Vessel", text),
                "feeder": extract("Feeder", text),
                "port_loading": extract("Port of Loading", text),
                "port_discharge": extract("Port of Discharge", text),
                "arrival_date": extract("Date of Anival & LDL", text),
                "weighing_place": extract("Place & Date of Weighment", text)
            },
            "contract": {
                "contract_no": extract("Contract/Pl No & Date", text),
                "invoice_no": extract("Invoice ilo & Date", text),
                "bl": extract("B/L No & Date", text),
                "origin": extract("Country of Origin", text),
                "commodity": extract("Commodity", text)
            },
            "weights": {
                "gross_landed_kg": to_float(extract("Total;", text)),
                "tare_kg": to_float(extract("Tare Weight", text)),
                "net_landed_kg": to_float(extract("Grand Total", text)),
                "invoice_weight_kg": to_float(extract("Invoice weight", text)),
                "loss_kg": to_float(safe_search(r"(-[0-9.,]+)\s*KGS", text)),
                "loss_percent": to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)", text))
            }
        }
 # Configure root logger explicitly
 root = logging.getLogger()
 root.setLevel(logging.INFO)
@@ -138,6 +308,17 @@ def safe_search(pattern, text, default=None, group_index=1, context=""):
        logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
        return default
 def to_float(s):
    if not s:
        return None
    s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
    s = s.replace("lbs", "").replace("LBS", "")
    s = s.strip()
    try:
        return float(s)
    except:
        return None
 def section(text, start, end=None):
    """Extract a block of text between two headings, safely."""
    pattern_start = re.escape(start)
@@ -157,6 +338,29 @@ def extract_field(text, label, default=None):
    pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
    return safe_search(pattern, text, default=default, context=f"field '{label}'")
 def extract(label, text, default=None):
    """
    Robust extraction for OCR/PDF text.
    Works with:
      Label: Value
      Label Value
      Label .... Value
    """
    if not text:
        return default
    patterns = [
        rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
        rf"{re.escape(label)}\s+([^\n\r]+)"
    ]
    for p in patterns:
        m = re.search(p, text, re.I)
        if m:
            return m.group(1).strip()
    return default
 def extract_report_metadata(text):
    logger.info("Starting metadata extraction, text length=%d", len(text))
@@ -254,6 +458,55 @@ def extract_report_metadata(text):
        logger.exception("Unexpected error during metadata extraction")
        raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
 def detect_template(text):
    t = text.lower()
    if "alfred h. knight" in t and "cotton landing report" in t:
        return "AHK"
    if "intertek" in t and "landing report" in t:
        return "INTERTEK"
    if "robertson international" in t or "ri ref no" in t:
        return "ROBERTSON"
    if "landing report" in t and "carcon cargo" in t:
        return "SGS"
    if "pacific inspection company" in t or "picl-bd.com" in t:
        return "PICL"
    return "UNKNOWN"
@app.post("/metadata")
 async def metadata(text: str = Body(..., embed=True)):
    return extract_report_metadata(text)
@app.post("/parse")
 async def parse_endpoint(text: str = Body(..., embed=True)):
    return parse_report(text)
 PARSERS = {
    "AHK": AHKParser(),
    "INTERTEK": IntertekParser(),
    "ROBERTSON": RobertsonParser(),
    "SGS": SGSParser(),
    "PICL": PICLParser()
 }
 def parse_report(text):
    template = detect_template(text)
    logger.info(f"Detected template: {template}")
    if template not in PARSERS:
        return {
            "template": "UNKNOWN",
            "raw_text": text[:5000]
        }
    data = PARSERS[template].parse(text)
    return {
        "template": template,
        "data": data
    }