diff --git a/app.py b/app.py index 9f0704b..8cba8d7 100644 --- a/app.py +++ b/app.py @@ -23,6 +23,176 @@ file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s" )) +class AHKParser: + lab = "AHK" + + def parse(self, text): + invoice_block = section(text, "INVOICE WEIGHTS", "Bales Weighed") + landed_block = section(text, "Bales Weighed", "Outturn") + loss_block = section(text, "LOSS", "Invoice average") + + return { + "report": { + "lab": "AHK", + "reference": safe_search(r"(AHK\s*/\S+)", text, default=None, context="AHK reference"), + "date": extract("Produced On", text) + }, + "shipment": { + "bales": to_float(extract("Total Bales", text)), + "vessel": extract("Vessel", text), + "bl": extract("B/L No", text), + "arrival_date": extract("Arrival Date", text) + }, + "weights": { + "invoice_kg": to_float(extract("Net", invoice_block)), + "landed_kg": to_float(extract("Net", landed_block)), + "gain_loss_kg": to_float(extract("kg", loss_block)), + "gain_loss_percent": to_float(extract("Percentage", loss_block)) + } + } + +class IntertekParser: + lab = "INTERTEK" + + def parse(self, text): + m = re.search(r"([0-9.]+)\s*%", text) + percent = m.group(1) if m else None + + return { + "report": { + "lab": "INTERTEK", + "reference": extract("Global Ref", text), + "date": extract("Dated", text) + }, + "shipment": { + "bales": to_float(extract("Invoice Quantity", text)), + "vessel": extract("Vessel", text), + "bl": extract("B/L No", text), + "arrival_date": extract("Arrival Date", text) + }, + "weights": { + "invoice_kg": to_float(extract("Invoice Weight", text)), + "landed_kg": to_float(extract("Landed Weight", text)), + "gain_loss_kg": to_float(extract("Gain", text)), + "gain_loss_percent": to_float(percent) + } + } + +class RobertsonParser: + lab = "ROBERTSON" + + def parse(self, text): + m = re.search(r"([0-9.]+)\s*%", text) + percent = m.group(1) if m else None + + return { + "report": { + "lab": "ROBERTSON", + "reference": extract("RI REF NO.", text), + "date": extract("DATED", text) + }, + "shipment": { + "bales": to_float(extract("QUANTITY", text)), + "vessel": extract("VESSEL", text), + "bl": extract("B/L NO.", text), + "arrival_date": extract("ARRIVAL DATE", text) + }, + "weights": { + "invoice_kg": to_float(extract("NET INVOICE WEIGHT", text)), + "landed_kg": to_float(extract("NET LANDED WEIGHT", text)), + "gain_loss_kg": to_float(extract("LOSS", text)), + "gain_loss_percent": to_float(percent) + } + } + +class SGSParser: + lab = "SGS" + + def parse(self, text): + + return { + "report": { + "lab": "SGS", + "reference": extract("LANDING REPORT No", text), + "file_no": extract("FILE NO.", text), + "date": extract("DATE", text) + }, + "contract": { + "contract_no": extract("CONTRACT NO.", text), + "invoice_no": extract("INVOICE NO.", text), + "origin": extract("ORIGIN", text), + "product": extract("PRODUCT", text) + }, + "parties": { + "seller": extract("Seller", text), + "buyer": extract("Buyer", text), + "carrier": extract("Carrier", text) + }, + "shipment": { + "bl": extract("B/L no.", text), + "port_loading": extract("Port of loading", text), + "port_destination": extract("Port of destination", text), + "arrival_date": extract("Vessel arrival date", text), + "devanning_date": extract("Container devanning date", text), + "weighing_date": extract("Weighing date", text), + "weighing_mode": extract("Weighing mode", text), + "quantity_bales": to_float(extract("Quantity arrived", text)) + }, + "weights": { + "gross_landed_kg": to_float(extract("Gross landed", text)), + "tare_kg": to_float(extract("Tare", text)), + "net_landed_kg": to_float(extract("Net landed", text)), + "net_invoiced_kg": to_float(extract("Net invoiced", text)), + "gain_percent": to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%", text)), + "gain_kg": to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs", text)) + } + } + +class PICLParser: + lab = "PICL" + + def parse(self, text): + + return { + "report": { + "lab": "PICL", + "reference": safe_search(r"No[:\s]+([A-Z0-9\-]+)", text), + "date": safe_search(r"Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,|Sunday,?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})", text) + }, + + "parties": { + "seller": extract("FAIRCOT SA", text), + "buyer": extract("M/S.", text) + }, + + "shipment": { + "bales": to_float(extract("Grand Total", text)), + "vessel": extract("Shipped Per Vessel", text), + "feeder": extract("Feeder", text), + "port_loading": extract("Port of Loading", text), + "port_discharge": extract("Port of Discharge", text), + "arrival_date": extract("Date of Anival & LDL", text), + "weighing_place": extract("Place & Date of Weighment", text) + }, + + "contract": { + "contract_no": extract("Contract/Pl No & Date", text), + "invoice_no": extract("Invoice ilo & Date", text), + "bl": extract("B/L No & Date", text), + "origin": extract("Country of Origin", text), + "commodity": extract("Commodity", text) + }, + + "weights": { + "gross_landed_kg": to_float(extract("Total;", text)), + "tare_kg": to_float(extract("Tare Weight", text)), + "net_landed_kg": to_float(extract("Grand Total", text)), + "invoice_weight_kg": to_float(extract("Invoice weight", text)), + "loss_kg": to_float(safe_search(r"(-[0-9.,]+)\s*KGS", text)), + "loss_percent": to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)", text)) + } + } + # Configure root logger explicitly root = logging.getLogger() root.setLevel(logging.INFO) @@ -138,6 +308,17 @@ def safe_search(pattern, text, default=None, group_index=1, context=""): logger.warning("Group index %d not found for %s: %s", group_index, context, pattern) return default +def to_float(s): + if not s: + return None + s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "") + s = s.replace("lbs", "").replace("LBS", "") + s = s.strip() + try: + return float(s) + except: + return None + def section(text, start, end=None): """Extract a block of text between two headings, safely.""" pattern_start = re.escape(start) @@ -157,6 +338,29 @@ def extract_field(text, label, default=None): pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)" return safe_search(pattern, text, default=default, context=f"field '{label}'") +def extract(label, text, default=None): + """ + Robust extraction for OCR/PDF text. + Works with: + Label: Value + Label Value + Label .... Value + """ + if not text: + return default + + patterns = [ + rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)", + rf"{re.escape(label)}\s+([^\n\r]+)" + ] + + for p in patterns: + m = re.search(p, text, re.I) + if m: + return m.group(1).strip() + + return default + def extract_report_metadata(text): logger.info("Starting metadata extraction, text length=%d", len(text)) @@ -254,6 +458,55 @@ def extract_report_metadata(text): logger.exception("Unexpected error during metadata extraction") raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}") +def detect_template(text): + t = text.lower() + + if "alfred h. knight" in t and "cotton landing report" in t: + return "AHK" + + if "intertek" in t and "landing report" in t: + return "INTERTEK" + + if "robertson international" in t or "ri ref no" in t: + return "ROBERTSON" + + if "landing report" in t and "carcon cargo" in t: + return "SGS" + + if "pacific inspection company" in t or "picl-bd.com" in t: + return "PICL" + + return "UNKNOWN" + @app.post("/metadata") async def metadata(text: str = Body(..., embed=True)): return extract_report_metadata(text) + +@app.post("/parse") +async def parse_endpoint(text: str = Body(..., embed=True)): + return parse_report(text) + +PARSERS = { + "AHK": AHKParser(), + "INTERTEK": IntertekParser(), + "ROBERTSON": RobertsonParser(), + "SGS": SGSParser(), + "PICL": PICLParser() +} + +def parse_report(text): + template = detect_template(text) + logger.info(f"Detected template: {template}") + + if template not in PARSERS: + return { + "template": "UNKNOWN", + "raw_text": text[:5000] + } + + data = PARSERS[template].parse(text) + + return { + "template": template, + "data": data + }