From a55a956b61c03265eb9069727800aae5689820e7 Mon Sep 17 00:00:00 2001
From: laurentbarontini <l.barontini@open-squared.ch>
Date: Sun, 11 Jan 2026 17:52:26 +0100
Subject: [PATCH] 11.01.26

---
 app.py | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 248 insertions(+), 34 deletions(-)

diff --git a/app.py b/app.py
index 9e7e99b..0b6edc0 100644
--- a/app.py
+++ b/app.py
@@ -426,7 +426,6 @@ class PICLParser:
         r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text))
         return r
 
-
 # Configure root logger explicitly
 root = logging.getLogger()
 root.setLevel(logging.INFO)
@@ -444,48 +443,263 @@ predictor = ocr_predictor(pretrained=True)
 
 logger.info("Models loaded successfully.")
 
-# =============================
-# 🧠 Smart OCR
-# =============================
+import io
+import re
+from datetime import datetime
+from typing import Dict, Any
+import pytesseract
+from pdf2image import convert_from_bytes
+from PIL import Image
+from PyPDF2 import PdfReader
+import json
+
+def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
+    """
+    Parse structured data from cotton landing report OCR text
+    """
+    result = {
+        "lab": "ALFRED H KNIGHT",
+        "report": {"reference": None, "file_no": None, "date": None},
+        "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, 
+                    "origin": None, "commodity": None},
+        "parties": {"seller": None, "buyer": None, "carrier": None},
+        "shipment": {
+            "vessel": None, "bl_no": None, "port_loading": None,
+            "port_destination": None, "arrival_date": None,
+            "weighing_place": None, "weighing_method": None,
+            "bales": None
+        },
+        "weights": {
+            "gross_landed_kg": None, "tare_kg": None,
+            "net_landed_kg": None, "invoice_net_kg": None,
+            "gain_loss_kg": None, "gain_loss_percent": None
+        }
+    }
+    
+    # Clean the text
+    lines = ocr_text.split('\n')
+    clean_lines = [line.strip() for line in lines if line.strip()]
+    
+    # Extract using patterns
+    text = ocr_text.lower()
+    
+    # 1. Extract report reference and file number
+    ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if ref_match:
+        result["report"]["reference"] = ref_match.group(1).strip()
+    
+    # Try to get file number from AHK reference
+    ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
+    if ahk_match:
+        result["report"]["file_no"] = ahk_match.group(1)
+    
+    # 2. Extract dates
+    date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
+    if date_match:
+        result["report"]["date"] = date_match.group(1).title()
+    
+    # 3. Extract contract information
+    # Origin/Growth
+    growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
+    if growth_match:
+        origin = growth_match.group(1).strip()
+        result["contract"]["origin"] = origin
+        result["contract"]["commodity"] = "COTTON"
+    
+    # Invoice number from reference
+    if result["report"]["reference"]:
+        inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
+        if inv_match:
+            result["contract"]["invoice_no"] = inv_match.group(1)
+    
+    # 4. Extract parties
+    # Seller
+    seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if seller_match:
+        # Skip the "Client" label if present
+        seller_text = seller_match.group(1).strip()
+        if not seller_text.lower().startswith('client'):
+            result["parties"]["seller"] = seller_text
+    
+    # Buyer
+    buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if buyer_match:
+        buyer_text = buyer_match.group(1).strip()
+        if not buyer_text.lower().startswith('buyer'):
+            result["parties"]["buyer"] = buyer_text
+    
+    # 5. Extract shipment details
+    # Vessel
+    vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if vessel_match:
+        vessel_text = vessel_match.group(1).strip()
+        if not vessel_text.lower().startswith('vessel'):
+            result["shipment"]["vessel"] = vessel_text
+    
+    # B/L Number
+    bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if bl_match:
+        bl_text = bl_match.group(1).strip()
+        result["shipment"]["bl_no"] = bl_text
+    
+    # Destination
+    dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if dest_match:
+        dest_text = dest_match.group(1).strip()
+        if not dest_text.lower().startswith('destination'):
+            result["shipment"]["port_destination"] = dest_text
+    
+    # Arrival Date
+    arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
+    if arrival_match:
+        result["shipment"]["arrival_date"] = arrival_match.group(1).title()
+    
+    # Weighing method
+    weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
+    if weigh_match:
+        method_text = weigh_match.group(1).strip()
+        if not method_text.lower().startswith('weighing'):
+            result["shipment"]["weighing_method"] = method_text
+    
+    # Bales count
+    bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
+    if bales_match:
+        result["shipment"]["bales"] = int(bales_match.group(1))
+    
+    # 6. Extract weights (critical section)
+    # Gross Landed Weight
+    gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
+    if gross_match:
+        # We need the second occurrence (landed weight)
+        all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
+        if len(all_gross) >= 2:
+            result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))
+    
+    # Tare weight (should be same in both)
+    tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
+    if tare_match:
+        result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))
+    
+    # Net weights
+    net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
+    if len(net_matches) >= 2:
+        result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
+        result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))
+    
+    # Loss/Gain
+    loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
+    if loss_match:
+        loss_value = float(loss_match.group(1).replace(',', ''))
+        # Make it negative if not already indicated
+        if '-' not in loss_match.group(0) and '–' not in loss_match.group(0):
+            loss_value = -loss_value
+        result["weights"]["gain_loss_kg"] = loss_value
+    
+    # Percentage
+    percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
+    if percent_match:
+        percent_value = float(percent_match.group(1).replace(',', ''))
+        if '-' not in percent_match.group(0) and '–' not in percent_match.group(0):
+            percent_value = -percent_value
+        result["weights"]["gain_loss_percent"] = percent_value
+    
+    return result
+
 @app.post("/ocr")
 async def ocr(file: UploadFile):
-    logger.info(f"Received OCR request: {file.filename}")
+    """
+    Enhanced OCR endpoint that returns structured data
+    """
+    logger.info(f"Received structured OCR request: {file.filename}")
+    
     try:
         file_data = await file.read()
         ext = file.filename.lower()
-
-        # --------- PDF with native text ---------
+        
+        ocr_text = ""
+        
+        # Process PDF
         if ext.endswith(".pdf"):
-            logger.info("PDF detected → Extracting native text first")
+            # Try native text extraction first
             reader = PdfReader(io.BytesIO(file_data))
-            direct_text = "".join(
-                page.extract_text() or "" for page in reader.pages
-            )
-
+            direct_text = "".join(page.extract_text() or "" for page in reader.pages)
+            
             if direct_text.strip():
-                logger.info("Native PDF text found → No OCR needed")
-                return {"ocr_text": direct_text}
-
-            # -------- Fallback: scanned PDF OCR --------
-            logger.info("No native text → PDF treated as scanned → OCR")
-            from pdf2image import convert_from_bytes
-            images = convert_from_bytes(file_data)
-            text = ""
-            for i, img in enumerate(images):
-                logger.info(f"OCR page {i+1}/{len(images)}")
-                text += pytesseract.image_to_string(img) + "\n"
-
-            return {"ocr_text": text}
-
-        # --------- Image file OCR ---------
-        logger.info("Image detected → Running OCR")
-        img = Image.open(io.BytesIO(file_data))
-        text = pytesseract.image_to_string(img)
-        return {"ocr_text": text}
-
+                logger.info("Using native PDF text")
+                ocr_text = direct_text
+            else:
+                # Fallback to OCR
+                logger.info("Using OCR for scanned PDF")
+                images = convert_from_bytes(file_data)
+                for i, img in enumerate(images):
+                    logger.info(f"OCR page {i+1}/{len(images)}")
+                    ocr_text += pytesseract.image_to_string(img) + "\n"
+        else:
+            # Process image
+            img = Image.open(io.BytesIO(file_data))
+            ocr_text = pytesseract.image_to_string(img)
+        
+        # Parse structured data
+        structured_data = parse_cotton_report(ocr_text)
+        
+        return {
+            "success": True,
+            "raw_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
+            "structured_data": structured_data,
+            "json": json.dumps(structured_data, indent=2, ensure_ascii=False)
+        }
+        
     except Exception as e:
-        logger.error(f"OCR failed: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
+        logger.error(f"Structured OCR failed: {e}", exc_info=True)
+        return {
+            "success": False,
+            "error": str(e),
+            "raw_text": "",
+            "structured_data": {}
+        }
+
+# =============================
+# 🧠 Smart OCR
+# =============================
+# @app.post("/ocr")
+# async def ocr(file: UploadFile):
+#     logger.info(f"Received OCR request: {file.filename}")
+#     try:
+#         file_data = await file.read()
+#         ext = file.filename.lower()
+
+#         # --------- PDF with native text ---------
+#         if ext.endswith(".pdf"):
+#             logger.info("PDF detected → Extracting native text first")
+#             reader = PdfReader(io.BytesIO(file_data))
+#             direct_text = "".join(
+#                 page.extract_text() or "" for page in reader.pages
+#             )
+
+#             if direct_text.strip():
+#                 logger.info("Native PDF text found → No OCR needed")
+#                 return {"ocr_text": direct_text}
+
+#             # -------- Fallback: scanned PDF OCR --------
+#             logger.info("No native text → PDF treated as scanned → OCR")
+#             from pdf2image import convert_from_bytes
+#             images = convert_from_bytes(file_data)
+#             text = ""
+#             for i, img in enumerate(images):
+#                 logger.info(f"OCR page {i+1}/{len(images)}")
+#                 text += pytesseract.image_to_string(img) + "\n"
+
+#             return {"ocr_text": text}
+
+#         # --------- Image file OCR ---------
+#         logger.info("Image detected → Running OCR")
+#         img = Image.open(io.BytesIO(file_data))
+#         text = pytesseract.image_to_string(img)
+#         return {"ocr_text": text}
+
+#     except Exception as e:
+#         logger.error(f"OCR failed: {e}", exc_info=True)
+#         raise HTTPException(status_code=500, detail=str(e))
 
 # =============================
 # 🧱 Structure / Layout