from fastapi import FastAPI, UploadFile, HTTPException, Body from PIL import Image import pytesseract from doctr.models import ocr_predictor from doctr.io import DocumentFile from PyPDF2 import PdfReader import camelot import spacy import logging import io from logging.handlers import RotatingFileHandler import re LOG_PATH = "/var/log/automation-service.log" file_handler = RotatingFileHandler( LOG_PATH, maxBytes=10*1024*1024, backupCount=5, encoding="utf-8" ) file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s" )) # class AHKParser: # lab="AHK" # def parse(self,text): # r=empty_weight_report("AHK") # inv=section(text,"INVOICE WEIGHTS","Bales Weighed") # land=section(text,"Bales Weighed","Outturn") # loss=section(text,"LOSS","Invoice average") # r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text) # r["report"]["date"]=extract("Produced On",text) # r["contract"]["invoice_no"]=extract("Client Reference",text) # r["contract"]["origin"]=extract("Growth",text) # r["contract"]["commodity"]="Raw Cotton" # r["parties"]["seller"]=extract("Client",text) # r["parties"]["buyer"]=extract("Buyer",text) # r["shipment"]["vessel"]=extract("Vessel",text) # r["shipment"]["bl_no"]=extract("B/L No",text) # r["shipment"]["port_destination"]=extract("Destination",text) # r["shipment"]["arrival_date"]=extract("Arrival Date",text) # r["shipment"]["weighing_method"]=extract("Weighing method",text) # r["shipment"]["bales"]=to_float(extract("Total Bales",text)) # r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land)) # r["weights"]["tare_kg"]=to_float(extract("Tare",land)) # r["weights"]["net_landed_kg"]=to_float(extract("Net",land)) # r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv)) # r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss)) # r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss)) # return r # class AHKParser: # lab = "AHK" # def extract_table(self, text, headers): # lines = [l.strip() for l in text.splitlines() if l.strip()] # out = {} # for h in headers: # for i,l in enumerate(lines): # if l == h: # for j in range(i+1, i+8): # if j < len(lines) and lines[j].startswith(":"): # out[h] = lines[j][1:].strip() # break # return out # def extract_weights(self, text): # lines = [l.strip() for l in text.splitlines() if l.strip()] # res = {} # for i,l in enumerate(lines): # if l == "Bales Weighed": # headers = ["Bales","Gross","Tare","Net"] # for h in headers: # for j in range(i, i+20): # if j < len(lines) and lines[j].startswith(":"): # res[h] = lines[j][1:].replace("kg","").strip() # break # return res # def parse(self, text): # r = empty_weight_report("AHK") # # report # r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) # r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) # # contract # r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) # r["contract"]["commodity"] = "Raw Cotton" # # buyer # r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) # # shipment tables # ship = self.extract_table(text, [ # "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" # ]) # ship2 = self.extract_table(text, [ # "Growth","Arrival Date","First date of weighing", # "Last Date of Weighing","Weighing method","Tare" # ]) # r["shipment"]["bales"] = to_float(ship.get("Total Bales")) # r["shipment"]["vessel"] = ship.get("Vessel") # r["shipment"]["bl_no"] = ship.get("B/L No.") # r["shipment"]["port_destination"] = ship.get("Destination") # r["shipment"]["arrival_date"] = ship2.get("Arrival Date") # r["shipment"]["weighing_method"] = ship2.get("Weighing method") # r["contract"]["origin"] = ship2.get("Growth") # # weights # inv = self.extract_table(text, ["Bales","Gross","Tare","Net"]) # land = self.extract_weights(text) # r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) # r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) # r["weights"]["tare_kg"] = to_float(land.get("Tare")) # r["weights"]["net_landed_kg"] = to_float(land.get("Net")) # # loss # loss = section(text,"LOSS","Invoice average") # r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) # r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) # return r class AHKParser: lab = "AHK" def _lines(self, text): return [l.strip() for l in text.splitlines() if l.strip()] def _col_block(self, lines, labels, max_scan=30): idx = [i for i,l in enumerate(lines) if l in labels] if not idx: return {} # << empêche le crash start = max(idx) + 1 vals = [] for l in lines[start:start+max_scan]: if l.startswith(":"): v = l[1:].replace("kg","").strip() vals.append(v) if len(vals) == len(labels): break return dict(zip(labels, vals)) def parse(self, text): L = self._lines(text) r = empty_weight_report("AHK") # report r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) # contract r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) r["contract"]["commodity"] = "Raw Cotton" # buyer r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) # shipment block 1 ship1 = self._col_block(L, [ "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" ]) # shipment block 2 ship2 = self._col_block(L, [ "Growth","Arrival Date","First date of weighing", "Last Date of Weighing","Weighing method","Tare" ]) r["shipment"]["bales"] = to_float(ship1.get("Total Bales")) r["shipment"]["vessel"] = ship1.get("Vessel") r["shipment"]["bl_no"] = ship1.get("B/L No.") r["shipment"]["port_destination"] = ship1.get("Destination") r["shipment"]["arrival_date"] = ship2.get("Arrival Date") r["shipment"]["weighing_method"] = ship2.get("Weighing method") r["contract"]["origin"] = ship2.get("Growth") # invoice weights inv = self._col_block(L, ["Bales","Gross","Tare","Net"]) r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) # landed weights land = self._col_block( self._lines(section(text,"Bales Weighed","Outturn")), ["Bales","Gross","Tare","Net"] ) r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) r["weights"]["tare_kg"] = to_float(land.get("Tare")) r["weights"]["net_landed_kg"] = to_float(land.get("Net")) # loss loss = section(text,"LOSS","Invoice average") r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) return r class IntertekParser: lab="INTERTEK" def parse(self,text): r=empty_weight_report("INTERTEK") pct=safe_search(r"([0-9.]+)\s*%",text) r["report"]["reference"]=extract("Global Ref",text) r["report"]["file_no"]=extract("Report / File No",text) r["report"]["date"]=extract("Dated",text) r["contract"]["contract_no"]=extract("Contract No",text) r["contract"]["invoice_no"]=extract("Invoice No",text) r["contract"]["origin"]=extract("Growth",text) r["contract"]["commodity"]="Raw Cotton" r["parties"]["buyer"]=extract("Buyer",text) r["shipment"]["vessel"]=extract("Vessel",text) r["shipment"]["bl_no"]=extract("B/L No",text) r["shipment"]["arrival_date"]=extract("Arrival Date",text) r["shipment"]["weighing_place"]=extract("Weighed at",text) r["shipment"]["bales"]=to_float(extract("Invoice Quantity",text)) r["weights"]["gross_landed_kg"]=to_float(extract("Gross",text)) r["weights"]["tare_kg"]=to_float(extract("Invoice Tare",text)) r["weights"]["net_landed_kg"]=to_float(extract("Landed Weight",text)) r["weights"]["invoice_net_kg"]=to_float(extract("Invoice Weight",text)) r["weights"]["gain_loss_kg"]=to_float(extract("Gain",text)) r["weights"]["gain_loss_percent"]=to_float(pct) return r class RobertsonParser: lab="ROBERTSON" def parse(self,text): r=empty_weight_report("ROBERTSON") pct=safe_search(r"([0-9.]+)\s*%",text) r["report"]["reference"]=extract("OUR REF",text) r["report"]["date"]=extract("DATE",text) r["contract"]["contract_no"]=extract("CONTRACT NO",text) r["contract"]["invoice_no"]=extract("INVOICE NO",text) r["contract"]["lc_no"]=extract("LIC NO",text) r["contract"]["commodity"]="Raw Cotton" r["parties"]["seller"]=extract("SELLER",text) r["parties"]["buyer"]=extract("BUYER",text) r["shipment"]["vessel"]=extract("NAME OF VESSEL",text) r["shipment"]["port_loading"]=extract("SAILED FROM",text) r["shipment"]["port_destination"]=extract("ARRIVED AT",text) r["shipment"]["arrival_date"]=extract("DATE OF ARRIVAL",text) r["shipment"]["weighing_place"]=extract("PLACE OF CONTROL",text) r["shipment"]["bales"]=to_float(extract("CONSIGNMENT",text)) r["weights"]["gross_landed_kg"]=to_float(extract("GROSS",text)) r["weights"]["tare_kg"]=to_float(extract("TARE",text)) r["weights"]["net_landed_kg"]=to_float(extract("LANDED NET",text)) r["weights"]["invoice_net_kg"]=to_float(extract("INVOICE NET",text)) r["weights"]["gain_loss_kg"]=to_float(extract("GAIN",text)) r["weights"]["gain_loss_percent"]=to_float(pct) return r class SGSParser: lab="SGS" def parse(self,text): r=empty_weight_report("SGS") r["report"]["reference"]=extract("LANDING REPORT No",text) r["report"]["file_no"]=extract("FILE NO.",text) r["report"]["date"]=extract("DATE",text) r["contract"]["contract_no"]=extract("CONTRACT NO.",text) r["contract"]["invoice_no"]=extract("INVOICE NO.",text) r["contract"]["origin"]=extract("ORIGIN",text) r["contract"]["commodity"]=extract("PRODUCT",text) r["parties"]["seller"]=extract("Seller",text) r["parties"]["buyer"]=extract("Buyer",text) r["parties"]["carrier"]=extract("Carrier",text) r["shipment"]["bl_no"]=extract("B/L no.",text) r["shipment"]["port_loading"]=extract("Port of loading",text) r["shipment"]["port_destination"]=extract("Port of destination",text) r["shipment"]["arrival_date"]=extract("Vessel arrival date",text) r["shipment"]["weighing_place"]=extract("Place of weighing",text) r["shipment"]["weighing_method"]=extract("Weighing mode",text) r["shipment"]["bales"]=to_float(extract("Quantity arrived",text)) r["weights"]["gross_landed_kg"]=to_float(extract("Gross landed",text)) r["weights"]["tare_kg"]=to_float(extract("Tare",text)) r["weights"]["net_landed_kg"]=to_float(extract("Net landed",text)) r["weights"]["invoice_net_kg"]=to_float(extract("Net invoiced",text)) r["weights"]["gain_loss_kg"]=to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs",text)) r["weights"]["gain_loss_percent"]=to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%",text)) return r class PICLParser: lab="PICL" def parse(self,text): r=empty_weight_report("PICL") r["report"]["reference"]=safe_search(r"No[:\s]+([A-Z0-9\-]+)",text) r["report"]["date"]=safe_search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})",text,group_index=2) r["contract"]["contract_no"]=extract("Contract/Pl No & Date",text) r["contract"]["invoice_no"]=extract("Invoice ilo & Date",text) r["contract"]["lc_no"]=extract("L/C No & Date",text) r["contract"]["origin"]=extract("Country of Origin",text) r["contract"]["commodity"]=extract("Commodity",text) r["parties"]["seller"]=extract("FAIRCOT SA",text) r["parties"]["buyer"]=extract("M/S.",text) r["parties"]["carrier"]=extract("Shipping Agent",text) r["shipment"]["vessel"]=extract("Shipped Per Vessel",text) r["shipment"]["bl_no"]=extract("B/L No & Date",text) r["shipment"]["port_loading"]=extract("Port of Loading",text) r["shipment"]["port_destination"]=extract("Port of Discharge",text) r["shipment"]["arrival_date"]=extract("Date of Anival & LDL",text) r["shipment"]["weighing_place"]=extract("Place & Date of Weighment",text) r["shipment"]["weighing_method"]=extract("Method of Weighment",text) r["shipment"]["bales"]=to_float(extract("Grand Total",text)) r["weights"]["gross_landed_kg"]=to_float(extract("Total;",text)) r["weights"]["tare_kg"]=to_float(extract("Tare Weight",text)) r["weights"]["net_landed_kg"]=to_float(extract("Grand Total",text)) r["weights"]["invoice_net_kg"]=to_float(extract("Invoice weight",text)) r["weights"]["gain_loss_kg"]=to_float(safe_search(r"(-[0-9.,]+)\s*KGS",text)) r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text)) return r # Configure root logger explicitly root = logging.getLogger() root.setLevel(logging.INFO) root.addHandler(file_handler) root.addHandler(logging.StreamHandler()) # Use root logger for your app logger = logging.getLogger(__name__) app = FastAPI() logger.info("Loading models...") nlp = spacy.load("en_core_web_sm") predictor = ocr_predictor(pretrained=True) logger.info("Models loaded successfully.") import io import re from datetime import datetime from typing import Dict, Any import pytesseract from pdf2image import convert_from_bytes from PIL import Image from PyPDF2 import PdfReader import json def parse_cotton_report(ocr_text: str) -> Dict[str, Any]: """ Parse structured data from cotton landing report OCR text """ result = { "lab": "ALFRED H KNIGHT", "report": {"reference": None, "file_no": None, "date": None}, "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None}, "parties": {"seller": None, "buyer": None, "carrier": None}, "shipment": { "vessel": None, "bl_no": None, "port_loading": None, "port_destination": None, "arrival_date": None, "weighing_place": None, "weighing_method": None, "bales": None }, "weights": { "gross_landed_kg": None, "tare_kg": None, "net_landed_kg": None, "invoice_net_kg": None, "gain_loss_kg": None, "gain_loss_percent": None } } # Clean the text lines = ocr_text.split('\n') clean_lines = [line.strip() for line in lines if line.strip()] # Extract using patterns text = ocr_text.lower() # 1. Extract report reference and file number ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE) if ref_match: result["report"]["reference"] = ref_match.group(1).strip() # Try to get file number from AHK reference ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE) if ahk_match: result["report"]["file_no"] = ahk_match.group(1) # 2. Extract dates date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE) if date_match: result["report"]["date"] = date_match.group(1).title() # 3. Extract contract information # Origin/Growth growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE) if growth_match: origin = growth_match.group(1).strip() result["contract"]["origin"] = origin result["contract"]["commodity"] = "COTTON" # Invoice number from reference if result["report"]["reference"]: inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE) if inv_match: result["contract"]["invoice_no"] = inv_match.group(1) # 4. Extract parties # Seller seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if seller_match: # Skip the "Client" label if present seller_text = seller_match.group(1).strip() if not seller_text.lower().startswith('client'): result["parties"]["seller"] = seller_text # Buyer buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if buyer_match: buyer_text = buyer_match.group(1).strip() if not buyer_text.lower().startswith('buyer'): result["parties"]["buyer"] = buyer_text # 5. Extract shipment details # Vessel vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if vessel_match: vessel_text = vessel_match.group(1).strip() if not vessel_text.lower().startswith('vessel'): result["shipment"]["vessel"] = vessel_text # B/L Number bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if bl_match: bl_text = bl_match.group(1).strip() result["shipment"]["bl_no"] = bl_text # Destination dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if dest_match: dest_text = dest_match.group(1).strip() if not dest_text.lower().startswith('destination'): result["shipment"]["port_destination"] = dest_text # Arrival Date arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE) if arrival_match: result["shipment"]["arrival_date"] = arrival_match.group(1).title() # Weighing method weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) if weigh_match: method_text = weigh_match.group(1).strip() if not method_text.lower().startswith('weighing'): result["shipment"]["weighing_method"] = method_text # Bales count bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE) if bales_match: result["shipment"]["bales"] = int(bales_match.group(1)) # 6. Extract weights (critical section) # Gross Landed Weight gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text) if gross_match: # We need the second occurrence (landed weight) all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text) if len(all_gross) >= 2: result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', '')) # Tare weight (should be same in both) tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text) if tare_match: result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', '')) # Net weights net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text) if len(net_matches) >= 2: result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', '')) result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', '')) # Loss/Gain loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE) if loss_match: loss_value = float(loss_match.group(1).replace(',', '')) # Make it negative if not already indicated if '-' not in loss_match.group(0) and '–' not in loss_match.group(0): loss_value = -loss_value result["weights"]["gain_loss_kg"] = loss_value # Percentage percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE) if percent_match: percent_value = float(percent_match.group(1).replace(',', '')) if '-' not in percent_match.group(0) and '–' not in percent_match.group(0): percent_value = -percent_value result["weights"]["gain_loss_percent"] = percent_value return result @app.post("/ocr") async def ocr(file: UploadFile): """ Enhanced OCR endpoint that returns structured data """ logger.info(f"Received structured OCR request: {file.filename}") try: file_data = await file.read() ext = file.filename.lower() ocr_text = "" # Process PDF if ext.endswith(".pdf"): # Try native text extraction first reader = PdfReader(io.BytesIO(file_data)) direct_text = "".join(page.extract_text() or "" for page in reader.pages) if direct_text.strip(): logger.info("Using native PDF text") ocr_text = direct_text else: # Fallback to OCR logger.info("Using OCR for scanned PDF") images = convert_from_bytes(file_data) for i, img in enumerate(images): logger.info(f"OCR page {i+1}/{len(images)}") ocr_text += pytesseract.image_to_string(img) + "\n" else: # Process image img = Image.open(io.BytesIO(file_data)) ocr_text = pytesseract.image_to_string(img) # Parse structured data structured_data = parse_cotton_report(ocr_text) return { "success": True, "ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text, "structured_data": structured_data, "json": json.dumps(structured_data, indent=2, ensure_ascii=False) } except Exception as e: logger.error(f"Structured OCR failed: {e}", exc_info=True) return { "success": False, "error": str(e), "raw_text": "", "structured_data": {} } # ============================= # 🧠 Smart OCR # ============================= # @app.post("/ocr") # async def ocr(file: UploadFile): # logger.info(f"Received OCR request: {file.filename}") # try: # file_data = await file.read() # ext = file.filename.lower() # # --------- PDF with native text --------- # if ext.endswith(".pdf"): # logger.info("PDF detected → Extracting native text first") # reader = PdfReader(io.BytesIO(file_data)) # direct_text = "".join( # page.extract_text() or "" for page in reader.pages # ) # if direct_text.strip(): # logger.info("Native PDF text found → No OCR needed") # return {"ocr_text": direct_text} # # -------- Fallback: scanned PDF OCR -------- # logger.info("No native text → PDF treated as scanned → OCR") # from pdf2image import convert_from_bytes # images = convert_from_bytes(file_data) # text = "" # for i, img in enumerate(images): # logger.info(f"OCR page {i+1}/{len(images)}") # text += pytesseract.image_to_string(img) + "\n" # return {"ocr_text": text} # # --------- Image file OCR --------- # logger.info("Image detected → Running OCR") # img = Image.open(io.BytesIO(file_data)) # text = pytesseract.image_to_string(img) # return {"ocr_text": text} # except Exception as e: # logger.error(f"OCR failed: {e}", exc_info=True) # raise HTTPException(status_code=500, detail=str(e)) # ============================= # 🧱 Structure / Layout # ============================= @app.post("/structure") async def structure(file: UploadFile): logger.info(f"Received structure request: {file.filename}") try: file_data = await file.read() ext = file.filename.lower() if ext.endswith(".pdf"): doc = DocumentFile.from_pdf(file_data) logger.info(f"Structure prediction on PDF ({len(doc)} pages)") else: img = Image.open(io.BytesIO(file_data)).convert("RGB") doc = DocumentFile.from_images([img]) logger.info("Structure prediction on image") res = predictor(doc) return {"structure": str(res)} except Exception as e: logger.error(f"Structure extraction failed: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) # ============================= # 📊 Tables extraction (PDF only) # ============================= @app.post("/tables") async def tables(file: UploadFile): logger.info(f"Received table extraction request: {file.filename}") try: file_data = await file.read() buffer = io.BytesIO(file_data) tables = camelot.read_pdf(buffer) logger.info(f"Found {len(tables)} tables") return {"tables": [t.df.to_dict() for t in tables]} except Exception as e: logger.error(f"Table extraction failed: {e}", exc_info=True) raise HTTPException(status_code=500, detail=str(e)) def safe_search(pattern, text, default=None, group_index=1, context=""): """Recherche sécurisée avec logging en cas d'absence de correspondance.""" m = re.search(pattern, text, re.I | re.S) if not m: logger.warning("Pattern not found for %s: %s", context, pattern) return default try: return m.group(group_index).strip() except IndexError: logger.warning("Group index %d not found for %s: %s", group_index, context, pattern) return default def to_float(s): if not s: return None s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "") s = s.replace("lbs", "").replace("LBS", "") s = s.strip() try: return float(s) except: return None def section(text, start, end=None): """Extract a block of text between two headings, safely.""" pattern_start = re.escape(start) if end: pattern_end = re.escape(end) reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I) else: reg = re.compile(pattern_start + r"(.*)", re.S | re.I) m = reg.search(text) if not m: logger.warning("Section not found: start='%s', end='%s'", start, end) return "" return m.group(1).strip() def extract_field(text, label, default=None): """Extract a line of the form 'Label: value', safely.""" pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)" return safe_search(pattern, text, default=default, context=f"field '{label}'") def extract(label, text, default=None): """ Robust extraction for OCR/PDF text. Works with: Label: Value Label Value Label .... Value """ if not text: return default patterns = [ rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)", rf"{re.escape(label)}\s+([^\n\r]+)" ] for p in patterns: m = re.search(p, text, re.I) if m: return m.group(1).strip() return default def extract_report_metadata(text): logger.info("Starting metadata extraction, text length=%d", len(text)) try: # ----------- SECTIONS ----------- order_details = section(text, "Order details", "Weights") invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed") landed_section = section(text, "Bales Weighed", "Outturn") loss_section = section(text, "LOSS", "Invoice average") avg_section = section(text, "Invoice average", "Comments") signature_block = section(text, "Signed on") # ----------- TOP INFO ----------- top_info = { "produced_on": extract_field(text, "Produced On"), "printed_date": extract_field(text, "Printed Date"), "client_reference": extract_field(text, "Client Reference"), "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1), } # ----------- ORDER DETAILS ----------- parties = { "client": extract_field(order_details, "Client"), "client_ref_no": extract_field(order_details, "Client Ref No"), "buyer": extract_field(order_details, "Buyer"), "destination": extract_field(order_details, "Destination"), } shipment = { "total_bales": extract_field(order_details, "Total Bales"), "vessel": extract_field(order_details, "Vessel"), "voyage_no": extract_field(order_details, "Voy. No"), "bl_no": extract_field(order_details, "B/L No"), "bl_date": extract_field(order_details, "B/L Date"), "growth": extract_field(order_details, "Growth"), "arrival_date": extract_field(order_details, "Arrival Date"), "first_weighing_date": extract_field(order_details, "First date of weighing"), "last_weighing_date": extract_field(order_details, "Last Date of Weighing"), "weighing_method": extract_field(order_details, "Weighing method"), "tare_basis": extract_field(order_details, "Tare"), } # ----------- INVOICE SECTION ----------- invoice = { "bales": extract_field(invoice_section, "Bales"), "gross": extract_field(invoice_section, "Gross"), "tare": extract_field(invoice_section, "Tare"), "net": extract_field(invoice_section, "Net"), } # ----------- LANDED SECTION ----------- landed = { "bales": extract_field(landed_section, "Bales"), "gross": extract_field(landed_section, "Gross"), "tare": extract_field(landed_section, "Tare"), "net": extract_field(landed_section, "Net"), } # ----------- LOSS SECTION ----------- loss = { "kg": extract_field(loss_section, "kg"), "lb": extract_field(loss_section, "lb"), "percent": extract_field(loss_section, "Percentage"), } # ----------- AVERAGES SECTION ----------- averages = { "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"), "landed_gross_per_bale": extract_field(avg_section, "Landed average"), } # ----------- SIGNATURE ----------- signature = { "signed_on": extract_field(signature_block, "Signed on"), "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"), "role": "Client Services Coordinator", "company": "Alfred H. Knight International Limited" } logger.info("Metadata extraction completed successfully") return { "report": top_info, "parties": parties, "shipment": shipment, "weights": { "invoice": invoice, "landed": landed, "loss": loss, "averages": averages }, "signature": signature } except Exception as e: logger.exception("Unexpected error during metadata extraction") raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}") def detect_template(text): t = text.lower() if "alfred h. knight" in t and "cotton landing report" in t: return "AHK" if "intertek" in t and "landing report" in t: return "INTERTEK" if "robertson international" in t or "ri ref no" in t: return "ROBERTSON" if "landing report" in t and "carcon cargo" in t: return "SGS" if "pacific inspection company" in t or "picl-bd.com" in t: return "PICL" return "UNKNOWN" @app.post("/metadata") async def metadata(text: str = Body(..., embed=True)): return extract_report_metadata(text) @app.post("/parse") async def parse_endpoint(text: str = Body(..., embed=True)): return parse_report(text) PARSERS = { "AHK": AHKParser(), "INTERTEK": IntertekParser(), "ROBERTSON": RobertsonParser(), "SGS": SGSParser(), "PICL": PICLParser() } def empty_weight_report(lab): return { "lab": lab, "report": {"reference": None, "file_no": None, "date": None}, "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None}, "parties": {"seller": None, "buyer": None, "carrier": None}, "shipment": { "vessel": None, "bl_no": None, "port_loading": None, "port_destination": None, "arrival_date": None, "weighing_place": None, "weighing_method": None, "bales": None }, "weights": { "gross_landed_kg": None, "tare_kg": None, "net_landed_kg": None, "invoice_net_kg": None, "gain_loss_kg": None, "gain_loss_percent": None } } def parse_report(text): template=detect_template(text) if template not in PARSERS: return {"template":"UNKNOWN"} return PARSERS[template].parse(text)