diff --git a/app.py b/app.py index da09da0..42d2693 100644 --- a/app.py +++ b/app.py @@ -23,113 +23,6 @@ file_handler.setFormatter(logging.Formatter( "%(asctime)s - %(levelname)s - %(name)s - %(message)s" )) -# class AHKParser: -# lab="AHK" -# def parse(self,text): -# r=empty_weight_report("AHK") -# inv=section(text,"INVOICE WEIGHTS","Bales Weighed") -# land=section(text,"Bales Weighed","Outturn") -# loss=section(text,"LOSS","Invoice average") - -# r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text) -# r["report"]["date"]=extract("Produced On",text) - -# r["contract"]["invoice_no"]=extract("Client Reference",text) -# r["contract"]["origin"]=extract("Growth",text) -# r["contract"]["commodity"]="Raw Cotton" - -# r["parties"]["seller"]=extract("Client",text) -# r["parties"]["buyer"]=extract("Buyer",text) - -# r["shipment"]["vessel"]=extract("Vessel",text) -# r["shipment"]["bl_no"]=extract("B/L No",text) -# r["shipment"]["port_destination"]=extract("Destination",text) -# r["shipment"]["arrival_date"]=extract("Arrival Date",text) -# r["shipment"]["weighing_method"]=extract("Weighing method",text) -# r["shipment"]["bales"]=to_float(extract("Total Bales",text)) - -# r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land)) -# r["weights"]["tare_kg"]=to_float(extract("Tare",land)) -# r["weights"]["net_landed_kg"]=to_float(extract("Net",land)) -# r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv)) -# r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss)) -# r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss)) -# return r -# class AHKParser: -# lab = "AHK" - -# def extract_table(self, text, headers): -# lines = [l.strip() for l in text.splitlines() if l.strip()] -# out = {} -# for h in headers: -# for i,l in enumerate(lines): -# if l == h: -# for j in range(i+1, i+8): -# if j < len(lines) and lines[j].startswith(":"): -# out[h] = lines[j][1:].strip() -# break -# return out - -# def extract_weights(self, text): -# lines = [l.strip() for l in text.splitlines() if l.strip()] -# res = {} -# for i,l in enumerate(lines): -# if l == "Bales Weighed": -# headers = ["Bales","Gross","Tare","Net"] -# for h in headers: -# for j in range(i, i+20): -# if j < len(lines) and lines[j].startswith(":"): -# res[h] = lines[j][1:].replace("kg","").strip() -# break -# return res - -# def parse(self, text): -# r = empty_weight_report("AHK") - -# # report -# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) -# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) - -# # contract -# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) -# r["contract"]["commodity"] = "Raw Cotton" - -# # buyer -# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) - -# # shipment tables -# ship = self.extract_table(text, [ -# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" -# ]) -# ship2 = self.extract_table(text, [ -# "Growth","Arrival Date","First date of weighing", -# "Last Date of Weighing","Weighing method","Tare" -# ]) - -# r["shipment"]["bales"] = to_float(ship.get("Total Bales")) -# r["shipment"]["vessel"] = ship.get("Vessel") -# r["shipment"]["bl_no"] = ship.get("B/L No.") -# r["shipment"]["port_destination"] = ship.get("Destination") -# r["shipment"]["arrival_date"] = ship2.get("Arrival Date") -# r["shipment"]["weighing_method"] = ship2.get("Weighing method") -# r["contract"]["origin"] = ship2.get("Growth") - -# # weights -# inv = self.extract_table(text, ["Bales","Gross","Tare","Net"]) -# land = self.extract_weights(text) - -# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) -# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) -# r["weights"]["tare_kg"] = to_float(land.get("Tare")) -# r["weights"]["net_landed_kg"] = to_float(land.get("Net")) - -# # loss -# loss = section(text,"LOSS","Invoice average") -# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) -# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) - -# return r - class AHKParser: lab = "AHK" @@ -205,7 +98,6 @@ class AHKParser: return r - class IntertekParser: lab="INTERTEK" def parse(self,text): @@ -354,221 +246,6 @@ predictor = ocr_predictor(pretrained=True) logger.info("Models loaded successfully.") -import io -import re -from datetime import datetime -from typing import Dict, Any -import pytesseract -from pdf2image import convert_from_bytes -from PIL import Image -from PyPDF2 import PdfReader -import json - -def parse_cotton_report(ocr_text: str) -> Dict[str, Any]: - """ - Parse structured data from cotton landing report OCR text - """ - result = { - "lab": "ALFRED H KNIGHT", - "report": {"reference": None, "file_no": None, "date": None}, - "contract": {"contract_no": None, "invoice_no": None, "lc_no": None, - "origin": None, "commodity": None}, - "parties": {"seller": None, "buyer": None, "carrier": None}, - "shipment": { - "vessel": None, "bl_no": None, "port_loading": None, - "port_destination": None, "arrival_date": None, - "weighing_place": None, "weighing_method": None, - "bales": None - }, - "weights": { - "gross_landed_kg": None, "tare_kg": None, - "net_landed_kg": None, "invoice_net_kg": None, - "gain_loss_kg": None, "gain_loss_percent": None - } - } - - # Clean the text - lines = ocr_text.split('\n') - clean_lines = [line.strip() for line in lines if line.strip()] - - # Extract using patterns - text = ocr_text.lower() - - # 1. Extract report reference and file number - ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE) - if ref_match: - result["report"]["reference"] = ref_match.group(1).strip() - - # Try to get file number from AHK reference - ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE) - if ahk_match: - result["report"]["file_no"] = ahk_match.group(1) - - # 2. Extract dates - date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE) - if date_match: - result["report"]["date"] = date_match.group(1).title() - - # 3. Extract contract information - # Origin/Growth - growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE) - if growth_match: - origin = growth_match.group(1).strip() - result["contract"]["origin"] = origin - result["contract"]["commodity"] = "COTTON" - - # Invoice number from reference - if result["report"]["reference"]: - inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE) - if inv_match: - result["contract"]["invoice_no"] = inv_match.group(1) - - # 4. Extract parties - # Seller - seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if seller_match: - # Skip the "Client" label if present - seller_text = seller_match.group(1).strip() - if not seller_text.lower().startswith('client'): - result["parties"]["seller"] = seller_text - - # Buyer - buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if buyer_match: - buyer_text = buyer_match.group(1).strip() - if not buyer_text.lower().startswith('buyer'): - result["parties"]["buyer"] = buyer_text - - # 5. Extract shipment details - # Vessel - vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if vessel_match: - vessel_text = vessel_match.group(1).strip() - if not vessel_text.lower().startswith('vessel'): - result["shipment"]["vessel"] = vessel_text - - # B/L Number - bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if bl_match: - bl_text = bl_match.group(1).strip() - result["shipment"]["bl_no"] = bl_text - - # Destination - dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if dest_match: - dest_text = dest_match.group(1).strip() - if not dest_text.lower().startswith('destination'): - result["shipment"]["port_destination"] = dest_text - - # Arrival Date - arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE) - if arrival_match: - result["shipment"]["arrival_date"] = arrival_match.group(1).title() - - # Weighing method - weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE) - if weigh_match: - method_text = weigh_match.group(1).strip() - if not method_text.lower().startswith('weighing'): - result["shipment"]["weighing_method"] = method_text - - # Bales count - bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE) - if bales_match: - result["shipment"]["bales"] = int(bales_match.group(1)) - - # 6. Extract weights (critical section) - # Gross Landed Weight - gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text) - if gross_match: - # We need the second occurrence (landed weight) - all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text) - if len(all_gross) >= 2: - result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', '')) - - # Tare weight (should be same in both) - tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text) - if tare_match: - result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', '')) - - # Net weights - net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text) - if len(net_matches) >= 2: - result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', '')) - result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', '')) - - # Loss/Gain - loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE) - if loss_match: - loss_value = float(loss_match.group(1).replace(',', '')) - # Make it negative if not already indicated - if '-' not in loss_match.group(0) and '–' not in loss_match.group(0): - loss_value = -loss_value - result["weights"]["gain_loss_kg"] = loss_value - - # Percentage - percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE) - if percent_match: - percent_value = float(percent_match.group(1).replace(',', '')) - if '-' not in percent_match.group(0) and '–' not in percent_match.group(0): - percent_value = -percent_value - result["weights"]["gain_loss_percent"] = percent_value - - return result - -@app.post("/ocr") -async def ocr(file: UploadFile): - """ - Enhanced OCR endpoint that returns structured data - """ - logger.info(f"Received structured OCR request: {file.filename}") - - try: - file_data = await file.read() - ext = file.filename.lower() - - ocr_text = "" - - # Process PDF - if ext.endswith(".pdf"): - # Try native text extraction first - reader = PdfReader(io.BytesIO(file_data)) - direct_text = "".join(page.extract_text() or "" for page in reader.pages) - - if direct_text.strip(): - logger.info("Using native PDF text") - ocr_text = direct_text - else: - # Fallback to OCR - logger.info("Using OCR for scanned PDF") - images = convert_from_bytes(file_data) - for i, img in enumerate(images): - logger.info(f"OCR page {i+1}/{len(images)}") - ocr_text += pytesseract.image_to_string(img) + "\n" - else: - # Process image - img = Image.open(io.BytesIO(file_data)) - ocr_text = pytesseract.image_to_string(img) - - # Parse structured data - structured_data = parse_cotton_report(ocr_text) - - return { - "success": True, - "ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text, - "structured_data": structured_data, - "json": json.dumps(structured_data, indent=2, ensure_ascii=False) - } - - except Exception as e: - logger.error(f"Structured OCR failed: {e}", exc_info=True) - return { - "success": False, - "error": str(e), - "raw_text": "", - "structured_data": {} - } - # ============================= # 🧠 Smart OCR # ============================= @@ -611,7 +288,91 @@ async def ocr(file: UploadFile): # except Exception as e: # logger.error(f"OCR failed: {e}", exc_info=True) # raise HTTPException(status_code=500, detail=str(e)) - +@app.post("/ocr") +async def ocr(file: UploadFile): + """ + Smart PDF processing optimized for cotton landing reports + """ + logger.info(f"Smart OCR request: {file.filename}") + + try: + file_data = await file.read() + + # Strategy 1: Try pdfplumber (best for digital PDFs) + try: + with pdfplumber.open(io.BytesIO(file_data)) as pdf: + text_parts = [] + tables_found = [] + + for page in pdf.pages: + # Extract text + page_text = page.extract_text(x_tolerance=2, y_tolerance=2) + if page_text: + text_parts.append(page_text) + + # Look for tables (common in landing reports) + tables = page.extract_tables({ + "vertical_strategy": "text", + "horizontal_strategy": "text", + "snap_tolerance": 5, + }) + + for table in tables: + if table and len(table) > 1: + tables_found.append(table) + + combined_text = "\n".join(text_parts) + return {"ocr_text": combined_text} + # if combined_text.strip(): + # logger.info(f"pdfplumber extracted {len(combined_text)} chars") + + # # Try parsing structured data + # structured_data = parse_cotton_report(combined_text) + + # # Check if we got key fields + # if (structured_data.get("shipment", {}).get("bales") and + # structured_data.get("weights", {}).get("net_landed_kg")): + # logger.info("Successfully parsed structured data from pdfplumber") + # return { + # "method": "pdfplumber", + # "structured_data": structured_data, + # "raw_text_sample": combined_text[:500] + # } + + except Exception as e: + logger.warning(f"pdfplumber attempt: {e}") + + # Strategy 2: Fallback to OCR for scanned PDFs + logger.info("Falling back to OCR...") + + # Convert PDF to images + from pdf2image import convert_from_bytes + images = convert_from_bytes(file_data, dpi=200) + + ocr_results = [] + for img in images: + # Use pytesseract with optimized settings + text = pytesseract.image_to_string( + img, + config='--psm 6 -c preserve_interword_spaces=1' + ) + ocr_results.append(text) + + ocr_text = "\n".join(ocr_results) + structured_data = parse_cotton_report(ocr_text) + + return { + "method": "tesseract_ocr", + "structured_data": structured_data, + "raw_text_sample": ocr_text[:500] + } + + except Exception as e: + logger.error(f"Smart OCR failed: {e}", exc_info=True) + return { + "error": str(e), + "success": False + } # ============================= # 🧱 Structure / Layout # =============================