11.01.26
This commit is contained in:
268
app.py
268
app.py
@@ -426,7 +426,6 @@ class PICLParser:
|
||||
r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text))
|
||||
return r
|
||||
|
||||
|
||||
# Configure root logger explicitly
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.INFO)
|
||||
@@ -444,48 +443,263 @@ predictor = ocr_predictor(pretrained=True)
|
||||
|
||||
logger.info("Models loaded successfully.")
|
||||
|
||||
# =============================
|
||||
# 🧠 Smart OCR
|
||||
# =============================
|
||||
import io
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
import pytesseract
|
||||
from pdf2image import convert_from_bytes
|
||||
from PIL import Image
|
||||
from PyPDF2 import PdfReader
|
||||
import json
|
||||
|
||||
def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse structured data from cotton landing report OCR text
|
||||
"""
|
||||
result = {
|
||||
"lab": "ALFRED H KNIGHT",
|
||||
"report": {"reference": None, "file_no": None, "date": None},
|
||||
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None,
|
||||
"origin": None, "commodity": None},
|
||||
"parties": {"seller": None, "buyer": None, "carrier": None},
|
||||
"shipment": {
|
||||
"vessel": None, "bl_no": None, "port_loading": None,
|
||||
"port_destination": None, "arrival_date": None,
|
||||
"weighing_place": None, "weighing_method": None,
|
||||
"bales": None
|
||||
},
|
||||
"weights": {
|
||||
"gross_landed_kg": None, "tare_kg": None,
|
||||
"net_landed_kg": None, "invoice_net_kg": None,
|
||||
"gain_loss_kg": None, "gain_loss_percent": None
|
||||
}
|
||||
}
|
||||
|
||||
# Clean the text
|
||||
lines = ocr_text.split('\n')
|
||||
clean_lines = [line.strip() for line in lines if line.strip()]
|
||||
|
||||
# Extract using patterns
|
||||
text = ocr_text.lower()
|
||||
|
||||
# 1. Extract report reference and file number
|
||||
ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if ref_match:
|
||||
result["report"]["reference"] = ref_match.group(1).strip()
|
||||
|
||||
# Try to get file number from AHK reference
|
||||
ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
|
||||
if ahk_match:
|
||||
result["report"]["file_no"] = ahk_match.group(1)
|
||||
|
||||
# 2. Extract dates
|
||||
date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
|
||||
if date_match:
|
||||
result["report"]["date"] = date_match.group(1).title()
|
||||
|
||||
# 3. Extract contract information
|
||||
# Origin/Growth
|
||||
growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
|
||||
if growth_match:
|
||||
origin = growth_match.group(1).strip()
|
||||
result["contract"]["origin"] = origin
|
||||
result["contract"]["commodity"] = "COTTON"
|
||||
|
||||
# Invoice number from reference
|
||||
if result["report"]["reference"]:
|
||||
inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
|
||||
if inv_match:
|
||||
result["contract"]["invoice_no"] = inv_match.group(1)
|
||||
|
||||
# 4. Extract parties
|
||||
# Seller
|
||||
seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if seller_match:
|
||||
# Skip the "Client" label if present
|
||||
seller_text = seller_match.group(1).strip()
|
||||
if not seller_text.lower().startswith('client'):
|
||||
result["parties"]["seller"] = seller_text
|
||||
|
||||
# Buyer
|
||||
buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if buyer_match:
|
||||
buyer_text = buyer_match.group(1).strip()
|
||||
if not buyer_text.lower().startswith('buyer'):
|
||||
result["parties"]["buyer"] = buyer_text
|
||||
|
||||
# 5. Extract shipment details
|
||||
# Vessel
|
||||
vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if vessel_match:
|
||||
vessel_text = vessel_match.group(1).strip()
|
||||
if not vessel_text.lower().startswith('vessel'):
|
||||
result["shipment"]["vessel"] = vessel_text
|
||||
|
||||
# B/L Number
|
||||
bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if bl_match:
|
||||
bl_text = bl_match.group(1).strip()
|
||||
result["shipment"]["bl_no"] = bl_text
|
||||
|
||||
# Destination
|
||||
dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if dest_match:
|
||||
dest_text = dest_match.group(1).strip()
|
||||
if not dest_text.lower().startswith('destination'):
|
||||
result["shipment"]["port_destination"] = dest_text
|
||||
|
||||
# Arrival Date
|
||||
arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
|
||||
if arrival_match:
|
||||
result["shipment"]["arrival_date"] = arrival_match.group(1).title()
|
||||
|
||||
# Weighing method
|
||||
weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
||||
if weigh_match:
|
||||
method_text = weigh_match.group(1).strip()
|
||||
if not method_text.lower().startswith('weighing'):
|
||||
result["shipment"]["weighing_method"] = method_text
|
||||
|
||||
# Bales count
|
||||
bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
|
||||
if bales_match:
|
||||
result["shipment"]["bales"] = int(bales_match.group(1))
|
||||
|
||||
# 6. Extract weights (critical section)
|
||||
# Gross Landed Weight
|
||||
gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
||||
if gross_match:
|
||||
# We need the second occurrence (landed weight)
|
||||
all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
||||
if len(all_gross) >= 2:
|
||||
result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))
|
||||
|
||||
# Tare weight (should be same in both)
|
||||
tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
||||
if tare_match:
|
||||
result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))
|
||||
|
||||
# Net weights
|
||||
net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
||||
if len(net_matches) >= 2:
|
||||
result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
|
||||
result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))
|
||||
|
||||
# Loss/Gain
|
||||
loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
|
||||
if loss_match:
|
||||
loss_value = float(loss_match.group(1).replace(',', ''))
|
||||
# Make it negative if not already indicated
|
||||
if '-' not in loss_match.group(0) and '–' not in loss_match.group(0):
|
||||
loss_value = -loss_value
|
||||
result["weights"]["gain_loss_kg"] = loss_value
|
||||
|
||||
# Percentage
|
||||
percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
|
||||
if percent_match:
|
||||
percent_value = float(percent_match.group(1).replace(',', ''))
|
||||
if '-' not in percent_match.group(0) and '–' not in percent_match.group(0):
|
||||
percent_value = -percent_value
|
||||
result["weights"]["gain_loss_percent"] = percent_value
|
||||
|
||||
return result
|
||||
|
||||
@app.post("/ocr")
|
||||
async def ocr(file: UploadFile):
|
||||
logger.info(f"Received OCR request: {file.filename}")
|
||||
"""
|
||||
Enhanced OCR endpoint that returns structured data
|
||||
"""
|
||||
logger.info(f"Received structured OCR request: {file.filename}")
|
||||
|
||||
try:
|
||||
file_data = await file.read()
|
||||
ext = file.filename.lower()
|
||||
|
||||
# --------- PDF with native text ---------
|
||||
ocr_text = ""
|
||||
|
||||
# Process PDF
|
||||
if ext.endswith(".pdf"):
|
||||
logger.info("PDF detected → Extracting native text first")
|
||||
# Try native text extraction first
|
||||
reader = PdfReader(io.BytesIO(file_data))
|
||||
direct_text = "".join(
|
||||
page.extract_text() or "" for page in reader.pages
|
||||
)
|
||||
direct_text = "".join(page.extract_text() or "" for page in reader.pages)
|
||||
|
||||
if direct_text.strip():
|
||||
logger.info("Native PDF text found → No OCR needed")
|
||||
return {"ocr_text": direct_text}
|
||||
|
||||
# -------- Fallback: scanned PDF OCR --------
|
||||
logger.info("No native text → PDF treated as scanned → OCR")
|
||||
from pdf2image import convert_from_bytes
|
||||
logger.info("Using native PDF text")
|
||||
ocr_text = direct_text
|
||||
else:
|
||||
# Fallback to OCR
|
||||
logger.info("Using OCR for scanned PDF")
|
||||
images = convert_from_bytes(file_data)
|
||||
text = ""
|
||||
for i, img in enumerate(images):
|
||||
logger.info(f"OCR page {i+1}/{len(images)}")
|
||||
text += pytesseract.image_to_string(img) + "\n"
|
||||
|
||||
return {"ocr_text": text}
|
||||
|
||||
# --------- Image file OCR ---------
|
||||
logger.info("Image detected → Running OCR")
|
||||
ocr_text += pytesseract.image_to_string(img) + "\n"
|
||||
else:
|
||||
# Process image
|
||||
img = Image.open(io.BytesIO(file_data))
|
||||
text = pytesseract.image_to_string(img)
|
||||
return {"ocr_text": text}
|
||||
ocr_text = pytesseract.image_to_string(img)
|
||||
|
||||
# Parse structured data
|
||||
structured_data = parse_cotton_report(ocr_text)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"raw_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
|
||||
"structured_data": structured_data,
|
||||
"json": json.dumps(structured_data, indent=2, ensure_ascii=False)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OCR failed: {e}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
logger.error(f"Structured OCR failed: {e}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"raw_text": "",
|
||||
"structured_data": {}
|
||||
}
|
||||
|
||||
# =============================
|
||||
# 🧠 Smart OCR
|
||||
# =============================
|
||||
# @app.post("/ocr")
|
||||
# async def ocr(file: UploadFile):
|
||||
# logger.info(f"Received OCR request: {file.filename}")
|
||||
# try:
|
||||
# file_data = await file.read()
|
||||
# ext = file.filename.lower()
|
||||
|
||||
# # --------- PDF with native text ---------
|
||||
# if ext.endswith(".pdf"):
|
||||
# logger.info("PDF detected → Extracting native text first")
|
||||
# reader = PdfReader(io.BytesIO(file_data))
|
||||
# direct_text = "".join(
|
||||
# page.extract_text() or "" for page in reader.pages
|
||||
# )
|
||||
|
||||
# if direct_text.strip():
|
||||
# logger.info("Native PDF text found → No OCR needed")
|
||||
# return {"ocr_text": direct_text}
|
||||
|
||||
# # -------- Fallback: scanned PDF OCR --------
|
||||
# logger.info("No native text → PDF treated as scanned → OCR")
|
||||
# from pdf2image import convert_from_bytes
|
||||
# images = convert_from_bytes(file_data)
|
||||
# text = ""
|
||||
# for i, img in enumerate(images):
|
||||
# logger.info(f"OCR page {i+1}/{len(images)}")
|
||||
# text += pytesseract.image_to_string(img) + "\n"
|
||||
|
||||
# return {"ocr_text": text}
|
||||
|
||||
# # --------- Image file OCR ---------
|
||||
# logger.info("Image detected → Running OCR")
|
||||
# img = Image.open(io.BytesIO(file_data))
|
||||
# text = pytesseract.image_to_string(img)
|
||||
# return {"ocr_text": text}
|
||||
|
||||
# except Exception as e:
|
||||
# logger.error(f"OCR failed: {e}", exc_info=True)
|
||||
# raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
# =============================
|
||||
# 🧱 Structure / Layout
|
||||
|
||||
Reference in New Issue
Block a user