This commit is contained in:
2026-01-11 19:28:20 +01:00
parent e288d4f2dd
commit b7335d330d

409
app.py
View File

@@ -23,113 +23,6 @@ file_handler.setFormatter(logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s" "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
)) ))
# class AHKParser:
# lab="AHK"
# def parse(self,text):
# r=empty_weight_report("AHK")
# inv=section(text,"INVOICE WEIGHTS","Bales Weighed")
# land=section(text,"Bales Weighed","Outturn")
# loss=section(text,"LOSS","Invoice average")
# r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text)
# r["report"]["date"]=extract("Produced On",text)
# r["contract"]["invoice_no"]=extract("Client Reference",text)
# r["contract"]["origin"]=extract("Growth",text)
# r["contract"]["commodity"]="Raw Cotton"
# r["parties"]["seller"]=extract("Client",text)
# r["parties"]["buyer"]=extract("Buyer",text)
# r["shipment"]["vessel"]=extract("Vessel",text)
# r["shipment"]["bl_no"]=extract("B/L No",text)
# r["shipment"]["port_destination"]=extract("Destination",text)
# r["shipment"]["arrival_date"]=extract("Arrival Date",text)
# r["shipment"]["weighing_method"]=extract("Weighing method",text)
# r["shipment"]["bales"]=to_float(extract("Total Bales",text))
# r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land))
# r["weights"]["tare_kg"]=to_float(extract("Tare",land))
# r["weights"]["net_landed_kg"]=to_float(extract("Net",land))
# r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv))
# r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss))
# r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss))
# return r
# class AHKParser:
# lab = "AHK"
# def extract_table(self, text, headers):
# lines = [l.strip() for l in text.splitlines() if l.strip()]
# out = {}
# for h in headers:
# for i,l in enumerate(lines):
# if l == h:
# for j in range(i+1, i+8):
# if j < len(lines) and lines[j].startswith(":"):
# out[h] = lines[j][1:].strip()
# break
# return out
# def extract_weights(self, text):
# lines = [l.strip() for l in text.splitlines() if l.strip()]
# res = {}
# for i,l in enumerate(lines):
# if l == "Bales Weighed":
# headers = ["Bales","Gross","Tare","Net"]
# for h in headers:
# for j in range(i, i+20):
# if j < len(lines) and lines[j].startswith(":"):
# res[h] = lines[j][1:].replace("kg","").strip()
# break
# return res
# def parse(self, text):
# r = empty_weight_report("AHK")
# # report
# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)
# # contract
# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
# r["contract"]["commodity"] = "Raw Cotton"
# # buyer
# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)
# # shipment tables
# ship = self.extract_table(text, [
# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
# ])
# ship2 = self.extract_table(text, [
# "Growth","Arrival Date","First date of weighing",
# "Last Date of Weighing","Weighing method","Tare"
# ])
# r["shipment"]["bales"] = to_float(ship.get("Total Bales"))
# r["shipment"]["vessel"] = ship.get("Vessel")
# r["shipment"]["bl_no"] = ship.get("B/L No.")
# r["shipment"]["port_destination"] = ship.get("Destination")
# r["shipment"]["arrival_date"] = ship2.get("Arrival Date")
# r["shipment"]["weighing_method"] = ship2.get("Weighing method")
# r["contract"]["origin"] = ship2.get("Growth")
# # weights
# inv = self.extract_table(text, ["Bales","Gross","Tare","Net"])
# land = self.extract_weights(text)
# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net"))
# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
# r["weights"]["tare_kg"] = to_float(land.get("Tare"))
# r["weights"]["net_landed_kg"] = to_float(land.get("Net"))
# # loss
# loss = section(text,"LOSS","Invoice average")
# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))
# return r
class AHKParser: class AHKParser:
lab = "AHK" lab = "AHK"
@@ -205,7 +98,6 @@ class AHKParser:
return r return r
class IntertekParser: class IntertekParser:
lab="INTERTEK" lab="INTERTEK"
def parse(self,text): def parse(self,text):
@@ -354,221 +246,6 @@ predictor = ocr_predictor(pretrained=True)
logger.info("Models loaded successfully.") logger.info("Models loaded successfully.")
import io
import re
from datetime import datetime
from typing import Dict, Any
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader
import json
def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
"""
Parse structured data from cotton landing report OCR text
"""
result = {
"lab": "ALFRED H KNIGHT",
"report": {"reference": None, "file_no": None, "date": None},
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None,
"origin": None, "commodity": None},
"parties": {"seller": None, "buyer": None, "carrier": None},
"shipment": {
"vessel": None, "bl_no": None, "port_loading": None,
"port_destination": None, "arrival_date": None,
"weighing_place": None, "weighing_method": None,
"bales": None
},
"weights": {
"gross_landed_kg": None, "tare_kg": None,
"net_landed_kg": None, "invoice_net_kg": None,
"gain_loss_kg": None, "gain_loss_percent": None
}
}
# Clean the text
lines = ocr_text.split('\n')
clean_lines = [line.strip() for line in lines if line.strip()]
# Extract using patterns
text = ocr_text.lower()
# 1. Extract report reference and file number
ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
if ref_match:
result["report"]["reference"] = ref_match.group(1).strip()
# Try to get file number from AHK reference
ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
if ahk_match:
result["report"]["file_no"] = ahk_match.group(1)
# 2. Extract dates
date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
if date_match:
result["report"]["date"] = date_match.group(1).title()
# 3. Extract contract information
# Origin/Growth
growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
if growth_match:
origin = growth_match.group(1).strip()
result["contract"]["origin"] = origin
result["contract"]["commodity"] = "COTTON"
# Invoice number from reference
if result["report"]["reference"]:
inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
if inv_match:
result["contract"]["invoice_no"] = inv_match.group(1)
# 4. Extract parties
# Seller
seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if seller_match:
# Skip the "Client" label if present
seller_text = seller_match.group(1).strip()
if not seller_text.lower().startswith('client'):
result["parties"]["seller"] = seller_text
# Buyer
buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if buyer_match:
buyer_text = buyer_match.group(1).strip()
if not buyer_text.lower().startswith('buyer'):
result["parties"]["buyer"] = buyer_text
# 5. Extract shipment details
# Vessel
vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if vessel_match:
vessel_text = vessel_match.group(1).strip()
if not vessel_text.lower().startswith('vessel'):
result["shipment"]["vessel"] = vessel_text
# B/L Number
bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if bl_match:
bl_text = bl_match.group(1).strip()
result["shipment"]["bl_no"] = bl_text
# Destination
dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if dest_match:
dest_text = dest_match.group(1).strip()
if not dest_text.lower().startswith('destination'):
result["shipment"]["port_destination"] = dest_text
# Arrival Date
arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
if arrival_match:
result["shipment"]["arrival_date"] = arrival_match.group(1).title()
# Weighing method
weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if weigh_match:
method_text = weigh_match.group(1).strip()
if not method_text.lower().startswith('weighing'):
result["shipment"]["weighing_method"] = method_text
# Bales count
bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
if bales_match:
result["shipment"]["bales"] = int(bales_match.group(1))
# 6. Extract weights (critical section)
# Gross Landed Weight
gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if gross_match:
# We need the second occurrence (landed weight)
all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if len(all_gross) >= 2:
result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))
# Tare weight (should be same in both)
tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if tare_match:
result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))
# Net weights
net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if len(net_matches) >= 2:
result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))
# Loss/Gain
loss_match = re.search(r'loss\s*:?\s*[-]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
if loss_match:
loss_value = float(loss_match.group(1).replace(',', ''))
# Make it negative if not already indicated
if '-' not in loss_match.group(0) and '' not in loss_match.group(0):
loss_value = -loss_value
result["weights"]["gain_loss_kg"] = loss_value
# Percentage
percent_match = re.search(r'percentage\s*:?\s*[-]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
if percent_match:
percent_value = float(percent_match.group(1).replace(',', ''))
if '-' not in percent_match.group(0) and '' not in percent_match.group(0):
percent_value = -percent_value
result["weights"]["gain_loss_percent"] = percent_value
return result
@app.post("/ocr")
async def ocr(file: UploadFile):
"""
Enhanced OCR endpoint that returns structured data
"""
logger.info(f"Received structured OCR request: {file.filename}")
try:
file_data = await file.read()
ext = file.filename.lower()
ocr_text = ""
# Process PDF
if ext.endswith(".pdf"):
# Try native text extraction first
reader = PdfReader(io.BytesIO(file_data))
direct_text = "".join(page.extract_text() or "" for page in reader.pages)
if direct_text.strip():
logger.info("Using native PDF text")
ocr_text = direct_text
else:
# Fallback to OCR
logger.info("Using OCR for scanned PDF")
images = convert_from_bytes(file_data)
for i, img in enumerate(images):
logger.info(f"OCR page {i+1}/{len(images)}")
ocr_text += pytesseract.image_to_string(img) + "\n"
else:
# Process image
img = Image.open(io.BytesIO(file_data))
ocr_text = pytesseract.image_to_string(img)
# Parse structured data
structured_data = parse_cotton_report(ocr_text)
return {
"success": True,
"ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
"structured_data": structured_data,
"json": json.dumps(structured_data, indent=2, ensure_ascii=False)
}
except Exception as e:
logger.error(f"Structured OCR failed: {e}", exc_info=True)
return {
"success": False,
"error": str(e),
"raw_text": "",
"structured_data": {}
}
# ============================= # =============================
# 🧠 Smart OCR # 🧠 Smart OCR
# ============================= # =============================
@@ -611,7 +288,91 @@ async def ocr(file: UploadFile):
# except Exception as e: # except Exception as e:
# logger.error(f"OCR failed: {e}", exc_info=True) # logger.error(f"OCR failed: {e}", exc_info=True)
# raise HTTPException(status_code=500, detail=str(e)) # raise HTTPException(status_code=500, detail=str(e))
@app.post("/ocr")
async def ocr(file: UploadFile):
"""
Smart PDF processing optimized for cotton landing reports
"""
logger.info(f"Smart OCR request: {file.filename}")
try:
file_data = await file.read()
# Strategy 1: Try pdfplumber (best for digital PDFs)
try:
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
text_parts = []
tables_found = []
for page in pdf.pages:
# Extract text
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
if page_text:
text_parts.append(page_text)
# Look for tables (common in landing reports)
tables = page.extract_tables({
"vertical_strategy": "text",
"horizontal_strategy": "text",
"snap_tolerance": 5,
})
for table in tables:
if table and len(table) > 1:
tables_found.append(table)
combined_text = "\n".join(text_parts)
return {"ocr_text": combined_text}
# if combined_text.strip():
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
# # Try parsing structured data
# structured_data = parse_cotton_report(combined_text)
# # Check if we got key fields
# if (structured_data.get("shipment", {}).get("bales") and
# structured_data.get("weights", {}).get("net_landed_kg")):
# logger.info("Successfully parsed structured data from pdfplumber")
# return {
# "method": "pdfplumber",
# "structured_data": structured_data,
# "raw_text_sample": combined_text[:500]
# }
except Exception as e:
logger.warning(f"pdfplumber attempt: {e}")
# Strategy 2: Fallback to OCR for scanned PDFs
logger.info("Falling back to OCR...")
# Convert PDF to images
from pdf2image import convert_from_bytes
images = convert_from_bytes(file_data, dpi=200)
ocr_results = []
for img in images:
# Use pytesseract with optimized settings
text = pytesseract.image_to_string(
img,
config='--psm 6 -c preserve_interword_spaces=1'
)
ocr_results.append(text)
ocr_text = "\n".join(ocr_results)
structured_data = parse_cotton_report(ocr_text)
return {
"method": "tesseract_ocr",
"structured_data": structured_data,
"raw_text_sample": ocr_text[:500]
}
except Exception as e:
logger.error(f"Smart OCR failed: {e}", exc_info=True)
return {
"error": str(e),
"success": False
}
# ============================= # =============================
# 🧱 Structure / Layout # 🧱 Structure / Layout
# ============================= # =============================