11.01.26
This commit is contained in:
409
app.py
409
app.py
@@ -23,113 +23,6 @@ file_handler.setFormatter(logging.Formatter(
|
|||||||
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
||||||
))
|
))
|
||||||
|
|
||||||
# class AHKParser:
|
|
||||||
# lab="AHK"
|
|
||||||
# def parse(self,text):
|
|
||||||
# r=empty_weight_report("AHK")
|
|
||||||
# inv=section(text,"INVOICE WEIGHTS","Bales Weighed")
|
|
||||||
# land=section(text,"Bales Weighed","Outturn")
|
|
||||||
# loss=section(text,"LOSS","Invoice average")
|
|
||||||
|
|
||||||
# r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text)
|
|
||||||
# r["report"]["date"]=extract("Produced On",text)
|
|
||||||
|
|
||||||
# r["contract"]["invoice_no"]=extract("Client Reference",text)
|
|
||||||
# r["contract"]["origin"]=extract("Growth",text)
|
|
||||||
# r["contract"]["commodity"]="Raw Cotton"
|
|
||||||
|
|
||||||
# r["parties"]["seller"]=extract("Client",text)
|
|
||||||
# r["parties"]["buyer"]=extract("Buyer",text)
|
|
||||||
|
|
||||||
# r["shipment"]["vessel"]=extract("Vessel",text)
|
|
||||||
# r["shipment"]["bl_no"]=extract("B/L No",text)
|
|
||||||
# r["shipment"]["port_destination"]=extract("Destination",text)
|
|
||||||
# r["shipment"]["arrival_date"]=extract("Arrival Date",text)
|
|
||||||
# r["shipment"]["weighing_method"]=extract("Weighing method",text)
|
|
||||||
# r["shipment"]["bales"]=to_float(extract("Total Bales",text))
|
|
||||||
|
|
||||||
# r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land))
|
|
||||||
# r["weights"]["tare_kg"]=to_float(extract("Tare",land))
|
|
||||||
# r["weights"]["net_landed_kg"]=to_float(extract("Net",land))
|
|
||||||
# r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv))
|
|
||||||
# r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss))
|
|
||||||
# r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss))
|
|
||||||
# return r
|
|
||||||
# class AHKParser:
|
|
||||||
# lab = "AHK"
|
|
||||||
|
|
||||||
# def extract_table(self, text, headers):
|
|
||||||
# lines = [l.strip() for l in text.splitlines() if l.strip()]
|
|
||||||
# out = {}
|
|
||||||
# for h in headers:
|
|
||||||
# for i,l in enumerate(lines):
|
|
||||||
# if l == h:
|
|
||||||
# for j in range(i+1, i+8):
|
|
||||||
# if j < len(lines) and lines[j].startswith(":"):
|
|
||||||
# out[h] = lines[j][1:].strip()
|
|
||||||
# break
|
|
||||||
# return out
|
|
||||||
|
|
||||||
# def extract_weights(self, text):
|
|
||||||
# lines = [l.strip() for l in text.splitlines() if l.strip()]
|
|
||||||
# res = {}
|
|
||||||
# for i,l in enumerate(lines):
|
|
||||||
# if l == "Bales Weighed":
|
|
||||||
# headers = ["Bales","Gross","Tare","Net"]
|
|
||||||
# for h in headers:
|
|
||||||
# for j in range(i, i+20):
|
|
||||||
# if j < len(lines) and lines[j].startswith(":"):
|
|
||||||
# res[h] = lines[j][1:].replace("kg","").strip()
|
|
||||||
# break
|
|
||||||
# return res
|
|
||||||
|
|
||||||
# def parse(self, text):
|
|
||||||
# r = empty_weight_report("AHK")
|
|
||||||
|
|
||||||
# # report
|
|
||||||
# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
|
|
||||||
# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)
|
|
||||||
|
|
||||||
# # contract
|
|
||||||
# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
|
|
||||||
# r["contract"]["commodity"] = "Raw Cotton"
|
|
||||||
|
|
||||||
# # buyer
|
|
||||||
# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)
|
|
||||||
|
|
||||||
# # shipment tables
|
|
||||||
# ship = self.extract_table(text, [
|
|
||||||
# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
|
|
||||||
# ])
|
|
||||||
# ship2 = self.extract_table(text, [
|
|
||||||
# "Growth","Arrival Date","First date of weighing",
|
|
||||||
# "Last Date of Weighing","Weighing method","Tare"
|
|
||||||
# ])
|
|
||||||
|
|
||||||
# r["shipment"]["bales"] = to_float(ship.get("Total Bales"))
|
|
||||||
# r["shipment"]["vessel"] = ship.get("Vessel")
|
|
||||||
# r["shipment"]["bl_no"] = ship.get("B/L No.")
|
|
||||||
# r["shipment"]["port_destination"] = ship.get("Destination")
|
|
||||||
# r["shipment"]["arrival_date"] = ship2.get("Arrival Date")
|
|
||||||
# r["shipment"]["weighing_method"] = ship2.get("Weighing method")
|
|
||||||
# r["contract"]["origin"] = ship2.get("Growth")
|
|
||||||
|
|
||||||
# # weights
|
|
||||||
# inv = self.extract_table(text, ["Bales","Gross","Tare","Net"])
|
|
||||||
# land = self.extract_weights(text)
|
|
||||||
|
|
||||||
# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net"))
|
|
||||||
# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
|
|
||||||
# r["weights"]["tare_kg"] = to_float(land.get("Tare"))
|
|
||||||
# r["weights"]["net_landed_kg"] = to_float(land.get("Net"))
|
|
||||||
|
|
||||||
# # loss
|
|
||||||
# loss = section(text,"LOSS","Invoice average")
|
|
||||||
# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
|
|
||||||
# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))
|
|
||||||
|
|
||||||
# return r
|
|
||||||
|
|
||||||
class AHKParser:
|
class AHKParser:
|
||||||
lab = "AHK"
|
lab = "AHK"
|
||||||
|
|
||||||
@@ -205,7 +98,6 @@ class AHKParser:
|
|||||||
|
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
class IntertekParser:
|
class IntertekParser:
|
||||||
lab="INTERTEK"
|
lab="INTERTEK"
|
||||||
def parse(self,text):
|
def parse(self,text):
|
||||||
@@ -354,221 +246,6 @@ predictor = ocr_predictor(pretrained=True)
|
|||||||
|
|
||||||
logger.info("Models loaded successfully.")
|
logger.info("Models loaded successfully.")
|
||||||
|
|
||||||
import io
|
|
||||||
import re
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Dict, Any
|
|
||||||
import pytesseract
|
|
||||||
from pdf2image import convert_from_bytes
|
|
||||||
from PIL import Image
|
|
||||||
from PyPDF2 import PdfReader
|
|
||||||
import json
|
|
||||||
|
|
||||||
def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Parse structured data from cotton landing report OCR text
|
|
||||||
"""
|
|
||||||
result = {
|
|
||||||
"lab": "ALFRED H KNIGHT",
|
|
||||||
"report": {"reference": None, "file_no": None, "date": None},
|
|
||||||
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None,
|
|
||||||
"origin": None, "commodity": None},
|
|
||||||
"parties": {"seller": None, "buyer": None, "carrier": None},
|
|
||||||
"shipment": {
|
|
||||||
"vessel": None, "bl_no": None, "port_loading": None,
|
|
||||||
"port_destination": None, "arrival_date": None,
|
|
||||||
"weighing_place": None, "weighing_method": None,
|
|
||||||
"bales": None
|
|
||||||
},
|
|
||||||
"weights": {
|
|
||||||
"gross_landed_kg": None, "tare_kg": None,
|
|
||||||
"net_landed_kg": None, "invoice_net_kg": None,
|
|
||||||
"gain_loss_kg": None, "gain_loss_percent": None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Clean the text
|
|
||||||
lines = ocr_text.split('\n')
|
|
||||||
clean_lines = [line.strip() for line in lines if line.strip()]
|
|
||||||
|
|
||||||
# Extract using patterns
|
|
||||||
text = ocr_text.lower()
|
|
||||||
|
|
||||||
# 1. Extract report reference and file number
|
|
||||||
ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if ref_match:
|
|
||||||
result["report"]["reference"] = ref_match.group(1).strip()
|
|
||||||
|
|
||||||
# Try to get file number from AHK reference
|
|
||||||
ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
|
|
||||||
if ahk_match:
|
|
||||||
result["report"]["file_no"] = ahk_match.group(1)
|
|
||||||
|
|
||||||
# 2. Extract dates
|
|
||||||
date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
|
|
||||||
if date_match:
|
|
||||||
result["report"]["date"] = date_match.group(1).title()
|
|
||||||
|
|
||||||
# 3. Extract contract information
|
|
||||||
# Origin/Growth
|
|
||||||
growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
|
|
||||||
if growth_match:
|
|
||||||
origin = growth_match.group(1).strip()
|
|
||||||
result["contract"]["origin"] = origin
|
|
||||||
result["contract"]["commodity"] = "COTTON"
|
|
||||||
|
|
||||||
# Invoice number from reference
|
|
||||||
if result["report"]["reference"]:
|
|
||||||
inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
|
|
||||||
if inv_match:
|
|
||||||
result["contract"]["invoice_no"] = inv_match.group(1)
|
|
||||||
|
|
||||||
# 4. Extract parties
|
|
||||||
# Seller
|
|
||||||
seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if seller_match:
|
|
||||||
# Skip the "Client" label if present
|
|
||||||
seller_text = seller_match.group(1).strip()
|
|
||||||
if not seller_text.lower().startswith('client'):
|
|
||||||
result["parties"]["seller"] = seller_text
|
|
||||||
|
|
||||||
# Buyer
|
|
||||||
buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if buyer_match:
|
|
||||||
buyer_text = buyer_match.group(1).strip()
|
|
||||||
if not buyer_text.lower().startswith('buyer'):
|
|
||||||
result["parties"]["buyer"] = buyer_text
|
|
||||||
|
|
||||||
# 5. Extract shipment details
|
|
||||||
# Vessel
|
|
||||||
vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if vessel_match:
|
|
||||||
vessel_text = vessel_match.group(1).strip()
|
|
||||||
if not vessel_text.lower().startswith('vessel'):
|
|
||||||
result["shipment"]["vessel"] = vessel_text
|
|
||||||
|
|
||||||
# B/L Number
|
|
||||||
bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if bl_match:
|
|
||||||
bl_text = bl_match.group(1).strip()
|
|
||||||
result["shipment"]["bl_no"] = bl_text
|
|
||||||
|
|
||||||
# Destination
|
|
||||||
dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if dest_match:
|
|
||||||
dest_text = dest_match.group(1).strip()
|
|
||||||
if not dest_text.lower().startswith('destination'):
|
|
||||||
result["shipment"]["port_destination"] = dest_text
|
|
||||||
|
|
||||||
# Arrival Date
|
|
||||||
arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
|
|
||||||
if arrival_match:
|
|
||||||
result["shipment"]["arrival_date"] = arrival_match.group(1).title()
|
|
||||||
|
|
||||||
# Weighing method
|
|
||||||
weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
|
|
||||||
if weigh_match:
|
|
||||||
method_text = weigh_match.group(1).strip()
|
|
||||||
if not method_text.lower().startswith('weighing'):
|
|
||||||
result["shipment"]["weighing_method"] = method_text
|
|
||||||
|
|
||||||
# Bales count
|
|
||||||
bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
|
|
||||||
if bales_match:
|
|
||||||
result["shipment"]["bales"] = int(bales_match.group(1))
|
|
||||||
|
|
||||||
# 6. Extract weights (critical section)
|
|
||||||
# Gross Landed Weight
|
|
||||||
gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
|
||||||
if gross_match:
|
|
||||||
# We need the second occurrence (landed weight)
|
|
||||||
all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
|
||||||
if len(all_gross) >= 2:
|
|
||||||
result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))
|
|
||||||
|
|
||||||
# Tare weight (should be same in both)
|
|
||||||
tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
|
||||||
if tare_match:
|
|
||||||
result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))
|
|
||||||
|
|
||||||
# Net weights
|
|
||||||
net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
|
|
||||||
if len(net_matches) >= 2:
|
|
||||||
result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
|
|
||||||
result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))
|
|
||||||
|
|
||||||
# Loss/Gain
|
|
||||||
loss_match = re.search(r'loss\s*:?\s*[-–]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
|
|
||||||
if loss_match:
|
|
||||||
loss_value = float(loss_match.group(1).replace(',', ''))
|
|
||||||
# Make it negative if not already indicated
|
|
||||||
if '-' not in loss_match.group(0) and '–' not in loss_match.group(0):
|
|
||||||
loss_value = -loss_value
|
|
||||||
result["weights"]["gain_loss_kg"] = loss_value
|
|
||||||
|
|
||||||
# Percentage
|
|
||||||
percent_match = re.search(r'percentage\s*:?\s*[-–]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
|
|
||||||
if percent_match:
|
|
||||||
percent_value = float(percent_match.group(1).replace(',', ''))
|
|
||||||
if '-' not in percent_match.group(0) and '–' not in percent_match.group(0):
|
|
||||||
percent_value = -percent_value
|
|
||||||
result["weights"]["gain_loss_percent"] = percent_value
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
@app.post("/ocr")
|
|
||||||
async def ocr(file: UploadFile):
|
|
||||||
"""
|
|
||||||
Enhanced OCR endpoint that returns structured data
|
|
||||||
"""
|
|
||||||
logger.info(f"Received structured OCR request: {file.filename}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
file_data = await file.read()
|
|
||||||
ext = file.filename.lower()
|
|
||||||
|
|
||||||
ocr_text = ""
|
|
||||||
|
|
||||||
# Process PDF
|
|
||||||
if ext.endswith(".pdf"):
|
|
||||||
# Try native text extraction first
|
|
||||||
reader = PdfReader(io.BytesIO(file_data))
|
|
||||||
direct_text = "".join(page.extract_text() or "" for page in reader.pages)
|
|
||||||
|
|
||||||
if direct_text.strip():
|
|
||||||
logger.info("Using native PDF text")
|
|
||||||
ocr_text = direct_text
|
|
||||||
else:
|
|
||||||
# Fallback to OCR
|
|
||||||
logger.info("Using OCR for scanned PDF")
|
|
||||||
images = convert_from_bytes(file_data)
|
|
||||||
for i, img in enumerate(images):
|
|
||||||
logger.info(f"OCR page {i+1}/{len(images)}")
|
|
||||||
ocr_text += pytesseract.image_to_string(img) + "\n"
|
|
||||||
else:
|
|
||||||
# Process image
|
|
||||||
img = Image.open(io.BytesIO(file_data))
|
|
||||||
ocr_text = pytesseract.image_to_string(img)
|
|
||||||
|
|
||||||
# Parse structured data
|
|
||||||
structured_data = parse_cotton_report(ocr_text)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"success": True,
|
|
||||||
"ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
|
|
||||||
"structured_data": structured_data,
|
|
||||||
"json": json.dumps(structured_data, indent=2, ensure_ascii=False)
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Structured OCR failed: {e}", exc_info=True)
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"error": str(e),
|
|
||||||
"raw_text": "",
|
|
||||||
"structured_data": {}
|
|
||||||
}
|
|
||||||
|
|
||||||
# =============================
|
# =============================
|
||||||
# 🧠 Smart OCR
|
# 🧠 Smart OCR
|
||||||
# =============================
|
# =============================
|
||||||
@@ -611,7 +288,91 @@ async def ocr(file: UploadFile):
|
|||||||
# except Exception as e:
|
# except Exception as e:
|
||||||
# logger.error(f"OCR failed: {e}", exc_info=True)
|
# logger.error(f"OCR failed: {e}", exc_info=True)
|
||||||
# raise HTTPException(status_code=500, detail=str(e))
|
# raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
@app.post("/ocr")
|
||||||
|
async def ocr(file: UploadFile):
|
||||||
|
"""
|
||||||
|
Smart PDF processing optimized for cotton landing reports
|
||||||
|
"""
|
||||||
|
logger.info(f"Smart OCR request: {file.filename}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_data = await file.read()
|
||||||
|
|
||||||
|
# Strategy 1: Try pdfplumber (best for digital PDFs)
|
||||||
|
try:
|
||||||
|
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
|
||||||
|
text_parts = []
|
||||||
|
tables_found = []
|
||||||
|
|
||||||
|
for page in pdf.pages:
|
||||||
|
# Extract text
|
||||||
|
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
|
||||||
|
if page_text:
|
||||||
|
text_parts.append(page_text)
|
||||||
|
|
||||||
|
# Look for tables (common in landing reports)
|
||||||
|
tables = page.extract_tables({
|
||||||
|
"vertical_strategy": "text",
|
||||||
|
"horizontal_strategy": "text",
|
||||||
|
"snap_tolerance": 5,
|
||||||
|
})
|
||||||
|
|
||||||
|
for table in tables:
|
||||||
|
if table and len(table) > 1:
|
||||||
|
tables_found.append(table)
|
||||||
|
|
||||||
|
combined_text = "\n".join(text_parts)
|
||||||
|
return {"ocr_text": combined_text}
|
||||||
|
# if combined_text.strip():
|
||||||
|
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
|
||||||
|
|
||||||
|
# # Try parsing structured data
|
||||||
|
# structured_data = parse_cotton_report(combined_text)
|
||||||
|
|
||||||
|
# # Check if we got key fields
|
||||||
|
# if (structured_data.get("shipment", {}).get("bales") and
|
||||||
|
# structured_data.get("weights", {}).get("net_landed_kg")):
|
||||||
|
# logger.info("Successfully parsed structured data from pdfplumber")
|
||||||
|
# return {
|
||||||
|
# "method": "pdfplumber",
|
||||||
|
# "structured_data": structured_data,
|
||||||
|
# "raw_text_sample": combined_text[:500]
|
||||||
|
# }
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"pdfplumber attempt: {e}")
|
||||||
|
|
||||||
|
# Strategy 2: Fallback to OCR for scanned PDFs
|
||||||
|
logger.info("Falling back to OCR...")
|
||||||
|
|
||||||
|
# Convert PDF to images
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
images = convert_from_bytes(file_data, dpi=200)
|
||||||
|
|
||||||
|
ocr_results = []
|
||||||
|
for img in images:
|
||||||
|
# Use pytesseract with optimized settings
|
||||||
|
text = pytesseract.image_to_string(
|
||||||
|
img,
|
||||||
|
config='--psm 6 -c preserve_interword_spaces=1'
|
||||||
|
)
|
||||||
|
ocr_results.append(text)
|
||||||
|
|
||||||
|
ocr_text = "\n".join(ocr_results)
|
||||||
|
structured_data = parse_cotton_report(ocr_text)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"method": "tesseract_ocr",
|
||||||
|
"structured_data": structured_data,
|
||||||
|
"raw_text_sample": ocr_text[:500]
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
||||||
|
return {
|
||||||
|
"error": str(e),
|
||||||
|
"success": False
|
||||||
|
}
|
||||||
# =============================
|
# =============================
|
||||||
# 🧱 Structure / Layout
|
# 🧱 Structure / Layout
|
||||||
# =============================
|
# =============================
|
||||||
|
|||||||
Reference in New Issue
Block a user