Files
automation-service/app.py
2026-01-11 18:12:06 +01:00

874 lines
34 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re
LOG_PATH = "/var/log/automation-service.log"
file_handler = RotatingFileHandler(
LOG_PATH,
maxBytes=10*1024*1024,
backupCount=5,
encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))
# class AHKParser:
# lab="AHK"
# def parse(self,text):
# r=empty_weight_report("AHK")
# inv=section(text,"INVOICE WEIGHTS","Bales Weighed")
# land=section(text,"Bales Weighed","Outturn")
# loss=section(text,"LOSS","Invoice average")
# r["report"]["reference"]=safe_search(r"(AHK\s*/\S+)",text)
# r["report"]["date"]=extract("Produced On",text)
# r["contract"]["invoice_no"]=extract("Client Reference",text)
# r["contract"]["origin"]=extract("Growth",text)
# r["contract"]["commodity"]="Raw Cotton"
# r["parties"]["seller"]=extract("Client",text)
# r["parties"]["buyer"]=extract("Buyer",text)
# r["shipment"]["vessel"]=extract("Vessel",text)
# r["shipment"]["bl_no"]=extract("B/L No",text)
# r["shipment"]["port_destination"]=extract("Destination",text)
# r["shipment"]["arrival_date"]=extract("Arrival Date",text)
# r["shipment"]["weighing_method"]=extract("Weighing method",text)
# r["shipment"]["bales"]=to_float(extract("Total Bales",text))
# r["weights"]["gross_landed_kg"]=to_float(extract("Gross",land))
# r["weights"]["tare_kg"]=to_float(extract("Tare",land))
# r["weights"]["net_landed_kg"]=to_float(extract("Net",land))
# r["weights"]["invoice_net_kg"]=to_float(extract("Net",inv))
# r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss))
# r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss))
# return r
# class AHKParser:
# lab = "AHK"
# def extract_table(self, text, headers):
# lines = [l.strip() for l in text.splitlines() if l.strip()]
# out = {}
# for h in headers:
# for i,l in enumerate(lines):
# if l == h:
# for j in range(i+1, i+8):
# if j < len(lines) and lines[j].startswith(":"):
# out[h] = lines[j][1:].strip()
# break
# return out
# def extract_weights(self, text):
# lines = [l.strip() for l in text.splitlines() if l.strip()]
# res = {}
# for i,l in enumerate(lines):
# if l == "Bales Weighed":
# headers = ["Bales","Gross","Tare","Net"]
# for h in headers:
# for j in range(i, i+20):
# if j < len(lines) and lines[j].startswith(":"):
# res[h] = lines[j][1:].replace("kg","").strip()
# break
# return res
# def parse(self, text):
# r = empty_weight_report("AHK")
# # report
# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)
# # contract
# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
# r["contract"]["commodity"] = "Raw Cotton"
# # buyer
# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)
# # shipment tables
# ship = self.extract_table(text, [
# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
# ])
# ship2 = self.extract_table(text, [
# "Growth","Arrival Date","First date of weighing",
# "Last Date of Weighing","Weighing method","Tare"
# ])
# r["shipment"]["bales"] = to_float(ship.get("Total Bales"))
# r["shipment"]["vessel"] = ship.get("Vessel")
# r["shipment"]["bl_no"] = ship.get("B/L No.")
# r["shipment"]["port_destination"] = ship.get("Destination")
# r["shipment"]["arrival_date"] = ship2.get("Arrival Date")
# r["shipment"]["weighing_method"] = ship2.get("Weighing method")
# r["contract"]["origin"] = ship2.get("Growth")
# # weights
# inv = self.extract_table(text, ["Bales","Gross","Tare","Net"])
# land = self.extract_weights(text)
# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net"))
# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
# r["weights"]["tare_kg"] = to_float(land.get("Tare"))
# r["weights"]["net_landed_kg"] = to_float(land.get("Net"))
# # loss
# loss = section(text,"LOSS","Invoice average")
# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))
# return r
class AHKParser:
lab = "AHK"
def _lines(self, text):
return [l.strip() for l in text.splitlines() if l.strip()]
def _col_block(self, lines, labels, max_scan=25):
# trouve la dernière ligne du bloc de labels
last = max(i for i,l in enumerate(lines) if l in labels)
vals = []
for l in lines[last+1:last+1+max_scan]:
if l.startswith(":"):
vals.append(l[1:].strip())
if len(vals) == len(labels):
break
return dict(zip(labels, vals))
def parse(self, text):
L = self._lines(text)
r = empty_weight_report("AHK")
# report
r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text)
r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text)
# contract
r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text)
r["contract"]["commodity"] = "Raw Cotton"
# buyer
r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text)
# shipment block 1
ship1 = self._col_block(L, [
"Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"
])
# shipment block 2
ship2 = self._col_block(L, [
"Growth","Arrival Date","First date of weighing",
"Last Date of Weighing","Weighing method","Tare"
])
r["shipment"]["bales"] = to_float(ship1.get("Total Bales"))
r["shipment"]["vessel"] = ship1.get("Vessel")
r["shipment"]["bl_no"] = ship1.get("B/L No.")
r["shipment"]["port_destination"] = ship1.get("Destination")
r["shipment"]["arrival_date"] = ship2.get("Arrival Date")
r["shipment"]["weighing_method"] = ship2.get("Weighing method")
r["contract"]["origin"] = ship2.get("Growth")
# invoice weights
inv = self._col_block(L, ["Bales","Gross","Tare","Net"])
r["weights"]["invoice_net_kg"] = to_float(inv.get("Net"))
# landed weights
land = self._col_block(self._lines(section(text,"Bales Weighed","Outturn")),
["Bales","Gross","Tare","Net"])
r["weights"]["gross_landed_kg"] = to_float(land.get("Gross"))
r["weights"]["tare_kg"] = to_float(land.get("Tare"))
r["weights"]["net_landed_kg"] = to_float(land.get("Net"))
# loss
loss = section(text,"LOSS","Invoice average")
r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss))
r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss))
return r
class IntertekParser:
lab="INTERTEK"
def parse(self,text):
r=empty_weight_report("INTERTEK")
pct=safe_search(r"([0-9.]+)\s*%",text)
r["report"]["reference"]=extract("Global Ref",text)
r["report"]["file_no"]=extract("Report / File No",text)
r["report"]["date"]=extract("Dated",text)
r["contract"]["contract_no"]=extract("Contract No",text)
r["contract"]["invoice_no"]=extract("Invoice No",text)
r["contract"]["origin"]=extract("Growth",text)
r["contract"]["commodity"]="Raw Cotton"
r["parties"]["buyer"]=extract("Buyer",text)
r["shipment"]["vessel"]=extract("Vessel",text)
r["shipment"]["bl_no"]=extract("B/L No",text)
r["shipment"]["arrival_date"]=extract("Arrival Date",text)
r["shipment"]["weighing_place"]=extract("Weighed at",text)
r["shipment"]["bales"]=to_float(extract("Invoice Quantity",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Gross",text))
r["weights"]["tare_kg"]=to_float(extract("Invoice Tare",text))
r["weights"]["net_landed_kg"]=to_float(extract("Landed Weight",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Invoice Weight",text))
r["weights"]["gain_loss_kg"]=to_float(extract("Gain",text))
r["weights"]["gain_loss_percent"]=to_float(pct)
return r
class RobertsonParser:
lab="ROBERTSON"
def parse(self,text):
r=empty_weight_report("ROBERTSON")
pct=safe_search(r"([0-9.]+)\s*%",text)
r["report"]["reference"]=extract("OUR REF",text)
r["report"]["date"]=extract("DATE",text)
r["contract"]["contract_no"]=extract("CONTRACT NO",text)
r["contract"]["invoice_no"]=extract("INVOICE NO",text)
r["contract"]["lc_no"]=extract("LIC NO",text)
r["contract"]["commodity"]="Raw Cotton"
r["parties"]["seller"]=extract("SELLER",text)
r["parties"]["buyer"]=extract("BUYER",text)
r["shipment"]["vessel"]=extract("NAME OF VESSEL",text)
r["shipment"]["port_loading"]=extract("SAILED FROM",text)
r["shipment"]["port_destination"]=extract("ARRIVED AT",text)
r["shipment"]["arrival_date"]=extract("DATE OF ARRIVAL",text)
r["shipment"]["weighing_place"]=extract("PLACE OF CONTROL",text)
r["shipment"]["bales"]=to_float(extract("CONSIGNMENT",text))
r["weights"]["gross_landed_kg"]=to_float(extract("GROSS",text))
r["weights"]["tare_kg"]=to_float(extract("TARE",text))
r["weights"]["net_landed_kg"]=to_float(extract("LANDED NET",text))
r["weights"]["invoice_net_kg"]=to_float(extract("INVOICE NET",text))
r["weights"]["gain_loss_kg"]=to_float(extract("GAIN",text))
r["weights"]["gain_loss_percent"]=to_float(pct)
return r
class SGSParser:
lab="SGS"
def parse(self,text):
r=empty_weight_report("SGS")
r["report"]["reference"]=extract("LANDING REPORT No",text)
r["report"]["file_no"]=extract("FILE NO.",text)
r["report"]["date"]=extract("DATE",text)
r["contract"]["contract_no"]=extract("CONTRACT NO.",text)
r["contract"]["invoice_no"]=extract("INVOICE NO.",text)
r["contract"]["origin"]=extract("ORIGIN",text)
r["contract"]["commodity"]=extract("PRODUCT",text)
r["parties"]["seller"]=extract("Seller",text)
r["parties"]["buyer"]=extract("Buyer",text)
r["parties"]["carrier"]=extract("Carrier",text)
r["shipment"]["bl_no"]=extract("B/L no.",text)
r["shipment"]["port_loading"]=extract("Port of loading",text)
r["shipment"]["port_destination"]=extract("Port of destination",text)
r["shipment"]["arrival_date"]=extract("Vessel arrival date",text)
r["shipment"]["weighing_place"]=extract("Place of weighing",text)
r["shipment"]["weighing_method"]=extract("Weighing mode",text)
r["shipment"]["bales"]=to_float(extract("Quantity arrived",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Gross landed",text))
r["weights"]["tare_kg"]=to_float(extract("Tare",text))
r["weights"]["net_landed_kg"]=to_float(extract("Net landed",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Net invoiced",text))
r["weights"]["gain_loss_kg"]=to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs",text))
r["weights"]["gain_loss_percent"]=to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%",text))
return r
class PICLParser:
lab="PICL"
def parse(self,text):
r=empty_weight_report("PICL")
r["report"]["reference"]=safe_search(r"No[:\s]+([A-Z0-9\-]+)",text)
r["report"]["date"]=safe_search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})",text,group_index=2)
r["contract"]["contract_no"]=extract("Contract/Pl No & Date",text)
r["contract"]["invoice_no"]=extract("Invoice ilo & Date",text)
r["contract"]["lc_no"]=extract("L/C No & Date",text)
r["contract"]["origin"]=extract("Country of Origin",text)
r["contract"]["commodity"]=extract("Commodity",text)
r["parties"]["seller"]=extract("FAIRCOT SA",text)
r["parties"]["buyer"]=extract("M/S.",text)
r["parties"]["carrier"]=extract("Shipping Agent",text)
r["shipment"]["vessel"]=extract("Shipped Per Vessel",text)
r["shipment"]["bl_no"]=extract("B/L No & Date",text)
r["shipment"]["port_loading"]=extract("Port of Loading",text)
r["shipment"]["port_destination"]=extract("Port of Discharge",text)
r["shipment"]["arrival_date"]=extract("Date of Anival & LDL",text)
r["shipment"]["weighing_place"]=extract("Place & Date of Weighment",text)
r["shipment"]["weighing_method"]=extract("Method of Weighment",text)
r["shipment"]["bales"]=to_float(extract("Grand Total",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Total;",text))
r["weights"]["tare_kg"]=to_float(extract("Tare Weight",text))
r["weights"]["net_landed_kg"]=to_float(extract("Grand Total",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Invoice weight",text))
r["weights"]["gain_loss_kg"]=to_float(safe_search(r"(-[0-9.,]+)\s*KGS",text))
r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text))
return r
# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())
# Use root logger for your app
logger = logging.getLogger(__name__)
app = FastAPI()
logger.info("Loading models...")
nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)
logger.info("Models loaded successfully.")
import io
import re
from datetime import datetime
from typing import Dict, Any
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader
import json
def parse_cotton_report(ocr_text: str) -> Dict[str, Any]:
"""
Parse structured data from cotton landing report OCR text
"""
result = {
"lab": "ALFRED H KNIGHT",
"report": {"reference": None, "file_no": None, "date": None},
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None,
"origin": None, "commodity": None},
"parties": {"seller": None, "buyer": None, "carrier": None},
"shipment": {
"vessel": None, "bl_no": None, "port_loading": None,
"port_destination": None, "arrival_date": None,
"weighing_place": None, "weighing_method": None,
"bales": None
},
"weights": {
"gross_landed_kg": None, "tare_kg": None,
"net_landed_kg": None, "invoice_net_kg": None,
"gain_loss_kg": None, "gain_loss_percent": None
}
}
# Clean the text
lines = ocr_text.split('\n')
clean_lines = [line.strip() for line in lines if line.strip()]
# Extract using patterns
text = ocr_text.lower()
# 1. Extract report reference and file number
ref_match = re.search(r'client reference:\s*([^\n]+)', ocr_text, re.IGNORECASE)
if ref_match:
result["report"]["reference"] = ref_match.group(1).strip()
# Try to get file number from AHK reference
ahk_match = re.search(r'ahk s/(\d+)/', ocr_text, re.IGNORECASE)
if ahk_match:
result["report"]["file_no"] = ahk_match.group(1)
# 2. Extract dates
date_match = re.search(r'printed date:\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
if date_match:
result["report"]["date"] = date_match.group(1).title()
# 3. Extract contract information
# Origin/Growth
growth_match = re.search(r'growth\s*:?\s*([^\n:]+(?:raw cotton)?)', ocr_text, re.IGNORECASE)
if growth_match:
origin = growth_match.group(1).strip()
result["contract"]["origin"] = origin
result["contract"]["commodity"] = "COTTON"
# Invoice number from reference
if result["report"]["reference"]:
inv_match = re.search(r'inv\s*(\d+)', result["report"]["reference"], re.IGNORECASE)
if inv_match:
result["contract"]["invoice_no"] = inv_match.group(1)
# 4. Extract parties
# Seller
seller_match = re.search(r'client\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if seller_match:
# Skip the "Client" label if present
seller_text = seller_match.group(1).strip()
if not seller_text.lower().startswith('client'):
result["parties"]["seller"] = seller_text
# Buyer
buyer_match = re.search(r'buyer\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if buyer_match:
buyer_text = buyer_match.group(1).strip()
if not buyer_text.lower().startswith('buyer'):
result["parties"]["buyer"] = buyer_text
# 5. Extract shipment details
# Vessel
vessel_match = re.search(r'vessel\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if vessel_match:
vessel_text = vessel_match.group(1).strip()
if not vessel_text.lower().startswith('vessel'):
result["shipment"]["vessel"] = vessel_text
# B/L Number
bl_match = re.search(r'b/l no\.?\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if bl_match:
bl_text = bl_match.group(1).strip()
result["shipment"]["bl_no"] = bl_text
# Destination
dest_match = re.search(r'destination\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if dest_match:
dest_text = dest_match.group(1).strip()
if not dest_text.lower().startswith('destination'):
result["shipment"]["port_destination"] = dest_text
# Arrival Date
arrival_match = re.search(r'arrival date\s*:?\s*(\d{1,2}-[a-z]+-\d{4})', text, re.IGNORECASE)
if arrival_match:
result["shipment"]["arrival_date"] = arrival_match.group(1).title()
# Weighing method
weigh_match = re.search(r'weighing method\s*:?\s*([^\n]+)', ocr_text, re.IGNORECASE)
if weigh_match:
method_text = weigh_match.group(1).strip()
if not method_text.lower().startswith('weighing'):
result["shipment"]["weighing_method"] = method_text
# Bales count
bales_match = re.search(r'total bales\s*:?\s*(\d+)', ocr_text, re.IGNORECASE)
if bales_match:
result["shipment"]["bales"] = int(bales_match.group(1))
# 6. Extract weights (critical section)
# Gross Landed Weight
gross_match = re.search(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if gross_match:
# We need the second occurrence (landed weight)
all_gross = re.findall(r'gross\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if len(all_gross) >= 2:
result["weights"]["gross_landed_kg"] = float(all_gross[1].replace(',', ''))
# Tare weight (should be same in both)
tare_match = re.search(r'tare\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if tare_match:
result["weights"]["tare_kg"] = float(tare_match.group(1).replace(',', ''))
# Net weights
net_matches = re.findall(r'net\s*:?\s*([\d,\.]+)\s*kg', ocr_text)
if len(net_matches) >= 2:
result["weights"]["invoice_net_kg"] = float(net_matches[0].replace(',', ''))
result["weights"]["net_landed_kg"] = float(net_matches[1].replace(',', ''))
# Loss/Gain
loss_match = re.search(r'loss\s*:?\s*[-]?\s*([\d,\.]+)\s*kg', ocr_text, re.IGNORECASE)
if loss_match:
loss_value = float(loss_match.group(1).replace(',', ''))
# Make it negative if not already indicated
if '-' not in loss_match.group(0) and '' not in loss_match.group(0):
loss_value = -loss_value
result["weights"]["gain_loss_kg"] = loss_value
# Percentage
percent_match = re.search(r'percentage\s*:?\s*[-]?\s*([\d,\.]+)%', ocr_text, re.IGNORECASE)
if percent_match:
percent_value = float(percent_match.group(1).replace(',', ''))
if '-' not in percent_match.group(0) and '' not in percent_match.group(0):
percent_value = -percent_value
result["weights"]["gain_loss_percent"] = percent_value
return result
@app.post("/ocr")
async def ocr(file: UploadFile):
"""
Enhanced OCR endpoint that returns structured data
"""
logger.info(f"Received structured OCR request: {file.filename}")
try:
file_data = await file.read()
ext = file.filename.lower()
ocr_text = ""
# Process PDF
if ext.endswith(".pdf"):
# Try native text extraction first
reader = PdfReader(io.BytesIO(file_data))
direct_text = "".join(page.extract_text() or "" for page in reader.pages)
if direct_text.strip():
logger.info("Using native PDF text")
ocr_text = direct_text
else:
# Fallback to OCR
logger.info("Using OCR for scanned PDF")
images = convert_from_bytes(file_data)
for i, img in enumerate(images):
logger.info(f"OCR page {i+1}/{len(images)}")
ocr_text += pytesseract.image_to_string(img) + "\n"
else:
# Process image
img = Image.open(io.BytesIO(file_data))
ocr_text = pytesseract.image_to_string(img)
# Parse structured data
structured_data = parse_cotton_report(ocr_text)
return {
"success": True,
"ocr_text": ocr_text[:1000] + "..." if len(ocr_text) > 1000 else ocr_text,
"structured_data": structured_data,
"json": json.dumps(structured_data, indent=2, ensure_ascii=False)
}
except Exception as e:
logger.error(f"Structured OCR failed: {e}", exc_info=True)
return {
"success": False,
"error": str(e),
"raw_text": "",
"structured_data": {}
}
# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
# logger.info(f"Received OCR request: {file.filename}")
# try:
# file_data = await file.read()
# ext = file.filename.lower()
# # --------- PDF with native text ---------
# if ext.endswith(".pdf"):
# logger.info("PDF detected → Extracting native text first")
# reader = PdfReader(io.BytesIO(file_data))
# direct_text = "".join(
# page.extract_text() or "" for page in reader.pages
# )
# if direct_text.strip():
# logger.info("Native PDF text found → No OCR needed")
# return {"ocr_text": direct_text}
# # -------- Fallback: scanned PDF OCR --------
# logger.info("No native text → PDF treated as scanned → OCR")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data)
# text = ""
# for i, img in enumerate(images):
# logger.info(f"OCR page {i+1}/{len(images)}")
# text += pytesseract.image_to_string(img) + "\n"
# return {"ocr_text": text}
# # --------- Image file OCR ---------
# logger.info("Image detected → Running OCR")
# img = Image.open(io.BytesIO(file_data))
# text = pytesseract.image_to_string(img)
# return {"ocr_text": text}
# except Exception as e:
# logger.error(f"OCR failed: {e}", exc_info=True)
# raise HTTPException(status_code=500, detail=str(e))
# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
logger.info(f"Received structure request: {file.filename}")
try:
file_data = await file.read()
ext = file.filename.lower()
if ext.endswith(".pdf"):
doc = DocumentFile.from_pdf(file_data)
logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
else:
img = Image.open(io.BytesIO(file_data)).convert("RGB")
doc = DocumentFile.from_images([img])
logger.info("Structure prediction on image")
res = predictor(doc)
return {"structure": str(res)}
except Exception as e:
logger.error(f"Structure extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
logger.info(f"Received table extraction request: {file.filename}")
try:
file_data = await file.read()
buffer = io.BytesIO(file_data)
tables = camelot.read_pdf(buffer)
logger.info(f"Found {len(tables)} tables")
return {"tables": [t.df.to_dict() for t in tables]}
except Exception as e:
logger.error(f"Table extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
def safe_search(pattern, text, default=None, group_index=1, context=""):
"""Recherche sécurisée avec logging en cas d'absence de correspondance."""
m = re.search(pattern, text, re.I | re.S)
if not m:
logger.warning("Pattern not found for %s: %s", context, pattern)
return default
try:
return m.group(group_index).strip()
except IndexError:
logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
return default
def to_float(s):
if not s:
return None
s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
s = s.replace("lbs", "").replace("LBS", "")
s = s.strip()
try:
return float(s)
except:
return None
def section(text, start, end=None):
"""Extract a block of text between two headings, safely."""
pattern_start = re.escape(start)
if end:
pattern_end = re.escape(end)
reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
else:
reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
m = reg.search(text)
if not m:
logger.warning("Section not found: start='%s', end='%s'", start, end)
return ""
return m.group(1).strip()
def extract_field(text, label, default=None):
"""Extract a line of the form 'Label: value', safely."""
pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
return safe_search(pattern, text, default=default, context=f"field '{label}'")
def extract(label, text, default=None):
"""
Robust extraction for OCR/PDF text.
Works with:
Label: Value
Label Value
Label .... Value
"""
if not text:
return default
patterns = [
rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
rf"{re.escape(label)}\s+([^\n\r]+)"
]
for p in patterns:
m = re.search(p, text, re.I)
if m:
return m.group(1).strip()
return default
def extract_report_metadata(text):
logger.info("Starting metadata extraction, text length=%d", len(text))
try:
# ----------- SECTIONS -----------
order_details = section(text, "Order details", "Weights")
invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
landed_section = section(text, "Bales Weighed", "Outturn")
loss_section = section(text, "LOSS", "Invoice average")
avg_section = section(text, "Invoice average", "Comments")
signature_block = section(text, "Signed on")
# ----------- TOP INFO -----------
top_info = {
"produced_on": extract_field(text, "Produced On"),
"printed_date": extract_field(text, "Printed Date"),
"client_reference": extract_field(text, "Client Reference"),
"report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
}
# ----------- ORDER DETAILS -----------
parties = {
"client": extract_field(order_details, "Client"),
"client_ref_no": extract_field(order_details, "Client Ref No"),
"buyer": extract_field(order_details, "Buyer"),
"destination": extract_field(order_details, "Destination"),
}
shipment = {
"total_bales": extract_field(order_details, "Total Bales"),
"vessel": extract_field(order_details, "Vessel"),
"voyage_no": extract_field(order_details, "Voy. No"),
"bl_no": extract_field(order_details, "B/L No"),
"bl_date": extract_field(order_details, "B/L Date"),
"growth": extract_field(order_details, "Growth"),
"arrival_date": extract_field(order_details, "Arrival Date"),
"first_weighing_date": extract_field(order_details, "First date of weighing"),
"last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
"weighing_method": extract_field(order_details, "Weighing method"),
"tare_basis": extract_field(order_details, "Tare"),
}
# ----------- INVOICE SECTION -----------
invoice = {
"bales": extract_field(invoice_section, "Bales"),
"gross": extract_field(invoice_section, "Gross"),
"tare": extract_field(invoice_section, "Tare"),
"net": extract_field(invoice_section, "Net"),
}
# ----------- LANDED SECTION -----------
landed = {
"bales": extract_field(landed_section, "Bales"),
"gross": extract_field(landed_section, "Gross"),
"tare": extract_field(landed_section, "Tare"),
"net": extract_field(landed_section, "Net"),
}
# ----------- LOSS SECTION -----------
loss = {
"kg": extract_field(loss_section, "kg"),
"lb": extract_field(loss_section, "lb"),
"percent": extract_field(loss_section, "Percentage"),
}
# ----------- AVERAGES SECTION -----------
averages = {
"invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
"landed_gross_per_bale": extract_field(avg_section, "Landed average"),
}
# ----------- SIGNATURE -----------
signature = {
"signed_on": extract_field(signature_block, "Signed on"),
"signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
"role": "Client Services Coordinator",
"company": "Alfred H. Knight International Limited"
}
logger.info("Metadata extraction completed successfully")
return {
"report": top_info,
"parties": parties,
"shipment": shipment,
"weights": {
"invoice": invoice,
"landed": landed,
"loss": loss,
"averages": averages
},
"signature": signature
}
except Exception as e:
logger.exception("Unexpected error during metadata extraction")
raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
def detect_template(text):
t = text.lower()
if "alfred h. knight" in t and "cotton landing report" in t:
return "AHK"
if "intertek" in t and "landing report" in t:
return "INTERTEK"
if "robertson international" in t or "ri ref no" in t:
return "ROBERTSON"
if "landing report" in t and "carcon cargo" in t:
return "SGS"
if "pacific inspection company" in t or "picl-bd.com" in t:
return "PICL"
return "UNKNOWN"
@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
return extract_report_metadata(text)
@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
return parse_report(text)
PARSERS = {
"AHK": AHKParser(),
"INTERTEK": IntertekParser(),
"ROBERTSON": RobertsonParser(),
"SGS": SGSParser(),
"PICL": PICLParser()
}
def empty_weight_report(lab):
return {
"lab": lab,
"report": {"reference": None, "file_no": None, "date": None},
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
"parties": {"seller": None, "buyer": None, "carrier": None},
"shipment": {
"vessel": None, "bl_no": None, "port_loading": None,
"port_destination": None, "arrival_date": None,
"weighing_place": None, "weighing_method": None,
"bales": None
},
"weights": {
"gross_landed_kg": None, "tare_kg": None,
"net_landed_kg": None, "invoice_net_kg": None,
"gain_loss_kg": None, "gain_loss_percent": None
}
}
def parse_report(text):
template=detect_template(text)
if template not in PARSERS:
return {"template":"UNKNOWN"}
return PARSERS[template].parse(text)