513 lines
18 KiB
Python
513 lines
18 KiB
Python
from fastapi import FastAPI, UploadFile, HTTPException, Body
|
|
from PIL import Image
|
|
import pytesseract
|
|
from doctr.models import ocr_predictor
|
|
from doctr.io import DocumentFile
|
|
from PyPDF2 import PdfReader
|
|
import camelot
|
|
import spacy
|
|
import logging
|
|
import io
|
|
from logging.handlers import RotatingFileHandler
|
|
import re
|
|
|
|
LOG_PATH = "/var/log/automation-service.log"
|
|
|
|
file_handler = RotatingFileHandler(
|
|
LOG_PATH,
|
|
maxBytes=10*1024*1024,
|
|
backupCount=5,
|
|
encoding="utf-8"
|
|
)
|
|
file_handler.setFormatter(logging.Formatter(
|
|
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
|
))
|
|
|
|
class AHKParser:
|
|
lab = "AHK"
|
|
|
|
def parse(self, text):
|
|
invoice_block = section(text, "INVOICE WEIGHTS", "Bales Weighed")
|
|
landed_block = section(text, "Bales Weighed", "Outturn")
|
|
loss_block = section(text, "LOSS", "Invoice average")
|
|
|
|
return {
|
|
"report": {
|
|
"lab": "AHK",
|
|
"reference": safe_search(r"(AHK\s*/\S+)", text, default=None, context="AHK reference"),
|
|
"date": extract("Produced On", text)
|
|
},
|
|
"shipment": {
|
|
"bales": to_float(extract("Total Bales", text)),
|
|
"vessel": extract("Vessel", text),
|
|
"bl": extract("B/L No", text),
|
|
"arrival_date": extract("Arrival Date", text)
|
|
},
|
|
"weights": {
|
|
"invoice_kg": to_float(extract("Net", invoice_block)),
|
|
"landed_kg": to_float(extract("Net", landed_block)),
|
|
"gain_loss_kg": to_float(extract("kg", loss_block)),
|
|
"gain_loss_percent": to_float(extract("Percentage", loss_block))
|
|
}
|
|
}
|
|
|
|
class IntertekParser:
|
|
lab = "INTERTEK"
|
|
|
|
def parse(self, text):
|
|
m = re.search(r"([0-9.]+)\s*%", text)
|
|
percent = m.group(1) if m else None
|
|
|
|
return {
|
|
"report": {
|
|
"lab": "INTERTEK",
|
|
"reference": extract("Global Ref", text),
|
|
"date": extract("Dated", text)
|
|
},
|
|
"shipment": {
|
|
"bales": to_float(extract("Invoice Quantity", text)),
|
|
"vessel": extract("Vessel", text),
|
|
"bl": extract("B/L No", text),
|
|
"arrival_date": extract("Arrival Date", text)
|
|
},
|
|
"weights": {
|
|
"invoice_kg": to_float(extract("Invoice Weight", text)),
|
|
"landed_kg": to_float(extract("Landed Weight", text)),
|
|
"gain_loss_kg": to_float(extract("Gain", text)),
|
|
"gain_loss_percent": to_float(percent)
|
|
}
|
|
}
|
|
|
|
class RobertsonParser:
|
|
lab = "ROBERTSON"
|
|
|
|
def parse(self, text):
|
|
m = re.search(r"([0-9.]+)\s*%", text)
|
|
percent = m.group(1) if m else None
|
|
|
|
return {
|
|
"report": {
|
|
"lab": "ROBERTSON",
|
|
"reference": extract("RI REF NO.", text),
|
|
"date": extract("DATED", text)
|
|
},
|
|
"shipment": {
|
|
"bales": to_float(extract("QUANTITY", text)),
|
|
"vessel": extract("VESSEL", text),
|
|
"bl": extract("B/L NO.", text),
|
|
"arrival_date": extract("ARRIVAL DATE", text)
|
|
},
|
|
"weights": {
|
|
"invoice_kg": to_float(extract("NET INVOICE WEIGHT", text)),
|
|
"landed_kg": to_float(extract("NET LANDED WEIGHT", text)),
|
|
"gain_loss_kg": to_float(extract("LOSS", text)),
|
|
"gain_loss_percent": to_float(percent)
|
|
}
|
|
}
|
|
|
|
class SGSParser:
|
|
lab = "SGS"
|
|
|
|
def parse(self, text):
|
|
|
|
return {
|
|
"report": {
|
|
"lab": "SGS",
|
|
"reference": extract("LANDING REPORT No", text),
|
|
"file_no": extract("FILE NO.", text),
|
|
"date": extract("DATE", text)
|
|
},
|
|
"contract": {
|
|
"contract_no": extract("CONTRACT NO.", text),
|
|
"invoice_no": extract("INVOICE NO.", text),
|
|
"origin": extract("ORIGIN", text),
|
|
"product": extract("PRODUCT", text)
|
|
},
|
|
"parties": {
|
|
"seller": extract("Seller", text),
|
|
"buyer": extract("Buyer", text),
|
|
"carrier": extract("Carrier", text)
|
|
},
|
|
"shipment": {
|
|
"bl": extract("B/L no.", text),
|
|
"port_loading": extract("Port of loading", text),
|
|
"port_destination": extract("Port of destination", text),
|
|
"arrival_date": extract("Vessel arrival date", text),
|
|
"devanning_date": extract("Container devanning date", text),
|
|
"weighing_date": extract("Weighing date", text),
|
|
"weighing_mode": extract("Weighing mode", text),
|
|
"quantity_bales": to_float(extract("Quantity arrived", text))
|
|
},
|
|
"weights": {
|
|
"gross_landed_kg": to_float(extract("Gross landed", text)),
|
|
"tare_kg": to_float(extract("Tare", text)),
|
|
"net_landed_kg": to_float(extract("Net landed", text)),
|
|
"net_invoiced_kg": to_float(extract("Net invoiced", text)),
|
|
"gain_percent": to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%", text)),
|
|
"gain_kg": to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs", text))
|
|
}
|
|
}
|
|
|
|
class PICLParser:
|
|
lab = "PICL"
|
|
|
|
def parse(self, text):
|
|
|
|
return {
|
|
"report": {
|
|
"lab": "PICL",
|
|
"reference": safe_search(r"No[:\s]+([A-Z0-9\-]+)", text),
|
|
"date": safe_search(r"Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,|Sunday,?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})", text)
|
|
},
|
|
|
|
"parties": {
|
|
"seller": extract("FAIRCOT SA", text),
|
|
"buyer": extract("M/S.", text)
|
|
},
|
|
|
|
"shipment": {
|
|
"bales": to_float(extract("Grand Total", text)),
|
|
"vessel": extract("Shipped Per Vessel", text),
|
|
"feeder": extract("Feeder", text),
|
|
"port_loading": extract("Port of Loading", text),
|
|
"port_discharge": extract("Port of Discharge", text),
|
|
"arrival_date": extract("Date of Anival & LDL", text),
|
|
"weighing_place": extract("Place & Date of Weighment", text)
|
|
},
|
|
|
|
"contract": {
|
|
"contract_no": extract("Contract/Pl No & Date", text),
|
|
"invoice_no": extract("Invoice ilo & Date", text),
|
|
"bl": extract("B/L No & Date", text),
|
|
"origin": extract("Country of Origin", text),
|
|
"commodity": extract("Commodity", text)
|
|
},
|
|
|
|
"weights": {
|
|
"gross_landed_kg": to_float(extract("Total;", text)),
|
|
"tare_kg": to_float(extract("Tare Weight", text)),
|
|
"net_landed_kg": to_float(extract("Grand Total", text)),
|
|
"invoice_weight_kg": to_float(extract("Invoice weight", text)),
|
|
"loss_kg": to_float(safe_search(r"(-[0-9.,]+)\s*KGS", text)),
|
|
"loss_percent": to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)", text))
|
|
}
|
|
}
|
|
|
|
# Configure root logger explicitly
|
|
root = logging.getLogger()
|
|
root.setLevel(logging.INFO)
|
|
root.addHandler(file_handler)
|
|
root.addHandler(logging.StreamHandler())
|
|
|
|
# Use root logger for your app
|
|
logger = logging.getLogger(__name__)
|
|
|
|
app = FastAPI()
|
|
logger.info("Loading models...")
|
|
|
|
nlp = spacy.load("en_core_web_sm")
|
|
predictor = ocr_predictor(pretrained=True)
|
|
|
|
logger.info("Models loaded successfully.")
|
|
|
|
# =============================
|
|
# 🧠 Smart OCR
|
|
# =============================
|
|
@app.post("/ocr")
|
|
async def ocr(file: UploadFile):
|
|
logger.info(f"Received OCR request: {file.filename}")
|
|
try:
|
|
file_data = await file.read()
|
|
ext = file.filename.lower()
|
|
|
|
# --------- PDF with native text ---------
|
|
if ext.endswith(".pdf"):
|
|
logger.info("PDF detected → Extracting native text first")
|
|
reader = PdfReader(io.BytesIO(file_data))
|
|
direct_text = "".join(
|
|
page.extract_text() or "" for page in reader.pages
|
|
)
|
|
|
|
if direct_text.strip():
|
|
logger.info("Native PDF text found → No OCR needed")
|
|
return {"ocr_text": direct_text}
|
|
|
|
# -------- Fallback: scanned PDF OCR --------
|
|
logger.info("No native text → PDF treated as scanned → OCR")
|
|
from pdf2image import convert_from_bytes
|
|
images = convert_from_bytes(file_data)
|
|
text = ""
|
|
for i, img in enumerate(images):
|
|
logger.info(f"OCR page {i+1}/{len(images)}")
|
|
text += pytesseract.image_to_string(img) + "\n"
|
|
|
|
return {"ocr_text": text}
|
|
|
|
# --------- Image file OCR ---------
|
|
logger.info("Image detected → Running OCR")
|
|
img = Image.open(io.BytesIO(file_data))
|
|
text = pytesseract.image_to_string(img)
|
|
return {"ocr_text": text}
|
|
|
|
except Exception as e:
|
|
logger.error(f"OCR failed: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
# =============================
|
|
# 🧱 Structure / Layout
|
|
# =============================
|
|
@app.post("/structure")
|
|
async def structure(file: UploadFile):
|
|
logger.info(f"Received structure request: {file.filename}")
|
|
try:
|
|
file_data = await file.read()
|
|
ext = file.filename.lower()
|
|
|
|
if ext.endswith(".pdf"):
|
|
doc = DocumentFile.from_pdf(file_data)
|
|
logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
|
|
else:
|
|
img = Image.open(io.BytesIO(file_data)).convert("RGB")
|
|
doc = DocumentFile.from_images([img])
|
|
logger.info("Structure prediction on image")
|
|
|
|
res = predictor(doc)
|
|
return {"structure": str(res)}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Structure extraction failed: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
# =============================
|
|
# 📊 Tables extraction (PDF only)
|
|
# =============================
|
|
@app.post("/tables")
|
|
async def tables(file: UploadFile):
|
|
logger.info(f"Received table extraction request: {file.filename}")
|
|
try:
|
|
file_data = await file.read()
|
|
buffer = io.BytesIO(file_data)
|
|
|
|
tables = camelot.read_pdf(buffer)
|
|
logger.info(f"Found {len(tables)} tables")
|
|
return {"tables": [t.df.to_dict() for t in tables]}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Table extraction failed: {e}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
def safe_search(pattern, text, default=None, group_index=1, context=""):
|
|
"""Recherche sécurisée avec logging en cas d'absence de correspondance."""
|
|
m = re.search(pattern, text, re.I | re.S)
|
|
if not m:
|
|
logger.warning("Pattern not found for %s: %s", context, pattern)
|
|
return default
|
|
try:
|
|
return m.group(group_index).strip()
|
|
except IndexError:
|
|
logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
|
|
return default
|
|
|
|
def to_float(s):
|
|
if not s:
|
|
return None
|
|
s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
|
|
s = s.replace("lbs", "").replace("LBS", "")
|
|
s = s.strip()
|
|
try:
|
|
return float(s)
|
|
except:
|
|
return None
|
|
|
|
def section(text, start, end=None):
|
|
"""Extract a block of text between two headings, safely."""
|
|
pattern_start = re.escape(start)
|
|
if end:
|
|
pattern_end = re.escape(end)
|
|
reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
|
|
else:
|
|
reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
|
|
m = reg.search(text)
|
|
if not m:
|
|
logger.warning("Section not found: start='%s', end='%s'", start, end)
|
|
return ""
|
|
return m.group(1).strip()
|
|
|
|
def extract_field(text, label, default=None):
|
|
"""Extract a line of the form 'Label: value', safely."""
|
|
pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
|
|
return safe_search(pattern, text, default=default, context=f"field '{label}'")
|
|
|
|
def extract(label, text, default=None):
|
|
"""
|
|
Robust extraction for OCR/PDF text.
|
|
Works with:
|
|
Label: Value
|
|
Label Value
|
|
Label .... Value
|
|
"""
|
|
if not text:
|
|
return default
|
|
|
|
patterns = [
|
|
rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
|
|
rf"{re.escape(label)}\s+([^\n\r]+)"
|
|
]
|
|
|
|
for p in patterns:
|
|
m = re.search(p, text, re.I)
|
|
if m:
|
|
return m.group(1).strip()
|
|
|
|
return default
|
|
|
|
def extract_report_metadata(text):
|
|
logger.info("Starting metadata extraction, text length=%d", len(text))
|
|
|
|
try:
|
|
# ----------- SECTIONS -----------
|
|
order_details = section(text, "Order details", "Weights")
|
|
invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
|
|
landed_section = section(text, "Bales Weighed", "Outturn")
|
|
loss_section = section(text, "LOSS", "Invoice average")
|
|
avg_section = section(text, "Invoice average", "Comments")
|
|
signature_block = section(text, "Signed on")
|
|
|
|
# ----------- TOP INFO -----------
|
|
top_info = {
|
|
"produced_on": extract_field(text, "Produced On"),
|
|
"printed_date": extract_field(text, "Printed Date"),
|
|
"client_reference": extract_field(text, "Client Reference"),
|
|
"report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
|
|
}
|
|
|
|
# ----------- ORDER DETAILS -----------
|
|
parties = {
|
|
"client": extract_field(order_details, "Client"),
|
|
"client_ref_no": extract_field(order_details, "Client Ref No"),
|
|
"buyer": extract_field(order_details, "Buyer"),
|
|
"destination": extract_field(order_details, "Destination"),
|
|
}
|
|
|
|
shipment = {
|
|
"total_bales": extract_field(order_details, "Total Bales"),
|
|
"vessel": extract_field(order_details, "Vessel"),
|
|
"voyage_no": extract_field(order_details, "Voy. No"),
|
|
"bl_no": extract_field(order_details, "B/L No"),
|
|
"bl_date": extract_field(order_details, "B/L Date"),
|
|
"growth": extract_field(order_details, "Growth"),
|
|
"arrival_date": extract_field(order_details, "Arrival Date"),
|
|
"first_weighing_date": extract_field(order_details, "First date of weighing"),
|
|
"last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
|
|
"weighing_method": extract_field(order_details, "Weighing method"),
|
|
"tare_basis": extract_field(order_details, "Tare"),
|
|
}
|
|
|
|
# ----------- INVOICE SECTION -----------
|
|
invoice = {
|
|
"bales": extract_field(invoice_section, "Bales"),
|
|
"gross": extract_field(invoice_section, "Gross"),
|
|
"tare": extract_field(invoice_section, "Tare"),
|
|
"net": extract_field(invoice_section, "Net"),
|
|
}
|
|
|
|
# ----------- LANDED SECTION -----------
|
|
landed = {
|
|
"bales": extract_field(landed_section, "Bales"),
|
|
"gross": extract_field(landed_section, "Gross"),
|
|
"tare": extract_field(landed_section, "Tare"),
|
|
"net": extract_field(landed_section, "Net"),
|
|
}
|
|
|
|
# ----------- LOSS SECTION -----------
|
|
loss = {
|
|
"kg": extract_field(loss_section, "kg"),
|
|
"lb": extract_field(loss_section, "lb"),
|
|
"percent": extract_field(loss_section, "Percentage"),
|
|
}
|
|
|
|
# ----------- AVERAGES SECTION -----------
|
|
averages = {
|
|
"invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
|
|
"landed_gross_per_bale": extract_field(avg_section, "Landed average"),
|
|
}
|
|
|
|
# ----------- SIGNATURE -----------
|
|
signature = {
|
|
"signed_on": extract_field(signature_block, "Signed on"),
|
|
"signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
|
|
"role": "Client Services Coordinator",
|
|
"company": "Alfred H. Knight International Limited"
|
|
}
|
|
|
|
logger.info("Metadata extraction completed successfully")
|
|
return {
|
|
"report": top_info,
|
|
"parties": parties,
|
|
"shipment": shipment,
|
|
"weights": {
|
|
"invoice": invoice,
|
|
"landed": landed,
|
|
"loss": loss,
|
|
"averages": averages
|
|
},
|
|
"signature": signature
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.exception("Unexpected error during metadata extraction")
|
|
raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
|
|
|
|
def detect_template(text):
|
|
t = text.lower()
|
|
|
|
if "alfred h. knight" in t and "cotton landing report" in t:
|
|
return "AHK"
|
|
|
|
if "intertek" in t and "landing report" in t:
|
|
return "INTERTEK"
|
|
|
|
if "robertson international" in t or "ri ref no" in t:
|
|
return "ROBERTSON"
|
|
|
|
if "landing report" in t and "carcon cargo" in t:
|
|
return "SGS"
|
|
|
|
if "pacific inspection company" in t or "picl-bd.com" in t:
|
|
return "PICL"
|
|
|
|
return "UNKNOWN"
|
|
|
|
@app.post("/metadata")
|
|
async def metadata(text: str = Body(..., embed=True)):
|
|
return extract_report_metadata(text)
|
|
|
|
@app.post("/parse")
|
|
async def parse_endpoint(text: str = Body(..., embed=True)):
|
|
return parse_report(text)
|
|
|
|
PARSERS = {
|
|
"AHK": AHKParser(),
|
|
"INTERTEK": IntertekParser(),
|
|
"ROBERTSON": RobertsonParser(),
|
|
"SGS": SGSParser(),
|
|
"PICL": PICLParser()
|
|
}
|
|
|
|
def parse_report(text):
|
|
template = detect_template(text)
|
|
logger.info(f"Detected template: {template}")
|
|
|
|
if template not in PARSERS:
|
|
return {
|
|
"template": "UNKNOWN",
|
|
"raw_text": text[:5000]
|
|
}
|
|
|
|
data = PARSERS[template].parse(text)
|
|
|
|
return {
|
|
"template": template,
|
|
"data": data
|
|
}
|