Files
automation-service/app.py
2026-01-11 21:04:10 +01:00

969 lines
38 KiB
Python

from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import pdfplumber
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re
from datetime import datetime
LOG_PATH = "/var/log/automation-service.log"
file_handler = RotatingFileHandler(
LOG_PATH,
maxBytes=10*1024*1024,
backupCount=5,
encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))
class AHKParser:
lab = "AHK"
def _clean_value(self, value):
"""Nettoie la valeur en supprimant les espaces inutiles"""
if value:
return value.strip()
return value
def parse(self, text):
"""Parse le texte et retourne un dictionnaire structuré"""
result = {
"lab": self.lab,
"report": self._extract_report_info(text),
"contract": self._extract_contract_info(text),
"parties": self._extract_parties_info(text),
"shipment": self._extract_shipment_info(text),
"weights": self._extract_weights_info(text)
}
self.data = result
return result
def _extract_report_info(self, text):
"""Extrait les informations du rapport"""
report_info = {
"reference": None,
"file_no": None,
"date": None
}
# Recherche de la référence client - plus précise
ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
if ref_match:
report_info["reference"] = self._clean_value(ref_match.group(1))
# Recherche du numéro de fichier AHK
file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
if file_no_match:
report_info["file_no"] = self._clean_value(file_no_match.group(1))
# Recherche de la date du rapport
date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
if date_match:
report_info["date"] = self._clean_value(date_match.group(1))
return report_info
def _extract_contract_info(self, text):
"""Extrait les informations du contrat"""
contract_info = {
"contract_no": None,
"invoice_no": None,
"lc_no": None,
"origin": None,
"commodity": None
}
# Extraction de la référence client
ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
if ref_match:
ref_text = ref_match.group(1).strip()
# Sépare S-3488 et INV 4013
parts = re.split(r'[/\s]+', ref_text)
for part in parts:
if part.startswith('S-'):
contract_info["contract_no"] = part.strip()
elif part.startswith('INV'):
contract_info["invoice_no"] = part.strip()
# Extraction de l'origine et de la marchandise - regex plus précise
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
if growth_match:
origin_text = growth_match.group(1).strip()
if "AUSTRALIAN" in origin_text.upper():
contract_info["origin"] = "AUSTRALIA"
contract_info["commodity"] = "RAW COTTON"
return contract_info
def _extract_parties_info(self, text):
"""Extrait les informations sur les parties"""
parties_info = {
"seller": None,
"buyer": None,
"carrier": None
}
# Extraction du vendeur (Client) - regex plus précise
seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
if seller_match:
parties_info["seller"] = self._clean_value(seller_match.group(1))
# Extraction de l'acheteur (Buyer) - regex plus précise
buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
if buyer_match:
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
# Extraction du transporteur (nom du navire seulement)
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
if vessel_match:
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
return parties_info
def _extract_shipment_info(self, text):
"""Extrait les informations d'expédition"""
shipment_info = {
"vessel": None,
"bl_no": None,
"bl_date": None,
"port_loading": None,
"port_destination": None,
"arrival_date": None,
"weighing_place": None,
"weighing_method": None,
"bales": None
}
# Extraction du navire (nom seulement)
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
if vessel_match:
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
# Extraction du numéro de connaissement (seulement le numéro)
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
if bl_no_match:
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
# Extraction de la date du connaissement
bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
if bl_date_match:
shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))
# Extraction du port de destination (sans le "Tare")
dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
if dest_match:
shipment_info["port_destination"] = self._clean_value(dest_match.group(1))
# Extraction de la date d'arrivée
arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
if arrival_match:
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
# Extraction de la méthode de pesée
weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
if weighing_method_match:
shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))
# Extraction du nombre de balles
bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
if bales_match:
try:
shipment_info["bales"] = int(bales_match.group(1).strip())
except ValueError:
shipment_info["bales"] = None
return shipment_info
def _extract_weights_info(self, text):
"""Extrait les informations de poids"""
weights_info = {
"gross_landed_kg": None,
"tare_kg": None,
"net_landed_kg": None,
"invoice_net_kg": None,
"gain_loss_kg": None,
"gain_loss_percent": None
}
# Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
if gross_landed_match:
try:
weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids de tare
tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
if tare_match:
try:
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
if net_landed_match:
try:
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids net facturé (101299 kg)
invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
if invoice_net_match:
try:
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction de la perte en kg
loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
if loss_match:
try:
weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du pourcentage de perte
percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
if percent_match:
try:
weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
except ValueError:
pass
return weights_info
# class AHKParser:
# lab="AHK"
# def parse(self, text):
# """Parse le texte et retourne un dictionnaire structuré"""
# result = {
# "lab": self.lab,
# "report": self._extract_report_info(text),
# "contract": self._extract_contract_info(text),
# "parties": self._extract_parties_info(text),
# "shipment": self._extract_shipment_info(text),
# "weights": self._extract_weights_info(text)
# }
# self.data = result
# return result
# def _extract_report_info(self, text):
# """Extrait les informations du rapport"""
# report_info = {
# "reference": None,
# "file_no": None,
# "date": None
# }
# # Recherche de la référence client
# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text)
# if ref_match:
# report_info["reference"] = ref_match.group(1).strip()
# # Recherche du numéro de fichier AHK
# file_no_match = re.search(r'AHK\s*S/([\w/]+)', text)
# if file_no_match:
# report_info["file_no"] = file_no_match.group(1).strip()
# # Recherche de la date du rapport
# date_match = re.search(r'Signed on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
# if date_match:
# report_info["date"] = date_match.group(1).strip()
# return report_info
# def _extract_contract_info(self, text):
# """Extrait les informations du contrat"""
# contract_info = {
# "contract_no": None,
# "invoice_no": None,
# "lc_no": None,
# "origin": None,
# "commodity": None
# }
# # Extraction de la référence client (peut servir comme numéro de contrat)
# ref_match = re.search(r'Client Reference:\s*(S-\d+/\s*INV\s*\d+)', text)
# if ref_match:
# ref_parts = ref_match.group(1).split('/')
# if len(ref_parts) >= 2:
# contract_info["contract_no"] = ref_parts[0].strip()
# contract_info["invoice_no"] = ref_parts[1].strip()
# # Extraction de l'origine et de la marchandise
# origin_match = re.search(r'Growth\s*:\s*([\w\s]+)', text)
# if origin_match:
# origin_text = origin_match.group(1).strip()
# if "AUSTRALIAN" in origin_text.upper():
# contract_info["origin"] = "AUSTRALIA"
# # La marchandise est généralement "RAW COTTON"
# contract_info["commodity"] = "RAW COTTON"
# return contract_info
# def _extract_parties_info(self, text):
# """Extrait les informations sur les parties"""
# parties_info = {
# "seller": None,
# "buyer": None,
# "carrier": None
# }
# # Extraction du vendeur (Client)
# seller_match = re.search(r'Client\s*:\s*([^\n]+)', text)
# if seller_match:
# parties_info["seller"] = seller_match.group(1).strip()
# # Extraction de l'acheteur (Buyer)
# buyer_match = re.search(r'Buyer\s*:\s*([^\n]+)', text)
# if buyer_match:
# parties_info["buyer"] = buyer_match.group(1).strip()
# # Extraction du transporteur (Vessel)
# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text)
# if vessel_match:
# # On considère le nom du navire comme transporteur
# parties_info["carrier"] = vessel_match.group(1).strip()
# return parties_info
# def _extract_shipment_info(self, text):
# """Extrait les informations d'expédition"""
# shipment_info = {
# "vessel": None,
# "bl_no": None,
# "bl_date": None,
# "port_loading": None, # Non spécifié dans le texte
# "port_destination": None,
# "arrival_date": None,
# "weighing_place": None, # Non spécifié dans le texte
# "weighing_method": None,
# "bales": None
# }
# # Extraction du navire
# vessel_match = re.search(r'Vessel\s*:\s*([^\n]+)', text)
# if vessel_match:
# shipment_info["vessel"] = vessel_match.group(1).strip()
# # Extraction du numéro de connaissement
# bl_no_match = re.search(r'B/L No\.\s*:\s*([^\n]+)', text)
# if bl_no_match:
# shipment_info["bl_no"] = bl_no_match.group(1).strip()
# # Extraction de la date du connaissement
# bl_date_match = re.search(r'B/L Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
# if bl_date_match:
# shipment_info["bl_date"] = bl_date_match.group(1).strip()
# # Extraction du port de destination
# dest_match = re.search(r'Destination\s*:\s*([^\n]+)', text)
# if dest_match:
# shipment_info["port_destination"] = dest_match.group(1).strip()
# # Extraction de la date d'arrivée
# arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
# if arrival_match:
# shipment_info["arrival_date"] = arrival_match.group(1).strip()
# # Extraction de la méthode de pesée
# weighing_method_match = re.search(r'Weighing method\s*:\s*([^\n]+)', text)
# if weighing_method_match:
# shipment_info["weighing_method"] = weighing_method_match.group(1).strip()
# # Extraction du nombre de balles
# bales_match = re.search(r'Total Bales\s*:\s*(\d+)', text)
# if bales_match:
# shipment_info["bales"] = int(bales_match.group(1).strip())
# return shipment_info
# def _extract_weights_info(self, text):
# """Extrait les informations de poids"""
# weights_info = {
# "gross_landed_kg": None,
# "tare_kg": None,
# "net_landed_kg": None,
# "invoice_net_kg": None,
# "gain_loss_kg": None,
# "gain_loss_percent": None
# }
# # Extraction du poids brut débarqué
# gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.]+)\s*kg', text)
# if gross_landed_match:
# weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).strip())
# # Extraction du poids de tare
# tare_match = re.search(r'Tare\s*:\s*([\d.]+)\s*kg', text)
# if tare_match:
# weights_info["tare_kg"] = float(tare_match.group(1).strip())
# # Extraction du poids net débarqué
# net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text)
# if net_landed_match:
# weights_info["net_landed_kg"] = float(net_landed_match.group(1).strip())
# # Extraction du poids net facturé
# invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.]+)\s*kg', text)
# if invoice_net_match:
# weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).strip())
# # Extraction de la perte en kg
# loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.]+)\s*kg', text)
# if loss_match:
# weights_info["gain_loss_kg"] = -float(loss_match.group(1).strip())
# # Extraction du pourcentage de perte
# percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.]+)%', text)
# if percent_match:
# weights_info["gain_loss_percent"] = -float(percent_match.group(1).strip())
# return weights_info
class IntertekParser:
lab="INTERTEK"
def parse(self,text):
r=empty_weight_report("INTERTEK")
pct=safe_search(r"([0-9.]+)\s*%",text)
r["report"]["reference"]=extract("Global Ref",text)
r["report"]["file_no"]=extract("Report / File No",text)
r["report"]["date"]=extract("Dated",text)
r["contract"]["contract_no"]=extract("Contract No",text)
r["contract"]["invoice_no"]=extract("Invoice No",text)
r["contract"]["origin"]=extract("Growth",text)
r["contract"]["commodity"]="Raw Cotton"
r["parties"]["buyer"]=extract("Buyer",text)
r["shipment"]["vessel"]=extract("Vessel",text)
r["shipment"]["bl_no"]=extract("B/L No",text)
r["shipment"]["arrival_date"]=extract("Arrival Date",text)
r["shipment"]["weighing_place"]=extract("Weighed at",text)
r["shipment"]["bales"]=to_float(extract("Invoice Quantity",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Gross",text))
r["weights"]["tare_kg"]=to_float(extract("Invoice Tare",text))
r["weights"]["net_landed_kg"]=to_float(extract("Landed Weight",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Invoice Weight",text))
r["weights"]["gain_loss_kg"]=to_float(extract("Gain",text))
r["weights"]["gain_loss_percent"]=to_float(pct)
return r
class RobertsonParser:
lab="ROBERTSON"
def parse(self,text):
r=empty_weight_report("ROBERTSON")
pct=safe_search(r"([0-9.]+)\s*%",text)
r["report"]["reference"]=extract("OUR REF",text)
r["report"]["date"]=extract("DATE",text)
r["contract"]["contract_no"]=extract("CONTRACT NO",text)
r["contract"]["invoice_no"]=extract("INVOICE NO",text)
r["contract"]["lc_no"]=extract("LIC NO",text)
r["contract"]["commodity"]="Raw Cotton"
r["parties"]["seller"]=extract("SELLER",text)
r["parties"]["buyer"]=extract("BUYER",text)
r["shipment"]["vessel"]=extract("NAME OF VESSEL",text)
r["shipment"]["port_loading"]=extract("SAILED FROM",text)
r["shipment"]["port_destination"]=extract("ARRIVED AT",text)
r["shipment"]["arrival_date"]=extract("DATE OF ARRIVAL",text)
r["shipment"]["weighing_place"]=extract("PLACE OF CONTROL",text)
r["shipment"]["bales"]=to_float(extract("CONSIGNMENT",text))
r["weights"]["gross_landed_kg"]=to_float(extract("GROSS",text))
r["weights"]["tare_kg"]=to_float(extract("TARE",text))
r["weights"]["net_landed_kg"]=to_float(extract("LANDED NET",text))
r["weights"]["invoice_net_kg"]=to_float(extract("INVOICE NET",text))
r["weights"]["gain_loss_kg"]=to_float(extract("GAIN",text))
r["weights"]["gain_loss_percent"]=to_float(pct)
return r
class SGSParser:
lab="SGS"
def parse(self,text):
r=empty_weight_report("SGS")
r["report"]["reference"]=extract("LANDING REPORT No",text)
r["report"]["file_no"]=extract("FILE NO.",text)
r["report"]["date"]=extract("DATE",text)
r["contract"]["contract_no"]=extract("CONTRACT NO.",text)
r["contract"]["invoice_no"]=extract("INVOICE NO.",text)
r["contract"]["origin"]=extract("ORIGIN",text)
r["contract"]["commodity"]=extract("PRODUCT",text)
r["parties"]["seller"]=extract("Seller",text)
r["parties"]["buyer"]=extract("Buyer",text)
r["parties"]["carrier"]=extract("Carrier",text)
r["shipment"]["bl_no"]=extract("B/L no.",text)
r["shipment"]["port_loading"]=extract("Port of loading",text)
r["shipment"]["port_destination"]=extract("Port of destination",text)
r["shipment"]["arrival_date"]=extract("Vessel arrival date",text)
r["shipment"]["weighing_place"]=extract("Place of weighing",text)
r["shipment"]["weighing_method"]=extract("Weighing mode",text)
r["shipment"]["bales"]=to_float(extract("Quantity arrived",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Gross landed",text))
r["weights"]["tare_kg"]=to_float(extract("Tare",text))
r["weights"]["net_landed_kg"]=to_float(extract("Net landed",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Net invoiced",text))
r["weights"]["gain_loss_kg"]=to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs",text))
r["weights"]["gain_loss_percent"]=to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%",text))
return r
class PICLParser:
lab="PICL"
def parse(self,text):
r=empty_weight_report("PICL")
r["report"]["reference"]=safe_search(r"No[:\s]+([A-Z0-9\-]+)",text)
r["report"]["date"]=safe_search(r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})",text,group_index=2)
r["contract"]["contract_no"]=extract("Contract/Pl No & Date",text)
r["contract"]["invoice_no"]=extract("Invoice ilo & Date",text)
r["contract"]["lc_no"]=extract("L/C No & Date",text)
r["contract"]["origin"]=extract("Country of Origin",text)
r["contract"]["commodity"]=extract("Commodity",text)
r["parties"]["seller"]=extract("FAIRCOT SA",text)
r["parties"]["buyer"]=extract("M/S.",text)
r["parties"]["carrier"]=extract("Shipping Agent",text)
r["shipment"]["vessel"]=extract("Shipped Per Vessel",text)
r["shipment"]["bl_no"]=extract("B/L No & Date",text)
r["shipment"]["port_loading"]=extract("Port of Loading",text)
r["shipment"]["port_destination"]=extract("Port of Discharge",text)
r["shipment"]["arrival_date"]=extract("Date of Anival & LDL",text)
r["shipment"]["weighing_place"]=extract("Place & Date of Weighment",text)
r["shipment"]["weighing_method"]=extract("Method of Weighment",text)
r["shipment"]["bales"]=to_float(extract("Grand Total",text))
r["weights"]["gross_landed_kg"]=to_float(extract("Total;",text))
r["weights"]["tare_kg"]=to_float(extract("Tare Weight",text))
r["weights"]["net_landed_kg"]=to_float(extract("Grand Total",text))
r["weights"]["invoice_net_kg"]=to_float(extract("Invoice weight",text))
r["weights"]["gain_loss_kg"]=to_float(safe_search(r"(-[0-9.,]+)\s*KGS",text))
r["weights"]["gain_loss_percent"]=to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)",text))
return r
# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())
# Use root logger for your app
logger = logging.getLogger(__name__)
app = FastAPI()
logger.info("Loading models...")
nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)
logger.info("Models loaded successfully.")
# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
# logger.info(f"Received OCR request: {file.filename}")
# try:
# file_data = await file.read()
# ext = file.filename.lower()
# # --------- PDF with native text ---------
# if ext.endswith(".pdf"):
# logger.info("PDF detected → Extracting native text first")
# reader = PdfReader(io.BytesIO(file_data))
# direct_text = "".join(
# page.extract_text() or "" for page in reader.pages
# )
# if direct_text.strip():
# logger.info("Native PDF text found → No OCR needed")
# return {"ocr_text": direct_text}
# # -------- Fallback: scanned PDF OCR --------
# logger.info("No native text → PDF treated as scanned → OCR")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data)
# text = ""
# for i, img in enumerate(images):
# logger.info(f"OCR page {i+1}/{len(images)}")
# text += pytesseract.image_to_string(img) + "\n"
# return {"ocr_text": text}
# # --------- Image file OCR ---------
# logger.info("Image detected → Running OCR")
# img = Image.open(io.BytesIO(file_data))
# text = pytesseract.image_to_string(img)
# return {"ocr_text": text}
# except Exception as e:
# logger.error(f"OCR failed: {e}", exc_info=True)
# raise HTTPException(status_code=500, detail=str(e))
@app.post("/ocr")
async def ocr(file: UploadFile):
"""
Smart PDF processing optimized for cotton landing reports
"""
logger.info(f"Smart OCR request: {file.filename}")
try:
file_data = await file.read()
# Strategy 1: Try pdfplumber (best for digital PDFs)
try:
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
text_parts = []
tables_found = []
for page in pdf.pages:
# Extract text
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
if page_text:
text_parts.append(page_text)
# Look for tables (common in landing reports)
tables = page.extract_tables({
"vertical_strategy": "text",
"horizontal_strategy": "text",
"snap_tolerance": 5,
})
for table in tables:
if table and len(table) > 1:
tables_found.append(table)
combined_text = "\n".join(text_parts)
return {"ocr_text": combined_text}
# if combined_text.strip():
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
# # Try parsing structured data
# structured_data = parse_cotton_report(combined_text)
# # Check if we got key fields
# if (structured_data.get("shipment", {}).get("bales") and
# structured_data.get("weights", {}).get("net_landed_kg")):
# logger.info("Successfully parsed structured data from pdfplumber")
# return {
# "method": "pdfplumber",
# "structured_data": structured_data,
# "raw_text_sample": combined_text[:500]
# }
except Exception as e:
logger.warning(f"pdfplumber attempt: {e}")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data, dpi=200)
# ocr_results = []
# for img in images:
# text = pytesseract.image_to_string(
# img,
# config='--psm 6 -c preserve_interword_spaces=1'
# )
# ocr_results.append(text)
# ocr_text = "\n".join(ocr_results)
# return {
# "method": "tesseract_ocr",
# "structured_data": ocr_text,
# "raw_text_sample": ocr_text[:500]
# }
except Exception as e:
logger.error(f"Smart OCR failed: {e}", exc_info=True)
return {
"error": str(e),
"success": False
}
# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
logger.info(f"Received structure request: {file.filename}")
try:
file_data = await file.read()
ext = file.filename.lower()
if ext.endswith(".pdf"):
doc = DocumentFile.from_pdf(file_data)
logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
else:
img = Image.open(io.BytesIO(file_data)).convert("RGB")
doc = DocumentFile.from_images([img])
logger.info("Structure prediction on image")
res = predictor(doc)
return {"structure": str(res)}
except Exception as e:
logger.error(f"Structure extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
logger.info(f"Received table extraction request: {file.filename}")
try:
file_data = await file.read()
buffer = io.BytesIO(file_data)
tables = camelot.read_pdf(buffer)
logger.info(f"Found {len(tables)} tables")
return {"tables": [t.df.to_dict() for t in tables]}
except Exception as e:
logger.error(f"Table extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
def safe_search(pattern, text, default=None, group_index=1, context=""):
"""Recherche sécurisée avec logging en cas d'absence de correspondance."""
m = re.search(pattern, text, re.I | re.S)
if not m:
logger.warning("Pattern not found for %s: %s", context, pattern)
return default
try:
return m.group(group_index).strip()
except IndexError:
logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
return default
def to_float(s):
if not s:
return None
s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
s = s.replace("lbs", "").replace("LBS", "")
s = s.strip()
try:
return float(s)
except:
return None
def section(text, start, end=None):
"""Extract a block of text between two headings, safely."""
pattern_start = re.escape(start)
if end:
pattern_end = re.escape(end)
reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
else:
reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
m = reg.search(text)
if not m:
logger.warning("Section not found: start='%s', end='%s'", start, end)
return ""
return m.group(1).strip()
def extract_field(text, label, default=None):
"""Extract a line of the form 'Label: value', safely."""
pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
return safe_search(pattern, text, default=default, context=f"field '{label}'")
def extract(label, text, default=None):
"""
Robust extraction for OCR/PDF text.
Works with:
Label: Value
Label Value
Label .... Value
"""
if not text:
return default
patterns = [
rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
rf"{re.escape(label)}\s+([^\n\r]+)"
]
for p in patterns:
m = re.search(p, text, re.I)
if m:
return m.group(1).strip()
return default
def extract_report_metadata(text):
logger.info("Starting metadata extraction, text length=%d", len(text))
try:
# ----------- SECTIONS -----------
order_details = section(text, "Order details", "Weights")
invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
landed_section = section(text, "Bales Weighed", "Outturn")
loss_section = section(text, "LOSS", "Invoice average")
avg_section = section(text, "Invoice average", "Comments")
signature_block = section(text, "Signed on")
# ----------- TOP INFO -----------
top_info = {
"produced_on": extract_field(text, "Produced On"),
"printed_date": extract_field(text, "Printed Date"),
"client_reference": extract_field(text, "Client Reference"),
"report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
}
# ----------- ORDER DETAILS -----------
parties = {
"client": extract_field(order_details, "Client"),
"client_ref_no": extract_field(order_details, "Client Ref No"),
"buyer": extract_field(order_details, "Buyer"),
"destination": extract_field(order_details, "Destination"),
}
shipment = {
"total_bales": extract_field(order_details, "Total Bales"),
"vessel": extract_field(order_details, "Vessel"),
"voyage_no": extract_field(order_details, "Voy. No"),
"bl_no": extract_field(order_details, "B/L No"),
"bl_date": extract_field(order_details, "B/L Date"),
"growth": extract_field(order_details, "Growth"),
"arrival_date": extract_field(order_details, "Arrival Date"),
"first_weighing_date": extract_field(order_details, "First date of weighing"),
"last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
"weighing_method": extract_field(order_details, "Weighing method"),
"tare_basis": extract_field(order_details, "Tare"),
}
# ----------- INVOICE SECTION -----------
invoice = {
"bales": extract_field(invoice_section, "Bales"),
"gross": extract_field(invoice_section, "Gross"),
"tare": extract_field(invoice_section, "Tare"),
"net": extract_field(invoice_section, "Net"),
}
# ----------- LANDED SECTION -----------
landed = {
"bales": extract_field(landed_section, "Bales"),
"gross": extract_field(landed_section, "Gross"),
"tare": extract_field(landed_section, "Tare"),
"net": extract_field(landed_section, "Net"),
}
# ----------- LOSS SECTION -----------
loss = {
"kg": extract_field(loss_section, "kg"),
"lb": extract_field(loss_section, "lb"),
"percent": extract_field(loss_section, "Percentage"),
}
# ----------- AVERAGES SECTION -----------
averages = {
"invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
"landed_gross_per_bale": extract_field(avg_section, "Landed average"),
}
# ----------- SIGNATURE -----------
signature = {
"signed_on": extract_field(signature_block, "Signed on"),
"signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
"role": "Client Services Coordinator",
"company": "Alfred H. Knight International Limited"
}
logger.info("Metadata extraction completed successfully")
return {
"report": top_info,
"parties": parties,
"shipment": shipment,
"weights": {
"invoice": invoice,
"landed": landed,
"loss": loss,
"averages": averages
},
"signature": signature
}
except Exception as e:
logger.exception("Unexpected error during metadata extraction")
raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
def detect_template(text):
t = text.lower()
if "alfred h. knight" in t and "cotton landing report" in t:
return "AHK"
if "intertek" in t and "landing report" in t:
return "INTERTEK"
if "robertson international" in t or "ri ref no" in t:
return "ROBERTSON"
if "landing report" in t and "carcon cargo" in t:
return "SGS"
if "pacific inspection company" in t or "picl-bd.com" in t:
return "PICL"
return "UNKNOWN"
@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
return extract_report_metadata(text)
@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
return parse_report(text)
PARSERS = {
"AHK": AHKParser(),
"INTERTEK": IntertekParser(),
"ROBERTSON": RobertsonParser(),
"SGS": SGSParser(),
"PICL": PICLParser()
}
def empty_weight_report(lab):
return {
"lab": lab,
"report": {"reference": None, "file_no": None, "date": None},
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
"parties": {"seller": None, "buyer": None, "carrier": None},
"shipment": {
"vessel": None, "bl_no": None, "bl_date": None, "port_loading": None,
"port_destination": None, "arrival_date": None,
"weighing_place": None, "weighing_method": None,
"bales": None
},
"weights": {
"gross_landed_kg": None, "tare_kg": None,
"net_landed_kg": None, "invoice_net_kg": None,
"gain_loss_kg": None, "gain_loss_percent": None
}
}
def parse_report(text):
template=detect_template(text)
if template not in PARSERS:
return {"template":"UNKNOWN"}
return PARSERS[template].parse(text)