Files
automation-service/app.py
2026-01-13 16:00:29 +01:00

888 lines
33 KiB
Python

from fastapi import FastAPI, UploadFile, HTTPException, Body
from PIL import Image
import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import pdfplumber
import camelot
import spacy
import logging
import io
from logging.handlers import RotatingFileHandler
import re
from datetime import datetime
from io import BytesIO
import requests
LOG_PATH = "/var/log/automation-service.log"
file_handler = RotatingFileHandler(
LOG_PATH,
maxBytes=10*1024*1024,
backupCount=5,
encoding="utf-8"
)
file_handler.setFormatter(logging.Formatter(
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
))
class AHKParser:
lab = "AHK"
def _clean_value(self, value):
"""Nettoie la valeur en supprimant les espaces inutiles"""
if value:
return value.strip()
return value
def parse(self, text):
"""Parse le texte et retourne un dictionnaire structuré"""
result = {
"lab": self.lab,
"report": self._extract_report_info(text),
"contract": self._extract_contract_info(text),
"parties": self._extract_parties_info(text),
"shipment": self._extract_shipment_info(text),
"weights": self._extract_weights_info(text)
}
self.data = result
return result
def _extract_report_info(self, text):
"""Extrait les informations du rapport"""
report_info = {
"reference": None,
"file_no": None,
"date": None
}
# Recherche de la référence client - plus précise
ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text)
if ref_match:
report_info["reference"] = self._clean_value(ref_match.group(1))
# Recherche du numéro de fichier AHK
file_no_match = re.search(r'AHK\s+S/([\w/]+)', text)
if file_no_match:
report_info["file_no"] = self._clean_value(file_no_match.group(1))
# Recherche de la date du rapport
date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text)
if date_match:
report_info["date"] = self._clean_value(date_match.group(1))
return report_info
def _extract_contract_info(self, text):
"""Extrait les informations du contrat"""
contract_info = {
"contract_no": None,
"invoice_no": None,
"lc_no": None,
"origin": None,
"commodity": None
}
# Extraction de la référence client
ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text)
if ref_match:
ref_text = ref_match.group(1).strip()
# Sépare S-3488 et INV 4013
parts = re.split(r'[/\s]+', ref_text)
for part in parts:
if part.startswith('S-'):
contract_info["contract_no"] = part.strip()
elif part.startswith('INV'):
contract_info["invoice_no"] = part.strip()
# Extraction de l'origine et de la marchandise - regex plus précise
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text)
if growth_match:
origin_text = growth_match.group(1).strip()
if "AUSTRALIAN" in origin_text.upper():
contract_info["origin"] = "AUSTRALIA"
contract_info["commodity"] = "RAW COTTON"
return contract_info
def _extract_parties_info(self, text):
"""Extrait les informations sur les parties"""
parties_info = {
"seller": None,
"buyer": None,
"carrier": None
}
# Extraction du vendeur (Client) - regex plus précise
seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text)
if seller_match:
parties_info["seller"] = self._clean_value(seller_match.group(1))
# Extraction de l'acheteur (Buyer) - regex plus précise
buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text)
if buyer_match:
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
# Extraction du transporteur (nom du navire seulement)
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
if vessel_match:
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
return parties_info
def _extract_shipment_info(self, text):
"""Extrait les informations d'expédition"""
shipment_info = {
"vessel": None,
"bl_no": None,
"bl_date": None,
"port_loading": None,
"port_destination": None,
"arrival_date": None,
"weighing_place": None,
"weighing_method": None,
"bales": None
}
# Extraction du navire (nom seulement)
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text)
if vessel_match:
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
# Extraction du numéro de connaissement (seulement le numéro)
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text)
if bl_no_match:
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
# Extraction de la date du connaissement
bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
if bl_date_match:
shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1))
# Extraction du port de destination (sans le "Tare")
dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text)
if dest_match:
shipment_info["port_destination"] = self._clean_value(dest_match.group(1))
# Extraction de la date d'arrivée
arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text)
if arrival_match:
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
# Extraction de la méthode de pesée
weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text)
if weighing_method_match:
shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1))
# Extraction du nombre de balles
bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text)
if bales_match:
try:
shipment_info["bales"] = int(bales_match.group(1).strip())
except ValueError:
shipment_info["bales"] = None
return shipment_info
def _extract_weights_info(self, text):
"""Extrait les informations de poids"""
weights_info = {
"gross_landed_kg": None,
"tare_kg": None,
"net_landed_kg": None,
"invoice_net_kg": None,
"gain_loss_kg": None,
"gain_loss_percent": None
}
# Extraction du poids brut débarqué (corrigé - doit être 100580 kg)
gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text)
if gross_landed_match:
try:
weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids de tare
tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text)
if tare_match:
try:
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids net débarqué (corrigé - doit être 100078.40 kg)
net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
if net_landed_match:
try:
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du poids net facturé (101299 kg)
invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text)
if invoice_net_match:
try:
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction de la perte en kg
loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text)
if loss_match:
try:
weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip())
except ValueError:
pass
# Extraction du pourcentage de perte
percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text)
if percent_match:
try:
weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip())
except ValueError:
pass
return weights_info
import re
class IntertekParser:
lab = "Intertek"
def _clean_value(self, value):
"""Nettoie la valeur en supprimant les espaces inutiles"""
if value:
return value.strip()
return value
def _extract_number(self, text, pattern, is_int=False):
"""Extrait un nombre (int ou float) du texte selon un pattern regex"""
match = re.search(pattern, text)
if match:
try:
# Nettoie la chaîne numérique
num_str = match.group(1).replace(',', '').replace(' ', '').strip()
if is_int:
return int(num_str)
else:
return float(num_str)
except (ValueError, AttributeError):
return None
return None
def parse(self, text):
"""Parse le texte et retourne un dictionnaire structuré"""
result = {
"lab": self.lab,
"report": self._extract_report_info(text),
"contract": self._extract_contract_info(text),
"parties": self._extract_parties_info(text),
"shipment": self._extract_shipment_info(text),
"weights": self._extract_weights_info(text)
}
return result
def _extract_report_info(self, text):
"""Extrait les informations du rapport"""
report_info = {
"reference": None,
"file_no": None,
"date": None
}
# Recherche de la référence globale
ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text)
if ref_match:
report_info["reference"] = self._clean_value(ref_match.group(1))
# Recherche du numéro de fichier
file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text)
if file_no_match:
report_info["file_no"] = self._clean_value(file_no_match.group(1))
# Recherche de la date du rapport
date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
if date_match:
report_info["date"] = self._clean_value(date_match.group(1))
return report_info
def _extract_contract_info(self, text):
"""Extrait les informations du contrat"""
contract_info = {
"contract_no": None,
"invoice_no": None,
"lc_no": None, # Non présent dans ce rapport
"origin": None,
"commodity": None
}
# Extraction du numéro de contrat
contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text)
if contract_match:
contract_info["contract_no"] = self._clean_value(contract_match.group(1))
# Extraction du numéro de facture
invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text)
if invoice_match:
contract_info["invoice_no"] = self._clean_value(invoice_match.group(1))
# Extraction de l'origine et de la marchandise
growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text)
if growth_match:
origin_text = growth_match.group(1).strip()
if "GREECE" in origin_text.upper():
contract_info["origin"] = "GREECE"
contract_info["commodity"] = "RAW COTTON"
return contract_info
def _extract_parties_info(self, text):
"""Extrait les informations sur les parties"""
parties_info = {
"seller": None,
"buyer": None,
"carrier": None
}
# Extraction du vendeur (Shipper)
seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text)
if seller_match:
parties_info["seller"] = self._clean_value(seller_match.group(1))
# Extraction de l'acheteur (Buyer)
buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text)
if buyer_match:
parties_info["buyer"] = self._clean_value(buyer_match.group(1))
# Extraction du transporteur (nom du navire seulement)
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
if vessel_match:
parties_info["carrier"] = self._clean_value(vessel_match.group(1))
return parties_info
def _extract_shipment_info(self, text):
"""Extrait les informations d'expédition"""
shipment_info = {
"vessel": None,
"bl_no": None,
"bl_date": None, # Non présent dans ce rapport
"port_loading": None, # Non présent dans ce rapport
"port_destination": None, # Non présent dans ce rapport
"arrival_date": None,
"weighing_place": None,
"weighing_method": None,
"bales": None
}
# Extraction du navire
vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text)
if vessel_match:
shipment_info["vessel"] = self._clean_value(vessel_match.group(1))
# Extraction du numéro de connaissement
bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text)
if bl_no_match:
shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1))
# Extraction de la date d'arrivée
arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text)
if arrival_match:
shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1))
# Extraction du lieu de pesée
weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text)
if weighing_place_match:
shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1))
# Extraction de la méthode de pesée
# Recherche dans les remarques
remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE)
if remarks_section:
remarks_text = remarks_section.group(1)
if "weighbridge" in remarks_text.lower():
shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck"
# Extraction du nombre de balles (à partir du total)
bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text)
if not bales_match:
# Essayons une autre approche
bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text)
if bales_match:
try:
bales_str = bales_match.group(1).replace(',', '').strip()
shipment_info["bales"] = int(bales_str)
except ValueError:
shipment_info["bales"] = None
return shipment_info
def _extract_weights_info(self, text):
"""Extrait les informations de poids"""
weights_info = {
"gross_landed_kg": None,
"tare_kg": None,
"net_landed_kg": None,
"invoice_net_kg": None,
"gain_loss_kg": None,
"gain_loss_percent": None
}
# Extraction du poids brut débarqué
gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text)
if gross_match:
weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', ''))
# Extraction du poids de tare
tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
if tare_match:
weights_info["tare_kg"] = float(tare_match.group(1).replace(',', ''))
# Extraction du poids net débarqué
net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
if net_landed_match:
weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', ''))
# Extraction du poids net facturé
invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text)
if invoice_net_match:
weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', ''))
# Extraction du gain en kg
gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text)
if gain_match:
weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', ''))
# Extraction du pourcentage de gain (0.4% dans le tableau)
percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text)
if percent_match:
try:
weights_info["gain_loss_percent"] = float(percent_match.group(1))
except ValueError:
pass
return weights_info
# Configure root logger explicitly
root = logging.getLogger()
root.setLevel(logging.INFO)
root.addHandler(file_handler)
root.addHandler(logging.StreamHandler())
# Use root logger for your app
logger = logging.getLogger(__name__)
app = FastAPI()
logger.info("Loading models...")
nlp = spacy.load("en_core_web_sm")
predictor = ocr_predictor(pretrained=True)
logger.info("Models loaded successfully.")
# =============================
# 🧠 Smart OCR
# =============================
# @app.post("/ocr")
# async def ocr(file: UploadFile):
# logger.info(f"Received OCR request: {file.filename}")
# try:
# file_data = await file.read()
# ext = file.filename.lower()
# # --------- PDF with native text ---------
# if ext.endswith(".pdf"):
# logger.info("PDF detected → Extracting native text first")
# reader = PdfReader(io.BytesIO(file_data))
# direct_text = "".join(
# page.extract_text() or "" for page in reader.pages
# )
# if direct_text.strip():
# logger.info("Native PDF text found → No OCR needed")
# return {"ocr_text": direct_text}
# # -------- Fallback: scanned PDF OCR --------
# logger.info("No native text → PDF treated as scanned → OCR")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data)
# text = ""
# for i, img in enumerate(images):
# logger.info(f"OCR page {i+1}/{len(images)}")
# text += pytesseract.image_to_string(img) + "\n"
# return {"ocr_text": text}
# # --------- Image file OCR ---------
# logger.info("Image detected → Running OCR")
# img = Image.open(io.BytesIO(file_data))
# text = pytesseract.image_to_string(img)
# return {"ocr_text": text}
# except Exception as e:
# logger.error(f"OCR failed: {e}", exc_info=True)
# raise HTTPException(status_code=500, detail=str(e))
@app.post("/ocr")
async def ocr(file: UploadFile):
"""
Smart PDF processing optimized for cotton landing reports
"""
logger.info(f"Smart OCR request: {file.filename}")
try:
file_data = await file.read()
# Strategy 1: Try pdfplumber (best for digital PDFs)
try:
with pdfplumber.open(io.BytesIO(file_data)) as pdf:
text_parts = []
tables_found = []
for page in pdf.pages:
# Extract text
page_text = page.extract_text(x_tolerance=2, y_tolerance=2)
if page_text:
text_parts.append(page_text)
# Look for tables (common in landing reports)
tables = page.extract_tables({
"vertical_strategy": "text",
"horizontal_strategy": "text",
"snap_tolerance": 5,
})
for table in tables:
if table and len(table) > 1:
tables_found.append(table)
combined_text = "\n".join(text_parts)
return {"ocr_text": combined_text}
# if combined_text.strip():
# logger.info(f"pdfplumber extracted {len(combined_text)} chars")
# # Try parsing structured data
# structured_data = parse_cotton_report(combined_text)
# # Check if we got key fields
# if (structured_data.get("shipment", {}).get("bales") and
# structured_data.get("weights", {}).get("net_landed_kg")):
# logger.info("Successfully parsed structured data from pdfplumber")
# return {
# "method": "pdfplumber",
# "structured_data": structured_data,
# "raw_text_sample": combined_text[:500]
# }
except Exception as e:
logger.warning(f"pdfplumber attempt: {e}")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data, dpi=200)
# ocr_results = []
# for img in images:
# text = pytesseract.image_to_string(
# img,
# config='--psm 6 -c preserve_interword_spaces=1'
# )
# ocr_results.append(text)
# ocr_text = "\n".join(ocr_results)
# return {
# "method": "tesseract_ocr",
# "structured_data": ocr_text,
# "raw_text_sample": ocr_text[:500]
# }
except Exception as e:
logger.error(f"Smart OCR failed: {e}", exc_info=True)
return {
"error": str(e),
"success": False
}
# =============================
# 🧱 Structure / Layout
# =============================
@app.post("/structure")
async def structure(file: UploadFile):
logger.info(f"Received structure request: {file.filename}")
try:
file_data = await file.read()
ext = file.filename.lower()
if ext.endswith(".pdf"):
doc = DocumentFile.from_pdf(file_data)
logger.info(f"Structure prediction on PDF ({len(doc)} pages)")
else:
img = Image.open(io.BytesIO(file_data)).convert("RGB")
doc = DocumentFile.from_images([img])
logger.info("Structure prediction on image")
res = predictor(doc)
return {"structure": str(res)}
except Exception as e:
logger.error(f"Structure extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
# =============================
# 📊 Tables extraction (PDF only)
# =============================
@app.post("/tables")
async def tables(file: UploadFile):
logger.info(f"Received table extraction request: {file.filename}")
try:
file_data = await file.read()
buffer = io.BytesIO(file_data)
tables = camelot.read_pdf(buffer)
logger.info(f"Found {len(tables)} tables")
return {"tables": [t.df.to_dict() for t in tables]}
except Exception as e:
logger.error(f"Table extraction failed: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
def safe_search(pattern, text, default=None, group_index=1, context=""):
"""Recherche sécurisée avec logging en cas d'absence de correspondance."""
m = re.search(pattern, text, re.I | re.S)
if not m:
logger.warning("Pattern not found for %s: %s", context, pattern)
return default
try:
return m.group(group_index).strip()
except IndexError:
logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
return default
def to_float(s):
if not s:
return None
s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
s = s.replace("lbs", "").replace("LBS", "")
s = s.strip()
try:
return float(s)
except:
return None
def section(text, start, end=None):
"""Extract a block of text between two headings, safely."""
pattern_start = re.escape(start)
if end:
pattern_end = re.escape(end)
reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I)
else:
reg = re.compile(pattern_start + r"(.*)", re.S | re.I)
m = reg.search(text)
if not m:
logger.warning("Section not found: start='%s', end='%s'", start, end)
return ""
return m.group(1).strip()
def extract_field(text, label, default=None):
"""Extract a line of the form 'Label: value', safely."""
pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
return safe_search(pattern, text, default=default, context=f"field '{label}'")
def extract(label, text, default=None):
"""
Robust extraction for OCR/PDF text.
Works with:
Label: Value
Label Value
Label .... Value
"""
if not text:
return default
patterns = [
rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
rf"{re.escape(label)}\s+([^\n\r]+)"
]
for p in patterns:
m = re.search(p, text, re.I)
if m:
return m.group(1).strip()
return default
def extract_report_metadata(text):
logger.info("Starting metadata extraction, text length=%d", len(text))
try:
# ----------- SECTIONS -----------
order_details = section(text, "Order details", "Weights")
invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed")
landed_section = section(text, "Bales Weighed", "Outturn")
loss_section = section(text, "LOSS", "Invoice average")
avg_section = section(text, "Invoice average", "Comments")
signature_block = section(text, "Signed on")
# ----------- TOP INFO -----------
top_info = {
"produced_on": extract_field(text, "Produced On"),
"printed_date": extract_field(text, "Printed Date"),
"client_reference": extract_field(text, "Client Reference"),
"report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1),
}
# ----------- ORDER DETAILS -----------
parties = {
"client": extract_field(order_details, "Client"),
"client_ref_no": extract_field(order_details, "Client Ref No"),
"buyer": extract_field(order_details, "Buyer"),
"destination": extract_field(order_details, "Destination"),
}
shipment = {
"total_bales": extract_field(order_details, "Total Bales"),
"vessel": extract_field(order_details, "Vessel"),
"voyage_no": extract_field(order_details, "Voy. No"),
"bl_no": extract_field(order_details, "B/L No"),
"bl_date": extract_field(order_details, "B/L Date"),
"growth": extract_field(order_details, "Growth"),
"arrival_date": extract_field(order_details, "Arrival Date"),
"first_weighing_date": extract_field(order_details, "First date of weighing"),
"last_weighing_date": extract_field(order_details, "Last Date of Weighing"),
"weighing_method": extract_field(order_details, "Weighing method"),
"tare_basis": extract_field(order_details, "Tare"),
}
# ----------- INVOICE SECTION -----------
invoice = {
"bales": extract_field(invoice_section, "Bales"),
"gross": extract_field(invoice_section, "Gross"),
"tare": extract_field(invoice_section, "Tare"),
"net": extract_field(invoice_section, "Net"),
}
# ----------- LANDED SECTION -----------
landed = {
"bales": extract_field(landed_section, "Bales"),
"gross": extract_field(landed_section, "Gross"),
"tare": extract_field(landed_section, "Tare"),
"net": extract_field(landed_section, "Net"),
}
# ----------- LOSS SECTION -----------
loss = {
"kg": extract_field(loss_section, "kg"),
"lb": extract_field(loss_section, "lb"),
"percent": extract_field(loss_section, "Percentage"),
}
# ----------- AVERAGES SECTION -----------
averages = {
"invoice_gross_per_bale": extract_field(avg_section, "Invoice average"),
"landed_gross_per_bale": extract_field(avg_section, "Landed average"),
}
# ----------- SIGNATURE -----------
signature = {
"signed_on": extract_field(signature_block, "Signed on"),
"signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"),
"role": "Client Services Coordinator",
"company": "Alfred H. Knight International Limited"
}
logger.info("Metadata extraction completed successfully")
return {
"report": top_info,
"parties": parties,
"shipment": shipment,
"weights": {
"invoice": invoice,
"landed": landed,
"loss": loss,
"averages": averages
},
"signature": signature
}
except Exception as e:
logger.exception("Unexpected error during metadata extraction")
raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
def detect_template(text):
t = text.lower()
if "alfred h. knight" in t and "cotton landing report" in t:
return "AHK"
if "intertek" in t and "landing report" in t:
return "INTERTEK"
if "robertson international" in t or "ri ref no" in t:
return "ROBERTSON"
if "landing report" in t and "carcon cargo" in t:
return "SGS"
if "pacific inspection company" in t or "picl-bd.com" in t:
return "PICL"
return "UNKNOWN"
@app.post("/metadata")
async def metadata(text: str = Body(..., embed=True)):
return extract_report_metadata(text)
def call_extractor(text: str, lab: str = "AHK"):
url = "http://62.72.36.116:8090/extract"
params = {"lab": lab}
fake_file = BytesIO(text.encode("utf-8"))
files = {
"file": ("document.txt", fake_file, "text/plain")
}
response = requests.post(url, params=params, files=files, timeout=60)
response.raise_for_status()
return response.json()
@app.post("/parse")
async def parse_endpoint(text: str = Body(..., embed=True)):
lab = parse_report(text)
result = call_extractor(text, lab=lab)
return result
PARSERS = {
"AHK": AHKParser(),
"INTERTEK": IntertekParser()
}
def empty_weight_report(lab):
return {
"lab": lab,
"report": {"reference": None, "file_no": None, "date": None},
"contract": {"contract_no": None, "invoice_no": None, "lc_no": None, "origin": None, "commodity": None},
"parties": {"seller": None, "buyer": None, "carrier": None},
"shipment": {
"vessel": None, "bl_no": None, "bl_date": None, "port_loading": None,
"port_destination": None, "arrival_date": None,
"weighing_place": None, "weighing_method": None,
"bales": None
},
"weights": {
"gross_landed_kg": None, "tare_kg": None,
"net_landed_kg": None, "invoice_net_kg": None,
"gain_loss_kg": None, "gain_loss_percent": None
}
}
def parse_report(text):
template=detect_template(text)
# if template not in PARSERS:
# return {"template":"UNKNOWN"}
# return PARSERS[template].parse(text)
return template