commit 377ff3a613e1b48f1ccea21831ea7976a6b5ee1b Author: root Date: Sun Dec 28 16:48:23 2025 +0000 Initial import diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e08f7a7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +# Dépendencies système +RUN apt update && apt install -y --no-install-recommends \ + tesseract-ocr \ + libmagic1 \ + ghostscript \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install dependencies first (cache optimization) +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +EXPOSE 8006 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8006"] diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc new file mode 100644 index 0000000..1356bef Binary files /dev/null and b/__pycache__/app.cpython-311.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000..9f0704b --- /dev/null +++ b/app.py @@ -0,0 +1,259 @@ +from fastapi import FastAPI, UploadFile, HTTPException, Body +from PIL import Image +import pytesseract +from doctr.models import ocr_predictor +from doctr.io import DocumentFile +from PyPDF2 import PdfReader +import camelot +import spacy +import logging +import io +from logging.handlers import RotatingFileHandler +import re + +LOG_PATH = "/var/log/automation-service.log" + +file_handler = RotatingFileHandler( + LOG_PATH, + maxBytes=10*1024*1024, + backupCount=5, + encoding="utf-8" +) +file_handler.setFormatter(logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +)) + +# Configure root logger explicitly +root = logging.getLogger() +root.setLevel(logging.INFO) +root.addHandler(file_handler) +root.addHandler(logging.StreamHandler()) + +# Use root logger for your app +logger = logging.getLogger(__name__) + +app = FastAPI() +logger.info("Loading models...") + +nlp = spacy.load("en_core_web_sm") +predictor = ocr_predictor(pretrained=True) + +logger.info("Models loaded successfully.") + +# ============================= +# 🧠 Smart OCR +# ============================= +@app.post("/ocr") +async def ocr(file: UploadFile): + logger.info(f"Received OCR request: {file.filename}") + try: + file_data = await file.read() + ext = file.filename.lower() + + # --------- PDF with native text --------- + if ext.endswith(".pdf"): + logger.info("PDF detected → Extracting native text first") + reader = PdfReader(io.BytesIO(file_data)) + direct_text = "".join( + page.extract_text() or "" for page in reader.pages + ) + + if direct_text.strip(): + logger.info("Native PDF text found → No OCR needed") + return {"ocr_text": direct_text} + + # -------- Fallback: scanned PDF OCR -------- + logger.info("No native text → PDF treated as scanned → OCR") + from pdf2image import convert_from_bytes + images = convert_from_bytes(file_data) + text = "" + for i, img in enumerate(images): + logger.info(f"OCR page {i+1}/{len(images)}") + text += pytesseract.image_to_string(img) + "\n" + + return {"ocr_text": text} + + # --------- Image file OCR --------- + logger.info("Image detected → Running OCR") + img = Image.open(io.BytesIO(file_data)) + text = pytesseract.image_to_string(img) + return {"ocr_text": text} + + except Exception as e: + logger.error(f"OCR failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +# ============================= +# 🧱 Structure / Layout +# ============================= +@app.post("/structure") +async def structure(file: UploadFile): + logger.info(f"Received structure request: {file.filename}") + try: + file_data = await file.read() + ext = file.filename.lower() + + if ext.endswith(".pdf"): + doc = DocumentFile.from_pdf(file_data) + logger.info(f"Structure prediction on PDF ({len(doc)} pages)") + else: + img = Image.open(io.BytesIO(file_data)).convert("RGB") + doc = DocumentFile.from_images([img]) + logger.info("Structure prediction on image") + + res = predictor(doc) + return {"structure": str(res)} + + except Exception as e: + logger.error(f"Structure extraction failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +# ============================= +# 📊 Tables extraction (PDF only) +# ============================= +@app.post("/tables") +async def tables(file: UploadFile): + logger.info(f"Received table extraction request: {file.filename}") + try: + file_data = await file.read() + buffer = io.BytesIO(file_data) + + tables = camelot.read_pdf(buffer) + logger.info(f"Found {len(tables)} tables") + return {"tables": [t.df.to_dict() for t in tables]} + + except Exception as e: + logger.error(f"Table extraction failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +def safe_search(pattern, text, default=None, group_index=1, context=""): + """Recherche sécurisée avec logging en cas d'absence de correspondance.""" + m = re.search(pattern, text, re.I | re.S) + if not m: + logger.warning("Pattern not found for %s: %s", context, pattern) + return default + try: + return m.group(group_index).strip() + except IndexError: + logger.warning("Group index %d not found for %s: %s", group_index, context, pattern) + return default + +def section(text, start, end=None): + """Extract a block of text between two headings, safely.""" + pattern_start = re.escape(start) + if end: + pattern_end = re.escape(end) + reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I) + else: + reg = re.compile(pattern_start + r"(.*)", re.S | re.I) + m = reg.search(text) + if not m: + logger.warning("Section not found: start='%s', end='%s'", start, end) + return "" + return m.group(1).strip() + +def extract_field(text, label, default=None): + """Extract a line of the form 'Label: value', safely.""" + pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)" + return safe_search(pattern, text, default=default, context=f"field '{label}'") + +def extract_report_metadata(text): + logger.info("Starting metadata extraction, text length=%d", len(text)) + + try: + # ----------- SECTIONS ----------- + order_details = section(text, "Order details", "Weights") + invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed") + landed_section = section(text, "Bales Weighed", "Outturn") + loss_section = section(text, "LOSS", "Invoice average") + avg_section = section(text, "Invoice average", "Comments") + signature_block = section(text, "Signed on") + + # ----------- TOP INFO ----------- + top_info = { + "produced_on": extract_field(text, "Produced On"), + "printed_date": extract_field(text, "Printed Date"), + "client_reference": extract_field(text, "Client Reference"), + "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1), + } + + # ----------- ORDER DETAILS ----------- + parties = { + "client": extract_field(order_details, "Client"), + "client_ref_no": extract_field(order_details, "Client Ref No"), + "buyer": extract_field(order_details, "Buyer"), + "destination": extract_field(order_details, "Destination"), + } + + shipment = { + "total_bales": extract_field(order_details, "Total Bales"), + "vessel": extract_field(order_details, "Vessel"), + "voyage_no": extract_field(order_details, "Voy. No"), + "bl_no": extract_field(order_details, "B/L No"), + "bl_date": extract_field(order_details, "B/L Date"), + "growth": extract_field(order_details, "Growth"), + "arrival_date": extract_field(order_details, "Arrival Date"), + "first_weighing_date": extract_field(order_details, "First date of weighing"), + "last_weighing_date": extract_field(order_details, "Last Date of Weighing"), + "weighing_method": extract_field(order_details, "Weighing method"), + "tare_basis": extract_field(order_details, "Tare"), + } + + # ----------- INVOICE SECTION ----------- + invoice = { + "bales": extract_field(invoice_section, "Bales"), + "gross": extract_field(invoice_section, "Gross"), + "tare": extract_field(invoice_section, "Tare"), + "net": extract_field(invoice_section, "Net"), + } + + # ----------- LANDED SECTION ----------- + landed = { + "bales": extract_field(landed_section, "Bales"), + "gross": extract_field(landed_section, "Gross"), + "tare": extract_field(landed_section, "Tare"), + "net": extract_field(landed_section, "Net"), + } + + # ----------- LOSS SECTION ----------- + loss = { + "kg": extract_field(loss_section, "kg"), + "lb": extract_field(loss_section, "lb"), + "percent": extract_field(loss_section, "Percentage"), + } + + # ----------- AVERAGES SECTION ----------- + averages = { + "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"), + "landed_gross_per_bale": extract_field(avg_section, "Landed average"), + } + + # ----------- SIGNATURE ----------- + signature = { + "signed_on": extract_field(signature_block, "Signed on"), + "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"), + "role": "Client Services Coordinator", + "company": "Alfred H. Knight International Limited" + } + + logger.info("Metadata extraction completed successfully") + return { + "report": top_info, + "parties": parties, + "shipment": shipment, + "weights": { + "invoice": invoice, + "landed": landed, + "loss": loss, + "averages": averages + }, + "signature": signature + } + + except Exception as e: + logger.exception("Unexpected error during metadata extraction") + raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}") + +@app.post("/metadata") +async def metadata(text: str = Body(..., embed=True)): + return extract_report_metadata(text) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..56555e2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +fastapi +uvicorn[standard] +python-multipart + +pytesseract +Pillow + +opencv-python-headless==4.9.0.80 + +python-doctr[torch]==0.8.1 +torch==2.2.0 +torchvision==0.17.0 + +camelot-py[cv] + +spacy +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl + +# ➕ Added for PDF text + OCR fallback +PyPDF2 +pdf2image +pypdf