From e6824fea9d0e45d82d882a6d0a3e3afa941bd3ed Mon Sep 17 00:00:00 2001 From: laurentbarontini Date: Sun, 11 Jan 2026 19:54:30 +0100 Subject: [PATCH] 11.01.26 --- app.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/app.py b/app.py index 42d2693..bb379ac 100644 --- a/app.py +++ b/app.py @@ -4,6 +4,7 @@ import pytesseract from doctr.models import ocr_predictor from doctr.io import DocumentFile from PyPDF2 import PdfReader +import pdfplumber import camelot import spacy import logging @@ -342,30 +343,24 @@ async def ocr(file: UploadFile): except Exception as e: logger.warning(f"pdfplumber attempt: {e}") - # Strategy 2: Fallback to OCR for scanned PDFs - logger.info("Falling back to OCR...") + # from pdf2image import convert_from_bytes + # images = convert_from_bytes(file_data, dpi=200) - # Convert PDF to images - from pdf2image import convert_from_bytes - images = convert_from_bytes(file_data, dpi=200) + # ocr_results = [] + # for img in images: + # text = pytesseract.image_to_string( + # img, + # config='--psm 6 -c preserve_interword_spaces=1' + # ) + # ocr_results.append(text) - ocr_results = [] - for img in images: - # Use pytesseract with optimized settings - text = pytesseract.image_to_string( - img, - config='--psm 6 -c preserve_interword_spaces=1' - ) - ocr_results.append(text) + # ocr_text = "\n".join(ocr_results) - ocr_text = "\n".join(ocr_results) - structured_data = parse_cotton_report(ocr_text) - - return { - "method": "tesseract_ocr", - "structured_data": structured_data, - "raw_text_sample": ocr_text[:500] - } + # return { + # "method": "tesseract_ocr", + # "structured_data": ocr_text, + # "raw_text_sample": ocr_text[:500] + # } except Exception as e: logger.error(f"Smart OCR failed: {e}", exc_info=True)