From e6824fea9d0e45d82d882a6d0a3e3afa941bd3ed Mon Sep 17 00:00:00 2001
From: laurentbarontini <l.barontini@open-squared.ch>
Date: Sun, 11 Jan 2026 19:54:30 +0100
Subject: [PATCH] 11.01.26

---
 app.py | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/app.py b/app.py
index 42d2693..bb379ac 100644
--- a/app.py
+++ b/app.py
@@ -4,6 +4,7 @@ import pytesseract
 from doctr.models import ocr_predictor
 from doctr.io import DocumentFile
 from PyPDF2 import PdfReader
+import pdfplumber
 import camelot
 import spacy
 import logging
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
         except Exception as e:
             logger.warning(f"pdfplumber attempt: {e}")
         
-        # Strategy 2: Fallback to OCR for scanned PDFs
-        logger.info("Falling back to OCR...")
+        # from pdf2image import convert_from_bytes
+        # images = convert_from_bytes(file_data, dpi=200)
         
-        # Convert PDF to images
-        from pdf2image import convert_from_bytes
-        images = convert_from_bytes(file_data, dpi=200)
+        # ocr_results = []
+        # for img in images:
+        #     text = pytesseract.image_to_string(
+        #         img,
+        #         config='--psm 6 -c preserve_interword_spaces=1'
+        #     )
+        #     ocr_results.append(text)
         
-        ocr_results = []
-        for img in images:
-            # Use pytesseract with optimized settings
-            text = pytesseract.image_to_string(
-                img,
-                config='--psm 6 -c preserve_interword_spaces=1'
-            )
-            ocr_results.append(text)
+        # ocr_text = "\n".join(ocr_results)
         
-        ocr_text = "\n".join(ocr_results)
-        structured_data = parse_cotton_report(ocr_text)
-        
-        return {
-            "method": "tesseract_ocr",
-            "structured_data": structured_data,
-            "raw_text_sample": ocr_text[:500]
-        }
+        # return {
+        #     "method": "tesseract_ocr",
+        #     "structured_data": ocr_text,
+        #     "raw_text_sample": ocr_text[:500]
+        # }
         
     except Exception as e:
         logger.error(f"Smart OCR failed: {e}", exc_info=True)