11.01.26

2026-01-11 19:54:30 +01:00
parent b7335d330d
commit e6824fea9d
1 changed files with 16 additions and 21 deletions
--- a/app.py
+++ b/app.py
@@ -4,6 +4,7 @@ import pytesseract
 from doctr.models import ocr_predictor
 from doctr.io import DocumentFile
 from PyPDF2 import PdfReader
+import pdfplumber
 import camelot
 import spacy
 import logging
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
        except Exception as e:
            logger.warning(f"pdfplumber attempt: {e}")
        
-        # Strategy 2: Fallback to OCR for scanned PDFs
-        logger.info("Falling back to OCR...")
+        # from pdf2image import convert_from_bytes
+        # images = convert_from_bytes(file_data, dpi=200)
        
-        # Convert PDF to images
-        from pdf2image import convert_from_bytes
-        images = convert_from_bytes(file_data, dpi=200)
+        # ocr_results = []
+        # for img in images:
+        #     text = pytesseract.image_to_string(
+        #         img,
+        #         config='--psm 6 -c preserve_interword_spaces=1'
+        #     )
+        #     ocr_results.append(text)
        
-        ocr_results = []
-        for img in images:
-            # Use pytesseract with optimized settings
-            text = pytesseract.image_to_string(
-                img,
-                config='--psm 6 -c preserve_interword_spaces=1'
-            )
-            ocr_results.append(text)
+        # ocr_text = "\n".join(ocr_results)
        
-        ocr_text = "\n".join(ocr_results)
-        structured_data = parse_cotton_report(ocr_text)
-        
-        return {
-            "method": "tesseract_ocr",
-            "structured_data": structured_data,
-            "raw_text_sample": ocr_text[:500]
-        }
+        # return {
+        #     "method": "tesseract_ocr",
+        #     "structured_data": ocr_text,
+        #     "raw_text_sample": ocr_text[:500]
+        # }
        
    except Exception as e:
        logger.error(f"Smart OCR failed: {e}", exc_info=True)