11.01.26

2026-01-11 19:54:30 +01:00
parent b7335d330d
commit e6824fea9d
1 changed files with 16 additions and 21 deletions
--- a/app.py
+++ b/app.py
@@ -4,6 +4,7 @@ import pytesseract
 from doctr.models import ocr_predictor
 from doctr.io import DocumentFile
 from PyPDF2 import PdfReader
 import pdfplumber
 import camelot
 import spacy
 import logging
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
        except Exception as e:
            logger.warning(f"pdfplumber attempt: {e}")
-        # Strategy 2: Fallback to OCR for scanned PDFs
+        # from pdf2image import convert_from_bytes
-        logger.info("Falling back to OCR...")
+        # images = convert_from_bytes(file_data, dpi=200)
-        # Convert PDF to images
+        # ocr_results = []
-        from pdf2image import convert_from_bytes
+        # for img in images:
-        images = convert_from_bytes(file_data, dpi=200)
+        #     text = pytesseract.image_to_string(
        #         img,
        #         config='--psm 6 -c preserve_interword_spaces=1'
        #     )
        #     ocr_results.append(text)
-        ocr_results = []
+        # ocr_text = "\n".join(ocr_results)
        for img in images:
            # Use pytesseract with optimized settings
            text = pytesseract.image_to_string(
                img,
                config='--psm 6 -c preserve_interword_spaces=1'
            )
            ocr_results.append(text)
-        ocr_text = "\n".join(ocr_results)
+        # return {
-        structured_data = parse_cotton_report(ocr_text)
+        #     "method": "tesseract_ocr",
-        
+        #     "structured_data": ocr_text,
-        return {
+        #     "raw_text_sample": ocr_text[:500]
-            "method": "tesseract_ocr",
+        # }
            "structured_data": structured_data,
            "raw_text_sample": ocr_text[:500]
        }
    except Exception as e:
        logger.error(f"Smart OCR failed: {e}", exc_info=True)