This commit is contained in:
2026-01-11 19:54:30 +01:00
parent b7335d330d
commit e6824fea9d

37
app.py
View File

@@ -4,6 +4,7 @@ import pytesseract
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from PyPDF2 import PdfReader
import pdfplumber
import camelot
import spacy
import logging
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
except Exception as e:
logger.warning(f"pdfplumber attempt: {e}")
# Strategy 2: Fallback to OCR for scanned PDFs
logger.info("Falling back to OCR...")
# from pdf2image import convert_from_bytes
# images = convert_from_bytes(file_data, dpi=200)
# Convert PDF to images
from pdf2image import convert_from_bytes
images = convert_from_bytes(file_data, dpi=200)
# ocr_results = []
# for img in images:
# text = pytesseract.image_to_string(
# img,
# config='--psm 6 -c preserve_interword_spaces=1'
# )
# ocr_results.append(text)
ocr_results = []
for img in images:
# Use pytesseract with optimized settings
text = pytesseract.image_to_string(
img,
config='--psm 6 -c preserve_interword_spaces=1'
)
ocr_results.append(text)
# ocr_text = "\n".join(ocr_results)
ocr_text = "\n".join(ocr_results)
structured_data = parse_cotton_report(ocr_text)
return {
"method": "tesseract_ocr",
"structured_data": structured_data,
"raw_text_sample": ocr_text[:500]
}
# return {
# "method": "tesseract_ocr",
# "structured_data": ocr_text,
# "raw_text_sample": ocr_text[:500]
# }
except Exception as e:
logger.error(f"Smart OCR failed: {e}", exc_info=True)