This commit is contained in:
2026-01-11 19:54:30 +01:00
parent b7335d330d
commit e6824fea9d

37
app.py
View File

@@ -4,6 +4,7 @@ import pytesseract
from doctr.models import ocr_predictor from doctr.models import ocr_predictor
from doctr.io import DocumentFile from doctr.io import DocumentFile
from PyPDF2 import PdfReader from PyPDF2 import PdfReader
import pdfplumber
import camelot import camelot
import spacy import spacy
import logging import logging
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
except Exception as e: except Exception as e:
logger.warning(f"pdfplumber attempt: {e}") logger.warning(f"pdfplumber attempt: {e}")
# Strategy 2: Fallback to OCR for scanned PDFs # from pdf2image import convert_from_bytes
logger.info("Falling back to OCR...") # images = convert_from_bytes(file_data, dpi=200)
# Convert PDF to images # ocr_results = []
from pdf2image import convert_from_bytes # for img in images:
images = convert_from_bytes(file_data, dpi=200) # text = pytesseract.image_to_string(
# img,
# config='--psm 6 -c preserve_interword_spaces=1'
# )
# ocr_results.append(text)
ocr_results = [] # ocr_text = "\n".join(ocr_results)
for img in images:
# Use pytesseract with optimized settings
text = pytesseract.image_to_string(
img,
config='--psm 6 -c preserve_interword_spaces=1'
)
ocr_results.append(text)
ocr_text = "\n".join(ocr_results) # return {
structured_data = parse_cotton_report(ocr_text) # "method": "tesseract_ocr",
# "structured_data": ocr_text,
return { # "raw_text_sample": ocr_text[:500]
"method": "tesseract_ocr", # }
"structured_data": structured_data,
"raw_text_sample": ocr_text[:500]
}
except Exception as e: except Exception as e:
logger.error(f"Smart OCR failed: {e}", exc_info=True) logger.error(f"Smart OCR failed: {e}", exc_info=True)