11.01.26
This commit is contained in:
37
app.py
37
app.py
@@ -4,6 +4,7 @@ import pytesseract
|
||||
from doctr.models import ocr_predictor
|
||||
from doctr.io import DocumentFile
|
||||
from PyPDF2 import PdfReader
|
||||
import pdfplumber
|
||||
import camelot
|
||||
import spacy
|
||||
import logging
|
||||
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
|
||||
except Exception as e:
|
||||
logger.warning(f"pdfplumber attempt: {e}")
|
||||
|
||||
# Strategy 2: Fallback to OCR for scanned PDFs
|
||||
logger.info("Falling back to OCR...")
|
||||
# from pdf2image import convert_from_bytes
|
||||
# images = convert_from_bytes(file_data, dpi=200)
|
||||
|
||||
# Convert PDF to images
|
||||
from pdf2image import convert_from_bytes
|
||||
images = convert_from_bytes(file_data, dpi=200)
|
||||
# ocr_results = []
|
||||
# for img in images:
|
||||
# text = pytesseract.image_to_string(
|
||||
# img,
|
||||
# config='--psm 6 -c preserve_interword_spaces=1'
|
||||
# )
|
||||
# ocr_results.append(text)
|
||||
|
||||
ocr_results = []
|
||||
for img in images:
|
||||
# Use pytesseract with optimized settings
|
||||
text = pytesseract.image_to_string(
|
||||
img,
|
||||
config='--psm 6 -c preserve_interword_spaces=1'
|
||||
)
|
||||
ocr_results.append(text)
|
||||
# ocr_text = "\n".join(ocr_results)
|
||||
|
||||
ocr_text = "\n".join(ocr_results)
|
||||
structured_data = parse_cotton_report(ocr_text)
|
||||
|
||||
return {
|
||||
"method": "tesseract_ocr",
|
||||
"structured_data": structured_data,
|
||||
"raw_text_sample": ocr_text[:500]
|
||||
}
|
||||
# return {
|
||||
# "method": "tesseract_ocr",
|
||||
# "structured_data": ocr_text,
|
||||
# "raw_text_sample": ocr_text[:500]
|
||||
# }
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
||||
|
||||
Reference in New Issue
Block a user