11.01.26
This commit is contained in:
37
app.py
37
app.py
@@ -4,6 +4,7 @@ import pytesseract
|
|||||||
from doctr.models import ocr_predictor
|
from doctr.models import ocr_predictor
|
||||||
from doctr.io import DocumentFile
|
from doctr.io import DocumentFile
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
|
import pdfplumber
|
||||||
import camelot
|
import camelot
|
||||||
import spacy
|
import spacy
|
||||||
import logging
|
import logging
|
||||||
@@ -342,30 +343,24 @@ async def ocr(file: UploadFile):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"pdfplumber attempt: {e}")
|
logger.warning(f"pdfplumber attempt: {e}")
|
||||||
|
|
||||||
# Strategy 2: Fallback to OCR for scanned PDFs
|
# from pdf2image import convert_from_bytes
|
||||||
logger.info("Falling back to OCR...")
|
# images = convert_from_bytes(file_data, dpi=200)
|
||||||
|
|
||||||
# Convert PDF to images
|
# ocr_results = []
|
||||||
from pdf2image import convert_from_bytes
|
# for img in images:
|
||||||
images = convert_from_bytes(file_data, dpi=200)
|
# text = pytesseract.image_to_string(
|
||||||
|
# img,
|
||||||
|
# config='--psm 6 -c preserve_interword_spaces=1'
|
||||||
|
# )
|
||||||
|
# ocr_results.append(text)
|
||||||
|
|
||||||
ocr_results = []
|
# ocr_text = "\n".join(ocr_results)
|
||||||
for img in images:
|
|
||||||
# Use pytesseract with optimized settings
|
|
||||||
text = pytesseract.image_to_string(
|
|
||||||
img,
|
|
||||||
config='--psm 6 -c preserve_interword_spaces=1'
|
|
||||||
)
|
|
||||||
ocr_results.append(text)
|
|
||||||
|
|
||||||
ocr_text = "\n".join(ocr_results)
|
# return {
|
||||||
structured_data = parse_cotton_report(ocr_text)
|
# "method": "tesseract_ocr",
|
||||||
|
# "structured_data": ocr_text,
|
||||||
return {
|
# "raw_text_sample": ocr_text[:500]
|
||||||
"method": "tesseract_ocr",
|
# }
|
||||||
"structured_data": structured_data,
|
|
||||||
"raw_text_sample": ocr_text[:500]
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
logger.error(f"Smart OCR failed: {e}", exc_info=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user