From 377ff3a613e1b48f1ccea21831ea7976a6b5ee1b Mon Sep 17 00:00:00 2001 From: root Date: Sun, 28 Dec 2025 16:48:23 +0000 Subject: [PATCH] Initial import --- Dockerfile | 27 ++++ __pycache__/app.cpython-311.pyc | Bin 0 -> 14390 bytes app.py | 259 ++++++++++++++++++++++++++++++++ requirements.txt | 22 +++ 4 files changed, 308 insertions(+) create mode 100644 Dockerfile create mode 100644 __pycache__/app.cpython-311.pyc create mode 100644 app.py create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e08f7a7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +# Dépendencies système +RUN apt update && apt install -y --no-install-recommends \ + tesseract-ocr \ + libmagic1 \ + ghostscript \ + libgl1 \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender1 \ + poppler-utils \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install dependencies first (cache optimization) +COPY requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +EXPOSE 8006 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8006"] diff --git a/__pycache__/app.cpython-311.pyc b/__pycache__/app.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1356beff04cb012f59213b7bc4376d14e74e1fdb GIT binary patch literal 14390 zcmdrzTWs9cmE@2!!+DQJGkT09i+XD$Te4;OCCireuq@fKtypfHD0a0Yk&;Il&1gx= zvZtf0ngoq%coDY<5T~`#>0+BIjSHxYbm1;qAZe3rfOY}WAm9Ok0gPQ3*!(GQ0|efm zJ(rvpDO>Rt3oNh{=a9Vj+;h);o!7nkxyxn8;QHe)_D8;K!m$59AGynteR=#(1ctqf zF&KkKu?2h*UocIY7Klk=ft)09lx~We7c7$&^__@XC#{f1MyUncq)kmTN9_xaNr(Dw zi8>ctlddeTd(sWKMAS3sQJ>yPulg*REPRTwCEY0HjC;bdH88BgNDH~vHjGb}Z zb-qtx*oW}buE{dCELEQSDtk))Lpk_qm;MXKRP4%pF1&o0`@klHY%o=o z|15fe8me=jMv73TCYM+IWXqlxQ{`z#uI&5pQsBu*L(e~3tIwaLAs^RMn5<*zRDF(m z@sn}uORK%(nQzG9FdlvWWFt_wG2Xi+hV-Uq*5K1);uu?(YR=Iweln$cOvcZeW=%}s zP14YsmS<8h^mmp*`4$ST&!kZCR30j~PzW(iO!b@iWE<1W)WEYHo;%>Vlc{|Zo80wG z`m~-t^d`yFL4Vf2Z-Oz0pLWgSOat2it#5qOY-oKaLrh?ymM;)uh*Ye@VP5DTA64w% zT8zfS%y1;iDz1_9=f{scBODM@p~`K6f|pfz-R7C731a-L9QmT`TV&47`tQA&j1|ACb1Mq z;jNc`|Nq}S%Q@Qsft}mniS|J!Rh5C5Q#hBm3lEl2K%~0kLrf-{p4w)oVjx9tWL0^j7@f@wR z=xKwtvL?w0KJh#wPi-j+dJSpF6*H2vNwcv(RxAv;Vik&XNzzdN5O(S8MVL)1)Qasn zxOXHe#+iVY{}^LjLT#Z>jC5gMl16TmR-vv?V!kBzp0w#P88@mS0i(ZSy)h2`V|~v4 zG3XW?6TnKzUGt^6x4&vG*`AO06>E;)225zp)ret@7&owD*Y$9qkXv|Tyh%G+(@e=- zpI(APYjzdkl?t7OucTeygLnPEFNiL;Cnt5O^R?K{UodKwn{LfxRlw zvk{IL6ig@FkO0~XzsCarG~t%SE(Jl*;3-Jidg&L_BCrgi3CFP;RSYbTkcmSbhIEY+ zszh8&p}IpR#niD|As0b%^GGbx^b<#CcyST^Iu@@zvGvZeG4|>rcQjE3Vz{F>8lMSA z`J)~A^g~dJ9|G{7@VAO>Vy1ex3ep{2H^~Ryy8GTb$=e`%8$@p?#cdL~`~qg-?J>~7 zmRbOu(K-nAq1H*DnzUX#6IXjY#-m5S?n7InHxfn&pH5W+mo&xB*3 za@6sFf#Xo4p;vq}@z@oX6Q*Xl_`=jQQi=&18d%gb^cP5%Ji$Ak;PxeC7OIL{Kep$V zVlgzZP$uC-1!u#cu$UhD{0BH_)T=X7k=Shfi@yM+4@`=K2MxZ&gHB{v#R8KAkV6%U z1r+rh3(5}RDil(ANW&^5@XsjbXnYwmK|M44azwbyAqlLQB5}p4YVpyt3OOH-#1ykS zoQjzjxX7YngZbDML8bKECW-|y4m>-U2nXvx9Vm?pi&_z{ zkZ9}_JfaYhg*gRhd88`n!o>yEI$)H!yN$5wR_u6l_i~c6Vy=F_heK!^!xa;n_>m+xB>~2^iHZ7Rb z^LFp`-dh&QQ7b!Yzr_5u=8V(#_NnWq#EKJ=b3}HIh|ZBrV8-}_Nio%dg&#qk#e;H?V1Rk_zrZBm%G>Vb#8@1Z46z3i!n zqOLs|Z`r$!A3DV9XcZ%TlDV!%7VE{HHJ<}&N10JCOXH`&TdhE zrvoiwpk>`A1-j%w7w}dZSR2X&8&bnkuw4$eqneK48vKFfCVq2ozS_g<_rQB8sviK{8;xsDi8>%t2Sn;X z#$B~e{Ce;n{;T1ShDG;b$$eOM9~P;@U&FO&22$|kn_*VHKUjI975fw$@WA6=z5V6z z_+zVmq?`C-*O8G<;`2^3yeI6PFx{7Cge92I#TF)7C#p4S2hsii*Ba%H72B2bsMYGV z#dace+Pb)>az!8(A>^$X3@l2$76u!MykkhivDjrHZ;4=7!3GtIA`$b>Top#H^fjc4 zEaY4X3}eiCtQAt%Vs{QRNzNJU=auJav%!J{yQJ6-G_-fc%;>DXq=7eN*@2~0pW`WO zDOq%Vj|B6Ui=mPh)l#zEwd<`N;0Wp8?8P{WEjOc`3NaxN#RWfEwUjKcS&A$r*O^3h z-crghor1l@`Jtvn%|vcd(OEYb+*cZIq`g}cR#gr2A%!@1V&K`xs!CC8`C5uGLCdJN z7tqXWW)5{6*N6Zzbwhp**_>Pmfi?u%5!i_UvgbJD4S`lh76Ry0ZXZfRD}XzQ08&jH zGAk8p)|gNnDsEQO+42rPcNn1}!-?xb0IAWCRn?$eFG@H9Kp_}VdV}g4$MPI9v3OKl zRyV4OksE;o6}*hH8CYiq#dvgGxVs{HkHRgHeKOf6l6{+|lD|eVK6JQ?)#4^*u^r0z zcE~=YV0>M&uWM~EuO>%-N8B6y4fR_}tQgg9$$3(Co)p#lk+1xnwmau;PeRF>F1e;l z^6iy^_MSMSY zBeq7Qy;XN6Ql;;`bo(XI(=1ZW+bN8L8!w1~gK&S`jTFX+OpS=t$nz--6y1D($TPGb z`*gp3*hGAW9XWo4`0R)o-V@eNAv_&r`KRlLhcHI}*ZM&y^6Tf(>{BnJv|KUiBws<+ z8zL0XCHG9(vYsKeNP{uHkd=Bxa!vIQNuAWoM<;TvLr=z%Bvq+ry-VpO7f8J# z4Z_%jd_)>Flff4^N=#Mik*|?OQqOTF(U6yV>TIBk1W?{tOjw3hw*~~9Hf0pcQ>G3H zoFk?8_!xk9u{j7(U8+K!;5>Y5SL&$>R__!3z;vT}0{Z}bLB+((Dps)ZQM?gcLo}Ne zT*x|rs$h9y?~O)yVJ5z?cr+2zm@0Dq@_FBZqC66#tC$$<(eCx|tv$E9f71J--uI5) zKAKPFP{2M>c^<8e;@)_sICec%92bjO@QtEz2v{O7qac#?f>jl^A#*{xDwgS`*;#O4 z#(^JAtfD2Lid9Xhsu?PPbpF2p7=-Ha4d$Z-(R&K#G;U_sZ>8ME@S`mZ&b7>Jq80w6{X* z(T)A^l!84nbh}5jXtyzryEjV2KsVg?1SIQ*WNJvHhW-b#uD@rX4*PwbeQ+1?`}QM) zt;8Q%&G614<3$ZE%;d!gq##j&A{dWP!n}mAY|-qT&%Y(1Z|oF&DXlDCkQj4P|jGX_`R6UUSNlM=W|Vnxm<__$xv(gfLZO z%v;C@u~ZL&<(v64B?*;7Sm}igHpJnK>1AvJ%ZHtarD32Bg5@&H!H?$u@WIRy7l8*$ zhp(_R5I~=sLs1AeM$bU7gJ}s*^AOEvAyz>{{2lCw#dwSfLrO=YdOR!$EEl6=afm^w za%eWr(M^00{1Oc(xcJf{9f>jQRl13JDpJU%m^g@E@N5{^QSea(pHSDVVqFe%C>)}g z=MWjL3)oidqlnmX6gdc46zif^J=c%&tPDFFUWy8eLoEQo5LV7X3MSapZWgL|4Y+d@~2f2wM(XUt@fuKPUJk6 zqqXL~?iS1UZ7hFm&8z>kr)TU?N&i97L(^=4Gtqz(@ZdF@IP430v`NfN=QuSH!d z(ogE8u27&;a0ph+vCe1KV*1~p59{}-YaZ4+^jl%udBu{nfUL;7h9E1Rx}q5(i6>2Z zIRo1a5H@uFYE=#;0$Z=yfqfWG#t($2FgX4%5>RR(+L@&uX#GX;xTfVF?S% zar!c-HdsjfP8!X3v>#?}?P%)@B}jOL5-k%DRztp0ei8T35L@JgBP~sQ%T5pkjQXa^ zXjNK=?A$BBykY?ZHN428HMXz_Q9JJ2kjh;`;35L82uvaHHwd5=3_gmwSRt?oH9&+f z#wf1rGKP2|N^|5sQ69mubKEQx+~T(=R$k+qTZHVtM=1XS09LNQ?48kDtmJQ${f(=q z(^mT?Mwbx3aN6>#wn???=#=0{(m$5nf#-vuxf9LzH|5xpj`>5j!S``F_uho(GPq`;IEfQ6oEQM6yOTZglGx%@uY1 zqJDke^$S4{{Ei6>>Y_Ofx&br+V@l$X3uf|NQhyKw=8`$lRhUar*iu!e5-11|=nHhq zSa_O^_Rv?t(IpnzBH9|_G^F!L7$7mN329f>>YY-_a!Zi;V}ykqTo`Ph|DDh+ z@b>CuU%lwwuidLBGLEvF%PDewPV9T(^8kSC?Kmen&dH8*B6&{hZ&=~^3k7D>{%!+J z@qAmsx?VQO3!{5r!-OJvk%WUd45612E4Tpz+!q6e$$(*IECvk1fMI1Q0|sfppctD0 z!)(AX1b9{~#qJ-p#G!AiA=eP%G32@oWs5d?88^`OFkW>t4(9WR`8my$-bNd%MNViE zXAGNehTMRm9(~!eZF9@@xl9GrP8sM|>TwLY!EJM^w#}{HHn(Qm+#TEI)^3|iZ<|}U zZLZrCz+6*Lql%cwz=*4TxREOV#TSWU!lo^+STC@Vxyu5d@Qj{$ z@$Bf}ar(mX(Gw%*ClaoKFhl`qH6M1Wt!I}6*fxUmUt?z{CICHlB@TzPkmJU|=Bz>u z#uw0$FW7jVh|Hlq>Ub>S80X^5(hPi@jU}ApuqBK3v4>#uHsKwNMgW~Y$3idzIe7`H zwSVN)#fkQiYV9d5j$MqymNsnnPD8i@x$2lq9BhlyYPTtt7tyA4G+}))eysz@CoBV@Z=eP)1H(I}x z4WsiEG-?1cgO<_oz}O}NV{kN2MM25xsGwv|?hOt87Nc++M}NK$ zh*`63hT|7%BNZ}=I+V)x3Li3~a|~!Hp`Zl_3YZTRe>t*LVrcQ4hl*)SLvffNh+xn4vol?z7x#nahP@g)o(IN$U z7>#(v4)Bl~+YrF5!xqe&_~D3>18wa|ws9NjpOWgRFKK7c~UMziEUB>N8`3OhCe zQt1J?^Z-!sp+552*DTP>kmX!wv`xp1ueRs|dg9_nuT*|aE}xhCmpdhMOrrW_s$Zn~AC~P%m+wed&=0HX z(?L32QJt=Cgi;l7#ts1|7HrrII1rl}u-U)K1_5{h~N=9znR>7bN!u*?mEzF67S0yo95eCE6wW#mTl)0qjo!`z-c-&3l;~XiXPo2DDuzWfZyXr0NG8@A|KuuvxD6#9F8oi ziys~+_QWIfY;b|fRmq{)5e_-LA6~M1HIDjJx8!J*9jzkSs)h?f#F@f=1@{|34eeH} zv!IH@ixCd-#*qjhKb34_QFFys^g721W7%_qTpVqO0p=JwsIQQtXNJ!z_AtX_|4l$~!9fc)ypa2( zrl!yV!>K7PY)`hCmGE$Z(y=3Z6*(wK8q{VPhbt*U=%Y`3Vt(^xj^p=0MN_0UrZOstIM*rLb3*BYjBlJW8_<|*IjFKH=+_&C1X`0R)x|$uX}DfC9FoqYDBCin<9Ff zZdzq;ljLoYy)6>fDr2o8)(XcO?Nw{#vMqS)pk!;5ZH=pzhZfiB+;<~yMplV5W_io; zxAwyoM8=8ABZ z1U>f|X;bk&jmac8MC8P$|1rg4Ls;5IYK|qC6aI3WNuD&4# Qp~+>NrW}fztgfB^24oD_TmS$7 literal 0 HcmV?d00001 diff --git a/app.py b/app.py new file mode 100644 index 0000000..9f0704b --- /dev/null +++ b/app.py @@ -0,0 +1,259 @@ +from fastapi import FastAPI, UploadFile, HTTPException, Body +from PIL import Image +import pytesseract +from doctr.models import ocr_predictor +from doctr.io import DocumentFile +from PyPDF2 import PdfReader +import camelot +import spacy +import logging +import io +from logging.handlers import RotatingFileHandler +import re + +LOG_PATH = "/var/log/automation-service.log" + +file_handler = RotatingFileHandler( + LOG_PATH, + maxBytes=10*1024*1024, + backupCount=5, + encoding="utf-8" +) +file_handler.setFormatter(logging.Formatter( + "%(asctime)s - %(levelname)s - %(name)s - %(message)s" +)) + +# Configure root logger explicitly +root = logging.getLogger() +root.setLevel(logging.INFO) +root.addHandler(file_handler) +root.addHandler(logging.StreamHandler()) + +# Use root logger for your app +logger = logging.getLogger(__name__) + +app = FastAPI() +logger.info("Loading models...") + +nlp = spacy.load("en_core_web_sm") +predictor = ocr_predictor(pretrained=True) + +logger.info("Models loaded successfully.") + +# ============================= +# 🧠 Smart OCR +# ============================= +@app.post("/ocr") +async def ocr(file: UploadFile): + logger.info(f"Received OCR request: {file.filename}") + try: + file_data = await file.read() + ext = file.filename.lower() + + # --------- PDF with native text --------- + if ext.endswith(".pdf"): + logger.info("PDF detected → Extracting native text first") + reader = PdfReader(io.BytesIO(file_data)) + direct_text = "".join( + page.extract_text() or "" for page in reader.pages + ) + + if direct_text.strip(): + logger.info("Native PDF text found → No OCR needed") + return {"ocr_text": direct_text} + + # -------- Fallback: scanned PDF OCR -------- + logger.info("No native text → PDF treated as scanned → OCR") + from pdf2image import convert_from_bytes + images = convert_from_bytes(file_data) + text = "" + for i, img in enumerate(images): + logger.info(f"OCR page {i+1}/{len(images)}") + text += pytesseract.image_to_string(img) + "\n" + + return {"ocr_text": text} + + # --------- Image file OCR --------- + logger.info("Image detected → Running OCR") + img = Image.open(io.BytesIO(file_data)) + text = pytesseract.image_to_string(img) + return {"ocr_text": text} + + except Exception as e: + logger.error(f"OCR failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +# ============================= +# 🧱 Structure / Layout +# ============================= +@app.post("/structure") +async def structure(file: UploadFile): + logger.info(f"Received structure request: {file.filename}") + try: + file_data = await file.read() + ext = file.filename.lower() + + if ext.endswith(".pdf"): + doc = DocumentFile.from_pdf(file_data) + logger.info(f"Structure prediction on PDF ({len(doc)} pages)") + else: + img = Image.open(io.BytesIO(file_data)).convert("RGB") + doc = DocumentFile.from_images([img]) + logger.info("Structure prediction on image") + + res = predictor(doc) + return {"structure": str(res)} + + except Exception as e: + logger.error(f"Structure extraction failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +# ============================= +# 📊 Tables extraction (PDF only) +# ============================= +@app.post("/tables") +async def tables(file: UploadFile): + logger.info(f"Received table extraction request: {file.filename}") + try: + file_data = await file.read() + buffer = io.BytesIO(file_data) + + tables = camelot.read_pdf(buffer) + logger.info(f"Found {len(tables)} tables") + return {"tables": [t.df.to_dict() for t in tables]} + + except Exception as e: + logger.error(f"Table extraction failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=str(e)) + +def safe_search(pattern, text, default=None, group_index=1, context=""): + """Recherche sécurisée avec logging en cas d'absence de correspondance.""" + m = re.search(pattern, text, re.I | re.S) + if not m: + logger.warning("Pattern not found for %s: %s", context, pattern) + return default + try: + return m.group(group_index).strip() + except IndexError: + logger.warning("Group index %d not found for %s: %s", group_index, context, pattern) + return default + +def section(text, start, end=None): + """Extract a block of text between two headings, safely.""" + pattern_start = re.escape(start) + if end: + pattern_end = re.escape(end) + reg = re.compile(pattern_start + r"(.*?)" + pattern_end, re.S | re.I) + else: + reg = re.compile(pattern_start + r"(.*)", re.S | re.I) + m = reg.search(text) + if not m: + logger.warning("Section not found: start='%s', end='%s'", start, end) + return "" + return m.group(1).strip() + +def extract_field(text, label, default=None): + """Extract a line of the form 'Label: value', safely.""" + pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)" + return safe_search(pattern, text, default=default, context=f"field '{label}'") + +def extract_report_metadata(text): + logger.info("Starting metadata extraction, text length=%d", len(text)) + + try: + # ----------- SECTIONS ----------- + order_details = section(text, "Order details", "Weights") + invoice_section = section(text, "INVOICE WEIGHTS", "Bales Weighed") + landed_section = section(text, "Bales Weighed", "Outturn") + loss_section = section(text, "LOSS", "Invoice average") + avg_section = section(text, "Invoice average", "Comments") + signature_block = section(text, "Signed on") + + # ----------- TOP INFO ----------- + top_info = { + "produced_on": extract_field(text, "Produced On"), + "printed_date": extract_field(text, "Printed Date"), + "client_reference": extract_field(text, "Client Reference"), + "report_number": safe_search(r"(AHK\S+)", text, default="", context="report_number", group_index=1), + } + + # ----------- ORDER DETAILS ----------- + parties = { + "client": extract_field(order_details, "Client"), + "client_ref_no": extract_field(order_details, "Client Ref No"), + "buyer": extract_field(order_details, "Buyer"), + "destination": extract_field(order_details, "Destination"), + } + + shipment = { + "total_bales": extract_field(order_details, "Total Bales"), + "vessel": extract_field(order_details, "Vessel"), + "voyage_no": extract_field(order_details, "Voy. No"), + "bl_no": extract_field(order_details, "B/L No"), + "bl_date": extract_field(order_details, "B/L Date"), + "growth": extract_field(order_details, "Growth"), + "arrival_date": extract_field(order_details, "Arrival Date"), + "first_weighing_date": extract_field(order_details, "First date of weighing"), + "last_weighing_date": extract_field(order_details, "Last Date of Weighing"), + "weighing_method": extract_field(order_details, "Weighing method"), + "tare_basis": extract_field(order_details, "Tare"), + } + + # ----------- INVOICE SECTION ----------- + invoice = { + "bales": extract_field(invoice_section, "Bales"), + "gross": extract_field(invoice_section, "Gross"), + "tare": extract_field(invoice_section, "Tare"), + "net": extract_field(invoice_section, "Net"), + } + + # ----------- LANDED SECTION ----------- + landed = { + "bales": extract_field(landed_section, "Bales"), + "gross": extract_field(landed_section, "Gross"), + "tare": extract_field(landed_section, "Tare"), + "net": extract_field(landed_section, "Net"), + } + + # ----------- LOSS SECTION ----------- + loss = { + "kg": extract_field(loss_section, "kg"), + "lb": extract_field(loss_section, "lb"), + "percent": extract_field(loss_section, "Percentage"), + } + + # ----------- AVERAGES SECTION ----------- + averages = { + "invoice_gross_per_bale": extract_field(avg_section, "Invoice average"), + "landed_gross_per_bale": extract_field(avg_section, "Landed average"), + } + + # ----------- SIGNATURE ----------- + signature = { + "signed_on": extract_field(signature_block, "Signed on"), + "signed_by": safe_search(r"\n([A-Za-z ]+)\nClient Services", signature_block, default="", context="signed_by"), + "role": "Client Services Coordinator", + "company": "Alfred H. Knight International Limited" + } + + logger.info("Metadata extraction completed successfully") + return { + "report": top_info, + "parties": parties, + "shipment": shipment, + "weights": { + "invoice": invoice, + "landed": landed, + "loss": loss, + "averages": averages + }, + "signature": signature + } + + except Exception as e: + logger.exception("Unexpected error during metadata extraction") + raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}") + +@app.post("/metadata") +async def metadata(text: str = Body(..., embed=True)): + return extract_report_metadata(text) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..56555e2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +fastapi +uvicorn[standard] +python-multipart + +pytesseract +Pillow + +opencv-python-headless==4.9.0.80 + +python-doctr[torch]==0.8.1 +torch==2.2.0 +torchvision==0.17.0 + +camelot-py[cv] + +spacy +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl + +# ➕ Added for PDF text + OCR fallback +PyPDF2 +pdf2image +pypdf