From baa02ade7d5bdf0c00030040c621f565e74d0507 Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 24 Feb 2026 13:47:13 +0000 Subject: [PATCH] Update app.py --- app.py | 174 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/app.py b/app.py index bd288eb..c4f6b40 100644 --- a/app.py +++ b/app.py @@ -50,14 +50,14 @@ def get_db_connection(): ) class AHKParser: - lab = "AHK" - + lab = "AHK" + def _clean_value(self, value): """Nettoie la valeur en supprimant les espaces inutiles""" if value: return value.strip() return value - + def parse(self, text): """Parse le texte et retourne un dictionnaire structuré""" result = { @@ -70,7 +70,7 @@ class AHKParser: } self.data = result return result - + def _extract_report_info(self, text): """Extrait les informations du rapport""" report_info = { @@ -78,24 +78,24 @@ class AHKParser: "file_no": None, "date": None } - + # Recherche de la référence client - plus précise ref_match = re.search(r'Client\s+Reference:\s*(S-\d+\s*/\s*INV\s*\d+)', text) if ref_match: report_info["reference"] = self._clean_value(ref_match.group(1)) - + # Recherche du numéro de fichier AHK file_no_match = re.search(r'AHK\s+S/([\w/]+)', text) if file_no_match: report_info["file_no"] = self._clean_value(file_no_match.group(1)) - + # Recherche de la date du rapport date_match = re.search(r'Signed\s+on\s*(\d{1,2}-[A-Za-z]{3}-\d{4})', text) if date_match: report_info["date"] = self._clean_value(date_match.group(1)) - + return report_info - + def _extract_contract_info(self, text): """Extrait les informations du contrat""" contract_info = { @@ -105,7 +105,7 @@ class AHKParser: "origin": None, "commodity": None } - + # Extraction de la référence client ref_match = re.search(r'Client\s+Ref\s+No\.\s*:\s*([^\n]+)', text) if ref_match: @@ -117,7 +117,7 @@ class AHKParser: contract_info["contract_no"] = part.strip() elif part.startswith('INV'): contract_info["invoice_no"] = part.strip() - + # Extraction de l'origine et de la marchandise - regex plus précise growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+?)(?=\s*(?:Vessel|$))', text) if growth_match: @@ -125,9 +125,9 @@ class AHKParser: if "AUSTRALIAN" in origin_text.upper(): contract_info["origin"] = "AUSTRALIA" contract_info["commodity"] = "RAW COTTON" - + return contract_info - + def _extract_parties_info(self, text): """Extrait les informations sur les parties""" parties_info = { @@ -135,24 +135,24 @@ class AHKParser: "buyer": None, "carrier": None } - + # Extraction du vendeur (Client) - regex plus précise seller_match = re.search(r'Client\s*:\s*([^\n:]+?)(?=\s*(?:Client Ref|Buyer|$))', text) if seller_match: parties_info["seller"] = self._clean_value(seller_match.group(1)) - + # Extraction de l'acheteur (Buyer) - regex plus précise buyer_match = re.search(r'Buyer\s*:\s*([^\n:]+?)(?=\s*(?:Total Bales|$))', text) if buyer_match: parties_info["buyer"] = self._clean_value(buyer_match.group(1)) - + # Extraction du transporteur (nom du navire seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: parties_info["carrier"] = self._clean_value(vessel_match.group(1)) - + return parties_info - + def _extract_shipment_info(self, text): """Extrait les informations d'expédition""" shipment_info = { @@ -166,37 +166,37 @@ class AHKParser: "weighing_method": None, "bales": None } - + # Extraction du navire (nom seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|Voy|$))', text) if vessel_match: shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) - + # Extraction du numéro de connaissement (seulement le numéro) bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)(?=\s|$)', text) if bl_no_match: shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) - + # Extraction de la date du connaissement bl_date_match = re.search(r'B/L\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if bl_date_match: shipment_info["bl_date"] = self._clean_value(bl_date_match.group(1)) - + # Extraction du port de destination (sans le "Tare") dest_match = re.search(r'Destination\s*:\s*([A-Z,\s]+?)(?=\s*(?:Tare|$))', text) if dest_match: shipment_info["port_destination"] = self._clean_value(dest_match.group(1)) - + # Extraction de la date d'arrivée arrival_match = re.search(r'Arrival\s+Date\s*:\s*(\d{1,2}-[A-Za-z]{3}-\d{4})(?=\s|$)', text) if arrival_match: shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) - + # Extraction de la méthode de pesée weighing_method_match = re.search(r'Weighing\s+method\s*:\s*([^\n]+?)(?=\s*(?:Tare|$))', text) if weighing_method_match: shipment_info["weighing_method"] = self._clean_value(weighing_method_match.group(1)) - + # Extraction du nombre de balles bales_match = re.search(r'Total\s+Bales\s*:\s*(\d+)(?=\s|$)', text) if bales_match: @@ -204,9 +204,9 @@ class AHKParser: shipment_info["bales"] = int(bales_match.group(1).strip()) except ValueError: shipment_info["bales"] = None - + return shipment_info - + def _extract_weights_info(self, text): """Extrait les informations de poids""" weights_info = { @@ -217,7 +217,7 @@ class AHKParser: "gain_loss_kg": None, "gain_loss_percent": None } - + # Extraction du poids brut débarqué (corrigé - doit être 100580 kg) gross_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Gross\s*:\s*([\d.,]+)\s*kg', text) if gross_landed_match: @@ -225,7 +225,7 @@ class AHKParser: weights_info["gross_landed_kg"] = float(gross_landed_match.group(1).replace(',', '').strip()) except ValueError: pass - + # Extraction du poids de tare tare_match = re.search(r'Tare\s*:\s*([\d.,]+)\s*kg', text) if tare_match: @@ -233,7 +233,7 @@ class AHKParser: weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '').strip()) except ValueError: pass - + # Extraction du poids net débarqué (corrigé - doit être 100078.40 kg) net_landed_match = re.search(r'LANDED WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if net_landed_match: @@ -241,7 +241,7 @@ class AHKParser: weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '').strip()) except ValueError: pass - + # Extraction du poids net facturé (101299 kg) invoice_net_match = re.search(r'INVOICE WEIGHTS[\s\S]*?Net\s*:\s*([\d.,]+)\s*kg', text) if invoice_net_match: @@ -249,7 +249,7 @@ class AHKParser: weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '').strip()) except ValueError: pass - + # Extraction de la perte en kg loss_match = re.search(r'LOSS\s*:\s*-\s*([\d.,]+)\s*kg', text) if loss_match: @@ -257,7 +257,7 @@ class AHKParser: weights_info["gain_loss_kg"] = -float(loss_match.group(1).replace(',', '').strip()) except ValueError: pass - + # Extraction du pourcentage de perte percent_match = re.search(r'Percentage\s*:\s*-\s*([\d.,]+)%', text) if percent_match: @@ -265,20 +265,20 @@ class AHKParser: weights_info["gain_loss_percent"] = -float(percent_match.group(1).replace(',', '').strip()) except ValueError: pass - + return weights_info - + import re class IntertekParser: lab = "Intertek" - + def _clean_value(self, value): """Nettoie la valeur en supprimant les espaces inutiles""" if value: return value.strip() return value - + def _extract_number(self, text, pattern, is_int=False): """Extrait un nombre (int ou float) du texte selon un pattern regex""" match = re.search(pattern, text) @@ -293,7 +293,7 @@ class IntertekParser: except (ValueError, AttributeError): return None return None - + def parse(self, text): """Parse le texte et retourne un dictionnaire structuré""" result = { @@ -305,7 +305,7 @@ class IntertekParser: "weights": self._extract_weights_info(text) } return result - + def _extract_report_info(self, text): """Extrait les informations du rapport""" report_info = { @@ -313,24 +313,24 @@ class IntertekParser: "file_no": None, "date": None } - + # Recherche de la référence globale ref_match = re.search(r'Global Ref\s*:\s*(GLO-\d+-[A-Z]+)', text) if ref_match: report_info["reference"] = self._clean_value(ref_match.group(1)) - + # Recherche du numéro de fichier file_no_match = re.search(r'Report\s*/\s*File No\s*:\s*([A-Z]+-AGR\d+-?)', text) if file_no_match: report_info["file_no"] = self._clean_value(file_no_match.group(1)) - + # Recherche de la date du rapport date_match = re.search(r'Dated\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) if date_match: report_info["date"] = self._clean_value(date_match.group(1)) - + return report_info - + def _extract_contract_info(self, text): """Extrait les informations du contrat""" contract_info = { @@ -340,17 +340,17 @@ class IntertekParser: "origin": None, "commodity": None } - + # Extraction du numéro de contrat contract_match = re.search(r'Contract No\s*:\s*([A-Z]?-\d+)', text) if contract_match: contract_info["contract_no"] = self._clean_value(contract_match.group(1)) - + # Extraction du numéro de facture invoice_match = re.search(r'Invoice No\s*:\s*(\d+)', text) if invoice_match: contract_info["invoice_no"] = self._clean_value(invoice_match.group(1)) - + # Extraction de l'origine et de la marchandise growth_match = re.search(r'Growth\s*:\s*([A-Z\s]+)(?=\s*Shipper|\n|$)', text) if growth_match: @@ -358,9 +358,9 @@ class IntertekParser: if "GREECE" in origin_text.upper(): contract_info["origin"] = "GREECE" contract_info["commodity"] = "RAW COTTON" - + return contract_info - + def _extract_parties_info(self, text): """Extrait les informations sur les parties""" parties_info = { @@ -368,24 +368,24 @@ class IntertekParser: "buyer": None, "carrier": None } - + # Extraction du vendeur (Shipper) seller_match = re.search(r'Shipper\s*:\s*([^\n]+?)(?=\s*(?:Buyer|$))', text) if seller_match: parties_info["seller"] = self._clean_value(seller_match.group(1)) - + # Extraction de l'acheteur (Buyer) buyer_match = re.search(r'Buyer\s*:\s*([^\n]+?)(?=\s*(?:CONTAINER|TOTAL|$))', text) if buyer_match: parties_info["buyer"] = self._clean_value(buyer_match.group(1)) - + # Extraction du transporteur (nom du navire seulement) vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) if vessel_match: parties_info["carrier"] = self._clean_value(vessel_match.group(1)) - + return parties_info - + def _extract_shipment_info(self, text): """Extrait les informations d'expédition""" shipment_info = { @@ -399,27 +399,27 @@ class IntertekParser: "weighing_method": None, "bales": None } - + # Extraction du navire vessel_match = re.search(r'Vessel\s*:\s*([A-Z\s]+?)(?=\s*(?:Arrival|$))', text) if vessel_match: shipment_info["vessel"] = self._clean_value(vessel_match.group(1)) - + # Extraction du numéro de connaissement bl_no_match = re.search(r'B/L\s+No\.\s*:\s*([A-Z0-9]+)', text) if bl_no_match: shipment_info["bl_no"] = self._clean_value(bl_no_match.group(1)) - + # Extraction de la date d'arrivée arrival_match = re.search(r'Arrival Date\s*:\s*(\d{1,2}\.\d{1,2}\.\d{4})', text) if arrival_match: shipment_info["arrival_date"] = self._clean_value(arrival_match.group(1)) - + # Extraction du lieu de pesée weighing_place_match = re.search(r'Weighed at\s*:\s*([^\n]+?)(?=\s*(?:Vessel|$))', text) if weighing_place_match: shipment_info["weighing_place"] = self._clean_value(weighing_place_match.group(1)) - + # Extraction de la méthode de pesée # Recherche dans les remarques remarks_section = re.search(r'REMARKS\s*(.+?)(?=ISSUED BY|$)', text, re.DOTALL | re.IGNORECASE) @@ -427,22 +427,22 @@ class IntertekParser: remarks_text = remarks_section.group(1) if "weighbridge" in remarks_text.lower(): shipment_info["weighing_method"] = "Weighbridge weighing by empty/full truck" - + # Extraction du nombre de balles (à partir du total) bales_match = re.search(r'TOTAL\s+(\d{1,4}(?:,\d{3})?)\s+[\d,]+\.\d{2}', text) if not bales_match: # Essayons une autre approche bales_match = re.search(r'Invoice Quantity\s*:\s*(\d+)\s+Bales', text) - + if bales_match: try: bales_str = bales_match.group(1).replace(',', '').strip() shipment_info["bales"] = int(bales_str) except ValueError: shipment_info["bales"] = None - + return shipment_info - + def _extract_weights_info(self, text): """Extrait les informations de poids""" weights_info = { @@ -453,32 +453,32 @@ class IntertekParser: "gain_loss_kg": None, "gain_loss_percent": None } - + # Extraction du poids brut débarqué gross_match = re.search(r'Gross Landed Weight\s*:\s*([\d,]+\.\d{2})\s*kgs', text) if gross_match: weights_info["gross_landed_kg"] = float(gross_match.group(1).replace(',', '')) - + # Extraction du poids de tare tare_match = re.search(r'Invoice Tare\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if tare_match: weights_info["tare_kg"] = float(tare_match.group(1).replace(',', '')) - + # Extraction du poids net débarqué net_landed_match = re.search(r'Net Landed Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if net_landed_match: weights_info["net_landed_kg"] = float(net_landed_match.group(1).replace(',', '')) - + # Extraction du poids net facturé invoice_net_match = re.search(r'Net Invoice Weight\s*:\s*([\d,]+\.\d{2})\s*Kgs', text) if invoice_net_match: weights_info["invoice_net_kg"] = float(invoice_net_match.group(1).replace(',', '')) - + # Extraction du gain en kg gain_match = re.search(r'Gain\s+([\d,]+\.\d{2})\s*Kgs', text) if gain_match: weights_info["gain_loss_kg"] = float(gain_match.group(1).replace(',', '')) - + # Extraction du pourcentage de gain (0.4% dans le tableau) percent_match = re.search(r'TOTAL\s+\d+\s+[\d,]+\.\d{2}\s+([\d.]+)%', text) if percent_match: @@ -486,7 +486,7 @@ class IntertekParser: weights_info["gain_loss_percent"] = float(percent_match.group(1)) except ValueError: pass - + return weights_info # Configure root logger explicitly @@ -639,43 +639,43 @@ async def ocr(file: UploadFile): Smart PDF processing optimized for cotton landing reports """ logger.info(f"Smart OCR request: {file.filename}") - + try: file_data = await file.read() - + # Strategy 1: Try pdfplumber (best for digital PDFs) try: with pdfplumber.open(io.BytesIO(file_data)) as pdf: text_parts = [] tables_found = [] - + for page in pdf.pages: # Extract text page_text = page.extract_text(x_tolerance=2, y_tolerance=2) if page_text: text_parts.append(page_text) - + # Look for tables (common in landing reports) tables = page.extract_tables({ "vertical_strategy": "text", "horizontal_strategy": "text", "snap_tolerance": 5, }) - + for table in tables: if table and len(table) > 1: tables_found.append(table) - + combined_text = "\n".join(text_parts) return {"ocr_text": combined_text} # if combined_text.strip(): # logger.info(f"pdfplumber extracted {len(combined_text)} chars") - + # # Try parsing structured data # structured_data = parse_cotton_report(combined_text) - + # # Check if we got key fields - # if (structured_data.get("shipment", {}).get("bales") and + # if (structured_data.get("shipment", {}).get("bales") and # structured_data.get("weights", {}).get("net_landed_kg")): # logger.info("Successfully parsed structured data from pdfplumber") # return { @@ -683,13 +683,13 @@ async def ocr(file: UploadFile): # "structured_data": structured_data, # "raw_text_sample": combined_text[:500] # } - + except Exception as e: logger.warning(f"pdfplumber attempt: {e}") - + # from pdf2image import convert_from_bytes # images = convert_from_bytes(file_data, dpi=200) - + # ocr_results = [] # for img in images: # text = pytesseract.image_to_string( @@ -697,15 +697,15 @@ async def ocr(file: UploadFile): # config='--psm 6 -c preserve_interword_spaces=1' # ) # ocr_results.append(text) - + # ocr_text = "\n".join(ocr_results) - + # return { # "method": "tesseract_ocr", # "structured_data": ocr_text, # "raw_text_sample": ocr_text[:500] # } - + except Exception as e: logger.error(f"Smart OCR failed: {e}", exc_info=True) return { @@ -928,10 +928,10 @@ def detect_template(text): if "robertson international" in t or "ri ref no" in t: return "ROBERTSON" - + if "landing report" in t and "carcon cargo" in t: return "CARGO CONTROL" - + if "pacific inspection company" in t or "picl-bd.com" in t: return "PICL"