diff --git a/app.py b/app.py index 8c09de6..9e7e99b 100644 --- a/app.py +++ b/app.py @@ -55,78 +55,243 @@ file_handler.setFormatter(logging.Formatter( # r["weights"]["gain_loss_kg"]=to_float(extract("kg",loss)) # r["weights"]["gain_loss_percent"]=to_float(extract("Percentage",loss)) # return r +# class AHKParser: +# lab = "AHK" + +# def extract_table(self, text, headers): +# lines = [l.strip() for l in text.splitlines() if l.strip()] +# out = {} +# for h in headers: +# for i,l in enumerate(lines): +# if l == h: +# for j in range(i+1, i+8): +# if j < len(lines) and lines[j].startswith(":"): +# out[h] = lines[j][1:].strip() +# break +# return out + +# def extract_weights(self, text): +# lines = [l.strip() for l in text.splitlines() if l.strip()] +# res = {} +# for i,l in enumerate(lines): +# if l == "Bales Weighed": +# headers = ["Bales","Gross","Tare","Net"] +# for h in headers: +# for j in range(i, i+20): +# if j < len(lines) and lines[j].startswith(":"): +# res[h] = lines[j][1:].replace("kg","").strip() +# break +# return res + +# def parse(self, text): +# r = empty_weight_report("AHK") + +# # report +# r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) +# r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) + +# # contract +# r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) +# r["contract"]["commodity"] = "Raw Cotton" + +# # buyer +# r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) + +# # shipment tables +# ship = self.extract_table(text, [ +# "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" +# ]) +# ship2 = self.extract_table(text, [ +# "Growth","Arrival Date","First date of weighing", +# "Last Date of Weighing","Weighing method","Tare" +# ]) + +# r["shipment"]["bales"] = to_float(ship.get("Total Bales")) +# r["shipment"]["vessel"] = ship.get("Vessel") +# r["shipment"]["bl_no"] = ship.get("B/L No.") +# r["shipment"]["port_destination"] = ship.get("Destination") +# r["shipment"]["arrival_date"] = ship2.get("Arrival Date") +# r["shipment"]["weighing_method"] = ship2.get("Weighing method") +# r["contract"]["origin"] = ship2.get("Growth") + +# # weights +# inv = self.extract_table(text, ["Bales","Gross","Tare","Net"]) +# land = self.extract_weights(text) + +# r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) +# r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) +# r["weights"]["tare_kg"] = to_float(land.get("Tare")) +# r["weights"]["net_landed_kg"] = to_float(land.get("Net")) + +# # loss +# loss = section(text,"LOSS","Invoice average") +# r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) +# r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) + +# return r + +import re +from typing import List, Dict, Optional + class AHKParser: lab = "AHK" - def extract_table(self, text, headers): - lines = [l.strip() for l in text.splitlines() if l.strip()] - out = {} - for h in headers: - for i,l in enumerate(lines): - if l == h: - for j in range(i+1, i+8): - if j < len(lines) and lines[j].startswith(":"): - out[h] = lines[j][1:].strip() - break - return out + # ---------- Helpers ---------- + def _norm(self, text: str) -> str: + # Normalise espaces/entités, supprime artefacts typiques d'OCR + t = (text.replace("\u00a0", " ") + .replace(" ", " ") + .replace("**", " ") + .replace("\t", " ")) + # Supprime espaces multiples + t = re.sub(r"[ ]{2,}", " ", t) + # Aligne "Page of" etc. (inutile au parsing) + return t.strip() - def extract_weights(self, text): - lines = [l.strip() for l in text.splitlines() if l.strip()] - res = {} - for i,l in enumerate(lines): - if l == "Bales Weighed": - headers = ["Bales","Gross","Tare","Net"] - for h in headers: - for j in range(i, i+20): - if j < len(lines) and lines[j].startswith(":"): - res[h] = lines[j][1:].replace("kg","").strip() - break - return res + def _safe_search(self, pat: str, text: str, flags=0) -> Optional[str]: + m = re.search(pat, text, flags) + return m.group(1).strip() if m else None - def parse(self, text): - r = empty_weight_report("AHK") + def _to_float(self, s: Optional[str]) -> Optional[float]: + if not s: + return None + s = s.replace(",", "").replace("kg", "").replace("%", "").strip() + # enlève éventuels espaces après le signe + s = re.sub(r"^([+\-])\s+", r"\1", s) + try: + return float(s) + except ValueError: + return None + + def _split_lines(self, text: str) -> List[str]: + lines = [l.strip() for l in re.split(r"\r?\n", text) if l.strip()] + return lines + + def _take_next_colon_values(self, lines: List[str], start_idx: int, count: int) -> List[str]: + """ + Récupère, à partir de start_idx (exclu), les 'count' prochaines valeurs qui suivent un ':'. + Tolère plusieurs valeurs sur la même ligne: ex ': A : B : C' + """ + vals = [] + j = start_idx + 1 + while j < len(lines) and len(vals) < count: + # attrape toutes les occurrences sur la ligne + parts = re.findall(r":\s*([^:]+?)(?=\s*(?::|$))", lines[j]) + for v in parts: + if len(vals) < count: + vals.append(v.strip()) + j += 1 + return vals + + def _extract_group_by_headers(self, text: str, headers: List[str], anchor_regex: Optional[str]=None) -> Dict[str, str]: + """ + Trouve une ligne contenant tous les headers (dans l'ordre) OU une ancre fournie, + puis mappe les N valeurs suivantes (débutant par ':') aux headers. + """ + lines = self._split_lines(self._norm(text)) + # construire regex qui force l'ordre des headers + hdr_regex = r"\b" + r"\s+".join([re.escape(h) for h in headers]) + r"\b" + start_idx = None + for i, l in enumerate(lines): + if anchor_regex and re.search(anchor_regex, l, flags=re.I): + start_idx = i + break + if re.search(hdr_regex, l): + start_idx = i + break + if start_idx is None: + return {} + values = self._take_next_colon_values(lines, start_idx, len(headers)) + return {h: (values[idx] if idx < len(values) else None) for idx, h in enumerate(headers)} + + # ---------- API compatibles avec ton code ---------- + def extract_table(self, text: str, headers: List[str]) -> Dict[str, str]: + # version robuste: détecte headers groupés et prend les valeurs en séquence + return self._extract_group_by_headers(text, headers) + + def extract_weights(self, text: str, anchor: Optional[str]=None) -> Dict[str, str]: + """ + Extrait un bloc de poids Bales/Gross/Tare/Net. + - Si anchor est défini (ex. 'Bales Weighed'), on part de cette ancre. + - Sinon on cherche la ligne d'en-têtes 'Bales Gross Tare Net'. + """ + headers = ["Bales", "Gross", "Tare", "Net"] + block = self._extract_group_by_headers(text, headers, + anchor_regex=anchor if anchor else None) + # nettoyage des unités pour les poids + clean = {} + for k, v in block.items(): + if v is None: + clean[k] = None + else: + clean[k] = v.replace("kg", "").strip() + return clean + + # ---------- Parse principal ---------- + def parse(self, text: str) -> dict: + # si tu as déjà empty_weight_report(), réutilise-le + r = { + "report": {}, + "contract": {}, + "parties": {}, + "shipment": {}, + "weights": {} + } + + T = self._norm(text) # report - r["report"]["reference"] = safe_search(r"(AHK\s*/[A-Z0-9/]+)", text) - r["report"]["date"] = safe_search(r"Produced On\s*([0-9A-Za-z ]+)", text) + # Exemple PDF: "AHK S/790329/161112/PK" (il y a un espace après AHK) + r["report"]["reference"] = self._safe_search(r"(AHK\s+[A-Z0-9/]+)", T) + r["report"]["date"] = self._safe_search(r"Produced On\s*([0-9A-Za-z ]+)", T) - # contract - r["contract"]["invoice_no"] = safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", text) - r["contract"]["commodity"] = "Raw Cotton" + # Order details: "Client Client Ref No. Buyer" puis valeurs + order = self.extract_table(T, ["Client", "Client Ref No.", "Buyer"]) + r["contract"]["invoice_no"] = order.get("Client Ref No.") or \ + self._safe_search(r"Client Reference:\s*([A-Z0-9\- /]+)", T) + r["parties"]["client"] = order.get("Client") + r["parties"]["buyer"] = order.get("Buyer") - # buyer - r["parties"]["buyer"] = safe_search(r"Buyer:\s*([A-Z0-9 ().,-]+)", text) + # Infos expédition (2 blocs groupés) + ship = self.extract_table(T, ["Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination"]) + ship2 = self.extract_table(T, ["Growth","Arrival Date","First date of weighing", + "Last Date of Weighing","Weighing method","Tare"]) - # shipment tables - ship = self.extract_table(text, [ - "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination" - ]) - ship2 = self.extract_table(text, [ - "Growth","Arrival Date","First date of weighing", - "Last Date of Weighing","Weighing method","Tare" - ]) + r["shipment"]["bales"] = self._to_float(ship.get("Total Bales")) + r["shipment"]["vessel"] = ship.get("Vessel") + r["shipment"]["voyage_no"] = ship.get("Voy. No.") + r["shipment"]["bl_no"] = ship.get("B/L No.") + r["shipment"]["bl_date"] = ship.get("B/L Date") + r["shipment"]["port_destination"] = ship.get("Destination") - r["shipment"]["bales"] = to_float(ship.get("Total Bales")) - r["shipment"]["vessel"] = ship.get("Vessel") - r["shipment"]["bl_no"] = ship.get("B/L No.") - r["shipment"]["port_destination"] = ship.get("Destination") - r["shipment"]["arrival_date"] = ship2.get("Arrival Date") - r["shipment"]["weighing_method"] = ship2.get("Weighing method") - r["contract"]["origin"] = ship2.get("Growth") + r["contract"]["origin"] = ship2.get("Growth") + r["shipment"]["arrival_date"] = ship2.get("Arrival Date") + r["shipment"]["first_weighing_date"] = ship2.get("First date of weighing") + r["shipment"]["last_weighing_date"] = ship2.get("Last Date of Weighing") + r["shipment"]["weighing_method"] = ship2.get("Weighing method") + # Chez AHK, "Tare: Invoice" indique la base de tare, pas un poids + r["shipment"]["tare_basis"] = ship2.get("Tare") - # weights - inv = self.extract_table(text, ["Bales","Gross","Tare","Net"]) - land = self.extract_weights(text) + # Poids + # Bloc 1: invoice (juste après l'en-tête 'Bales Gross Tare Net') + inv = self.extract_weights(T) # sans ancre -> la 1ère occurrence + # Bloc 2: landed (ancré sur 'Bales Weighed') + land = self.extract_weights(T, anchor=r"\bBales Weighed\b") - r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) - r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) - r["weights"]["tare_kg"] = to_float(land.get("Tare")) - r["weights"]["net_landed_kg"] = to_float(land.get("Net")) + r["weights"]["invoice_bales"] = self._to_float(inv.get("Bales")) + r["weights"]["invoice_gross_kg"] = self._to_float(inv.get("Gross")) + r["weights"]["invoice_tare_kg"] = self._to_float(inv.get("Tare")) + r["weights"]["invoice_net_kg"] = self._to_float(inv.get("Net")) - # loss - loss = section(text,"LOSS","Invoice average") - r["weights"]["gain_loss_kg"] = to_float(safe_search(r"(-?\d+\.?\d*)\s*kg", loss)) - r["weights"]["gain_loss_percent"] = to_float(safe_search(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) + r["weights"]["landed_bales"] = self._to_float(land.get("Bales")) + r["weights"]["gross_landed_kg"] = self._to_float(land.get("Gross")) + r["weights"]["tare_kg"] = self._to_float(land.get("Tare")) + r["weights"]["net_landed_kg"] = self._to_float(land.get("Net")) + + # Loss / Outturn + loss_sec = T # si tu as section(text, "LOSS", "Invoice average"), remplace par ta fonction + r["weights"]["gain_loss_kg"] = self._to_float(self._safe_search(r"LOSS.*?(-?\s*\d+\.?\d*)\s*kg", loss_sec, flags=re.S)) + r["weights"]["gain_loss_percent"] = self._to_float(self._safe_search(r"Percentage\s*:\s*([\-+]?\s*\d+\.?\d*)", loss_sec)) return r