diff --git a/app.py b/app.py index 01565be..0ac331e 100644 --- a/app.py +++ b/app.py @@ -58,73 +58,47 @@ file_handler.setFormatter(logging.Formatter( class AHKParser: lab = "AHK" - # ---------- helpers ---------- - def norm(self, s): - return s.replace("\xa0", " ").strip() + def g(self, pat, txt): + m = re.search(pat, txt, re.I | re.S) + return m.group(1).strip() if m else None - def find(self, pattern, text): - m = re.search(pattern, text, re.I) - return self.norm(m.group(1)) if m else None - - def lines(self, text): - return [self.norm(l) for l in text.splitlines() if self.norm(l)] - - def block(self, text, labels): - L = self.lines(text) - idx = [i for i,l in enumerate(L) if l in labels] - values = [] - - for i in range(len(idx)): - start = idx[i] - end = idx[i+1] if i+1 < len(idx) else len(L) - for j in range(start, end): - if L[j].startswith(":"): - values.append(L[j][1:].strip()) - break - return dict(zip(labels, values)) - - # ---------- parser ---------- def parse(self, text): r = empty_weight_report("AHK") # ---------- report ---------- - r["report"]["reference"] = self.find(r"(AHK\s*/\S+)", text) - r["report"]["date"] = self.find(r"Produced On\s*([0-9A-Za-z ]+)", text) + r["report"]["reference"] = self.g(r"(AHK\s*/[A-Z0-9/]+)", text) + r["report"]["date"] = self.g(r"Produced On\s*([0-9]{1,2}\s+[A-Za-z]+\s+20\d{2})", text) # ---------- contract ---------- - r["contract"]["invoice_no"] = self.find(r"Client Reference:\s*([A-Z0-9\- /]+)", text) + r["contract"]["invoice_no"] = self.g(r"Client Reference:\s*([A-Z0-9\- /]+)", text) r["contract"]["commodity"] = "Raw Cotton" + r["contract"]["origin"] = self.g(r"Growth\s*:\s*([A-Z ].+?)(?:Arrival Date|First)", text) # ---------- parties ---------- - r["parties"]["buyer"] = self.find(r"Buyer\s*:\s*(.+)", text) + r["parties"]["buyer"] = self.g(r"Buyer\s*:\s*([A-Z0-9 ().,-]+)", text) # ---------- shipment ---------- - ship = self.block(text, [ - "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination", - "Growth","Arrival Date","First date of weighing","Last Date of Weighing", - "Weighing method","Tare" - ]) + r["shipment"]["bales"] = to_float(self.g(r"Total Bales\s*:\s*(\d+)", text)) + r["shipment"]["vessel"] = self.g(r"Vessel\s*:\s*([A-Z0-9 ]+)", text) + r["shipment"]["bl_no"] = self.g(r"B/L No\.\s*:\s*([A-Z0-9]+)", text) + r["shipment"]["port_destination"] = self.g(r"Destination\s*:\s*([A-Z ,]+)", text) + r["shipment"]["arrival_date"] = self.g(r"Arrival Date\s*:\s*([0-9A-Za-z-]+)", text) + r["shipment"]["weighing_method"] = self.g(r"Weighing method\s*:\s*([A-Za-z ]+)", text) - r["shipment"]["bales"] = to_float(ship.get("Total Bales")) - r["shipment"]["vessel"] = ship.get("Vessel") - r["shipment"]["bl_no"] = ship.get("B/L No.") - r["shipment"]["port_destination"] = ship.get("Destination") - r["shipment"]["arrival_date"] = ship.get("Arrival Date") - r["shipment"]["weighing_method"] = ship.get("Weighing method") - r["contract"]["origin"] = ship.get("Growth") + # ---------- invoice weights ---------- + inv = self.g(r"INVOICE WEIGHTS.*?Net\s*:\s*([\d.]+)\s*kg", text) + r["weights"]["invoice_net_kg"] = to_float(inv) - # ---------- weights ---------- - inv = self.block(text, ["Bales","Gross","Tare","Net"]) - land = self.block(section(text,"Bales Weighed","Outturn"),["Bales","Gross","Tare","Net"]) - loss = section(text,"LOSS","Invoice average") + # ---------- landed weights ---------- + land = self.g(r"Bales Weighed.*?Net\s*:\s*([\d.]+)\s*kg", text) + r["weights"]["net_landed_kg"] = to_float(land) - r["weights"]["gross_landed_kg"] = to_float(land.get("Gross")) - r["weights"]["tare_kg"] = to_float(land.get("Tare")) - r["weights"]["net_landed_kg"] = to_float(land.get("Net")) - r["weights"]["invoice_net_kg"] = to_float(inv.get("Net")) + r["weights"]["gross_landed_kg"] = to_float(self.g(r"Bales Weighed.*?Gross\s*:\s*([\d.]+)\s*kg", text)) + r["weights"]["tare_kg"] = to_float(self.g(r"Bales Weighed.*?Tare\s*:\s*([\d.]+)\s*kg", text)) - r["weights"]["gain_loss_kg"] = to_float(self.find(r"(-?\d+\.?\d*)\s*kg", loss)) - r["weights"]["gain_loss_percent"] = to_float(self.find(r"Percentage\s*:\s*(-?\d+\.?\d*)", loss)) + # ---------- loss ---------- + r["weights"]["gain_loss_kg"] = to_float(self.g(r"LOSS.*?(-?\d+\.?\d*)\s*kg", text)) + r["weights"]["gain_loss_percent"] = to_float(self.g(r"Percentage\s*:\s*(-?\d+\.?\d*)", text)) return r