diff --git a/app.py b/app.py index 7ced0cb..01565be 100644 --- a/app.py +++ b/app.py @@ -59,31 +59,33 @@ class AHKParser: lab = "AHK" # ---------- helpers ---------- - def clean(self, t): - return " ".join(t.replace("\xa0", " ").split()) + def norm(self, s): + return s.replace("\xa0", " ").strip() def find(self, pattern, text): m = re.search(pattern, text, re.I) - return self.clean(m.group(1)) if m else None + return self.norm(m.group(1)) if m else None + + def lines(self, text): + return [self.norm(l) for l in text.splitlines() if self.norm(l)] def block(self, text, labels): - lines = [self.clean(l) for l in text.splitlines() if self.clean(l)] - idx = [i for i,l in enumerate(lines) if l in labels] + L = self.lines(text) + idx = [i for i,l in enumerate(L) if l in labels] values = [] for i in range(len(idx)): start = idx[i] - end = idx[i+1] if i+1 < len(idx) else len(lines) + end = idx[i+1] if i+1 < len(idx) else len(L) for j in range(start, end): - if lines[j].startswith(":"): - values.append(lines[j].lstrip(":").strip()) + if L[j].startswith(":"): + values.append(L[j][1:].strip()) break return dict(zip(labels, values)) # ---------- parser ---------- def parse(self, text): r = empty_weight_report("AHK") - text = self.clean(text) # ---------- report ---------- r["report"]["reference"] = self.find(r"(AHK\s*/\S+)", text) @@ -94,10 +96,9 @@ class AHKParser: r["contract"]["commodity"] = "Raw Cotton" # ---------- parties ---------- - r["parties"]["buyer"] = self.find(r"Buyer\s*:\s*(.+)", text) - r["parties"]["seller"] = self.find(r"Client\s*Ref No\.\s*:\s*(.+)", text) + r["parties"]["buyer"] = self.find(r"Buyer\s*:\s*(.+)", text) - # ---------- shipment block ---------- + # ---------- shipment ---------- ship = self.block(text, [ "Total Bales","Vessel","Voy. No.","B/L No.","B/L Date","Destination", "Growth","Arrival Date","First date of weighing","Last Date of Weighing",