09.01.26
This commit is contained in:
253
app.py
253
app.py
@@ -23,6 +23,176 @@ file_handler.setFormatter(logging.Formatter(
|
||||
"%(asctime)s - %(levelname)s - %(name)s - %(message)s"
|
||||
))
|
||||
|
||||
class AHKParser:
|
||||
lab = "AHK"
|
||||
|
||||
def parse(self, text):
|
||||
invoice_block = section(text, "INVOICE WEIGHTS", "Bales Weighed")
|
||||
landed_block = section(text, "Bales Weighed", "Outturn")
|
||||
loss_block = section(text, "LOSS", "Invoice average")
|
||||
|
||||
return {
|
||||
"report": {
|
||||
"lab": "AHK",
|
||||
"reference": safe_search(r"(AHK\s*/\S+)", text, default=None, context="AHK reference"),
|
||||
"date": extract("Produced On", text)
|
||||
},
|
||||
"shipment": {
|
||||
"bales": to_float(extract("Total Bales", text)),
|
||||
"vessel": extract("Vessel", text),
|
||||
"bl": extract("B/L No", text),
|
||||
"arrival_date": extract("Arrival Date", text)
|
||||
},
|
||||
"weights": {
|
||||
"invoice_kg": to_float(extract("Net", invoice_block)),
|
||||
"landed_kg": to_float(extract("Net", landed_block)),
|
||||
"gain_loss_kg": to_float(extract("kg", loss_block)),
|
||||
"gain_loss_percent": to_float(extract("Percentage", loss_block))
|
||||
}
|
||||
}
|
||||
|
||||
class IntertekParser:
|
||||
lab = "INTERTEK"
|
||||
|
||||
def parse(self, text):
|
||||
m = re.search(r"([0-9.]+)\s*%", text)
|
||||
percent = m.group(1) if m else None
|
||||
|
||||
return {
|
||||
"report": {
|
||||
"lab": "INTERTEK",
|
||||
"reference": extract("Global Ref", text),
|
||||
"date": extract("Dated", text)
|
||||
},
|
||||
"shipment": {
|
||||
"bales": to_float(extract("Invoice Quantity", text)),
|
||||
"vessel": extract("Vessel", text),
|
||||
"bl": extract("B/L No", text),
|
||||
"arrival_date": extract("Arrival Date", text)
|
||||
},
|
||||
"weights": {
|
||||
"invoice_kg": to_float(extract("Invoice Weight", text)),
|
||||
"landed_kg": to_float(extract("Landed Weight", text)),
|
||||
"gain_loss_kg": to_float(extract("Gain", text)),
|
||||
"gain_loss_percent": to_float(percent)
|
||||
}
|
||||
}
|
||||
|
||||
class RobertsonParser:
|
||||
lab = "ROBERTSON"
|
||||
|
||||
def parse(self, text):
|
||||
m = re.search(r"([0-9.]+)\s*%", text)
|
||||
percent = m.group(1) if m else None
|
||||
|
||||
return {
|
||||
"report": {
|
||||
"lab": "ROBERTSON",
|
||||
"reference": extract("RI REF NO.", text),
|
||||
"date": extract("DATED", text)
|
||||
},
|
||||
"shipment": {
|
||||
"bales": to_float(extract("QUANTITY", text)),
|
||||
"vessel": extract("VESSEL", text),
|
||||
"bl": extract("B/L NO.", text),
|
||||
"arrival_date": extract("ARRIVAL DATE", text)
|
||||
},
|
||||
"weights": {
|
||||
"invoice_kg": to_float(extract("NET INVOICE WEIGHT", text)),
|
||||
"landed_kg": to_float(extract("NET LANDED WEIGHT", text)),
|
||||
"gain_loss_kg": to_float(extract("LOSS", text)),
|
||||
"gain_loss_percent": to_float(percent)
|
||||
}
|
||||
}
|
||||
|
||||
class SGSParser:
|
||||
lab = "SGS"
|
||||
|
||||
def parse(self, text):
|
||||
|
||||
return {
|
||||
"report": {
|
||||
"lab": "SGS",
|
||||
"reference": extract("LANDING REPORT No", text),
|
||||
"file_no": extract("FILE NO.", text),
|
||||
"date": extract("DATE", text)
|
||||
},
|
||||
"contract": {
|
||||
"contract_no": extract("CONTRACT NO.", text),
|
||||
"invoice_no": extract("INVOICE NO.", text),
|
||||
"origin": extract("ORIGIN", text),
|
||||
"product": extract("PRODUCT", text)
|
||||
},
|
||||
"parties": {
|
||||
"seller": extract("Seller", text),
|
||||
"buyer": extract("Buyer", text),
|
||||
"carrier": extract("Carrier", text)
|
||||
},
|
||||
"shipment": {
|
||||
"bl": extract("B/L no.", text),
|
||||
"port_loading": extract("Port of loading", text),
|
||||
"port_destination": extract("Port of destination", text),
|
||||
"arrival_date": extract("Vessel arrival date", text),
|
||||
"devanning_date": extract("Container devanning date", text),
|
||||
"weighing_date": extract("Weighing date", text),
|
||||
"weighing_mode": extract("Weighing mode", text),
|
||||
"quantity_bales": to_float(extract("Quantity arrived", text))
|
||||
},
|
||||
"weights": {
|
||||
"gross_landed_kg": to_float(extract("Gross landed", text)),
|
||||
"tare_kg": to_float(extract("Tare", text)),
|
||||
"net_landed_kg": to_float(extract("Net landed", text)),
|
||||
"net_invoiced_kg": to_float(extract("Net invoiced", text)),
|
||||
"gain_percent": to_float(safe_search(r"Gain\s*\+?\s*([0-9.,]+)\s*%", text)),
|
||||
"gain_kg": to_float(safe_search(r"Gain.*?([0-9.,]+)\s*kgs", text))
|
||||
}
|
||||
}
|
||||
|
||||
class PICLParser:
|
||||
lab = "PICL"
|
||||
|
||||
def parse(self, text):
|
||||
|
||||
return {
|
||||
"report": {
|
||||
"lab": "PICL",
|
||||
"reference": safe_search(r"No[:\s]+([A-Z0-9\-]+)", text),
|
||||
"date": safe_search(r"Monday,|Tuesday,|Wednesday,|Thursday,|Friday,|Saturday,|Sunday,?\s*([A-Za-z]+\s+[0-9]{1,2},\s*[0-9]{4})", text)
|
||||
},
|
||||
|
||||
"parties": {
|
||||
"seller": extract("FAIRCOT SA", text),
|
||||
"buyer": extract("M/S.", text)
|
||||
},
|
||||
|
||||
"shipment": {
|
||||
"bales": to_float(extract("Grand Total", text)),
|
||||
"vessel": extract("Shipped Per Vessel", text),
|
||||
"feeder": extract("Feeder", text),
|
||||
"port_loading": extract("Port of Loading", text),
|
||||
"port_discharge": extract("Port of Discharge", text),
|
||||
"arrival_date": extract("Date of Anival & LDL", text),
|
||||
"weighing_place": extract("Place & Date of Weighment", text)
|
||||
},
|
||||
|
||||
"contract": {
|
||||
"contract_no": extract("Contract/Pl No & Date", text),
|
||||
"invoice_no": extract("Invoice ilo & Date", text),
|
||||
"bl": extract("B/L No & Date", text),
|
||||
"origin": extract("Country of Origin", text),
|
||||
"commodity": extract("Commodity", text)
|
||||
},
|
||||
|
||||
"weights": {
|
||||
"gross_landed_kg": to_float(extract("Total;", text)),
|
||||
"tare_kg": to_float(extract("Tare Weight", text)),
|
||||
"net_landed_kg": to_float(extract("Grand Total", text)),
|
||||
"invoice_weight_kg": to_float(extract("Invoice weight", text)),
|
||||
"loss_kg": to_float(safe_search(r"(-[0-9.,]+)\s*KGS", text)),
|
||||
"loss_percent": to_float(safe_search(r"\(\s*([0-9.,]+)\s*o/o\s*\)", text))
|
||||
}
|
||||
}
|
||||
|
||||
# Configure root logger explicitly
|
||||
root = logging.getLogger()
|
||||
root.setLevel(logging.INFO)
|
||||
@@ -138,6 +308,17 @@ def safe_search(pattern, text, default=None, group_index=1, context=""):
|
||||
logger.warning("Group index %d not found for %s: %s", group_index, context, pattern)
|
||||
return default
|
||||
|
||||
def to_float(s):
|
||||
if not s:
|
||||
return None
|
||||
s = s.replace(",", "").replace("Kgs", "").replace("kg", "").replace("%", "")
|
||||
s = s.replace("lbs", "").replace("LBS", "")
|
||||
s = s.strip()
|
||||
try:
|
||||
return float(s)
|
||||
except:
|
||||
return None
|
||||
|
||||
def section(text, start, end=None):
|
||||
"""Extract a block of text between two headings, safely."""
|
||||
pattern_start = re.escape(start)
|
||||
@@ -157,6 +338,29 @@ def extract_field(text, label, default=None):
|
||||
pattern = rf"{re.escape(label)}\s*:?[\s]+([^\n]+)"
|
||||
return safe_search(pattern, text, default=default, context=f"field '{label}'")
|
||||
|
||||
def extract(label, text, default=None):
|
||||
"""
|
||||
Robust extraction for OCR/PDF text.
|
||||
Works with:
|
||||
Label: Value
|
||||
Label Value
|
||||
Label .... Value
|
||||
"""
|
||||
if not text:
|
||||
return default
|
||||
|
||||
patterns = [
|
||||
rf"{re.escape(label)}\s*[:\-]?\s*([^\n\r]+)",
|
||||
rf"{re.escape(label)}\s+([^\n\r]+)"
|
||||
]
|
||||
|
||||
for p in patterns:
|
||||
m = re.search(p, text, re.I)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
|
||||
return default
|
||||
|
||||
def extract_report_metadata(text):
|
||||
logger.info("Starting metadata extraction, text length=%d", len(text))
|
||||
|
||||
@@ -254,6 +458,55 @@ def extract_report_metadata(text):
|
||||
logger.exception("Unexpected error during metadata extraction")
|
||||
raise HTTPException(status_code=500, detail=f"Metadata extraction failed: {e}")
|
||||
|
||||
def detect_template(text):
|
||||
t = text.lower()
|
||||
|
||||
if "alfred h. knight" in t and "cotton landing report" in t:
|
||||
return "AHK"
|
||||
|
||||
if "intertek" in t and "landing report" in t:
|
||||
return "INTERTEK"
|
||||
|
||||
if "robertson international" in t or "ri ref no" in t:
|
||||
return "ROBERTSON"
|
||||
|
||||
if "landing report" in t and "carcon cargo" in t:
|
||||
return "SGS"
|
||||
|
||||
if "pacific inspection company" in t or "picl-bd.com" in t:
|
||||
return "PICL"
|
||||
|
||||
return "UNKNOWN"
|
||||
|
||||
@app.post("/metadata")
|
||||
async def metadata(text: str = Body(..., embed=True)):
|
||||
return extract_report_metadata(text)
|
||||
|
||||
@app.post("/parse")
|
||||
async def parse_endpoint(text: str = Body(..., embed=True)):
|
||||
return parse_report(text)
|
||||
|
||||
PARSERS = {
|
||||
"AHK": AHKParser(),
|
||||
"INTERTEK": IntertekParser(),
|
||||
"ROBERTSON": RobertsonParser(),
|
||||
"SGS": SGSParser(),
|
||||
"PICL": PICLParser()
|
||||
}
|
||||
|
||||
def parse_report(text):
|
||||
template = detect_template(text)
|
||||
logger.info(f"Detected template: {template}")
|
||||
|
||||
if template not in PARSERS:
|
||||
return {
|
||||
"template": "UNKNOWN",
|
||||
"raw_text": text[:5000]
|
||||
}
|
||||
|
||||
data = PARSERS[template].parse(text)
|
||||
|
||||
return {
|
||||
"template": template,
|
||||
"data": data
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user