From 3db75bbc8197b2e093e19ed7a7679dbc0f4a846a Mon Sep 17 00:00:00 2001 From: Balaji S Date: Wed, 3 Dec 2025 06:36:55 +0530 Subject: [PATCH] fix: handle first transaction when description merges with Opening Unit Balance Some PDFs have their first transaction's description text extracted on the same line as "Opening Unit Balance" due to PDF internal structure differences. This caused the first transaction to be silently dropped since the parser couldn't match a line with date/amounts but no description. Example of problematic extraction: Line 1: 'NFO Purchase Opening Unit Balance: 0.000' <- description merged Line 2: '28-Apr-2020 2,000.00 200.000 10.0000 200.000' <- no description Fix: - Modified OPEN_UNITS_RE to capture optional description before "Opening Unit Balance" - Added TRANSACTION_RE5 to match transaction lines without description - Store pending description and apply it to the next transaction line Fixes orphan stamp duty issue where the corresponding purchase was missing. --- casparser/process/cas_detailed.py | 20 +++++++++++++++++--- casparser/process/regex.py | 5 ++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/casparser/process/cas_detailed.py b/casparser/process/cas_detailed.py index e496a97..98a9ac9 100644 --- a/casparser/process/cas_detailed.py +++ b/casparser/process/cas_detailed.py @@ -35,6 +35,7 @@ TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4, + TRANSACTION_RE5, VALUATION_RE, ) from .utils import isin_search @@ -128,7 +129,7 @@ def get_parsed_scheme_name(scheme) -> str: return re.sub(r"[^a-zA-Z0-9_)]+$", "", scheme).strip() -def parse_transaction(line) -> Optional[ParsedTransaction]: +def parse_transaction(line, pending_description: Optional[str] = None) -> Optional[ParsedTransaction]: for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4): if m := re.search(regex, line, re.DOTALL | re.MULTILINE | re.I): groups = m.groups() @@ -149,6 +150,12 @@ def parse_transaction(line) -> Optional[ParsedTransaction]: if date is not None: return ParsedTransaction(date, description, amount, units, nav, balance) + # Try TRANSACTION_RE5 for first transaction without description + # (description is on previous line with "Opening Unit Balance") + if pending_description and (m := re.search(TRANSACTION_RE5, line, re.DOTALL | re.MULTILINE | re.I)): + date, amount, units, nav, balance = m.groups() + return ParsedTransaction(date, pending_description, amount, units, nav, balance) + def process_detailed_text(text): """ @@ -163,6 +170,7 @@ def process_detailed_text(text): current_folio = None current_amc = None curr_scheme_data: Optional[Scheme] = None + pending_first_txn_desc: Optional[str] = None # For first transaction whose description is on Opening Unit Balance line lines = text.split("\u2029") for idx, line in enumerate(lines): # Parse schemes with long names (single line) effectively pushing @@ -237,7 +245,11 @@ def process_detailed_text(text): if m := re.search(NOMINEE_RE, line, re.I | re.DOTALL): curr_scheme_data.nominees.extend([x.strip() for x in m.groups() if x.strip()]) if m := re.search(OPEN_UNITS_RE, line): - curr_scheme_data.open = Decimal(m.group(1).replace(",", "_")) + # group(1) is optional description before "Opening Unit Balance" (for first transaction) + # group(2) is the opening unit balance value + if m.group(1): + pending_first_txn_desc = m.group(1).strip() + curr_scheme_data.open = Decimal(m.group(2).replace(",", "_")) curr_scheme_data.close_calculated = curr_scheme_data.open continue if m := re.search(CLOSE_UNITS_RE, line): @@ -255,7 +267,7 @@ def process_detailed_text(text): if m := re.search(DESCRIPTION_TAIL_RE, line): description_tail = m.group(1).strip() line = line.replace(m.group(1), "") - if parsed_txn := parse_transaction(line): + if parsed_txn := parse_transaction(line, pending_first_txn_desc): date = date_parser.parse(parsed_txn.date).date() desc = parsed_txn.description.strip() if description_tail != "": @@ -279,6 +291,8 @@ def process_detailed_text(text): dividend_rate=dividend_rate, ) ) + # Clear pending description after first transaction is parsed + pending_first_txn_desc = None if curr_scheme_data: folios[current_folio].schemes.append(curr_scheme_data) return ProcessedCASData( diff --git a/casparser/process/regex.py b/casparser/process/regex.py index 373611c..4528699 100644 --- a/casparser/process/regex.py +++ b/casparser/process/regex.py @@ -29,7 +29,8 @@ SCHEME_KV_RE = r"""(\w+)\s*:\s*([-\w]+)""" REGISTRAR_RE = r"^\s*Registrar\s*:\s*(.*)\s*$" -OPEN_UNITS_RE = r"Opening\s+Unit\s+Balance.+?([\d,.]+)" +# Captures optional description before "Opening Unit Balance" (for first transaction) +OPEN_UNITS_RE = r"(?:(.+?)\t\t)?Opening\s+Unit\s+Balance.+?([\d,.]+)" CLOSE_UNITS_RE = r"Closing\s+Unit\s+Balance.+?([\d,.]+)" COST_RE = r"Total\s+Cost\s+Value\s*:.+?[INR\s]*([\d,.]+)" VALUATION_RE = ( @@ -45,6 +46,8 @@ TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*" # Tax transactions TRANSACTION_RE4 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*" +# First transaction without description (description is on previous line with Opening Unit Balance) +TRANSACTION_RE5 = rf"{date_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}" DESCRIPTION_TAIL_RE = r"(\n.+?)(\t\t|$)" DIVIDEND_RE = r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?" SCHEME_TAIL_RE = r"(\n.+?)(?:\t\t|$)"