From fca1baf08823c57a4a4348c4811b176aff73f074 Mon Sep 17 00:00:00 2001 From: Sandeep Somasekharan Date: Wed, 20 May 2026 11:58:14 +1000 Subject: [PATCH 1/4] feat(parsers): atom-based pypdfium2 backend + bonds + section routing Rebuilds the parsing layer for v1.0 on top of pypdfium2 (Apache-2.0 / BSD-3) so casparser ships pure MIT end-to-end; the prior pdfminer.six + PyMuPDF dependencies are dropped along with the entire `casparser/ process/` regex-tokenisation pipeline they fed. Engine (casparser/parsers/extract.py, pageobj.py) ================================================= `extract.py` walks PDF page objects (one atom per text-show op), maps glyphs to their parent atom via PDFium's `FPDFText_GetTextObject`, deduplicates same-font overlapping atoms, then emits `Char`/`Line`/`Page` shaped output that downstream parsers consume. Atom-level dedup replaces all per-character overlay heuristics: when two atoms share a font, x-overlap by >=50% of the narrower atom's width, and sit 0.05-3.0pt apart in y, we drop the one further from the row's median baseline. That handles the date-twin artefact (same date column rendered twice with a small y-offset, glyphs interleaving by x to produce garbage like `2020 -> 22002200`) without the multi-stage sub-cluster filters earlier prototypes used. `pageobj.py` exposes the atoms + their column/block grouping that the NSDL/CDSL parsers operate on directly. The same Atom primitive backs the investor extractor. Per-issuer parsers ================== - `cams_detailed.py` / `cams_summary.py` consume the Line stream for CAMS + KFin DETAILED and SUMMARY templates. - `nsdl.py` reads the page-2 account roster, walks per-account holdings sections (equities + mutual funds + corporate bonds in both summary and detailed forms). Section-aware routing handles the case where multiple holding types share the same 18-cell detailed table header by tracking `cur_section` from the preceding marker block. The page-2 roster accepts both the 4-cell (broker + DP/Client joined) and 5-cell (broker, then DP/Client) variants. - `cdsl.py` mirrors NSDL's structure for the CDSL CAS template. Types ===== - Adds `Bond` model with optional coupon_rate / coupon_frequency / maturity_date / face_value / market_price; required fields are isin, num_bonds, value. Surfaces on `DematAccount.bonds`. - `investor_info` is now required on `CASData` and `NSDLCASData`. Performance =========== The dispatcher opens the PDF document exactly once per `read_cas_pdf` call and threads the handle through detect / parser / investor extractor via an `_doc=` kwarg. NSDL/CDSL additionally share the extracted atoms between the holdings parser and the investor extractor. --- casparser/__init__.py | 2 +- casparser/cli.py | 6 +- casparser/parsers/__init__.py | 196 +++++-- casparser/parsers/_classify.py | 114 ++++ casparser/parsers/_investor.py | 192 ++++++ casparser/parsers/_isin.py | 40 ++ casparser/parsers/cams_detailed.py | 546 +++++++++++++++++ casparser/parsers/cams_summary.py | 436 ++++++++++++++ casparser/parsers/cdsl.py | 576 ++++++++++++++++++ casparser/parsers/detect.py | 98 ++++ casparser/parsers/extract.py | 394 +++++++++++++ casparser/parsers/mupdf.py | 317 ---------- casparser/parsers/nsdl.py | 879 ++++++++++++++++++++++++++++ casparser/parsers/pageobj.py | 435 ++++++++++++++ casparser/parsers/pdfminer.py | 263 --------- casparser/process/__init__.py | 38 -- casparser/process/cas_detailed.py | 288 --------- casparser/process/cas_summary.py | 98 ---- casparser/process/cdsl_statement.py | 301 ---------- casparser/process/nsdl_statement.py | 213 ------- casparser/process/regex.py | 77 --- casparser/process/utils.py | 25 - casparser/types.py | 60 +- 23 files changed, 3898 insertions(+), 1696 deletions(-) create mode 100644 casparser/parsers/_classify.py create mode 100644 casparser/parsers/_investor.py create mode 100644 casparser/parsers/_isin.py create mode 100644 casparser/parsers/cams_detailed.py create mode 100644 casparser/parsers/cams_summary.py create mode 100644 casparser/parsers/cdsl.py create mode 100644 casparser/parsers/detect.py create mode 100644 casparser/parsers/extract.py delete mode 100644 casparser/parsers/mupdf.py create mode 100644 casparser/parsers/nsdl.py create mode 100644 casparser/parsers/pageobj.py delete mode 100644 casparser/parsers/pdfminer.py delete mode 100644 casparser/process/__init__.py delete mode 100644 casparser/process/cas_detailed.py delete mode 100644 casparser/process/cas_summary.py delete mode 100644 casparser/process/cdsl_statement.py delete mode 100644 casparser/process/nsdl_statement.py delete mode 100644 casparser/process/regex.py delete mode 100644 casparser/process/utils.py diff --git a/casparser/__init__.py b/casparser/__init__.py index ba4bf29..a7615e5 100644 --- a/casparser/__init__.py +++ b/casparser/__init__.py @@ -9,4 +9,4 @@ "CapitalGainsReport", ] -__version__ = "0.8.1" +__version__ = "1.0.0" diff --git a/casparser/cli.py b/casparser/cli.py index 9762422..63cce9f 100644 --- a/casparser/cli.py +++ b/casparser/cli.py @@ -37,7 +37,7 @@ def formatINR(number): else: last3 = int_part[-3:] rest = int_part[:-3] - groups = [rest[max(0, i - 2):i or None] for i in range(len(rest), 0, -2)][::-1] + groups = [rest[max(0, i - 2) : i or None] for i in range(len(rest), 0, -2)][::-1] if groups and groups[0]: r = ",".join(groups + [last3]) else: @@ -82,7 +82,7 @@ def print_nsdl(parsed_data: NSDLCASData): ) summary_table.add_row(Padding("File Type :", spacing), f"[bold]{data['file_type']}[/]") # summary_table.add_row(Padding("CAS Type :", spacing), f"[bold]{data['cas_type']}[/]") - for key, value in data["investor_info"].items(): + for key, value in (data.get("investor_info") or {}).items(): summary_table.add_row( Padding(f"{key.capitalize()} :", spacing), re.sub(r"[^\S\r\n]+", " ", value) ) @@ -208,7 +208,7 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio summary_table.add_row(Padding("File Type :", spacing), f"[bold]{data['file_type']}[/]") summary_table.add_row(Padding("CAS Type :", spacing), f"[bold]{data['cas_type']}[/]") - for key, value in data["investor_info"].items(): + for key, value in (data.get("investor_info") or {}).items(): summary_table.add_row( Padding(f"{key.capitalize()} :", spacing), re.sub(r"[^\S\r\n]+", " ", value) ) diff --git a/casparser/parsers/__init__.py b/casparser/parsers/__init__.py index efdbf3e..a56ad4c 100644 --- a/casparser/parsers/__init__.py +++ b/casparser/parsers/__init__.py @@ -1,72 +1,156 @@ +"""Top-level dispatcher for `casparser.read_cas_pdf`. + +v1.0 reorganisation: pdfminer.six and PyMuPDF are gone. Everything +runs on pypdfium2 with parsers that consume structured page-object +data directly (no text-rendering / regex round-trip for NSDL+CDSL, +column-aware layout reading for CAMS+KFin). + +The four issuer-specific parsers live alongside this file: + + cams_detailed.py → CAMS / KFin DETAILED statements + cams_summary.py → CAMS / KFin SUMMARY statements + nsdl.py → NSDL Consolidated Account Statement + cdsl.py → CDSL Consolidated Account Statement + +`read_cas_pdf` sniffs the issuer + statement variant from the PDF's +first page, dispatches to the right parser, optionally sorts +transactions chronologically, and returns either `CASData` (CAMS/KFin) +or `NSDLCASData` (NSDL/CDSL). +""" + +from __future__ import annotations + import io +import warnings from typing import Union -from casparser.process import process_cas_text -from casparser.types import CASData, NSDLCASData, ProcessedCASData +from casparser.enums import CASFileType, FileType +from casparser.exceptions import CASParseError +from casparser.types import CASData, NSDLCASData +from .detect import _open_document, detect_cas_type, detect_file_type from .utils import cas2csv, cas2json +def _sort_transactions(data: CASData) -> CASData: + """For each scheme, sort transactions by date and re-compute the + running balance from the opening balance.""" + for folio in data.folios: + for idx, scheme in enumerate(folio.schemes): + dates = [x.date for x in scheme.transactions] + if dates == sorted(dates): + continue + sorted_txns = [] + balance = scheme.open + for txn in sorted(scheme.transactions, key=lambda x: x.date): + balance += txn.units or 0 + txn.balance = balance + sorted_txns.append(txn) + scheme.transactions = sorted_txns + folio.schemes[idx] = scheme + return data + + def read_cas_pdf( filename: Union[str, io.IOBase], - password, - output="dict", - sort_transactions=True, - force_pdfminer=False, + password: str, + output: str = "dict", + sort_transactions: bool = True, + force_pdfminer: bool = False, ): - """ - Parse CAS pdf and returns line data. + """Parse a Consolidated Account Statement PDF. - :param filename: CAS pdf file (CAMS or Kfintech) - :param password: CAS pdf password - :param output: Output format (json,dict) [default: dict] - :param sort_transactions: Sort transactions by date and re-compute balances. - :param force_pdfminer: Force pdfminer parser even if mupdf is detected + :param filename: path to the CAS PDF (or an open file-like object). + :param password: PDF password (most CAS PDFs are encrypted with the + investor's PAN). + :param output: `"dict"` (default) returns the typed model directly, + `"json"` returns its JSON serialisation, `"csv"` + returns a CSV string of transactions or holdings. + :param sort_transactions: For CAMS / KFin DETAILED statements, sort + each scheme's transactions by date and + re-compute the running balance. Default + `True`. + :param force_pdfminer: **Deprecated.** v1.0 dropped pdfminer in + favour of pypdfium2. Setting this to True + emits a `DeprecationWarning` and is otherwise + ignored. + :return: `CASData` for CAMS/KFin issuers, `NSDLCASData` for + NSDL/CDSL issuers, or a serialised form of either when + `output` is `"json"` / `"csv"`. """ if force_pdfminer: - from .pdfminer import cas_pdf_to_text - else: - try: - from .mupdf import cas_pdf_to_text - except (ImportError, ModuleNotFoundError): - from .pdfminer import cas_pdf_to_text - - partial_cas_data = cas_pdf_to_text(filename, password) - processed_data = process_cas_text( - "\u2029".join(partial_cas_data.lines), partial_cas_data.file_type - ) - if isinstance(processed_data, ProcessedCASData): - if sort_transactions: - for folio in processed_data.folios: - for idx, scheme in enumerate(folio.schemes): - dates = [x.date for x in scheme.transactions] - sorted_dates = list(sorted(dates)) - if dates != sorted_dates: - sorted_transactions = [] - balance = scheme.open - for transaction in sorted(scheme.transactions, key=lambda x: x.date): - balance += transaction.units or 0 - transaction.balance = balance - sorted_transactions.append(transaction) - scheme.transactions = sorted_transactions - folio.schemes[idx] = scheme - - final_data = CASData( - statement_period=processed_data.statement_period, - folios=processed_data.folios, - investor_info=partial_cas_data.investor_info, - cas_type=processed_data.cas_type, - file_type=partial_cas_data.file_type, + warnings.warn( + "force_pdfminer is deprecated in casparser 1.0 — pdfminer " + "is no longer a supported backend.", + DeprecationWarning, + stacklevel=2, ) - else: - final_data = NSDLCASData( - statement_period=processed_data.statement_period, - accounts=processed_data.accounts, - investor_info=partial_cas_data.investor_info, - file_type=partial_cas_data.file_type, + + # Open the PDF exactly once and thread it through the detect / + # parser / investor extractor calls — every pypdfium2 open re-runs + # the password decrypt + content-stream parse, so the savings on + # multi-page detailed statements are significant. + doc = _open_document(filename, password) + + file_type = detect_file_type(filename, password, _doc=doc) + if file_type == FileType.UNKNOWN: + raise CASParseError( + "Could not identify the CAS issuer. Supported issuers are " + "CAMS, KFintech, NSDL, and CDSL." ) + + if file_type in (FileType.CAMS, FileType.KFINTECH): + cas_type = detect_cas_type(filename, password, _doc=doc) + if cas_type == CASFileType.DETAILED: + from . import cams_detailed + + data: Union[CASData, NSDLCASData] = cams_detailed.parse( + filename, + password, + file_type=file_type, + _doc=doc, + ) + elif cas_type == CASFileType.SUMMARY: + from . import cams_summary + + data = cams_summary.parse( + filename, + password, + file_type=file_type, + _doc=doc, + ) + else: + raise CASParseError( + "Could not identify whether this is a DETAILED or " "SUMMARY CAMS / KFin statement." + ) + if sort_transactions and isinstance(data, CASData): + data = _sort_transactions(data) + elif file_type == FileType.NSDL: + from . import nsdl + + data = nsdl.parse_nsdl( + filename, + password, + file_type=FileType.NSDL, + _doc=doc, + ) + elif file_type == FileType.CDSL: + from . import cdsl + + data = cdsl.parse_cdsl( + filename, + password, + file_type=FileType.CDSL, + _doc=doc, + ) + else: # pragma: no cover — handled above + raise CASParseError(f"Unsupported file type: {file_type}") + if output == "dict": - return final_data - elif output == "csv": - return cas2csv(final_data) - return cas2json(final_data) + return data + if output == "csv": + return cas2csv(data) + return cas2json(data) + + +__all__ = ["read_cas_pdf"] diff --git a/casparser/parsers/_classify.py b/casparser/parsers/_classify.py new file mode 100644 index 0000000..972e11c --- /dev/null +++ b/casparser/parsers/_classify.py @@ -0,0 +1,114 @@ +"""Classification helpers shared across the CAMS / KFin parsers. + +Two pure utilities: + +- `get_transaction_type` maps a transaction description + signed units + count to a `TransactionType` enum, also extracting the dividend rate + for IDCW / dividend lines. +- `get_parsed_scheme_name` normalises a raw scheme name (drops + `(formerly ...)`, `(erstwhile ...)`, `(Demat ...)` trailers, collapses + whitespace). + +These are pulled out of the old `casparser.process.cas_detailed` module +because the pypdfium2 DETAILED parser still needs them but the rest of +that module's text-rendering machinery is now gone. +""" + +from __future__ import annotations + +import re +from decimal import Decimal +from typing import Optional, Tuple + +from casparser.enums import TransactionType + +# Matches an IDCW / dividend transaction description. Captures the +# "reinvest" hint (if present) and the per-unit rupee value. +DIVIDEND_RE = re.compile( + r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?", + re.I | re.DOTALL, +) + + +def get_transaction_type( + description: str, units: Optional[Decimal] +) -> Tuple[TransactionType, Optional[Decimal]]: + """Classify a transaction by its description + units sign. + + Returns `(transaction_type, dividend_rate_or_None)`. The dividend + rate is only set for IDCW / dividend transactions. + """ + dividend_rate: Optional[Decimal] = None + description = description.lower() + if div_match := DIVIDEND_RE.search(description): + reinvest_flag, dividend_str = div_match.groups() + dividend_rate = Decimal(dividend_str) + txn_type = ( + TransactionType.DIVIDEND_REINVEST if reinvest_flag else TransactionType.DIVIDEND_PAYOUT + ) + elif units is None: + if "stt" in description: + txn_type = TransactionType.STT_TAX + elif "stamp" in description: + txn_type = TransactionType.STAMP_DUTY_TAX + elif "tds" in description: + txn_type = TransactionType.TDS_TAX + else: + txn_type = TransactionType.MISC + elif units > 0: + if "switch" in description: + txn_type = ( + TransactionType.SWITCH_IN_MERGER + if "merger" in description + else TransactionType.SWITCH_IN + ) + elif "segregat" in description: + txn_type = TransactionType.SEGREGATION + elif ( + "sip" in description + or "systematic" in description + or re.search(r"instal+ment", description, re.I) + or re.search(r"sys.+?invest", description, re.I | re.DOTALL) + ): + txn_type = TransactionType.PURCHASE_SIP + else: + txn_type = TransactionType.PURCHASE + elif units < 0: + if re.search( + r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", + description, + re.I, + ): + txn_type = TransactionType.REVERSAL + elif "switch" in description: + txn_type = ( + TransactionType.SWITCH_OUT_MERGER + if "merger" in description + else TransactionType.SWITCH_OUT + ) + else: + txn_type = TransactionType.REDEMPTION + else: + txn_type = TransactionType.UNKNOWN + + return txn_type, dividend_rate + + +def get_parsed_scheme_name(scheme: str) -> str: + """Strip `(formerly ...)`, `(erstwhile ...)`, `(Demat ...)`, + `(Non-Demat ...)` trailers; collapse whitespace; trim trailing + punctuation.""" + scheme = re.sub( + r"\((formerly|erstwhile).+?\)", + "", + scheme, + flags=re.I | re.DOTALL, + ).strip() + scheme = re.sub( + r"\((Demat|Non-Demat).*", + "", + scheme, + flags=re.I | re.DOTALL, + ).strip() + scheme = re.sub(r"\s+", " ", scheme).strip() + return re.sub(r"[^a-zA-Z0-9_)]+$", "", scheme).strip() diff --git a/casparser/parsers/_investor.py b/casparser/parsers/_investor.py new file mode 100644 index 0000000..e8ec246 --- /dev/null +++ b/casparser/parsers/_investor.py @@ -0,0 +1,192 @@ +"""Investor-info extractors for the four supported CAS issuers. + +Both extractors filter the source PDF's atoms to a top-left column on +a known page (page 1 for CAMS/KFin, page 2 for NSDL/CDSL), then walk +top-down picking out labelled fields. We use page-object atoms rather +than baseline-clustered lines so the right-column disclaimer text +that shares y-baselines with the investor block doesn't contaminate +the result. + +The CAMS/KFin block carries the full quartet (name, email, address, +mobile). The NSDL/CDSL block carries only the name and address — +those CAS variants don't print the investor's email or mobile on the +statement, so those fields come back as empty strings. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, List, Optional + +from casparser.exceptions import CASParseError +from casparser.types import InvestorInfo + +from .pageobj import Atom, extract_atoms + +if TYPE_CHECKING: # pragma: no cover + import pypdfium2 as pdfium + + +# Top-left column cutoffs. Everything to the right is the disclaimer +# paragraph (CAMS/KFin) or the cover-page banner (NSDL/CDSL). 200 is +# the conservative right edge that fits all observed templates. +_LEFT_COLUMN_X = 200.0 + + +_EMAIL_RE = re.compile(r"Email\s*Id\s*:\s*(\S+@\S+)", re.I) +_MOBILE_RE = re.compile(r"Mobile\s*:\s*([+\d]+)", re.I) +_PHONE_RE = re.compile(r"^\s*Phone\s+Off\s*:", re.I) +_PINCODE_RE = re.compile(r"^\s*(?:Pin\s*code|PINCODE)\s*:\s*\d+", re.I) +_ID_MARKER_RE = re.compile(r"^\s*(?:CAS|NSDL)\s*ID\s*:", re.I) + + +def _left_column_atoms(atoms: List[Atom]) -> List[Atom]: + """Filter to atoms in the top-left column, sorted top-down.""" + filtered = [a for a in atoms if a.x_left < _LEFT_COLUMN_X and a.text.strip()] + filtered.sort(key=lambda a: -a.y_top) + return filtered + + +def extract_cams_kfin_investor( + pdf_path, + password, + *, + _doc: "Optional[pdfium.PdfDocument]" = None, + _atoms: Optional[List[List[Atom]]] = None, +) -> InvestorInfo: + """Read the investor block from the top-left of page 1. + + Layout across CAMS and KFin templates: + + Email Id: + +
+ ... +
+ [Phone Off: ...] ← only on some KFin templates + Mobile: + + We anchor on `Email Id:` (always present on CAMS/KFin), then + everything until `Mobile:` (exclusive) is name + address. The + name is the first non-label line; the rest is address. Stray + `Phone Off:` lines are dropped from the address. + + Every CAS statement carries this block by mandate. If we can't + find it we raise `CASParseError` — a CAS without identifiable + investor is malformed, not a "missing field" case. + + `_doc` / `_atoms`: dispatcher-provided overrides to avoid a + second pypdfium2 open + page-object walk when the caller has + already extracted atoms for the holdings parser. + """ + pages = ( + _atoms + if _atoms is not None + else extract_atoms( + pdf_path, + password, + _doc=_doc, + ) + ) + block = _left_column_atoms(pages[0]) if pages else [] + + email = "" + mobile = "" + name = "" + address_lines: List[str] = [] + seen_email = False + + for atom in block: + text = atom.text.strip() + if m := _EMAIL_RE.match(text): + email = m.group(1).strip() + seen_email = True + continue + if m := _MOBILE_RE.match(text): + mobile = m.group(1).strip() + # Mobile is the last field of the investor block — stop here + # so the transaction table that follows isn't picked up. + break + if not seen_email: + continue + if _PHONE_RE.match(text): + continue + if not name: + name = text + else: + address_lines.append(text) + + if not name: + raise CASParseError( + "Could not extract investor info from CAMS/KFin CAS PDF. " + "Expected an `Email Id:` line followed by name + address + " + "`Mobile:` in the top-left column of page 1." + ) + return InvestorInfo( + name=name, + email=email, + address="\n".join(address_lines), + mobile=mobile, + ) + + +def extract_nsdl_cdsl_investor( + pdf_path, + password, + *, + _doc: "Optional[pdfium.PdfDocument]" = None, + _atoms: Optional[List[List[Atom]]] = None, +) -> InvestorInfo: + """NSDL / CDSL print the investor block on page 2 (after the cover + page). The block is delimited by a `CAS ID:` (CDSL) or `NSDL ID:` + (NSDL) marker on top and a `PINCODE:` line on the bottom. Name is + the first line after the marker; everything between is address. + Email and mobile aren't printed in these CAS variants, so they + come back as empty strings. + + Raises `CASParseError` if no investor block is found — a CAS + without identifiable investor is malformed. + + `_doc` / `_atoms`: dispatcher-provided overrides; see + `extract_cams_kfin_investor`. + """ + pages = ( + _atoms + if _atoms is not None + else extract_atoms( + pdf_path, + password, + _doc=_doc, + ) + ) + block = _left_column_atoms(pages[1]) if len(pages) >= 2 else [] + + name = "" + address_lines: List[str] = [] + seen_marker = False + for atom in block: + text = atom.text.strip() + if _ID_MARKER_RE.match(text): + seen_marker = True + continue + if not seen_marker: + continue + if not name: + name = text + continue + address_lines.append(text) + if _PINCODE_RE.match(text): + break + + if not name: + raise CASParseError( + "Could not extract investor info from NSDL/CDSL CAS PDF. " + "Expected a `CAS ID:` / `NSDL ID:` marker followed by name + " + "address in the top-left column of page 2." + ) + return InvestorInfo( + name=name, + email="", + address="\n".join(address_lines), + mobile="", + ) diff --git a/casparser/parsers/_isin.py b/casparser/parsers/_isin.py new file mode 100644 index 0000000..edcc6db --- /dev/null +++ b/casparser/parsers/_isin.py @@ -0,0 +1,40 @@ +from typing import Optional, Tuple + +from casparser_isin import MFISINDb + + +def isin_search( + scheme_name: str, + rta: str, + rta_code: str, + isin: Optional[str] = None, +) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """Look up `(ISIN, AMFI, type)` for a CAS scheme. + + The primary path matches on `(scheme_name, rta, rta_code)`. When + that returns no hit but the caller passed an `isin` (e.g., parsed + inline from the scheme header), fall back to a direct ISIN lookup. + The fallback bypasses RTA mis-detection that can happen when the + `Registrar:` value gets mangled by multi-line rendering on + pypdfium2's char extraction. + + :param scheme_name: Normalised scheme name from the CAS. + :param rta: Registrar (`CAMS` / `KFINTECH` / `FTAMIL` …). + :param rta_code: Scheme's per-RTA code. + :param isin: Optional ISIN hint pulled from the scheme header. + """ + with MFISINDb() as db: + try: + scheme_data = db.isin_lookup(scheme_name, rta, rta_code, isin=isin) + return scheme_data.isin, scheme_data.amfi_code, scheme_data.type + except ValueError: + pass + if isin: + try: + rows = db.direct_isin_lookup(isin) + if rows: + row = rows[0] + return row["isin"], row["amfi_code"], row["type"] + except (ValueError, KeyError, TypeError): + pass + return None, None, None diff --git a/casparser/parsers/cams_detailed.py b/casparser/parsers/cams_detailed.py new file mode 100644 index 0000000..b79d487 --- /dev/null +++ b/casparser/parsers/cams_detailed.py @@ -0,0 +1,546 @@ +"""POC: CAMS DETAILED CAS parser using column-based row reading. + +Produces the same `List[Folio]` shape as the production parser so output can +be diffed directly. ISIN/AMFI enrichment and investor info are deferred — +those passes are orthogonal to the column-reader question. + +Scope of this POC (handles): +- One CAS, possibly multi-page +- One AMC, one folio header per folio, one scheme header per scheme +- Transaction table with 6 standard columns (Date / Transaction / Amount / + Units / Price / Unit Balance) +- "Opening Unit Balance", "Closing Unit Balance", "NAV on", "Valuation on" + labeled rows + +Deferred (TODO markers below): +- Multi-line transaction descriptions (we keep first line only) +- ISIN / AMFI lookup +- Nominees +- Segregated portfolios +- Total Cost Value parsing +- Investor info / statement period +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from decimal import Decimal +from typing import List, Optional + +from dateutil import parser as dateparse + +from casparser.enums import CASFileType, FileType +from casparser.types import ( + CASData, + Folio, + Scheme, + SchemeValuation, + StatementPeriod, + TransactionData, +) + +from ._classify import get_parsed_scheme_name, get_transaction_type +from ._investor import extract_cams_kfin_investor +from ._isin import isin_search +from .extract import Char, Line, extract_pages + +# ----------------------------------------------------------------------------- +# Column anchors +# ----------------------------------------------------------------------------- + +# CAMS DETAILED transaction table. The header is two physical rows in the +# PDF: "Date Transaction Amount Units Price Unit" on top, "(INR) (INR) +# Balance" below. We require ≥4 of these labels on one line. +TXN_HEADER_LABELS = {"Date", "Transaction", "Amount", "Units", "Price", "Unit", "Balance", "NAV"} +TXN_MIN_HITS = 4 + +# All numeric columns are right-aligned; Date and Transaction are left-aligned. +ALIGN = { + "Date": "left", + "Transaction": "left", + "Amount": "right", + "Units": "right", + "Price": "right", + "Unit Balance": "right", + "NAV": "right", +} + + +@dataclass +class Column: + label: str + x_lo: float # range covering header label width + x_hi: float + alignment: str # 'left' | 'right' + + @property + def x_anchor(self) -> float: + """For right-aligned columns, x_hi is the snap target; for left, + x_lo is.""" + return self.x_hi if self.alignment == "right" else self.x_lo + + +def _words_on_line(line: Line, min_gap: float = 1.5) -> List[tuple[str, float, float]]: + """Return [(text, x0, x1)] words on a line, splitting on x-gap > min_gap.""" + cs = sorted(line.chars, key=lambda c: c.x0) + words = [] + cur, cur_x0, cur_x1 = "", None, None + for c in cs: + if cur and (c.x0 - cur_x1) > min_gap: + words.append((cur, cur_x0, cur_x1)) + cur = "" + if not cur: + cur_x0 = c.x0 + cur += c.text + cur_x1 = c.x1 + if cur: + words.append((cur, cur_x0, cur_x1)) + return words + + +HEADER_WINDOW_Y = 15.0 # vertical span (pts) that constitutes one logical +# header block. CAMS uses 2 baselines spanning ~10pt; KFin uses 4 baselines +# spanning ~11pt (Amount/Price at top, Unit, Date/Transaction/Units, (INR) +# /(INR)/Balance at bottom). + + +def detect_txn_columns(lines: List[Line], start_idx: int) -> Optional[tuple[int, List[Column]]]: + """Find the next transaction-table header at or after start_idx. + + A header is a y-window of consecutive lines (top-down) spanning ≤ HEADER_ + WINDOW_Y points and collectively containing ≥ TXN_MIN_HITS distinct + column labels. We collect labels from the whole window so wraps like + "Unit"/"Balance" stacked over 2 baselines or KFin's 4-baseline split + behave the same. + + Returns (index_of_last_line_in_header, ordered columns). Transaction + parsing should start at index + 1. + """ + for i in range(start_idx, len(lines)): + window = [lines[i]] + for j in range(i + 1, len(lines)): + if lines[i].baseline - lines[j].baseline > HEADER_WINDOW_Y: + break + window.append(lines[j]) + + all_words: List[tuple[str, float, float]] = [] + for line in window: + all_words.extend(_words_on_line(line)) + labels = {w[0] for w in all_words if w[0] in TXN_HEADER_LABELS} + if len(labels) < TXN_MIN_HITS: + continue + last_idx = i + len(window) - 1 + return last_idx, _build_columns(all_words) + return None + + +def _build_columns(words: List[tuple[str, float, float]]) -> List[Column]: + """Map header words to Columns. Merge "Unit"+"Balance" into one column.""" + cols: List[Column] = [] + used = set() + for text, x0, x1 in words: + if text == "Unit" and ("Balance" in (w[0] for w in words)): + # Find "Balance" with overlapping x-range + for w_text, w_x0, w_x1 in words: + if w_text == "Balance" and abs((w_x0 + w_x1) / 2 - (x0 + x1) / 2) < 30: + cols.append(Column("Unit Balance", min(x0, w_x0), max(x1, w_x1), "right")) + used.add(id((text, x0, x1))) + used.add(id((w_text, w_x0, w_x1))) + break + elif text in ALIGN and text not in ("Unit", "Balance"): + cols.append(Column(text, x0, x1, ALIGN[text])) + cols.sort(key=lambda c: c.x_lo) + return cols + + +NUMERIC_ZONE_WIDTH = 55.0 # pts; right-aligned numeric values sit within +# this width to the left of the column's x_hi. Wide enough for any common +# Indian-format amount (e.g. "1,23,45,678.90") but narrow enough to exclude +# wrapped description text that bleeds in from the left. + + +def _column_ranges(columns: List[Column]) -> List[tuple[Column, float, float]]: + """Compute x-range per column. Right-aligned numeric columns get a + fixed-width zone ending at x_hi. Left-aligned columns extend from x_lo to + the start of the next column's zone. + + The fundamental asymmetry: description text (Transaction column) is wide + and naturally extends into the Amount column's x-space, while the actual + amount value is in a narrow zone right-aligned to x_hi. Hence numeric + columns are bounded by content-width, not by midpoint to neighbors. + """ + sorted_cols = sorted(columns, key=lambda c: (c.x_lo + c.x_hi) / 2) + ranges: List[tuple[Column, float, float]] = [] + for i, col in enumerate(sorted_cols): + if col.alignment == "right": + lo = col.x_hi - NUMERIC_ZONE_WIDTH + hi = col.x_hi + 3.0 + else: + lo = col.x_lo - 3.0 + if i + 1 < len(sorted_cols): + nxt = sorted_cols[i + 1] + hi = nxt.x_hi - NUMERIC_ZONE_WIDTH if nxt.alignment == "right" else nxt.x_lo - 3.0 + else: + hi = float("inf") + ranges.append((col, lo, hi)) + return ranges + + +def assign_cells(line: Line, columns: List[Column]) -> dict[str, str]: + """Bucket each char into a column by x-midpoint, then render each cell + text in left-to-right order. Overlay duplicates are already filtered + upstream by ``extract.extract_pages`` at the atom level.""" + ranges = _column_ranges(columns) + cells: dict[str, list[Char]] = {c.label: [] for c in columns} + for ch in line.chars: + x_mid = (ch.x0 + ch.x1) / 2 + for col, lo, hi in ranges: + if lo <= x_mid < hi: + cells[col.label].append(ch) + break + out = {} + for label, chars in cells.items(): + if not chars: + continue + chars.sort(key=lambda c: c.x0) + heights = sorted(c.h for c in chars) + h_med = heights[len(heights) // 2] + gap = max(1.5, 0.6 * h_med) + parts, prev_x1 = [], None + for c in chars: + if prev_x1 is not None and (c.x0 - prev_x1) > gap: + parts.append(" ") + parts.append(c.text) + prev_x1 = c.x1 + out[label] = "".join(parts).strip() + return out + + +# ----------------------------------------------------------------------------- +# Label parsers (folio/scheme/labeled rows) +# ----------------------------------------------------------------------------- + +FOLIO_LINE_RE = re.compile( + # Folio format: with optional " / " sub-account + # suffix. Spaces around the slash are common in the source PDF. + # Each of PAN / KYC / PAN-KYC is optional but when present + # appears in this order on the same line. `.*?` lives *inside* + # the optional group so a non-greedy match doesn't skip past it + # and leave the capture empty. + r"Folio\s+No\s*:\s*(\d+(?:\s*/\s*\d+)?)" + r"(?:.*?PAN\s*:\s*([A-Z]{5}\d{4}[A-Z]))?" + r"(?:.*?KYC\s*:\s*(OK|NOT OK))?" + r"(?:.*?PAN\s*:\s*(OK|NOT OK))?", + re.I, +) +SCHEME_HEAD_RE = re.compile( + # `- Registrar:`. The `` chunk may carry + # inline `(Advisor: )` and `- ISIN: ` segments in either + # order — newer KFin templates put `(Advisor:...) - ISIN:...`, + # newer CAMS templates put `- ISIN: ...(Advisor: ...)`. We capture + # everything between code and Registrar as `name` and then strip + # the advisor / ISIN fragments out in a second pass. + r"^(?P[\w\s]+?)-\s*(?P.+?)" r"\s+Registrar\s*:\s*(?P\S+)", + re.I, +) +INLINE_ISIN_RE = re.compile(r"[-\s]*ISIN\s*:\s*([A-Z0-9]+)", re.I) +INLINE_ADVISOR_RE = re.compile(r"[-\s]*\(\s*Advisor\s*:\s*([^)]+?)\)", re.I) +SCHEME_HEAD_RTA_RE = re.compile(r"Registrar\s*:\s*(\S+)", re.I) +OPEN_BAL_RE = re.compile(r"Opening\s+Unit\s+Balance\s*:?\s*([\d,.]+)", re.I) +CLOSE_BAL_RE = re.compile(r"Closing\s+Unit\s+Balance\s*:?\s*([\d,.]+)", re.I) +NAV_RE = re.compile(r"NAV\s+on\s+(\d{2}-[A-Za-z]{3}-\d{4})\s*:\s*INR\s*([\d,.]+)", re.I) +VALUATION_RE = re.compile( + r"(?:Valuation|Market\s+Value)\s+on\s+(\d{2}-[A-Za-z]{3}-\d{4})\s*:\s*INR\s*([\d,.]+)", + re.I, +) +COST_VALUE_RE = re.compile(r"Total\s+Cost\s+Value\s*:?\s*([\d,.]+)", re.I) +# Nominee block on the folio header. Three optional name slots; an +# empty slot ("Nominee 2: ") means no nominee at that position. +NOMINEE_RE = re.compile( + r"Nominee\s+1\s*:\s*(?P[^:]*?)\s*(?:Nominee\s+2\s*:\s*(?P[^:]*?)\s*" + r"(?:Nominee\s+3\s*:\s*(?P.*?))?)?$", + re.I, +) +STMT_PERIOD_RE = re.compile( + r"(\d{2}-[A-Za-z]{3}-\d{4})\s+To\s+(\d{2}-[A-Za-z]{3}-\d{4})", + re.I, +) +# AMC header line. Most issuers end in "Mutual Fund" or "MF"; a few +# newer entrants use " Fund House" instead. We anchor on the +# trailing suffix so disclaimer paragraphs that happen to mention an +# AMC name mid-sentence don't get classified as section headers. +AMC_RE = re.compile( + r"^(.+?\s+(?:MF|Mutual\s*Fund|Fund\s*House))$", + re.I, +) +# Extract leading date pattern. Accept "25-Oct-2021", "25 Oct 2021", +# "25Oct2021", etc. Dashes sometimes sit on a different baseline. The +# regex anchors only at start so it survives stray trailing chars +# (e.g. KFin's instalment number "1" leaking from the description column). +DATE_CELL_RE = re.compile(r"^\s*(\d{1,2}[-\s]*[A-Za-z]{3}[-\s]*\d{4})") + + +def _decimal(s: str) -> Optional[Decimal]: + if s is None: + return None + s = s.strip() + if not s: + return None + neg = s.startswith("(") or s.startswith("-") + s = s.lstrip("(").rstrip(")").lstrip("-").replace(",", "") + try: + d = Decimal(s) + return -d if neg else d + except Exception: + return None + + +# ----------------------------------------------------------------------------- +# Top-level parse +# ----------------------------------------------------------------------------- + + +def parse( + pdf_path: str, + password: str, + file_type: FileType = FileType.UNKNOWN, + *, + _doc=None, +) -> CASData: + pages = extract_pages(pdf_path, password, _doc=_doc) + + statement_period: Optional[StatementPeriod] = None + folios: dict[str, Folio] = {} + current_amc: Optional[str] = None + current_folio: Optional[Folio] = None + current_scheme: Optional[Scheme] = None + last_columns: List[Column] = [] # inherited if current page lacks header + + for page in pages: + header_pos = detect_txn_columns(page.lines, 0) + if header_pos: + header_idx, columns = header_pos + last_columns = columns + else: + # Continuation page — no header. Inherit from previous. + # header_idx=-1 means transactions can start from line 0. + header_idx = -1 + columns = last_columns + + for i, line in enumerate(page.lines): + text = line.text + + # --- statement period (first page only) --- + if statement_period is None: + if m := STMT_PERIOD_RE.search(text): + statement_period = StatementPeriod(from_=m.group(1), to=m.group(2)) + + # --- AMC --- + if m := AMC_RE.match(text.strip()): + current_amc = m.group(0) + continue + + # --- Folio header --- + if "Folio No" in text and (m := FOLIO_LINE_RE.search(text)): + # Preserve internal " / " for compatibility with production + # parser output format (it keeps "12124203 / 63" style). + folio_no = m.group(1).strip() + if folio_no not in folios: + folios[folio_no] = Folio( + folio=folio_no, + amc=current_amc or "UNKNOWN", + PAN=m.group(2) or "", + KYC=m.group(3) or None, + PANKYC=m.group(4) or None, + schemes=[], + ) + current_folio = folios[folio_no] + current_scheme = None + continue + + # --- Scheme header --- + # The scheme block can span up to 3 baselines depending on AMC + # and statement template: + # + # Older CAMS: Newer CAMS: + # - ... Registrar : CAMS Registrar : + # WEALTH) - ... (Advisor:...) + # KFINTECH + # + # We stitch up to 2 lines above and 2 lines below the + # current line (within Y_BAND pts y-distance) if those + # adjacent lines contain Registrar / Advisor / ISIN markers + # or look like the standalone RTA value (CAMS / KFINTECH). + Y_BAND = 5.0 + if current_folio is not None and "-" in text: + parts_above = [] + parts_below = [] + base_y = page.lines[i].baseline + for offset in (1, 2): + j = i - offset + if j < 0: + break + if page.lines[j].baseline - base_y > Y_BAND: + break + t_above = page.lines[j].text.strip() + if re.fullmatch(r"Registrar\s*:?", t_above, re.I) or re.search( + r"Registrar\s*:|Advisor\s*:|ISIN\s*:", t_above, re.I + ): + parts_above.insert(0, t_above) + # When the scheme line ENDS with an incomplete trailing + # marker (e.g. "(Advisor: Registrar :"), take the next + # baseline below as the value continuation regardless of + # its content — the value tokens (ARN-XYZ, INAxxxxx, + # CAMS, KFINTECH) don't all match a fixed pattern. + trailing_incomplete = bool( + re.search( + r"(Registrar\s*:|Advisor\s*:|ISIN\s*:|\(\s*Advisor\s*:)\s*$", + text.strip(), + re.I, + ) + ) + for offset in (1, 2): + j = i + offset + if j >= len(page.lines): + break + if base_y - page.lines[j].baseline > Y_BAND: + break + t_below = page.lines[j].text.strip() + if ( + re.fullmatch(r"(CAMS|KFINTECH|KFIN)\)?", t_below, re.I) + or re.search(r"Registrar\s*:|Advisor\s*:|ISIN\s*:", t_below, re.I) + or (offset == 1 and trailing_incomplete) + ): + parts_below.append(t_below) + # Scheme line FIRST so SCHEME_HEAD_RE can anchor to `-`. + # Then append annotations from any direction. + scheme_text = " ".join([text.strip()] + parts_above + parts_below) + # Trailing "Registrar :" with value already on the next + # token after stitching → ensure value present. + if scheme_text.endswith("Registrar :") or scheme_text.endswith("Registrar:"): + if i + 1 < len(page.lines): + toks = page.lines[i + 1].text.split() + if toks: + scheme_text = scheme_text + " " + toks[0] + if "Registrar" in scheme_text and (m := SCHEME_HEAD_RE.match(scheme_text)): + code = m.group("code").strip() + raw_name = m.group("name") + # Pull `(Advisor: …)` and `- ISIN: …` out of name + # (templates emit them in either order). Capture + # values first, then `re.sub` both fragments so we + # don't have to track shifted span offsets. + isin_m = INLINE_ISIN_RE.search(raw_name) + inline_isin = isin_m.group(1).strip() if isin_m else None + adv_m = INLINE_ADVISOR_RE.search(raw_name) + advisor = adv_m.group(1).strip() if adv_m else None + raw_name = INLINE_ISIN_RE.sub("", raw_name) + raw_name = INLINE_ADVISOR_RE.sub("", raw_name) + name = get_parsed_scheme_name(raw_name) + rta = (m.group("rta") or "").strip() or "CAMS" + isin, amfi, scheme_type = isin_search( + name, + rta, + code, + isin=inline_isin, + ) + current_scheme = Scheme( + scheme=name, + advisor=advisor, + rta=rta, + rta_code=code, + isin=isin, + amfi=amfi, + type=scheme_type or "N/A", + open=Decimal(0), + close=Decimal(0), + close_calculated=Decimal(0), + valuation=SchemeValuation( + date=statement_period.to if statement_period else "1970-01-01", + nav=Decimal(0), + value=Decimal(0), + ), + transactions=[], + ) + current_folio.schemes.append(current_scheme) + continue + + if current_scheme is None: + continue + + # --- Labeled rows --- + if m := OPEN_BAL_RE.search(text): + current_scheme.open = _decimal(m.group(1)) or Decimal(0) + current_scheme.close_calculated = current_scheme.open + continue + if m := CLOSE_BAL_RE.search(text): + current_scheme.close = _decimal(m.group(1)) or Decimal(0) + if m := NAV_RE.search(text): + current_scheme.valuation.date = dateparse.parse(m.group(1)).date() + current_scheme.valuation.nav = _decimal(m.group(2)) or Decimal(0) + if m := VALUATION_RE.search(text): + current_scheme.valuation.date = dateparse.parse(m.group(1)).date() + current_scheme.valuation.value = _decimal(m.group(2)) or Decimal(0) + if m := COST_VALUE_RE.search(text): + current_scheme.valuation.cost = _decimal(m.group(1)) + if m := NOMINEE_RE.search(text): + noms = [ + (m.group("n1") or "").strip(), + (m.group("n2") or "").strip(), + (m.group("n3") or "").strip(), + ] + current_scheme.nominees = [n for n in noms if n] + + # --- Transaction row (only when we have columns AND we're past + # the header block on this page) --- + if columns and header_idx is not None and i > header_idx: + cells = assign_cells(line, columns) + date_str = cells.get("Date", "").strip() + desc = cells.get("Transaction", "").strip() + m_date = DATE_CELL_RE.match(date_str) + if not m_date: + continue + if not desc: + continue # row with date but no description: skip + date_str = m_date.group(1) + # Normalize: collapse runs of dashes/spaces from overlay + # bleed-through, e.g. "15--Jan--2021" -> "15-Jan-2021". + date_str = re.sub(r"[-\s]+", "-", date_str).strip("-") + amt = _decimal(cells.get("Amount", "")) + units = _decimal(cells.get("Units", "")) + nav = _decimal(cells.get("Price", "") or cells.get("NAV", "")) + bal = _decimal(cells.get("Unit Balance", "")) + # A row with no amount AND no units is not a real transaction + # (usually a stray date in a footnote like "Effective from + # 01-Apr-2019…"). Skip these. + if amt is None and units is None: + continue + # Some older CAMS / KFin templates omit the per-row Price + # column for transactions but always carry Amount + Units. + # Derive `nav = amount / units` so downstream capital-gains + # FIFO calculations don't crash on `nav=None`. + if nav is None and amt is not None and units is not None and units != 0: + nav = (amt / units).quantize(Decimal("0.0001")) + txn_type, dividend_rate = get_transaction_type(desc, units) + if units is not None: + current_scheme.close_calculated += units + current_scheme.transactions.append( + TransactionData( + date=dateparse.parse(date_str).date(), + description=desc, + amount=amt, + units=units, + nav=nav, + balance=bal, + type=txn_type.name, + dividend_rate=dividend_rate, + ) + ) + + return CASData( + statement_period=statement_period or StatementPeriod(**{"from": "", "to": ""}), + folios=list(folios.values()), + investor_info=extract_cams_kfin_investor(pdf_path, password, _doc=_doc), + cas_type=CASFileType.DETAILED, + file_type=file_type, + ) diff --git a/casparser/parsers/cams_summary.py b/casparser/parsers/cams_summary.py new file mode 100644 index 0000000..7495547 --- /dev/null +++ b/casparser/parsers/cams_summary.py @@ -0,0 +1,436 @@ +"""POC: CAMS/KFin SUMMARY CAS parser using column-based row reading. + +Same architecture as `cams_detailed`, simpler schema — each scheme is +ONE row (no transactions). Schemes can wrap to one or two continuation +lines below for long names. + +Row anatomy (CAMS, single line where possible): + -