diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index 2fed051..5cfa82b 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -12,13 +12,13 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: - python-version: '3.11' + python-version: '3.12' - name: Install uv - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v8.1.0 - name: Build run: uv build - name: Publish diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 8b294f0..68547c4 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -18,15 +18,15 @@ jobs: python-version: ['3.11', '3.12', '3.13'] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v8.1.0 - name: Install dependencies - run: uv sync --all-extras --dev + run: uv sync --dev - name: Extract test files run: ./.github/scripts/extract_files.sh env: @@ -44,8 +44,10 @@ jobs: KFINTECH_CAS_FILE_NEW: ${{ secrets.KFINTECH_CAS_FILE_NEW }} KFINTECH_CAS_PASSWORD: ${{ secrets.KFINTECH_CAS_PASSWORD }} NSDL_CAS_FILE_1: ${{ secrets.NSDL_CAS_FILE_1 }} + CDSL_CAS_FILE_1: ${{ secrets.CDSL_CAS_FILE_1 }} + CDSL_CAS_PASSWORD: ${{ secrets.CDSL_CAS_PASSWORD }} - name: Upload coverage report to codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@v6 with: files: ./coverage.xml token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.gitignore b/.gitignore index 19b9779..359c00a 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,7 @@ dmypy.json tests/files/** tests/files.tar tests/files.tar.bz2 +tests/samples/** .DS_Store casparser.code-workspace diff --git a/CHANGELOG.md b/CHANGELOG.md index c3a39c0..8fa82ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,71 @@ # Changelog +## 1.0.0 + +Major release. The parsing backend was rewritten from scratch on +[pypdfium2](https://github.com/pypdfium2-team/pypdfium2) (Apache-2.0 / +BSD-3) and the four supported CAS issuers now each have a dedicated +parser tuned to their template family. + +### Breaking changes + +- **pdfminer.six and PyMuPDF backends removed.** `casparser.read_cas_pdf` + no longer dispatches between them. The `mupdf` / `fast` extras in + `pyproject.toml` are gone. The `--force-pdfminer` CLI flag and the + `force_pdfminer=` kwarg on `read_cas_pdf` are kept as no-ops; the + kwarg emits a `DeprecationWarning` and is otherwise ignored. +- **License simplified to pure MIT.** With the GPL/AGPL-licensed + PyMuPDF dependency gone, the `licenses/` directory of GPL/AGPL + copies has been removed. pypdfium2 is dual Apache-2.0 / BSD-3 and + doesn't impose any copyleft obligation on users of casparser. +- **Minimum Python is now 3.11.** 3.9 / 3.10 classifiers dropped from + `pyproject.toml`. +- **`CASData.investor_info` is now `Optional[InvestorInfo]`** (matches + the `NSDLCASData.investor_info` shape that already existed). It is + populated on every supported issuer, but consumers should still + guard against the `None` case for unfamiliar templates. +- **Internal `casparser.process` package removed.** The two helpers + downstream code still imports from it are now at + `casparser.parsers._classify` (`get_parsed_scheme_name`, + `get_transaction_type`) and `casparser.parsers._isin` (`isin_search`). + +### New + +- **First-class NSDL and CDSL parsers.** Drops the regex-on-text + approach the 0.8 NSDL/CDSL code used; the new parsers consume + structured `Block`/`Cell` records directly from `pypdfium2`. Several + bugs the v0.8 NSDL/CDSL code shipped with are no longer in scope + (misplaced-UCC-as-folio on NSDL MF Holdings, space-merged + folio+units cells on CDSL, the silently-dropped NSDL HDFC + subaccount on CDSL multi-account statements, `Optional[Decimal]` + comma-strip miss in the `MutualFund` validator). +- **CAMS / KFin 2026 templates supported** out of the box. The newer + CAMS SUMMARY template added an ISIN column the v0.8 regex didn't + match; v1.0 parses all rows. The newer KFin SUMMARY template emits + zero-balance schemes with single-space-separated trio cells that + the v0.8 regex required `\t\t` between; v1.0 picks them up too. +- **AMC-header detection extended** to include the `Fund House` + suffix. v0.8's regex only matched `Mutual Fund` / `MF` suffixes, + so schemes from a few newer AMCs whose names end in `Fund House` + ended up bucketed under the previous AMC. +- **ISIN / AMFI enrichment has a direct-ISIN fallback** path via + `MFISINDb.direct_isin_lookup` for the case where multi-line + `Registrar:` rendering corrupts the RTA token. + +### Fixed + +- **CAMS SUMMARY `valuation.date` no longer mis-parses to year 201** + (was a column-boundary bug — the NAVDate column treated as + right-aligned with a 42pt width clipped the trailing year digit, + then Pydantic mis-coerced the `01-Jan-201` string). +- **CDSL multi-account statements** (5+ demat accounts on one PDF) are + now parsed correctly. Earlier the page-3+ scan only kicked in from + page 8, dropping holdings sections that landed on pages 4-7. +- **CDSL MF holdings** rows with `DIRECT` (or any non-`ARN-XXXX` + distribution-mode token) now correctly populate `pnl` and `return_`. + ## 0.9.0 - 2026-05-22 -- Add support for CDSL sttements +- Add support for CDSL statements - Drop support for Python 3.9 and 3.10; minimum supported version is now 3.11 - Support PyMuPDF >= 1.25 (1.27.x tested). Older `<1.25` pin removed. - Bump `casparser-isin` to `>= 2026.5.1` (new DB format v2 with @@ -11,20 +75,9 @@ field (Python attribute `return_`) also gets the comma-stripping treatment; previously NSDL MF folio rows with a return value of 1 lakh or more would fail Decimal validation. -- Parser robustness fixes for PyMuPDF 1.25+ text extraction quirks: - - Re-emit visual rows as separate blocks for CAMS/KFINTECH so the - table header / folio header no longer get merged when the new - block grouping collapses them into a single PyMuPDF block. - - Recover the registrar value (e.g. `KFINTECH`) when it wraps to the - next line. - - Recover the advisor value when the scheme name wraps before the - advisor closing paren. - - Pull ISIN/Advisor onto the scheme line when long scheme names wrap. - - Tax transactions (`*** Stamp Duty ***`, STT, TDS) no longer absorb - spurious units when an adjacent column wraps onto the same row. - - NSDL holdings: widen the y-band tolerance, drop the strict - multiline `$` anchoring, and accept tab-separated wrapped names so - the regexes match consistently across Python 3.11–3.14. +- Parser robustness fixes for PyMuPDF 1.25+ text extraction quirks + (all superseded in 1.0.0 by the pypdfium2 rewrite, kept here for + the historical record). ## 0.8.1 - 2025-09-21 - NSDL parser bug fixes diff --git a/README.md b/README.md index 7843491..8c53e41 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ [![codecov](https://codecov.io/gh/codereverser/casparser/branch/main/graph/badge.svg?token=DYZ7TXWRGI)](https://codecov.io/gh/codereverser/casparser) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/casparser) -Parse Consolidated Account Statement (CAS) PDF files generated from CAMS/KFINTECH +Parse Consolidated Account Statement (CAS) PDF files generated from +CAMS, KFintech, NSDL, and CDSL. `casparser` also includes a command line tool with the following analysis tools - `summary`- print portfolio summary @@ -19,13 +20,8 @@ Parse Consolidated Account Statement (CAS) PDF files generated from CAMS/KFINTEC pip install -U casparser ``` -### with faster PyMuPDF parser -```bash -pip install -U 'casparser[fast]' -``` - -**Note:** Enabling this dependency could result in licensing changes. Check the -[License](#license) section for more details +Since v1.0 the parser is built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2) +(Apache-2.0 / BSD-3) — no optional PDF backends, no GPL/AGPL dependencies. ## Usage @@ -50,7 +46,7 @@ csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="cs "from": "YYYY-MMM-DD", "to": "YYYY-MMM-DD" }, - "file_type": "CAMS/KARVY/UNKNOWN", + "file_type": "CAMS/KFINTECH/NSDL/CDSL/UNKNOWN", "cas_type": "DETAILED/SUMMARY", "investor_info": { "email": "string", @@ -122,6 +118,9 @@ Notes: - `MISC` - `dividend_rate` is applicable only for `DIVIDEND_PAYOUT` and `DIVIDEND_REINVESTMENT` transactions. +- NSDL and CDSL statements return a different top-level shape with + `accounts[].equities[]` and `accounts[].mutual_funds[]` instead of + `folios[].schemes[]`. See `casparser.types.NSDLCASData` for details. ### CLI @@ -143,8 +142,6 @@ Usage: casparser [-o output_file.json|output_file.csv] [-p password] [-s] [-a] C --gains-112a ask|FY2020-21 Generate Capital Gains Report - 112A format for a given financial year - Use 'ask' for a prompt from available options (BETA) - --force-pdfminer Force PDFMiner parser even if MuPDF is - detected --version Show the version and exit. -h, --help Show this message and exit. @@ -199,11 +196,16 @@ failing scheme name(s). ## License -CASParser is distributed under MIT license by default. However enabling the optional dependency -`mupdf/fast` would imply the use of [PyMuPDF](https://github.com/pymupdf/PyMuPDF) / -[MuPDF](https://mupdf.com/license.html) and hence the licenses GNU GPL v3 and GNU Affero GPL v3 -would apply. Copies of all licenses have been included in this repository. - _IANAL_ +CASParser is distributed under the MIT license. Up to v0.8 the optional +`mupdf` / `fast` extra pulled in [PyMuPDF](https://github.com/pymupdf/PyMuPDF) / +[MuPDF](https://mupdf.com/license.html), which would have caused GNU GPL v3 +and GNU Affero GPL v3 to apply transitively. v1.0 dropped that extra +(the PyMuPDF and pdfminer.six backends are gone; the parser now runs on +[pypdfium2](https://github.com/pypdfium2-team/pypdfium2), which is dual +Apache-2.0 / BSD-3), so casparser is now pure MIT end-to-end. ## Resources 1. [CAS from CAMS](https://www.camsonline.com/Investors/Statements/Consolidated-Account-Statement) 2. [CAS from Karvy/Kfintech](https://mfs.kfintech.com/investor/General/ConsolidatedAccountStatement) +3. [NSDL Consolidated Account Statement](https://nsdlcas.nsdl.com/) +4. [CDSL Consolidated Account Statement](https://www.cdslindia.com/Investors/Cas.html) diff --git a/casparser/__init__.py b/casparser/__init__.py index 3684a46..a7615e5 100644 --- a/casparser/__init__.py +++ b/casparser/__init__.py @@ -9,4 +9,4 @@ "CapitalGainsReport", ] -__version__ = "0.9.0" +__version__ = "1.0.0" diff --git a/casparser/cli.py b/casparser/cli.py index 9762422..63cce9f 100644 --- a/casparser/cli.py +++ b/casparser/cli.py @@ -37,7 +37,7 @@ def formatINR(number): else: last3 = int_part[-3:] rest = int_part[:-3] - groups = [rest[max(0, i - 2):i or None] for i in range(len(rest), 0, -2)][::-1] + groups = [rest[max(0, i - 2) : i or None] for i in range(len(rest), 0, -2)][::-1] if groups and groups[0]: r = ",".join(groups + [last3]) else: @@ -82,7 +82,7 @@ def print_nsdl(parsed_data: NSDLCASData): ) summary_table.add_row(Padding("File Type :", spacing), f"[bold]{data['file_type']}[/]") # summary_table.add_row(Padding("CAS Type :", spacing), f"[bold]{data['cas_type']}[/]") - for key, value in data["investor_info"].items(): + for key, value in (data.get("investor_info") or {}).items(): summary_table.add_row( Padding(f"{key.capitalize()} :", spacing), re.sub(r"[^\S\r\n]+", " ", value) ) @@ -208,7 +208,7 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio summary_table.add_row(Padding("File Type :", spacing), f"[bold]{data['file_type']}[/]") summary_table.add_row(Padding("CAS Type :", spacing), f"[bold]{data['cas_type']}[/]") - for key, value in data["investor_info"].items(): + for key, value in (data.get("investor_info") or {}).items(): summary_table.add_row( Padding(f"{key.capitalize()} :", spacing), re.sub(r"[^\S\r\n]+", " ", value) ) diff --git a/casparser/parsers/__init__.py b/casparser/parsers/__init__.py index efdbf3e..a56ad4c 100644 --- a/casparser/parsers/__init__.py +++ b/casparser/parsers/__init__.py @@ -1,72 +1,156 @@ +"""Top-level dispatcher for `casparser.read_cas_pdf`. + +v1.0 reorganisation: pdfminer.six and PyMuPDF are gone. Everything +runs on pypdfium2 with parsers that consume structured page-object +data directly (no text-rendering / regex round-trip for NSDL+CDSL, +column-aware layout reading for CAMS+KFin). + +The four issuer-specific parsers live alongside this file: + + cams_detailed.py → CAMS / KFin DETAILED statements + cams_summary.py → CAMS / KFin SUMMARY statements + nsdl.py → NSDL Consolidated Account Statement + cdsl.py → CDSL Consolidated Account Statement + +`read_cas_pdf` sniffs the issuer + statement variant from the PDF's +first page, dispatches to the right parser, optionally sorts +transactions chronologically, and returns either `CASData` (CAMS/KFin) +or `NSDLCASData` (NSDL/CDSL). +""" + +from __future__ import annotations + import io +import warnings from typing import Union -from casparser.process import process_cas_text -from casparser.types import CASData, NSDLCASData, ProcessedCASData +from casparser.enums import CASFileType, FileType +from casparser.exceptions import CASParseError +from casparser.types import CASData, NSDLCASData +from .detect import _open_document, detect_cas_type, detect_file_type from .utils import cas2csv, cas2json +def _sort_transactions(data: CASData) -> CASData: + """For each scheme, sort transactions by date and re-compute the + running balance from the opening balance.""" + for folio in data.folios: + for idx, scheme in enumerate(folio.schemes): + dates = [x.date for x in scheme.transactions] + if dates == sorted(dates): + continue + sorted_txns = [] + balance = scheme.open + for txn in sorted(scheme.transactions, key=lambda x: x.date): + balance += txn.units or 0 + txn.balance = balance + sorted_txns.append(txn) + scheme.transactions = sorted_txns + folio.schemes[idx] = scheme + return data + + def read_cas_pdf( filename: Union[str, io.IOBase], - password, - output="dict", - sort_transactions=True, - force_pdfminer=False, + password: str, + output: str = "dict", + sort_transactions: bool = True, + force_pdfminer: bool = False, ): - """ - Parse CAS pdf and returns line data. + """Parse a Consolidated Account Statement PDF. - :param filename: CAS pdf file (CAMS or Kfintech) - :param password: CAS pdf password - :param output: Output format (json,dict) [default: dict] - :param sort_transactions: Sort transactions by date and re-compute balances. - :param force_pdfminer: Force pdfminer parser even if mupdf is detected + :param filename: path to the CAS PDF (or an open file-like object). + :param password: PDF password (most CAS PDFs are encrypted with the + investor's PAN). + :param output: `"dict"` (default) returns the typed model directly, + `"json"` returns its JSON serialisation, `"csv"` + returns a CSV string of transactions or holdings. + :param sort_transactions: For CAMS / KFin DETAILED statements, sort + each scheme's transactions by date and + re-compute the running balance. Default + `True`. + :param force_pdfminer: **Deprecated.** v1.0 dropped pdfminer in + favour of pypdfium2. Setting this to True + emits a `DeprecationWarning` and is otherwise + ignored. + :return: `CASData` for CAMS/KFin issuers, `NSDLCASData` for + NSDL/CDSL issuers, or a serialised form of either when + `output` is `"json"` / `"csv"`. """ if force_pdfminer: - from .pdfminer import cas_pdf_to_text - else: - try: - from .mupdf import cas_pdf_to_text - except (ImportError, ModuleNotFoundError): - from .pdfminer import cas_pdf_to_text - - partial_cas_data = cas_pdf_to_text(filename, password) - processed_data = process_cas_text( - "\u2029".join(partial_cas_data.lines), partial_cas_data.file_type - ) - if isinstance(processed_data, ProcessedCASData): - if sort_transactions: - for folio in processed_data.folios: - for idx, scheme in enumerate(folio.schemes): - dates = [x.date for x in scheme.transactions] - sorted_dates = list(sorted(dates)) - if dates != sorted_dates: - sorted_transactions = [] - balance = scheme.open - for transaction in sorted(scheme.transactions, key=lambda x: x.date): - balance += transaction.units or 0 - transaction.balance = balance - sorted_transactions.append(transaction) - scheme.transactions = sorted_transactions - folio.schemes[idx] = scheme - - final_data = CASData( - statement_period=processed_data.statement_period, - folios=processed_data.folios, - investor_info=partial_cas_data.investor_info, - cas_type=processed_data.cas_type, - file_type=partial_cas_data.file_type, + warnings.warn( + "force_pdfminer is deprecated in casparser 1.0 — pdfminer " + "is no longer a supported backend.", + DeprecationWarning, + stacklevel=2, ) - else: - final_data = NSDLCASData( - statement_period=processed_data.statement_period, - accounts=processed_data.accounts, - investor_info=partial_cas_data.investor_info, - file_type=partial_cas_data.file_type, + + # Open the PDF exactly once and thread it through the detect / + # parser / investor extractor calls — every pypdfium2 open re-runs + # the password decrypt + content-stream parse, so the savings on + # multi-page detailed statements are significant. + doc = _open_document(filename, password) + + file_type = detect_file_type(filename, password, _doc=doc) + if file_type == FileType.UNKNOWN: + raise CASParseError( + "Could not identify the CAS issuer. Supported issuers are " + "CAMS, KFintech, NSDL, and CDSL." ) + + if file_type in (FileType.CAMS, FileType.KFINTECH): + cas_type = detect_cas_type(filename, password, _doc=doc) + if cas_type == CASFileType.DETAILED: + from . import cams_detailed + + data: Union[CASData, NSDLCASData] = cams_detailed.parse( + filename, + password, + file_type=file_type, + _doc=doc, + ) + elif cas_type == CASFileType.SUMMARY: + from . import cams_summary + + data = cams_summary.parse( + filename, + password, + file_type=file_type, + _doc=doc, + ) + else: + raise CASParseError( + "Could not identify whether this is a DETAILED or " "SUMMARY CAMS / KFin statement." + ) + if sort_transactions and isinstance(data, CASData): + data = _sort_transactions(data) + elif file_type == FileType.NSDL: + from . import nsdl + + data = nsdl.parse_nsdl( + filename, + password, + file_type=FileType.NSDL, + _doc=doc, + ) + elif file_type == FileType.CDSL: + from . import cdsl + + data = cdsl.parse_cdsl( + filename, + password, + file_type=FileType.CDSL, + _doc=doc, + ) + else: # pragma: no cover — handled above + raise CASParseError(f"Unsupported file type: {file_type}") + if output == "dict": - return final_data - elif output == "csv": - return cas2csv(final_data) - return cas2json(final_data) + return data + if output == "csv": + return cas2csv(data) + return cas2json(data) + + +__all__ = ["read_cas_pdf"] diff --git a/casparser/parsers/_classify.py b/casparser/parsers/_classify.py new file mode 100644 index 0000000..972e11c --- /dev/null +++ b/casparser/parsers/_classify.py @@ -0,0 +1,114 @@ +"""Classification helpers shared across the CAMS / KFin parsers. + +Two pure utilities: + +- `get_transaction_type` maps a transaction description + signed units + count to a `TransactionType` enum, also extracting the dividend rate + for IDCW / dividend lines. +- `get_parsed_scheme_name` normalises a raw scheme name (drops + `(formerly ...)`, `(erstwhile ...)`, `(Demat ...)` trailers, collapses + whitespace). + +These are pulled out of the old `casparser.process.cas_detailed` module +because the pypdfium2 DETAILED parser still needs them but the rest of +that module's text-rendering machinery is now gone. +""" + +from __future__ import annotations + +import re +from decimal import Decimal +from typing import Optional, Tuple + +from casparser.enums import TransactionType + +# Matches an IDCW / dividend transaction description. Captures the +# "reinvest" hint (if present) and the per-unit rupee value. +DIVIDEND_RE = re.compile( + r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?", + re.I | re.DOTALL, +) + + +def get_transaction_type( + description: str, units: Optional[Decimal] +) -> Tuple[TransactionType, Optional[Decimal]]: + """Classify a transaction by its description + units sign. + + Returns `(transaction_type, dividend_rate_or_None)`. The dividend + rate is only set for IDCW / dividend transactions. + """ + dividend_rate: Optional[Decimal] = None + description = description.lower() + if div_match := DIVIDEND_RE.search(description): + reinvest_flag, dividend_str = div_match.groups() + dividend_rate = Decimal(dividend_str) + txn_type = ( + TransactionType.DIVIDEND_REINVEST if reinvest_flag else TransactionType.DIVIDEND_PAYOUT + ) + elif units is None: + if "stt" in description: + txn_type = TransactionType.STT_TAX + elif "stamp" in description: + txn_type = TransactionType.STAMP_DUTY_TAX + elif "tds" in description: + txn_type = TransactionType.TDS_TAX + else: + txn_type = TransactionType.MISC + elif units > 0: + if "switch" in description: + txn_type = ( + TransactionType.SWITCH_IN_MERGER + if "merger" in description + else TransactionType.SWITCH_IN + ) + elif "segregat" in description: + txn_type = TransactionType.SEGREGATION + elif ( + "sip" in description + or "systematic" in description + or re.search(r"instal+ment", description, re.I) + or re.search(r"sys.+?invest", description, re.I | re.DOTALL) + ): + txn_type = TransactionType.PURCHASE_SIP + else: + txn_type = TransactionType.PURCHASE + elif units < 0: + if re.search( + r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", + description, + re.I, + ): + txn_type = TransactionType.REVERSAL + elif "switch" in description: + txn_type = ( + TransactionType.SWITCH_OUT_MERGER + if "merger" in description + else TransactionType.SWITCH_OUT + ) + else: + txn_type = TransactionType.REDEMPTION + else: + txn_type = TransactionType.UNKNOWN + + return txn_type, dividend_rate + + +def get_parsed_scheme_name(scheme: str) -> str: + """Strip `(formerly ...)`, `(erstwhile ...)`, `(Demat ...)`, + `(Non-Demat ...)` trailers; collapse whitespace; trim trailing + punctuation.""" + scheme = re.sub( + r"\((formerly|erstwhile).+?\)", + "", + scheme, + flags=re.I | re.DOTALL, + ).strip() + scheme = re.sub( + r"\((Demat|Non-Demat).*", + "", + scheme, + flags=re.I | re.DOTALL, + ).strip() + scheme = re.sub(r"\s+", " ", scheme).strip() + return re.sub(r"[^a-zA-Z0-9_)]+$", "", scheme).strip() diff --git a/casparser/parsers/_investor.py b/casparser/parsers/_investor.py new file mode 100644 index 0000000..e8ec246 --- /dev/null +++ b/casparser/parsers/_investor.py @@ -0,0 +1,192 @@ +"""Investor-info extractors for the four supported CAS issuers. + +Both extractors filter the source PDF's atoms to a top-left column on +a known page (page 1 for CAMS/KFin, page 2 for NSDL/CDSL), then walk +top-down picking out labelled fields. We use page-object atoms rather +than baseline-clustered lines so the right-column disclaimer text +that shares y-baselines with the investor block doesn't contaminate +the result. + +The CAMS/KFin block carries the full quartet (name, email, address, +mobile). The NSDL/CDSL block carries only the name and address — +those CAS variants don't print the investor's email or mobile on the +statement, so those fields come back as empty strings. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, List, Optional + +from casparser.exceptions import CASParseError +from casparser.types import InvestorInfo + +from .pageobj import Atom, extract_atoms + +if TYPE_CHECKING: # pragma: no cover + import pypdfium2 as pdfium + + +# Top-left column cutoffs. Everything to the right is the disclaimer +# paragraph (CAMS/KFin) or the cover-page banner (NSDL/CDSL). 200 is +# the conservative right edge that fits all observed templates. +_LEFT_COLUMN_X = 200.0 + + +_EMAIL_RE = re.compile(r"Email\s*Id\s*:\s*(\S+@\S+)", re.I) +_MOBILE_RE = re.compile(r"Mobile\s*:\s*([+\d]+)", re.I) +_PHONE_RE = re.compile(r"^\s*Phone\s+Off\s*:", re.I) +_PINCODE_RE = re.compile(r"^\s*(?:Pin\s*code|PINCODE)\s*:\s*\d+", re.I) +_ID_MARKER_RE = re.compile(r"^\s*(?:CAS|NSDL)\s*ID\s*:", re.I) + + +def _left_column_atoms(atoms: List[Atom]) -> List[Atom]: + """Filter to atoms in the top-left column, sorted top-down.""" + filtered = [a for a in atoms if a.x_left < _LEFT_COLUMN_X and a.text.strip()] + filtered.sort(key=lambda a: -a.y_top) + return filtered + + +def extract_cams_kfin_investor( + pdf_path, + password, + *, + _doc: "Optional[pdfium.PdfDocument]" = None, + _atoms: Optional[List[List[Atom]]] = None, +) -> InvestorInfo: + """Read the investor block from the top-left of page 1. + + Layout across CAMS and KFin templates: + + Email Id: + +
+ ... +
+ [Phone Off: ...] ← only on some KFin templates + Mobile: + + We anchor on `Email Id:` (always present on CAMS/KFin), then + everything until `Mobile:` (exclusive) is name + address. The + name is the first non-label line; the rest is address. Stray + `Phone Off:` lines are dropped from the address. + + Every CAS statement carries this block by mandate. If we can't + find it we raise `CASParseError` — a CAS without identifiable + investor is malformed, not a "missing field" case. + + `_doc` / `_atoms`: dispatcher-provided overrides to avoid a + second pypdfium2 open + page-object walk when the caller has + already extracted atoms for the holdings parser. + """ + pages = ( + _atoms + if _atoms is not None + else extract_atoms( + pdf_path, + password, + _doc=_doc, + ) + ) + block = _left_column_atoms(pages[0]) if pages else [] + + email = "" + mobile = "" + name = "" + address_lines: List[str] = [] + seen_email = False + + for atom in block: + text = atom.text.strip() + if m := _EMAIL_RE.match(text): + email = m.group(1).strip() + seen_email = True + continue + if m := _MOBILE_RE.match(text): + mobile = m.group(1).strip() + # Mobile is the last field of the investor block — stop here + # so the transaction table that follows isn't picked up. + break + if not seen_email: + continue + if _PHONE_RE.match(text): + continue + if not name: + name = text + else: + address_lines.append(text) + + if not name: + raise CASParseError( + "Could not extract investor info from CAMS/KFin CAS PDF. " + "Expected an `Email Id:` line followed by name + address + " + "`Mobile:` in the top-left column of page 1." + ) + return InvestorInfo( + name=name, + email=email, + address="\n".join(address_lines), + mobile=mobile, + ) + + +def extract_nsdl_cdsl_investor( + pdf_path, + password, + *, + _doc: "Optional[pdfium.PdfDocument]" = None, + _atoms: Optional[List[List[Atom]]] = None, +) -> InvestorInfo: + """NSDL / CDSL print the investor block on page 2 (after the cover + page). The block is delimited by a `CAS ID:` (CDSL) or `NSDL ID:` + (NSDL) marker on top and a `PINCODE:` line on the bottom. Name is + the first line after the marker; everything between is address. + Email and mobile aren't printed in these CAS variants, so they + come back as empty strings. + + Raises `CASParseError` if no investor block is found — a CAS + without identifiable investor is malformed. + + `_doc` / `_atoms`: dispatcher-provided overrides; see + `extract_cams_kfin_investor`. + """ + pages = ( + _atoms + if _atoms is not None + else extract_atoms( + pdf_path, + password, + _doc=_doc, + ) + ) + block = _left_column_atoms(pages[1]) if len(pages) >= 2 else [] + + name = "" + address_lines: List[str] = [] + seen_marker = False + for atom in block: + text = atom.text.strip() + if _ID_MARKER_RE.match(text): + seen_marker = True + continue + if not seen_marker: + continue + if not name: + name = text + continue + address_lines.append(text) + if _PINCODE_RE.match(text): + break + + if not name: + raise CASParseError( + "Could not extract investor info from NSDL/CDSL CAS PDF. " + "Expected a `CAS ID:` / `NSDL ID:` marker followed by name + " + "address in the top-left column of page 2." + ) + return InvestorInfo( + name=name, + email="", + address="\n".join(address_lines), + mobile="", + ) diff --git a/casparser/parsers/_isin.py b/casparser/parsers/_isin.py new file mode 100644 index 0000000..edcc6db --- /dev/null +++ b/casparser/parsers/_isin.py @@ -0,0 +1,40 @@ +from typing import Optional, Tuple + +from casparser_isin import MFISINDb + + +def isin_search( + scheme_name: str, + rta: str, + rta_code: str, + isin: Optional[str] = None, +) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """Look up `(ISIN, AMFI, type)` for a CAS scheme. + + The primary path matches on `(scheme_name, rta, rta_code)`. When + that returns no hit but the caller passed an `isin` (e.g., parsed + inline from the scheme header), fall back to a direct ISIN lookup. + The fallback bypasses RTA mis-detection that can happen when the + `Registrar:` value gets mangled by multi-line rendering on + pypdfium2's char extraction. + + :param scheme_name: Normalised scheme name from the CAS. + :param rta: Registrar (`CAMS` / `KFINTECH` / `FTAMIL` …). + :param rta_code: Scheme's per-RTA code. + :param isin: Optional ISIN hint pulled from the scheme header. + """ + with MFISINDb() as db: + try: + scheme_data = db.isin_lookup(scheme_name, rta, rta_code, isin=isin) + return scheme_data.isin, scheme_data.amfi_code, scheme_data.type + except ValueError: + pass + if isin: + try: + rows = db.direct_isin_lookup(isin) + if rows: + row = rows[0] + return row["isin"], row["amfi_code"], row["type"] + except (ValueError, KeyError, TypeError): + pass + return None, None, None diff --git a/casparser/parsers/cams_detailed.py b/casparser/parsers/cams_detailed.py new file mode 100644 index 0000000..b79d487 --- /dev/null +++ b/casparser/parsers/cams_detailed.py @@ -0,0 +1,546 @@ +"""POC: CAMS DETAILED CAS parser using column-based row reading. + +Produces the same `List[Folio]` shape as the production parser so output can +be diffed directly. ISIN/AMFI enrichment and investor info are deferred — +those passes are orthogonal to the column-reader question. + +Scope of this POC (handles): +- One CAS, possibly multi-page +- One AMC, one folio header per folio, one scheme header per scheme +- Transaction table with 6 standard columns (Date / Transaction / Amount / + Units / Price / Unit Balance) +- "Opening Unit Balance", "Closing Unit Balance", "NAV on", "Valuation on" + labeled rows + +Deferred (TODO markers below): +- Multi-line transaction descriptions (we keep first line only) +- ISIN / AMFI lookup +- Nominees +- Segregated portfolios +- Total Cost Value parsing +- Investor info / statement period +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass +from decimal import Decimal +from typing import List, Optional + +from dateutil import parser as dateparse + +from casparser.enums import CASFileType, FileType +from casparser.types import ( + CASData, + Folio, + Scheme, + SchemeValuation, + StatementPeriod, + TransactionData, +) + +from ._classify import get_parsed_scheme_name, get_transaction_type +from ._investor import extract_cams_kfin_investor +from ._isin import isin_search +from .extract import Char, Line, extract_pages + +# ----------------------------------------------------------------------------- +# Column anchors +# ----------------------------------------------------------------------------- + +# CAMS DETAILED transaction table. The header is two physical rows in the +# PDF: "Date Transaction Amount Units Price Unit" on top, "(INR) (INR) +# Balance" below. We require ≥4 of these labels on one line. +TXN_HEADER_LABELS = {"Date", "Transaction", "Amount", "Units", "Price", "Unit", "Balance", "NAV"} +TXN_MIN_HITS = 4 + +# All numeric columns are right-aligned; Date and Transaction are left-aligned. +ALIGN = { + "Date": "left", + "Transaction": "left", + "Amount": "right", + "Units": "right", + "Price": "right", + "Unit Balance": "right", + "NAV": "right", +} + + +@dataclass +class Column: + label: str + x_lo: float # range covering header label width + x_hi: float + alignment: str # 'left' | 'right' + + @property + def x_anchor(self) -> float: + """For right-aligned columns, x_hi is the snap target; for left, + x_lo is.""" + return self.x_hi if self.alignment == "right" else self.x_lo + + +def _words_on_line(line: Line, min_gap: float = 1.5) -> List[tuple[str, float, float]]: + """Return [(text, x0, x1)] words on a line, splitting on x-gap > min_gap.""" + cs = sorted(line.chars, key=lambda c: c.x0) + words = [] + cur, cur_x0, cur_x1 = "", None, None + for c in cs: + if cur and (c.x0 - cur_x1) > min_gap: + words.append((cur, cur_x0, cur_x1)) + cur = "" + if not cur: + cur_x0 = c.x0 + cur += c.text + cur_x1 = c.x1 + if cur: + words.append((cur, cur_x0, cur_x1)) + return words + + +HEADER_WINDOW_Y = 15.0 # vertical span (pts) that constitutes one logical +# header block. CAMS uses 2 baselines spanning ~10pt; KFin uses 4 baselines +# spanning ~11pt (Amount/Price at top, Unit, Date/Transaction/Units, (INR) +# /(INR)/Balance at bottom). + + +def detect_txn_columns(lines: List[Line], start_idx: int) -> Optional[tuple[int, List[Column]]]: + """Find the next transaction-table header at or after start_idx. + + A header is a y-window of consecutive lines (top-down) spanning ≤ HEADER_ + WINDOW_Y points and collectively containing ≥ TXN_MIN_HITS distinct + column labels. We collect labels from the whole window so wraps like + "Unit"/"Balance" stacked over 2 baselines or KFin's 4-baseline split + behave the same. + + Returns (index_of_last_line_in_header, ordered columns). Transaction + parsing should start at index + 1. + """ + for i in range(start_idx, len(lines)): + window = [lines[i]] + for j in range(i + 1, len(lines)): + if lines[i].baseline - lines[j].baseline > HEADER_WINDOW_Y: + break + window.append(lines[j]) + + all_words: List[tuple[str, float, float]] = [] + for line in window: + all_words.extend(_words_on_line(line)) + labels = {w[0] for w in all_words if w[0] in TXN_HEADER_LABELS} + if len(labels) < TXN_MIN_HITS: + continue + last_idx = i + len(window) - 1 + return last_idx, _build_columns(all_words) + return None + + +def _build_columns(words: List[tuple[str, float, float]]) -> List[Column]: + """Map header words to Columns. Merge "Unit"+"Balance" into one column.""" + cols: List[Column] = [] + used = set() + for text, x0, x1 in words: + if text == "Unit" and ("Balance" in (w[0] for w in words)): + # Find "Balance" with overlapping x-range + for w_text, w_x0, w_x1 in words: + if w_text == "Balance" and abs((w_x0 + w_x1) / 2 - (x0 + x1) / 2) < 30: + cols.append(Column("Unit Balance", min(x0, w_x0), max(x1, w_x1), "right")) + used.add(id((text, x0, x1))) + used.add(id((w_text, w_x0, w_x1))) + break + elif text in ALIGN and text not in ("Unit", "Balance"): + cols.append(Column(text, x0, x1, ALIGN[text])) + cols.sort(key=lambda c: c.x_lo) + return cols + + +NUMERIC_ZONE_WIDTH = 55.0 # pts; right-aligned numeric values sit within +# this width to the left of the column's x_hi. Wide enough for any common +# Indian-format amount (e.g. "1,23,45,678.90") but narrow enough to exclude +# wrapped description text that bleeds in from the left. + + +def _column_ranges(columns: List[Column]) -> List[tuple[Column, float, float]]: + """Compute x-range per column. Right-aligned numeric columns get a + fixed-width zone ending at x_hi. Left-aligned columns extend from x_lo to + the start of the next column's zone. + + The fundamental asymmetry: description text (Transaction column) is wide + and naturally extends into the Amount column's x-space, while the actual + amount value is in a narrow zone right-aligned to x_hi. Hence numeric + columns are bounded by content-width, not by midpoint to neighbors. + """ + sorted_cols = sorted(columns, key=lambda c: (c.x_lo + c.x_hi) / 2) + ranges: List[tuple[Column, float, float]] = [] + for i, col in enumerate(sorted_cols): + if col.alignment == "right": + lo = col.x_hi - NUMERIC_ZONE_WIDTH + hi = col.x_hi + 3.0 + else: + lo = col.x_lo - 3.0 + if i + 1 < len(sorted_cols): + nxt = sorted_cols[i + 1] + hi = nxt.x_hi - NUMERIC_ZONE_WIDTH if nxt.alignment == "right" else nxt.x_lo - 3.0 + else: + hi = float("inf") + ranges.append((col, lo, hi)) + return ranges + + +def assign_cells(line: Line, columns: List[Column]) -> dict[str, str]: + """Bucket each char into a column by x-midpoint, then render each cell + text in left-to-right order. Overlay duplicates are already filtered + upstream by ``extract.extract_pages`` at the atom level.""" + ranges = _column_ranges(columns) + cells: dict[str, list[Char]] = {c.label: [] for c in columns} + for ch in line.chars: + x_mid = (ch.x0 + ch.x1) / 2 + for col, lo, hi in ranges: + if lo <= x_mid < hi: + cells[col.label].append(ch) + break + out = {} + for label, chars in cells.items(): + if not chars: + continue + chars.sort(key=lambda c: c.x0) + heights = sorted(c.h for c in chars) + h_med = heights[len(heights) // 2] + gap = max(1.5, 0.6 * h_med) + parts, prev_x1 = [], None + for c in chars: + if prev_x1 is not None and (c.x0 - prev_x1) > gap: + parts.append(" ") + parts.append(c.text) + prev_x1 = c.x1 + out[label] = "".join(parts).strip() + return out + + +# ----------------------------------------------------------------------------- +# Label parsers (folio/scheme/labeled rows) +# ----------------------------------------------------------------------------- + +FOLIO_LINE_RE = re.compile( + # Folio format: with optional " / " sub-account + # suffix. Spaces around the slash are common in the source PDF. + # Each of PAN / KYC / PAN-KYC is optional but when present + # appears in this order on the same line. `.*?` lives *inside* + # the optional group so a non-greedy match doesn't skip past it + # and leave the capture empty. + r"Folio\s+No\s*:\s*(\d+(?:\s*/\s*\d+)?)" + r"(?:.*?PAN\s*:\s*([A-Z]{5}\d{4}[A-Z]))?" + r"(?:.*?KYC\s*:\s*(OK|NOT OK))?" + r"(?:.*?PAN\s*:\s*(OK|NOT OK))?", + re.I, +) +SCHEME_HEAD_RE = re.compile( + # `- Registrar:`. The `` chunk may carry + # inline `(Advisor: )` and `- ISIN: ` segments in either + # order — newer KFin templates put `(Advisor:...) - ISIN:...`, + # newer CAMS templates put `- ISIN: ...(Advisor: ...)`. We capture + # everything between code and Registrar as `name` and then strip + # the advisor / ISIN fragments out in a second pass. + r"^(?P[\w\s]+?)-\s*(?P.+?)" r"\s+Registrar\s*:\s*(?P\S+)", + re.I, +) +INLINE_ISIN_RE = re.compile(r"[-\s]*ISIN\s*:\s*([A-Z0-9]+)", re.I) +INLINE_ADVISOR_RE = re.compile(r"[-\s]*\(\s*Advisor\s*:\s*([^)]+?)\)", re.I) +SCHEME_HEAD_RTA_RE = re.compile(r"Registrar\s*:\s*(\S+)", re.I) +OPEN_BAL_RE = re.compile(r"Opening\s+Unit\s+Balance\s*:?\s*([\d,.]+)", re.I) +CLOSE_BAL_RE = re.compile(r"Closing\s+Unit\s+Balance\s*:?\s*([\d,.]+)", re.I) +NAV_RE = re.compile(r"NAV\s+on\s+(\d{2}-[A-Za-z]{3}-\d{4})\s*:\s*INR\s*([\d,.]+)", re.I) +VALUATION_RE = re.compile( + r"(?:Valuation|Market\s+Value)\s+on\s+(\d{2}-[A-Za-z]{3}-\d{4})\s*:\s*INR\s*([\d,.]+)", + re.I, +) +COST_VALUE_RE = re.compile(r"Total\s+Cost\s+Value\s*:?\s*([\d,.]+)", re.I) +# Nominee block on the folio header. Three optional name slots; an +# empty slot ("Nominee 2: ") means no nominee at that position. +NOMINEE_RE = re.compile( + r"Nominee\s+1\s*:\s*(?P[^:]*?)\s*(?:Nominee\s+2\s*:\s*(?P[^:]*?)\s*" + r"(?:Nominee\s+3\s*:\s*(?P.*?))?)?$", + re.I, +) +STMT_PERIOD_RE = re.compile( + r"(\d{2}-[A-Za-z]{3}-\d{4})\s+To\s+(\d{2}-[A-Za-z]{3}-\d{4})", + re.I, +) +# AMC header line. Most issuers end in "Mutual Fund" or "MF"; a few +# newer entrants use " Fund House" instead. We anchor on the +# trailing suffix so disclaimer paragraphs that happen to mention an +# AMC name mid-sentence don't get classified as section headers. +AMC_RE = re.compile( + r"^(.+?\s+(?:MF|Mutual\s*Fund|Fund\s*House))$", + re.I, +) +# Extract leading date pattern. Accept "25-Oct-2021", "25 Oct 2021", +# "25Oct2021", etc. Dashes sometimes sit on a different baseline. The +# regex anchors only at start so it survives stray trailing chars +# (e.g. KFin's instalment number "1" leaking from the description column). +DATE_CELL_RE = re.compile(r"^\s*(\d{1,2}[-\s]*[A-Za-z]{3}[-\s]*\d{4})") + + +def _decimal(s: str) -> Optional[Decimal]: + if s is None: + return None + s = s.strip() + if not s: + return None + neg = s.startswith("(") or s.startswith("-") + s = s.lstrip("(").rstrip(")").lstrip("-").replace(",", "") + try: + d = Decimal(s) + return -d if neg else d + except Exception: + return None + + +# ----------------------------------------------------------------------------- +# Top-level parse +# ----------------------------------------------------------------------------- + + +def parse( + pdf_path: str, + password: str, + file_type: FileType = FileType.UNKNOWN, + *, + _doc=None, +) -> CASData: + pages = extract_pages(pdf_path, password, _doc=_doc) + + statement_period: Optional[StatementPeriod] = None + folios: dict[str, Folio] = {} + current_amc: Optional[str] = None + current_folio: Optional[Folio] = None + current_scheme: Optional[Scheme] = None + last_columns: List[Column] = [] # inherited if current page lacks header + + for page in pages: + header_pos = detect_txn_columns(page.lines, 0) + if header_pos: + header_idx, columns = header_pos + last_columns = columns + else: + # Continuation page — no header. Inherit from previous. + # header_idx=-1 means transactions can start from line 0. + header_idx = -1 + columns = last_columns + + for i, line in enumerate(page.lines): + text = line.text + + # --- statement period (first page only) --- + if statement_period is None: + if m := STMT_PERIOD_RE.search(text): + statement_period = StatementPeriod(from_=m.group(1), to=m.group(2)) + + # --- AMC --- + if m := AMC_RE.match(text.strip()): + current_amc = m.group(0) + continue + + # --- Folio header --- + if "Folio No" in text and (m := FOLIO_LINE_RE.search(text)): + # Preserve internal " / " for compatibility with production + # parser output format (it keeps "12124203 / 63" style). + folio_no = m.group(1).strip() + if folio_no not in folios: + folios[folio_no] = Folio( + folio=folio_no, + amc=current_amc or "UNKNOWN", + PAN=m.group(2) or "", + KYC=m.group(3) or None, + PANKYC=m.group(4) or None, + schemes=[], + ) + current_folio = folios[folio_no] + current_scheme = None + continue + + # --- Scheme header --- + # The scheme block can span up to 3 baselines depending on AMC + # and statement template: + # + # Older CAMS: Newer CAMS: + # - ... Registrar : CAMS Registrar : + # WEALTH) - ... (Advisor:...) + # KFINTECH + # + # We stitch up to 2 lines above and 2 lines below the + # current line (within Y_BAND pts y-distance) if those + # adjacent lines contain Registrar / Advisor / ISIN markers + # or look like the standalone RTA value (CAMS / KFINTECH). + Y_BAND = 5.0 + if current_folio is not None and "-" in text: + parts_above = [] + parts_below = [] + base_y = page.lines[i].baseline + for offset in (1, 2): + j = i - offset + if j < 0: + break + if page.lines[j].baseline - base_y > Y_BAND: + break + t_above = page.lines[j].text.strip() + if re.fullmatch(r"Registrar\s*:?", t_above, re.I) or re.search( + r"Registrar\s*:|Advisor\s*:|ISIN\s*:", t_above, re.I + ): + parts_above.insert(0, t_above) + # When the scheme line ENDS with an incomplete trailing + # marker (e.g. "(Advisor: Registrar :"), take the next + # baseline below as the value continuation regardless of + # its content — the value tokens (ARN-XYZ, INAxxxxx, + # CAMS, KFINTECH) don't all match a fixed pattern. + trailing_incomplete = bool( + re.search( + r"(Registrar\s*:|Advisor\s*:|ISIN\s*:|\(\s*Advisor\s*:)\s*$", + text.strip(), + re.I, + ) + ) + for offset in (1, 2): + j = i + offset + if j >= len(page.lines): + break + if base_y - page.lines[j].baseline > Y_BAND: + break + t_below = page.lines[j].text.strip() + if ( + re.fullmatch(r"(CAMS|KFINTECH|KFIN)\)?", t_below, re.I) + or re.search(r"Registrar\s*:|Advisor\s*:|ISIN\s*:", t_below, re.I) + or (offset == 1 and trailing_incomplete) + ): + parts_below.append(t_below) + # Scheme line FIRST so SCHEME_HEAD_RE can anchor to `-`. + # Then append annotations from any direction. + scheme_text = " ".join([text.strip()] + parts_above + parts_below) + # Trailing "Registrar :" with value already on the next + # token after stitching → ensure value present. + if scheme_text.endswith("Registrar :") or scheme_text.endswith("Registrar:"): + if i + 1 < len(page.lines): + toks = page.lines[i + 1].text.split() + if toks: + scheme_text = scheme_text + " " + toks[0] + if "Registrar" in scheme_text and (m := SCHEME_HEAD_RE.match(scheme_text)): + code = m.group("code").strip() + raw_name = m.group("name") + # Pull `(Advisor: …)` and `- ISIN: …` out of name + # (templates emit them in either order). Capture + # values first, then `re.sub` both fragments so we + # don't have to track shifted span offsets. + isin_m = INLINE_ISIN_RE.search(raw_name) + inline_isin = isin_m.group(1).strip() if isin_m else None + adv_m = INLINE_ADVISOR_RE.search(raw_name) + advisor = adv_m.group(1).strip() if adv_m else None + raw_name = INLINE_ISIN_RE.sub("", raw_name) + raw_name = INLINE_ADVISOR_RE.sub("", raw_name) + name = get_parsed_scheme_name(raw_name) + rta = (m.group("rta") or "").strip() or "CAMS" + isin, amfi, scheme_type = isin_search( + name, + rta, + code, + isin=inline_isin, + ) + current_scheme = Scheme( + scheme=name, + advisor=advisor, + rta=rta, + rta_code=code, + isin=isin, + amfi=amfi, + type=scheme_type or "N/A", + open=Decimal(0), + close=Decimal(0), + close_calculated=Decimal(0), + valuation=SchemeValuation( + date=statement_period.to if statement_period else "1970-01-01", + nav=Decimal(0), + value=Decimal(0), + ), + transactions=[], + ) + current_folio.schemes.append(current_scheme) + continue + + if current_scheme is None: + continue + + # --- Labeled rows --- + if m := OPEN_BAL_RE.search(text): + current_scheme.open = _decimal(m.group(1)) or Decimal(0) + current_scheme.close_calculated = current_scheme.open + continue + if m := CLOSE_BAL_RE.search(text): + current_scheme.close = _decimal(m.group(1)) or Decimal(0) + if m := NAV_RE.search(text): + current_scheme.valuation.date = dateparse.parse(m.group(1)).date() + current_scheme.valuation.nav = _decimal(m.group(2)) or Decimal(0) + if m := VALUATION_RE.search(text): + current_scheme.valuation.date = dateparse.parse(m.group(1)).date() + current_scheme.valuation.value = _decimal(m.group(2)) or Decimal(0) + if m := COST_VALUE_RE.search(text): + current_scheme.valuation.cost = _decimal(m.group(1)) + if m := NOMINEE_RE.search(text): + noms = [ + (m.group("n1") or "").strip(), + (m.group("n2") or "").strip(), + (m.group("n3") or "").strip(), + ] + current_scheme.nominees = [n for n in noms if n] + + # --- Transaction row (only when we have columns AND we're past + # the header block on this page) --- + if columns and header_idx is not None and i > header_idx: + cells = assign_cells(line, columns) + date_str = cells.get("Date", "").strip() + desc = cells.get("Transaction", "").strip() + m_date = DATE_CELL_RE.match(date_str) + if not m_date: + continue + if not desc: + continue # row with date but no description: skip + date_str = m_date.group(1) + # Normalize: collapse runs of dashes/spaces from overlay + # bleed-through, e.g. "15--Jan--2021" -> "15-Jan-2021". + date_str = re.sub(r"[-\s]+", "-", date_str).strip("-") + amt = _decimal(cells.get("Amount", "")) + units = _decimal(cells.get("Units", "")) + nav = _decimal(cells.get("Price", "") or cells.get("NAV", "")) + bal = _decimal(cells.get("Unit Balance", "")) + # A row with no amount AND no units is not a real transaction + # (usually a stray date in a footnote like "Effective from + # 01-Apr-2019…"). Skip these. + if amt is None and units is None: + continue + # Some older CAMS / KFin templates omit the per-row Price + # column for transactions but always carry Amount + Units. + # Derive `nav = amount / units` so downstream capital-gains + # FIFO calculations don't crash on `nav=None`. + if nav is None and amt is not None and units is not None and units != 0: + nav = (amt / units).quantize(Decimal("0.0001")) + txn_type, dividend_rate = get_transaction_type(desc, units) + if units is not None: + current_scheme.close_calculated += units + current_scheme.transactions.append( + TransactionData( + date=dateparse.parse(date_str).date(), + description=desc, + amount=amt, + units=units, + nav=nav, + balance=bal, + type=txn_type.name, + dividend_rate=dividend_rate, + ) + ) + + return CASData( + statement_period=statement_period or StatementPeriod(**{"from": "", "to": ""}), + folios=list(folios.values()), + investor_info=extract_cams_kfin_investor(pdf_path, password, _doc=_doc), + cas_type=CASFileType.DETAILED, + file_type=file_type, + ) diff --git a/casparser/parsers/cams_summary.py b/casparser/parsers/cams_summary.py new file mode 100644 index 0000000..7495547 --- /dev/null +++ b/casparser/parsers/cams_summary.py @@ -0,0 +1,436 @@ +"""POC: CAMS/KFin SUMMARY CAS parser using column-based row reading. + +Same architecture as `cams_detailed`, simpler schema — each scheme is +ONE row (no transactions). Schemes can wrap to one or two continuation +lines below for long names. + +Row anatomy (CAMS, single line where possible): + -