From 0f72364e1949d1da66fc20987c3a5aa133a114d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilherme=20Ara=C3=BAjo?= Date: Fri, 13 Feb 2026 14:54:28 -0300 Subject: [PATCH 1/4] feat: add line numbers support --- README.md | 30 +++++++++++----- include/merve/parser.h | 19 ++++++++-- src/parser.cpp | 73 +++++++++++++++++++++++++------------- tests/real_world_tests.cpp | 55 ++++++++++++++++++++++++++++ 4 files changed, 143 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 0b000fd..05a8005 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ A fast C++ lexer for extracting named exports from CommonJS modules. This librar - **Fast**: Zero-copy parsing for most exports using `std::string_view` - **Accurate**: Handles complex CommonJS patterns including re-exports, Object.defineProperty, and transpiler output +- **Source Locations**: Each export includes a 1-based line number for tooling integration - **Unicode Support**: Properly unescapes JavaScript string literals including `\u{XXXX}` and surrogate pairs - **Optional SIMD Acceleration**: Can use [simdutf](https://github.com/simdutf/simdutf) for faster string operations - **No Dependencies**: Single-header distribution available (simdutf is optional) @@ -49,10 +50,11 @@ int main() { if (result) { std::cout << "Exports found:" << std::endl; for (const auto& exp : result->exports) { - std::cout << " - " << lexer::get_string_view(exp) << std::endl; + std::cout << " - " << lexer::get_string_view(exp) + << " (line " << exp.line << ")" << std::endl; } } - + return 0; } ``` @@ -60,9 +62,9 @@ int main() { Output: ``` Exports found: - - foo - - bar - - baz + - foo (line 2) + - bar (line 3) + - baz (line 4) ``` ## API Reference @@ -85,11 +87,22 @@ Parses CommonJS source code and extracts export information. ```cpp struct lexer_analysis { - std::vector exports; // Named exports - std::vector re_exports; // Re-exported module specifiers + std::vector exports; // Named exports + std::vector re_exports; // Re-exported module specifiers +}; +``` + +### `lexer::export_entry` + +```cpp +struct export_entry { + export_string name; + uint32_t line; // 1-based line number }; ``` +Each export/re-export entry includes the name and the 1-based line number where it was found in the source. + ### `lexer::export_string` ```cpp @@ -104,9 +117,10 @@ Export names are stored as a variant to avoid unnecessary copies: ```cpp inline std::string_view get_string_view(const export_string& s); +inline std::string_view get_string_view(const export_entry& e); ``` -Helper function to get a `string_view` from either variant type. +Helper functions to get a `string_view` from an `export_string` or `export_entry`. ### `lexer::get_last_error` diff --git a/include/merve/parser.h b/include/merve/parser.h index 2a2f406..26ea2f3 100644 --- a/include/merve/parser.h +++ b/include/merve/parser.h @@ -47,6 +47,14 @@ enum lexer_error { */ using export_string = std::variant; +/** + * @brief An export name together with its 1-based source line number. + */ +struct export_entry { + export_string name; + uint32_t line; // 1-based line number +}; + /** * @brief Result of parsing a CommonJS module. */ @@ -61,7 +69,7 @@ struct lexer_analysis { * - module.exports = { a, b, c } * - Object.defineProperty(exports, 'name', {...}) */ - std::vector exports{}; + std::vector exports{}; /** * @brief Module specifiers from re-export patterns. @@ -72,7 +80,7 @@ struct lexer_analysis { * - __export(require('other')) * - Object.keys(require('other')).forEach(...) */ - std::vector re_exports{}; + std::vector re_exports{}; }; /** @@ -89,6 +97,13 @@ inline std::string_view get_string_view(const export_string& s) { return std::visit([](const auto& v) -> std::string_view { return v; }, s); } +/** + * @brief Get a string_view from an export_entry (delegates to the name field). + */ +inline std::string_view get_string_view(const export_entry& e) { + return get_string_view(e.name); +} + /** * @brief Parse CommonJS source code and extract export information. * diff --git a/src/parser.cpp b/src/parser.cpp index 86e6503..a8ee66f 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -325,6 +325,8 @@ class CJSLexer { uint16_t openTokenDepth; uint16_t templateDepth; + uint32_t line_; + bool lastSlashWasDivision; bool nextBraceIsClass; @@ -335,8 +337,13 @@ class CJSLexer { StarExportBinding* starExportStack; const StarExportBinding* STAR_EXPORT_STACK_END; - std::vector& exports; - std::vector& re_exports; + std::vector& exports; + std::vector& re_exports; + + void countNewline(char ch) { + if (ch == '\n') ++line_; + else if (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')) ++line_; + } // Character classification helpers using lookup tables static bool isBr(char c) { @@ -495,6 +502,8 @@ class CJSLexer { return ch; } else if (!isBrOrWs(ch)) { return ch; + } else { + countNewline(ch); } } while (pos++ < end); return ch; @@ -503,8 +512,10 @@ class CJSLexer { void lineComment() { while (pos++ < end) { char ch = *pos; - if (ch == '\n' || ch == '\r') + if (ch == '\n' || ch == '\r') { + countNewline(ch); return; + } } } @@ -516,6 +527,7 @@ class CJSLexer { pos++; return; } + countNewline(ch); } } @@ -527,8 +539,13 @@ class CJSLexer { if (ch == '\\') { if (pos + 1 >= end) break; ch = *++pos; - if (ch == '\r' && *(pos + 1) == '\n') - pos++; + if (ch == '\r') { + ++line_; + if (*(pos + 1) == '\n') + pos++; + } else if (ch == '\n') { + ++line_; + } } else if (isBr(ch)) break; } @@ -580,8 +597,12 @@ class CJSLexer { } if (ch == '`') return; - if (ch == '\\' && pos + 1 < end) + if (ch == '\\' && pos + 1 < end) { pos++; + countNewline(*pos); + } else { + countNewline(ch); + } } syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING); } @@ -614,7 +635,7 @@ class CJSLexer { #endif } - void addExport(std::string_view export_name) { + void addExport(std::string_view export_name, uint32_t at_line) { // Skip surrounding quotes if present if (!export_name.empty() && (export_name.front() == '\'' || export_name.front() == '"')) { export_name.remove_prefix(1); @@ -625,11 +646,11 @@ class CJSLexer { if (!needsUnescaping(export_name)) { // Check if this export already exists (avoid duplicates) for (const auto& existing : exports) { - if (get_string_view(existing) == export_name) { + if (get_string_view(existing.name) == export_name) { return; // Already exists, skip } } - exports.push_back(export_name); + exports.push_back(export_entry{export_name, at_line}); return; } @@ -644,14 +665,14 @@ class CJSLexer { // Check if this export already exists (avoid duplicates) for (const auto& existing : exports) { - if (get_string_view(existing) == name) { + if (get_string_view(existing.name) == name) { return; // Already exists, skip } } - exports.push_back(std::move(unescaped.value())); + exports.push_back(export_entry{std::move(unescaped.value()), at_line}); } - void addReexport(std::string_view reexport_name) { + void addReexport(std::string_view reexport_name, uint32_t at_line) { // Skip surrounding quotes if present if (!reexport_name.empty() && (reexport_name.front() == '\'' || reexport_name.front() == '"')) { reexport_name.remove_prefix(1); @@ -660,7 +681,7 @@ class CJSLexer { // Fast path: no escaping needed, use string_view directly if (!needsUnescaping(reexport_name)) { - re_exports.push_back(reexport_name); + re_exports.push_back(export_entry{reexport_name, at_line}); return; } @@ -670,7 +691,7 @@ class CJSLexer { return; // Skip invalid escape sequences } - re_exports.push_back(std::move(unescaped.value())); + re_exports.push_back(export_entry{std::move(unescaped.value()), at_line}); } bool readExportsOrModuleDotExports(char ch) { @@ -712,7 +733,7 @@ class CJSLexer { switch (requireType) { case RequireType::ExportStar: case RequireType::ExportAssign: - addReexport(std::string_view(reexportStart, reexportEnd - reexportStart)); + addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line_); return true; default: if (starExportStack < STAR_EXPORT_STACK_END) { @@ -773,7 +794,7 @@ class CJSLexer { return; } } - addExport(std::string_view(startPos, endPos - startPos)); + addExport(std::string_view(startPos, endPos - startPos), line_); } else if (ch == '\'' || ch == '"') { const char* start = pos; stringLiteral(ch); @@ -786,7 +807,7 @@ class CJSLexer { pos = revertPos; return; } - addExport(std::string_view(start, end_pos - start)); + addExport(std::string_view(start, end_pos - start), line_); } } else if (ch == '.' && matchesAt(pos + 1, end, "..")) { pos += 3; @@ -825,7 +846,7 @@ class CJSLexer { const char* endPos = pos; ch = commentWhitespace(); if (ch == '=') { - addExport(std::string_view(startPos, endPos - startPos)); + addExport(std::string_view(startPos, endPos - startPos), line_); return; } } @@ -843,7 +864,7 @@ class CJSLexer { pos++; ch = commentWhitespace(); if (ch != '=') break; - addExport(std::string_view(startPos, endPos - startPos)); + addExport(std::string_view(startPos, endPos - startPos), line_); } break; } @@ -974,7 +995,7 @@ class CJSLexer { ch = commentWhitespace(); if (ch != ':') break; if (exportStart && exportEnd) - addExport(std::string_view(exportStart, exportEnd - exportStart)); + addExport(std::string_view(exportStart, exportEnd - exportStart), line_); pos = revertPos; return; } else if (ch == 'g') { @@ -1042,7 +1063,7 @@ class CJSLexer { ch = commentWhitespace(); if (ch != ')') break; if (exportStart && exportEnd) - addExport(std::string_view(exportStart, exportEnd - exportStart)); + addExport(std::string_view(exportStart, exportEnd - exportStart), line_); return; } break; @@ -1406,7 +1427,7 @@ class CJSLexer { StarExportBinding* curCheckBinding = &starExportStack_[0]; while (curCheckBinding != starExportStack) { if (curCheckBinding->id == id) { - addReexport(curCheckBinding->specifier); + addReexport(curCheckBinding->specifier, line_); pos = revertPos; return; } @@ -1506,9 +1527,10 @@ class CJSLexer { } public: - CJSLexer(std::vector& out_exports, std::vector& out_re_exports) + CJSLexer(std::vector& out_exports, std::vector& out_re_exports) : source(nullptr), pos(nullptr), end(nullptr), lastTokenPos(nullptr), templateStackDepth(0), openTokenDepth(0), templateDepth(0), + line_(1), lastSlashWasDivision(false), nextBraceIsClass(false), templateStack_{}, openTokenPosStack_{}, openClassPosStack{}, starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr), @@ -1525,6 +1547,7 @@ class CJSLexer { templateStackDepth = 0; openTokenDepth = 0; templateDepth = std::numeric_limits::max(); + line_ = 1; lastSlashWasDivision = false; starExportStack = &starExportStack_[0]; STAR_EXPORT_STACK_END = &starExportStack_[MAX_STAR_EXPORTS - 1]; @@ -1549,8 +1572,10 @@ class CJSLexer { while (pos++ < end) { ch = *pos; - if (ch == ' ' || (ch < 14 && ch > 8)) + if (ch == ' ' || (ch < 14 && ch > 8)) { + countNewline(ch); continue; + } if (openTokenDepth == 0) { switch (ch) { diff --git a/tests/real_world_tests.cpp b/tests/real_world_tests.cpp index 61f0c43..795019e 100644 --- a/tests/real_world_tests.cpp +++ b/tests/real_world_tests.cpp @@ -1132,3 +1132,58 @@ TEST(real_world_tests, exports_shorthand_syntax) { ASSERT_EQ(lexer::get_string_view(result->exports[2]), "c"); SUCCEED(); } + +TEST(real_world_tests, line_numbers_lf) { + auto result = lexer::parse_commonjs( + "// line 1\n" + "exports.a = 1;\n" + "\n" + "exports.b = 2;\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->exports.size(), 2); + ASSERT_EQ(lexer::get_string_view(result->exports[0]), "a"); + ASSERT_EQ(result->exports[0].line, 2); + ASSERT_EQ(lexer::get_string_view(result->exports[1]), "b"); + ASSERT_EQ(result->exports[1].line, 4); +} + +TEST(real_world_tests, line_numbers_crlf) { + auto result = lexer::parse_commonjs( + "// line 1\r\n" + "exports.x = 1;\r\n" + "\r\n" + "exports.y = 2;\r\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->exports.size(), 2); + ASSERT_EQ(lexer::get_string_view(result->exports[0]), "x"); + ASSERT_EQ(result->exports[0].line, 2); + ASSERT_EQ(lexer::get_string_view(result->exports[1]), "y"); + ASSERT_EQ(result->exports[1].line, 4); +} + +TEST(real_world_tests, line_numbers_reexports) { + auto result = lexer::parse_commonjs( + "// line 1\n" + "module.exports = require('dep1');\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->re_exports.size(), 1); + ASSERT_EQ(lexer::get_string_view(result->re_exports[0]), "dep1"); + ASSERT_EQ(result->re_exports[0].line, 2); +} + +TEST(real_world_tests, line_numbers_after_block_comment) { + auto result = lexer::parse_commonjs( + "/*\n" + " * multi-line\n" + " * comment\n" + " */\n" + "exports.after_comment = 1;\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->exports.size(), 1); + ASSERT_EQ(lexer::get_string_view(result->exports[0]), "after_comment"); + ASSERT_EQ(result->exports[0].line, 5); +} From 749848633637c9a279ded48f6ab17c6cf5296e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilherme=20Ara=C3=BAjo?= Date: Fri, 13 Feb 2026 15:06:35 -0300 Subject: [PATCH 2/4] build: add cstdint header --- include/merve/parser.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/merve/parser.h b/include/merve/parser.h index 26ea2f3..8d1ff42 100644 --- a/include/merve/parser.h +++ b/include/merve/parser.h @@ -3,6 +3,7 @@ #include "merve/version.h" +#include #include #include #include From addc57f8ee69125e69e8b79696a3dca0e79df089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilherme=20Ara=C3=BAjo?= Date: Fri, 13 Feb 2026 16:10:44 -0300 Subject: [PATCH 3/4] refactor: remove variable underscore --- src/parser.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/parser.cpp b/src/parser.cpp index a8ee66f..2d4c79a 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -325,7 +325,7 @@ class CJSLexer { uint16_t openTokenDepth; uint16_t templateDepth; - uint32_t line_; + uint32_t line; bool lastSlashWasDivision; bool nextBraceIsClass; @@ -341,8 +341,8 @@ class CJSLexer { std::vector& re_exports; void countNewline(char ch) { - if (ch == '\n') ++line_; - else if (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')) ++line_; + if (ch == '\n') ++line; + else if (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')) ++line; } // Character classification helpers using lookup tables @@ -540,11 +540,11 @@ class CJSLexer { if (pos + 1 >= end) break; ch = *++pos; if (ch == '\r') { - ++line_; + ++line; if (*(pos + 1) == '\n') pos++; } else if (ch == '\n') { - ++line_; + ++line; } } else if (isBr(ch)) break; @@ -733,7 +733,7 @@ class CJSLexer { switch (requireType) { case RequireType::ExportStar: case RequireType::ExportAssign: - addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line_); + addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line); return true; default: if (starExportStack < STAR_EXPORT_STACK_END) { @@ -794,7 +794,7 @@ class CJSLexer { return; } } - addExport(std::string_view(startPos, endPos - startPos), line_); + addExport(std::string_view(startPos, endPos - startPos), line); } else if (ch == '\'' || ch == '"') { const char* start = pos; stringLiteral(ch); @@ -807,7 +807,7 @@ class CJSLexer { pos = revertPos; return; } - addExport(std::string_view(start, end_pos - start), line_); + addExport(std::string_view(start, end_pos - start), line); } } else if (ch == '.' && matchesAt(pos + 1, end, "..")) { pos += 3; @@ -846,7 +846,7 @@ class CJSLexer { const char* endPos = pos; ch = commentWhitespace(); if (ch == '=') { - addExport(std::string_view(startPos, endPos - startPos), line_); + addExport(std::string_view(startPos, endPos - startPos), line); return; } } @@ -864,7 +864,7 @@ class CJSLexer { pos++; ch = commentWhitespace(); if (ch != '=') break; - addExport(std::string_view(startPos, endPos - startPos), line_); + addExport(std::string_view(startPos, endPos - startPos), line); } break; } @@ -995,7 +995,7 @@ class CJSLexer { ch = commentWhitespace(); if (ch != ':') break; if (exportStart && exportEnd) - addExport(std::string_view(exportStart, exportEnd - exportStart), line_); + addExport(std::string_view(exportStart, exportEnd - exportStart), line); pos = revertPos; return; } else if (ch == 'g') { @@ -1063,7 +1063,7 @@ class CJSLexer { ch = commentWhitespace(); if (ch != ')') break; if (exportStart && exportEnd) - addExport(std::string_view(exportStart, exportEnd - exportStart), line_); + addExport(std::string_view(exportStart, exportEnd - exportStart), line); return; } break; @@ -1427,7 +1427,7 @@ class CJSLexer { StarExportBinding* curCheckBinding = &starExportStack_[0]; while (curCheckBinding != starExportStack) { if (curCheckBinding->id == id) { - addReexport(curCheckBinding->specifier, line_); + addReexport(curCheckBinding->specifier, line); pos = revertPos; return; } @@ -1530,7 +1530,7 @@ class CJSLexer { CJSLexer(std::vector& out_exports, std::vector& out_re_exports) : source(nullptr), pos(nullptr), end(nullptr), lastTokenPos(nullptr), templateStackDepth(0), openTokenDepth(0), templateDepth(0), - line_(1), + line(1), lastSlashWasDivision(false), nextBraceIsClass(false), templateStack_{}, openTokenPosStack_{}, openClassPosStack{}, starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr), @@ -1547,7 +1547,7 @@ class CJSLexer { templateStackDepth = 0; openTokenDepth = 0; templateDepth = std::numeric_limits::max(); - line_ = 1; + line = 1; lastSlashWasDivision = false; starExportStack = &starExportStack_[0]; STAR_EXPORT_STACK_END = &starExportStack_[MAX_STAR_EXPORTS - 1]; From 914abcdee916d11d2b330b34efd36fb9d5afc2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guilherme=20Ara=C3=BAjo?= Date: Fri, 13 Feb 2026 16:27:14 -0300 Subject: [PATCH 4/4] refactor: review --- README.md | 2 +- src/parser.cpp | 7 +++++-- tests/real_world_tests.cpp | 22 ++++++++++++++++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 05a8005..b74ecb0 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ inline std::string_view get_string_view(const export_string& s); inline std::string_view get_string_view(const export_entry& e); ``` -Helper functions to get a `string_view` from an `export_string` or `export_entry`. +Helper function to get a `string_view` from an `export_string` or `export_entry`. ### `lexer::get_last_error` diff --git a/src/parser.cpp b/src/parser.cpp index 2d4c79a..a899b53 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -340,9 +340,12 @@ class CJSLexer { std::vector& exports; std::vector& re_exports; + // Increments `line` when consuming a line terminator. + // - Counts '\n' as a newline. + // - Counts '\r' as a newline only when it is not part of a CRLF sequence. + // (i.e., the next character is not '\n' or we're at end-of-input.) void countNewline(char ch) { - if (ch == '\n') ++line; - else if (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')) ++line; + line += (ch == '\n') || (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')); } // Character classification helpers using lookup tables diff --git a/tests/real_world_tests.cpp b/tests/real_world_tests.cpp index 795019e..08e54f4 100644 --- a/tests/real_world_tests.cpp +++ b/tests/real_world_tests.cpp @@ -1163,6 +1163,28 @@ TEST(real_world_tests, line_numbers_crlf) { ASSERT_EQ(result->exports[1].line, 4); } +TEST(real_world_tests, line_numbers_lfcr) { + auto result = lexer::parse_commonjs( + "// line 1\n\r" + "exports.z = 1;\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->exports.size(), 1); + ASSERT_EQ(lexer::get_string_view(result->exports[0]), "z"); + ASSERT_EQ(result->exports[0].line, 3); +} + +TEST(real_world_tests, line_numbers_cr) { + auto result = lexer::parse_commonjs( + "// line 1\r" + "exports.w = 1;\n" + ); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->exports.size(), 1); + ASSERT_EQ(lexer::get_string_view(result->exports[0]), "w"); + ASSERT_EQ(result->exports[0].line, 2); +} + TEST(real_world_tests, line_numbers_reexports) { auto result = lexer::parse_commonjs( "// line 1\n"