diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ae756a3 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.gguf filter=lfs diff=lfs merge=lfs -text diff --git a/AGENTS.md b/AGENTS.md index 2fe44b4..5b428d2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -162,9 +162,11 @@ ALWAYS preserve or improve performance when porting logic from the reference implementation. ALWAYS implement equivalent functionality natively without external llama.cpp or ggml linkage. -NEVER link "emel" against llama.cpp or ggml outside `tools/bench`. -ALWAYS link llama.cpp and ggml together with emel in `tools/bench` only. +NEVER link "emel" against llama.cpp or ggml outside `tools/bench` or +`tools/paritychecker`. +ALWAYS link llama.cpp and ggml together with emel in `tools/bench` and +`tools/paritychecker` only. NEVER use `llama_` or `ggml_` prefixes in identifiers, symbols, files, or APIs -outside `tools/bench`. +outside `tools/bench` or `tools/paritychecker`. ALWAYS use `emel_` or `EMEL_` prefixes for project-owned identifiers, symbols, files, and APIs. diff --git a/CMakeLists.txt b/CMakeLists.txt index cf736e3..93b1ace 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,11 @@ if(EMEL_ENABLE_TESTS) tests/tokenizer/preprocessor_rwkv_tests.cpp tests/tokenizer/preprocessor_plamo2_tests.cpp tests/tokenizer/preprocessor_fallback_tests.cpp + tests/tokenizer/bpe_regex_tests.cpp + tests/tokenizer/bpe_split_tests.cpp + tests/tokenizer/tokenizer_tests.cpp + tests/tokenizer/tokenizer_parity_tests.cpp + tests/tokenizer/tokenizer_action_guard_tests.cpp tests/batch/splitter_tests.cpp tests/batch/splitter_actions_tests.cpp tests/batch/splitter_additional_tests.cpp diff --git a/docs/architecture/mermaid/tokenizer.mmd b/docs/architecture/mermaid/tokenizer.mmd index 6d2aa32..ff832f4 100644 --- a/docs/architecture/mermaid/tokenizer.mmd +++ b/docs/architecture/mermaid/tokenizer.mmd @@ -1,25 +1,34 @@ stateDiagram-v2 direction TB - [*] --> initialized - initialized --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ - initialized --> errored : tokenize [always] / reject_invalid_ - done --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + [*] --> uninitialized + uninitialized --> binding_preprocessor : bind [can_bind_] / begin_bind_ + uninitialized --> errored : bind [always] / reject_bind_ + uninitialized --> errored : tokenize [always] / reject_invalid_ + binding_preprocessor --> binding_preprocessor_decision : [always] / bind_preprocessor_ + binding_preprocessor_decision --> errored : [phase_failed_] / none + binding_preprocessor_decision --> binding_encoder : [phase_ok_] / none + binding_encoder --> binding_encoder_decision : [always] / bind_encoder_ + binding_encoder_decision --> errored : [phase_failed_] / none + binding_encoder_decision --> idle : [phase_ok_] / none + idle --> binding_preprocessor : bind [can_bind_] / begin_bind_ + idle --> errored : bind [always] / reject_bind_ + idle --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ + idle --> errored : tokenize [always] / reject_invalid_ + done --> binding_preprocessor : bind [can_bind_] / begin_bind_ + done --> errored : bind [always] / reject_bind_ + done --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ done --> errored : tokenize [always] / reject_invalid_ - errored --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + errored --> binding_preprocessor : bind [can_bind_] / begin_bind_ + errored --> errored : bind [always] / reject_bind_ + errored --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ errored --> errored : tokenize [always] / reject_invalid_ - unexpected --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + unexpected --> binding_preprocessor : bind [can_bind_] / begin_bind_ + unexpected --> unexpected : bind [always] / reject_bind_ + unexpected --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ unexpected --> unexpected : tokenize [always] / reject_invalid_ - building_special_tokens --> special_tokens_decision : [always] / build_special_tokens_ - special_tokens_decision --> errored : [phase_failed_] / none - special_tokens_decision --> partitioning_with_specials : [has_special_tokens_] / none - special_tokens_decision --> partitioning_raw : [no_special_tokens_] / none - partitioning_raw --> partitioning_decision : [always] / partition_raw_ - partitioning_with_specials --> partitioning_decision : [always] / partition_with_specials_ - partitioning_decision --> errored : [phase_failed_] / none - partitioning_decision --> selecting_backend : [phase_ok_] / none - selecting_backend --> selecting_backend_decision : [always] / select_backend_ - selecting_backend_decision --> errored : [phase_failed_] / none - selecting_backend_decision --> prefix_decision : [phase_ok_] / none + preprocessing --> preprocess_decision : [always] / run_preprocess_ + preprocess_decision --> errored : [phase_failed_] / none + preprocess_decision --> prefix_decision : [phase_ok_] / none prefix_decision --> encoding_ready : [bos_ready_] / append_bos_ prefix_decision --> errored : [bos_no_capacity_] / set_capacity_error_ prefix_decision --> errored : [bos_invalid_id_] / set_invalid_id_error_ @@ -40,14 +49,14 @@ stateDiagram-v2 suffix_decision --> errored : [eos_invalid_id_] / set_invalid_id_error_ suffix_decision --> finalizing : [no_suffix_] / none finalizing --> done : [always] / finalize_ - initialized --> unexpected : _ [always] / on_unexpected_ - building_special_tokens --> unexpected : _ [always] / on_unexpected_ - special_tokens_decision --> unexpected : _ [always] / on_unexpected_ - partitioning_raw --> unexpected : _ [always] / on_unexpected_ - partitioning_with_specials --> unexpected : _ [always] / on_unexpected_ - partitioning_decision --> unexpected : _ [always] / on_unexpected_ - selecting_backend --> unexpected : _ [always] / on_unexpected_ - selecting_backend_decision --> unexpected : _ [always] / on_unexpected_ + uninitialized --> unexpected : _ [always] / on_unexpected_ + binding_preprocessor --> unexpected : _ [always] / on_unexpected_ + binding_preprocessor_decision --> unexpected : _ [always] / on_unexpected_ + binding_encoder --> unexpected : _ [always] / on_unexpected_ + binding_encoder_decision --> unexpected : _ [always] / on_unexpected_ + idle --> unexpected : _ [always] / on_unexpected_ + preprocessing --> unexpected : _ [always] / on_unexpected_ + preprocess_decision --> unexpected : _ [always] / on_unexpected_ prefix_decision --> unexpected : _ [always] / on_unexpected_ encoding_ready --> unexpected : _ [always] / on_unexpected_ encoding_token_fragment --> unexpected : _ [always] / on_unexpected_ diff --git a/docs/architecture/mermaid/tokenizer_preprocessor_fallback.mmd b/docs/architecture/mermaid/tokenizer_preprocessor_fallback.mmd index 9593f33..f4c1f8a 100644 --- a/docs/architecture/mermaid/tokenizer_preprocessor_fallback.mmd +++ b/docs/architecture/mermaid/tokenizer_preprocessor_fallback.mmd @@ -3,12 +3,7 @@ stateDiagram-v2 [*] --> idle idle --> preparing : preprocess [valid_request_] / begin_preprocess_ idle --> errored : preprocess [invalid_request_] / reject_invalid_ - preparing --> partitioning_select : [always] / build_specials_ - partitioning_select --> partitioning_bpe_no_specials : [bpe_no_specials_] / none - partitioning_select --> partitioning_bpe_with_specials : [bpe_with_specials_] / none - partitioning_select --> partitioning_non_bpe : [not_bpe_] / none - partitioning_bpe_no_specials --> partition_decision : [always] / partition_bpe_no_specials_ - partitioning_bpe_with_specials --> partition_decision : [always] / partition_bpe_with_specials_ + preparing --> partitioning_non_bpe : [always] / build_specials_ partitioning_non_bpe --> partition_decision : [always] / partition_non_bpe_ partition_decision --> errored : [phase_failed_] / ensure_last_error_ partition_decision --> done : [phase_ok_] / mark_done_ @@ -20,9 +15,6 @@ stateDiagram-v2 unexpected --> errored : preprocess [invalid_request_] / reject_invalid_ idle --> unexpected : _ [always] / on_unexpected_ preparing --> unexpected : _ [always] / on_unexpected_ - partitioning_select --> unexpected : _ [always] / on_unexpected_ - partitioning_bpe_no_specials --> unexpected : _ [always] / on_unexpected_ - partitioning_bpe_with_specials --> unexpected : _ [always] / on_unexpected_ partitioning_non_bpe --> unexpected : _ [always] / on_unexpected_ partition_decision --> unexpected : _ [always] / on_unexpected_ done --> unexpected : _ [always] / on_unexpected_ diff --git a/docs/architecture/tokenizer.md b/docs/architecture/tokenizer.md index ba69f56..59afd3c 100644 --- a/docs/architecture/tokenizer.md +++ b/docs/architecture/tokenizer.md @@ -7,26 +7,35 @@ Source: [`emel/tokenizer/sm.hpp`](https://github.com/stateforward/emel.cpp/blob/ ```mermaid stateDiagram-v2 direction TB - [*] --> initialized - initialized --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ - initialized --> errored : tokenize [always] / reject_invalid_ - done --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + [*] --> uninitialized + uninitialized --> binding_preprocessor : bind [can_bind_] / begin_bind_ + uninitialized --> errored : bind [always] / reject_bind_ + uninitialized --> errored : tokenize [always] / reject_invalid_ + binding_preprocessor --> binding_preprocessor_decision : [always] / bind_preprocessor_ + binding_preprocessor_decision --> errored : [phase_failed_] / none + binding_preprocessor_decision --> binding_encoder : [phase_ok_] / none + binding_encoder --> binding_encoder_decision : [always] / bind_encoder_ + binding_encoder_decision --> errored : [phase_failed_] / none + binding_encoder_decision --> idle : [phase_ok_] / none + idle --> binding_preprocessor : bind [can_bind_] / begin_bind_ + idle --> errored : bind [always] / reject_bind_ + idle --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ + idle --> errored : tokenize [always] / reject_invalid_ + done --> binding_preprocessor : bind [can_bind_] / begin_bind_ + done --> errored : bind [always] / reject_bind_ + done --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ done --> errored : tokenize [always] / reject_invalid_ - errored --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + errored --> binding_preprocessor : bind [can_bind_] / begin_bind_ + errored --> errored : bind [always] / reject_bind_ + errored --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ errored --> errored : tokenize [always] / reject_invalid_ - unexpected --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_ + unexpected --> binding_preprocessor : bind [can_bind_] / begin_bind_ + unexpected --> unexpected : bind [always] / reject_bind_ + unexpected --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_ unexpected --> unexpected : tokenize [always] / reject_invalid_ - building_special_tokens --> special_tokens_decision : [always] / build_special_tokens_ - special_tokens_decision --> errored : [phase_failed_] / none - special_tokens_decision --> partitioning_with_specials : [has_special_tokens_] / none - special_tokens_decision --> partitioning_raw : [no_special_tokens_] / none - partitioning_raw --> partitioning_decision : [always] / partition_raw_ - partitioning_with_specials --> partitioning_decision : [always] / partition_with_specials_ - partitioning_decision --> errored : [phase_failed_] / none - partitioning_decision --> selecting_backend : [phase_ok_] / none - selecting_backend --> selecting_backend_decision : [always] / select_backend_ - selecting_backend_decision --> errored : [phase_failed_] / none - selecting_backend_decision --> prefix_decision : [phase_ok_] / none + preprocessing --> preprocess_decision : [always] / run_preprocess_ + preprocess_decision --> errored : [phase_failed_] / none + preprocess_decision --> prefix_decision : [phase_ok_] / none prefix_decision --> encoding_ready : [bos_ready_] / append_bos_ prefix_decision --> errored : [bos_no_capacity_] / set_capacity_error_ prefix_decision --> errored : [bos_invalid_id_] / set_invalid_id_error_ @@ -47,14 +56,14 @@ stateDiagram-v2 suffix_decision --> errored : [eos_invalid_id_] / set_invalid_id_error_ suffix_decision --> finalizing : [no_suffix_] / none finalizing --> done : [always] / finalize_ - initialized --> unexpected : _ [always] / on_unexpected_ - building_special_tokens --> unexpected : _ [always] / on_unexpected_ - special_tokens_decision --> unexpected : _ [always] / on_unexpected_ - partitioning_raw --> unexpected : _ [always] / on_unexpected_ - partitioning_with_specials --> unexpected : _ [always] / on_unexpected_ - partitioning_decision --> unexpected : _ [always] / on_unexpected_ - selecting_backend --> unexpected : _ [always] / on_unexpected_ - selecting_backend_decision --> unexpected : _ [always] / on_unexpected_ + uninitialized --> unexpected : _ [always] / on_unexpected_ + binding_preprocessor --> unexpected : _ [always] / on_unexpected_ + binding_preprocessor_decision --> unexpected : _ [always] / on_unexpected_ + binding_encoder --> unexpected : _ [always] / on_unexpected_ + binding_encoder_decision --> unexpected : _ [always] / on_unexpected_ + idle --> unexpected : _ [always] / on_unexpected_ + preprocessing --> unexpected : _ [always] / on_unexpected_ + preprocess_decision --> unexpected : _ [always] / on_unexpected_ prefix_decision --> unexpected : _ [always] / on_unexpected_ encoding_ready --> unexpected : _ [always] / on_unexpected_ encoding_token_fragment --> unexpected : _ [always] / on_unexpected_ @@ -71,25 +80,34 @@ stateDiagram-v2 | Source | Event | Guard | Action | Target | | --- | --- | --- | --- | --- | -| [`initialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`initialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`uninitialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`uninitialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`uninitialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind_preprocessor>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_preprocessor_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_preprocessor_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_encoder`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_encoder`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind_encoder>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_encoder_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_encoder_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_encoder_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`bind`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_bind>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`can_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`begin_tokenize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`tokenize`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`build_special_tokens>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`special_tokens_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`special_tokens_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`special_tokens_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`has_special_tokens>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partitioning_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`special_tokens_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`no_special_tokens>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partitioning_raw`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_raw`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partition_raw>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partitioning_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partition_with_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`partitioning_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`selecting_backend`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`selecting_backend`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`select_backend>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`selecting_backend_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`selecting_backend_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`selecting_backend_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`run_preprocess>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`preprocess_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`preprocess_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`preprocess_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`bos_ready>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`append_bos>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`encoding_ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`bos_no_capacity>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`set_capacity_error>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`bos_invalid_id>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`set_invalid_id_error>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | @@ -110,14 +128,14 @@ stateDiagram-v2 | [`suffix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`eos_invalid_id>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`set_invalid_id_error>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`suffix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`no_suffix>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`finalizing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`finalizing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`finalize>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`initialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`building_special_tokens`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`special_tokens_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_raw`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`partitioning_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`selecting_backend`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | -| [`selecting_backend_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`uninitialized`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_preprocessor`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_preprocessor_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_encoder`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`binding_encoder_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`preprocessing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | +| [`preprocess_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`prefix_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`encoding_ready`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | | [`encoding_token_fragment`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/sm.hpp) | diff --git a/docs/architecture/tokenizer_preprocessor_fallback.md b/docs/architecture/tokenizer_preprocessor_fallback.md index f20fa5c..ccef93c 100644 --- a/docs/architecture/tokenizer_preprocessor_fallback.md +++ b/docs/architecture/tokenizer_preprocessor_fallback.md @@ -10,12 +10,7 @@ stateDiagram-v2 [*] --> idle idle --> preparing : preprocess [valid_request_] / begin_preprocess_ idle --> errored : preprocess [invalid_request_] / reject_invalid_ - preparing --> partitioning_select : [always] / build_specials_ - partitioning_select --> partitioning_bpe_no_specials : [bpe_no_specials_] / none - partitioning_select --> partitioning_bpe_with_specials : [bpe_with_specials_] / none - partitioning_select --> partitioning_non_bpe : [not_bpe_] / none - partitioning_bpe_no_specials --> partition_decision : [always] / partition_bpe_no_specials_ - partitioning_bpe_with_specials --> partition_decision : [always] / partition_bpe_with_specials_ + preparing --> partitioning_non_bpe : [always] / build_specials_ partitioning_non_bpe --> partition_decision : [always] / partition_non_bpe_ partition_decision --> errored : [phase_failed_] / ensure_last_error_ partition_decision --> done : [phase_ok_] / mark_done_ @@ -27,9 +22,6 @@ stateDiagram-v2 unexpected --> errored : preprocess [invalid_request_] / reject_invalid_ idle --> unexpected : _ [always] / on_unexpected_ preparing --> unexpected : _ [always] / on_unexpected_ - partitioning_select --> unexpected : _ [always] / on_unexpected_ - partitioning_bpe_no_specials --> unexpected : _ [always] / on_unexpected_ - partitioning_bpe_with_specials --> unexpected : _ [always] / on_unexpected_ partitioning_non_bpe --> unexpected : _ [always] / on_unexpected_ partition_decision --> unexpected : _ [always] / on_unexpected_ done --> unexpected : _ [always] / on_unexpected_ @@ -43,12 +35,7 @@ stateDiagram-v2 | --- | --- | --- | --- | --- | | [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`preprocess`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`valid_request>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`begin_preprocess>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`preparing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`preprocess`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`invalid_request>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`preparing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`build_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partitioning_select`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_select`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`bpe_no_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partitioning_bpe_no_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_select`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`bpe_with_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partitioning_bpe_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_select`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`not_bpe>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`none`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partitioning_non_bpe`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_bpe_no_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_bpe_no_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_bpe_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_bpe_with_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | +| [`preparing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`build_specials>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partitioning_non_bpe`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`partitioning_non_bpe`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_non_bpe>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`phase_failed>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`ensure_last_error>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | - | [`phase_ok>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`mark_done>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | @@ -60,9 +47,6 @@ stateDiagram-v2 | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`preprocess`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`invalid_request>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`reject_invalid>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`errored`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`idle`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`preparing`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_select`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_bpe_no_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | -| [`partitioning_bpe_with_specials`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`partitioning_non_bpe`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`partition_decision`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | | [`done`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`_`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`always`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`on_unexpected>`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | [`unexpected`](https://github.com/stateforward/emel.cpp/blob/main/src/emel/tokenizer/preprocessor/fallback/sm.hpp) | diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 4a2278b..d72510c 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -8,26 +8,38 @@ are not. True benchmarks will be end-to-end once the system is complete. | Benchmark | emel.cpp ns/op | llama.cpp ns/op | ratio | | --- | ---: | ---: | ---: | -| `batch/splitter_equal` | 1590.067 | 6551.321 | 0.243x | -| `batch/splitter_seq` | 1494.233 | 2769.658 | 0.540x | -| `batch/splitter_simple` | 764.492 | 2407.667 | 0.318x | -| `buffer/allocator_alloc_graph` | 17.333 | 56.792 | 0.305x | -| `buffer/allocator_full` | 39.583 | 263.475 | 0.150x | -| `buffer/allocator_reserve_n` | 20.758 | 450.979 | 0.046x | -| `jinja/parser_long` | 32224.275 | 50599.421 | 0.637x | -| `jinja/parser_short` | 404.867 | 506.729 | 0.799x | -| `jinja/renderer_long` | 94664.079 | 232153.237 | 0.408x | -| `jinja/renderer_short` | 1574.596 | 3978.121 | 0.396x | -| `memory/coordinator_recurrent_full` | 3873.150 | 5626.817 | 0.688x | -| `tokenizer/preprocessor_bpe_long` | 16275.712 | 16453.733 | 0.989x | -| `tokenizer/preprocessor_bpe_short` | 509.725 | 692.804 | 0.736x | -| `tokenizer/preprocessor_plamo2_long` | 3142.508 | 4691.025 | 0.670x | -| `tokenizer/preprocessor_plamo2_short` | 2429.562 | 3608.113 | 0.673x | -| `tokenizer/preprocessor_rwkv_long` | 3149.004 | 4657.842 | 0.676x | -| `tokenizer/preprocessor_rwkv_short` | 2500.412 | 3560.512 | 0.702x | -| `tokenizer/preprocessor_spm_long` | 3164.762 | 4422.837 | 0.716x | -| `tokenizer/preprocessor_spm_short` | 2489.713 | 3470.771 | 0.717x | -| `tokenizer/preprocessor_ugm_long` | 3222.725 | 4466.550 | 0.722x | -| `tokenizer/preprocessor_ugm_short` | 2468.867 | 3528.483 | 0.700x | -| `tokenizer/preprocessor_wpm_long` | 3217.846 | 4422.783 | 0.728x | -| `tokenizer/preprocessor_wpm_short` | 2435.762 | 3464.592 | 0.703x | +| `batch/splitter_equal` | 1626.933 | 6278.408 | 0.259x | +| `batch/splitter_seq` | 1319.379 | 2638.238 | 0.500x | +| `batch/splitter_simple` | 738.408 | 2273.875 | 0.325x | +| `buffer/allocator_alloc_graph` | 16.671 | 55.083 | 0.303x | +| `buffer/allocator_full` | 37.625 | 252.400 | 0.149x | +| `buffer/allocator_reserve_n` | 19.971 | 442.804 | 0.045x | +| `jinja/parser_long` | 30502.542 | 49796.596 | 0.613x | +| `jinja/parser_short` | 388.525 | 491.550 | 0.790x | +| `jinja/renderer_long` | 89658.308 | 227931.921 | 0.393x | +| `jinja/renderer_short` | 1427.583 | 3803.167 | 0.375x | +| `memory/coordinator_recurrent_full` | 3895.246 | 5590.212 | 0.697x | +| `tokenizer/full_bpe_long` | 6621.133 | 7004.667 | 0.945x | +| `tokenizer/full_bpe_short` | 163.496 | 157.471 | 1.038x | +| `tokenizer/full_plamo2_long` | 10211.054 | 10239.642 | 0.997x | +| `tokenizer/full_plamo2_short` | 2205.075 | 1822.450 | 1.210x | +| `tokenizer/full_rwkv_long` | 2418.412 | 2436.733 | 0.992x | +| `tokenizer/full_rwkv_short` | 1854.350 | 2193.179 | 0.846x | +| `tokenizer/full_spm_long` | 9995.317 | 10792.767 | 0.926x | +| `tokenizer/full_spm_short` | 187.167 | 191.354 | 0.978x | +| `tokenizer/full_ugm_long` | 8868.146 | 8974.592 | 0.988x | +| `tokenizer/full_ugm_short` | 1738.117 | 2098.412 | 0.828x | +| `tokenizer/full_wpm_long` | 25314.525 | 25538.029 | 0.991x | +| `tokenizer/full_wpm_short` | 2077.092 | 2376.600 | 0.874x | +| `tokenizer/preprocessor_bpe_long` | 2776.758 | 5373.312 | 0.517x | +| `tokenizer/preprocessor_bpe_short` | 78.850 | 1747.050 | 0.045x | +| `tokenizer/preprocessor_plamo2_long` | 3082.279 | 4788.679 | 0.644x | +| `tokenizer/preprocessor_plamo2_short` | 2386.262 | 3548.504 | 0.672x | +| `tokenizer/preprocessor_rwkv_long` | 2972.246 | 4580.996 | 0.649x | +| `tokenizer/preprocessor_rwkv_short` | 2305.317 | 3535.229 | 0.652x | +| `tokenizer/preprocessor_spm_long` | 3046.325 | 4598.229 | 0.662x | +| `tokenizer/preprocessor_spm_short` | 2361.629 | 3762.438 | 0.628x | +| `tokenizer/preprocessor_ugm_long` | 3027.463 | 4692.613 | 0.645x | +| `tokenizer/preprocessor_ugm_short` | 2348.642 | 3552.613 | 0.661x | +| `tokenizer/preprocessor_wpm_long` | 2952.042 | 4562.908 | 0.647x | +| `tokenizer/preprocessor_wpm_short` | 2307.729 | 3534.338 | 0.653x | diff --git a/docs/notes.md b/docs/notes.md new file mode 100644 index 0000000..2ba9397 --- /dev/null +++ b/docs/notes.md @@ -0,0 +1,3 @@ +# Notes + +- The GBNF parser needs a re-evaluation in the future; current behavior is not trusted. diff --git a/docs/plans/tokenizer.plan.md b/docs/plans/tokenizer.plan.md index 8469817..2854e26 100644 --- a/docs/plans/tokenizer.plan.md +++ b/docs/plans/tokenizer.plan.md @@ -15,6 +15,7 @@ explicit approval. - preprocessing is modeled as a component with variant SMs under `tokenizer/preprocessor/*`. - encoding is modeled as encoder SMs under `encoder/*` with a stable `encoder::event::encode` API. - tokenizer is bound to a loaded model/vocab; encoder binding happens at init (no per-request select). +- binding is performed via an explicit tokenizer `bind` event before tokenization. - tokenizer outputs are written only through the request payload (no persistent output buffers). - tokenizer errors are surfaced via `_error` events and error_out fields only. @@ -64,11 +65,13 @@ explicit approval. ## encoding model - encoder binding is done at init from `vocab->tokenizer_model_id` with a fallback encoder. - encoders must treat input fragments as authoritative and never re-run special token parsing. +- encoders accept preprocessed fragments via `event::encode.preprocessed`. - fragment encoding is a bounded RTC loop with explicit capacity checks. - prefix/suffix handling is centralized in the tokenizer (not encoder-specific code paths). ## data contracts - inputs: `event::tokenize` carries vocab pointer, text view, flags, output buffers, and callbacks. +- inputs: `event::bind` carries the vocab pointer and error output for binding. - preprocessing output: `fragment` array with `raw_text` spans or `token` ids. - outputs: `token_ids_out` and `token_count_out` only; no context-owned output buffers. - errors: `error_out` and `_error` events; context holds only the latest error code. diff --git a/docs/templates/README.md.j2 b/docs/templates/README.md.j2 index 4235891..ba93f81 100644 --- a/docs/templates/README.md.j2 +++ b/docs/templates/README.md.j2 @@ -10,6 +10,14 @@ allocator, and execution pipelines stabilize. This inference engine is being implemented by AI under human engineering and architecture direction. +## Implementation priorities + +1. Architect first, then scaffold cleanly. +2. Port math, instructions, and behavior without mirroring reference control flow. +3. Prove parity against llama.cpp. +4. Match model/tokenizer intent as defined by their creators (transformers). +5. Optimize once correctness is locked. + ## Why EMEL EMEL exists to make inference behavior explicit and verifiable. Instead of ad-hoc control flow, diff --git a/scripts/paritychecker.sh b/scripts/paritychecker.sh new file mode 100755 index 0000000..fd48085 --- /dev/null +++ b/scripts/paritychecker.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -euo pipefail + +for tool in cmake ctest ninja zig; do + if ! command -v "$tool" >/dev/null 2>&1; then + echo "error: required tool missing: $tool" >&2 + exit 1 + fi +done + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +BUILD_DIR="$ROOT_DIR/build/paritychecker_zig" +zig_bin="$(command -v zig)" + +cmake -S "$ROOT_DIR/tools/paritychecker" -B "$BUILD_DIR" -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER="$zig_bin" \ + -DCMAKE_C_COMPILER_ARG1=cc \ + -DCMAKE_CXX_COMPILER="$zig_bin" \ + -DCMAKE_CXX_COMPILER_ARG1=c++ \ + -DGGML_METAL=OFF \ + -DLLAMA_METAL=OFF + +cmake --build "$BUILD_DIR" --parallel +ctest --test-dir "$BUILD_DIR" --output-on-failure -R paritychecker_tests diff --git a/scripts/quality_gates.sh b/scripts/quality_gates.sh index 6ccaade..80cdef0 100755 --- a/scripts/quality_gates.sh +++ b/scripts/quality_gates.sh @@ -51,6 +51,7 @@ run_step() { run_step build_with_zig "$ROOT_DIR/scripts/build_with_zig.sh" run_step test_with_coverage "$ROOT_DIR/scripts/test_with_coverage.sh" +run_step paritychecker "$ROOT_DIR/scripts/paritychecker.sh" # Temporarily disabled (SML UBSAN issue under asan_ubsan). # TODO: re-enable once stateforward/sml.cpp fix lands. run_step fuzz_smoke "$ROOT_DIR/scripts/fuzz_smoke.sh" diff --git a/snapshots/bench/benchmarks.txt b/snapshots/bench/benchmarks.txt index ccd71eb..a2a9d18 100644 --- a/snapshots/bench/benchmarks.txt +++ b/snapshots/bench/benchmarks.txt @@ -1,25 +1,37 @@ # ref=94b0200a01a753eff5897dab9311f51a7bc1c62f # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal ns_per_op=1791.094 iter=100000 runs=5 -batch/splitter_seq ns_per_op=1814.758 iter=100000 runs=5 -batch/splitter_simple ns_per_op=847.215 iter=100000 runs=5 -buffer/allocator_alloc_graph ns_per_op=17.584 iter=100000 runs=5 -buffer/allocator_full ns_per_op=39.628 iter=100000 runs=5 -buffer/allocator_reserve_n ns_per_op=21.275 iter=100000 runs=5 -jinja/parser_long ns_per_op=30916.535 iter=100000 runs=5 -jinja/parser_short ns_per_op=399.071 iter=100000 runs=5 -jinja/renderer_long ns_per_op=92280.401 iter=100000 runs=5 -jinja/renderer_short ns_per_op=1440.573 iter=100000 runs=5 -memory/coordinator_recurrent_full ns_per_op=3889.142 iter=100000 runs=5 -tokenizer/preprocessor_bpe_long ns_per_op=15881.052 iter=100000 runs=5 -tokenizer/preprocessor_bpe_short ns_per_op=477.209 iter=100000 runs=5 -tokenizer/preprocessor_plamo2_long ns_per_op=3014.551 iter=100000 runs=5 -tokenizer/preprocessor_plamo2_short ns_per_op=2324.218 iter=100000 runs=5 -tokenizer/preprocessor_rwkv_long ns_per_op=3011.764 iter=100000 runs=5 -tokenizer/preprocessor_rwkv_short ns_per_op=2359.773 iter=100000 runs=5 -tokenizer/preprocessor_spm_long ns_per_op=3008.138 iter=100000 runs=5 -tokenizer/preprocessor_spm_short ns_per_op=2446.157 iter=100000 runs=5 -tokenizer/preprocessor_ugm_long ns_per_op=3132.069 iter=100000 runs=5 -tokenizer/preprocessor_ugm_short ns_per_op=2317.281 iter=100000 runs=5 -tokenizer/preprocessor_wpm_long ns_per_op=2989.513 iter=100000 runs=5 -tokenizer/preprocessor_wpm_short ns_per_op=2316.050 iter=100000 runs=5 +batch/splitter_equal ns_per_op=1623.433 iter=10000 runs=3 +batch/splitter_seq ns_per_op=1459.129 iter=10000 runs=3 +batch/splitter_simple ns_per_op=788.742 iter=10000 runs=3 +buffer/allocator_alloc_graph ns_per_op=17.113 iter=10000 runs=3 +buffer/allocator_full ns_per_op=40.817 iter=10000 runs=3 +buffer/allocator_reserve_n ns_per_op=20.837 iter=10000 runs=3 +jinja/parser_long ns_per_op=31673.517 iter=10000 runs=3 +jinja/parser_short ns_per_op=413.546 iter=10000 runs=3 +jinja/renderer_long ns_per_op=93195.971 iter=10000 runs=3 +jinja/renderer_short ns_per_op=1448.917 iter=10000 runs=3 +memory/coordinator_recurrent_full ns_per_op=3781.892 iter=10000 runs=3 +tokenizer/full_bpe_long ns_per_op=7475.504 iter=10000 runs=3 +tokenizer/full_bpe_short ns_per_op=170.971 iter=10000 runs=3 +tokenizer/full_plamo2_long ns_per_op=12447.188 iter=10000 runs=3 +tokenizer/full_plamo2_short ns_per_op=2576.592 iter=10000 runs=3 +tokenizer/full_rwkv_long ns_per_op=3456.438 iter=10000 runs=3 +tokenizer/full_rwkv_short ns_per_op=2176.604 iter=10000 runs=3 +tokenizer/full_spm_long ns_per_op=10159.708 iter=10000 runs=3 +tokenizer/full_spm_short ns_per_op=208.821 iter=10000 runs=3 +tokenizer/full_ugm_long ns_per_op=9014.188 iter=10000 runs=3 +tokenizer/full_ugm_short ns_per_op=1968.367 iter=10000 runs=3 +tokenizer/full_wpm_long ns_per_op=27093.292 iter=10000 runs=3 +tokenizer/full_wpm_short ns_per_op=2713.950 iter=10000 runs=3 +tokenizer/preprocessor_bpe_long ns_per_op=2884.775 iter=10000 runs=3 +tokenizer/preprocessor_bpe_short ns_per_op=83.042 iter=10000 runs=3 +tokenizer/preprocessor_plamo2_long ns_per_op=3169.646 iter=10000 runs=3 +tokenizer/preprocessor_plamo2_short ns_per_op=3111.558 iter=10000 runs=3 +tokenizer/preprocessor_rwkv_long ns_per_op=4157.771 iter=10000 runs=3 +tokenizer/preprocessor_rwkv_short ns_per_op=2377.146 iter=10000 runs=3 +tokenizer/preprocessor_spm_long ns_per_op=3097.287 iter=10000 runs=3 +tokenizer/preprocessor_spm_short ns_per_op=2393.150 iter=10000 runs=3 +tokenizer/preprocessor_ugm_long ns_per_op=3135.021 iter=10000 runs=3 +tokenizer/preprocessor_ugm_short ns_per_op=2412.779 iter=10000 runs=3 +tokenizer/preprocessor_wpm_long ns_per_op=3134.192 iter=10000 runs=3 +tokenizer/preprocessor_wpm_short ns_per_op=3999.287 iter=10000 runs=3 diff --git a/snapshots/bench/benchmarks_compare.txt b/snapshots/bench/benchmarks_compare.txt index dfdda65..d29a4fe 100644 --- a/snapshots/bench/benchmarks_compare.txt +++ b/snapshots/bench/benchmarks_compare.txt @@ -1,25 +1,37 @@ # ref=94b0200a01a753eff5897dab9311f51a7bc1c62f # toolchain=/opt/homebrew/bin/zig -batch/splitter_equal emel.cpp 1590.067 ns/op, llama.cpp 6551.321 ns/op, ratio=0.243x -batch/splitter_seq emel.cpp 1494.233 ns/op, llama.cpp 2769.658 ns/op, ratio=0.540x -batch/splitter_simple emel.cpp 764.492 ns/op, llama.cpp 2407.667 ns/op, ratio=0.318x -buffer/allocator_alloc_graph emel.cpp 17.333 ns/op, llama.cpp 56.792 ns/op, ratio=0.305x -buffer/allocator_full emel.cpp 39.583 ns/op, llama.cpp 263.475 ns/op, ratio=0.150x -buffer/allocator_reserve_n emel.cpp 20.758 ns/op, llama.cpp 450.979 ns/op, ratio=0.046x -jinja/parser_long emel.cpp 32224.275 ns/op, llama.cpp 50599.421 ns/op, ratio=0.637x -jinja/parser_short emel.cpp 404.867 ns/op, llama.cpp 506.729 ns/op, ratio=0.799x -jinja/renderer_long emel.cpp 94664.079 ns/op, llama.cpp 232153.237 ns/op, ratio=0.408x -jinja/renderer_short emel.cpp 1574.596 ns/op, llama.cpp 3978.121 ns/op, ratio=0.396x -memory/coordinator_recurrent_full emel.cpp 3873.150 ns/op, llama.cpp 5626.817 ns/op, ratio=0.688x -tokenizer/preprocessor_bpe_long emel.cpp 16275.712 ns/op, llama.cpp 16453.733 ns/op, ratio=0.989x -tokenizer/preprocessor_bpe_short emel.cpp 509.725 ns/op, llama.cpp 692.804 ns/op, ratio=0.736x -tokenizer/preprocessor_plamo2_long emel.cpp 3142.508 ns/op, llama.cpp 4691.025 ns/op, ratio=0.670x -tokenizer/preprocessor_plamo2_short emel.cpp 2429.562 ns/op, llama.cpp 3608.113 ns/op, ratio=0.673x -tokenizer/preprocessor_rwkv_long emel.cpp 3149.004 ns/op, llama.cpp 4657.842 ns/op, ratio=0.676x -tokenizer/preprocessor_rwkv_short emel.cpp 2500.412 ns/op, llama.cpp 3560.512 ns/op, ratio=0.702x -tokenizer/preprocessor_spm_long emel.cpp 3164.762 ns/op, llama.cpp 4422.837 ns/op, ratio=0.716x -tokenizer/preprocessor_spm_short emel.cpp 2489.713 ns/op, llama.cpp 3470.771 ns/op, ratio=0.717x -tokenizer/preprocessor_ugm_long emel.cpp 3222.725 ns/op, llama.cpp 4466.550 ns/op, ratio=0.722x -tokenizer/preprocessor_ugm_short emel.cpp 2468.867 ns/op, llama.cpp 3528.483 ns/op, ratio=0.700x -tokenizer/preprocessor_wpm_long emel.cpp 3217.846 ns/op, llama.cpp 4422.783 ns/op, ratio=0.728x -tokenizer/preprocessor_wpm_short emel.cpp 2435.762 ns/op, llama.cpp 3464.592 ns/op, ratio=0.703x +batch/splitter_equal emel.cpp 1626.933 ns/op, llama.cpp 6278.408 ns/op, ratio=0.259x +batch/splitter_seq emel.cpp 1319.379 ns/op, llama.cpp 2638.238 ns/op, ratio=0.500x +batch/splitter_simple emel.cpp 738.408 ns/op, llama.cpp 2273.875 ns/op, ratio=0.325x +buffer/allocator_alloc_graph emel.cpp 16.671 ns/op, llama.cpp 55.083 ns/op, ratio=0.303x +buffer/allocator_full emel.cpp 37.625 ns/op, llama.cpp 252.400 ns/op, ratio=0.149x +buffer/allocator_reserve_n emel.cpp 19.971 ns/op, llama.cpp 442.804 ns/op, ratio=0.045x +jinja/parser_long emel.cpp 30502.542 ns/op, llama.cpp 49796.596 ns/op, ratio=0.613x +jinja/parser_short emel.cpp 388.525 ns/op, llama.cpp 491.550 ns/op, ratio=0.790x +jinja/renderer_long emel.cpp 89658.308 ns/op, llama.cpp 227931.921 ns/op, ratio=0.393x +jinja/renderer_short emel.cpp 1427.583 ns/op, llama.cpp 3803.167 ns/op, ratio=0.375x +memory/coordinator_recurrent_full emel.cpp 3895.246 ns/op, llama.cpp 5590.212 ns/op, ratio=0.697x +tokenizer/full_bpe_long emel.cpp 6621.133 ns/op, llama.cpp 7004.667 ns/op, ratio=0.945x +tokenizer/full_bpe_short emel.cpp 163.496 ns/op, llama.cpp 157.471 ns/op, ratio=1.038x +tokenizer/full_plamo2_long emel.cpp 10211.054 ns/op, llama.cpp 10239.642 ns/op, ratio=0.997x +tokenizer/full_plamo2_short emel.cpp 2205.075 ns/op, llama.cpp 1822.450 ns/op, ratio=1.210x +tokenizer/full_rwkv_long emel.cpp 2418.412 ns/op, llama.cpp 2436.733 ns/op, ratio=0.992x +tokenizer/full_rwkv_short emel.cpp 1854.350 ns/op, llama.cpp 2193.179 ns/op, ratio=0.846x +tokenizer/full_spm_long emel.cpp 9995.317 ns/op, llama.cpp 10792.767 ns/op, ratio=0.926x +tokenizer/full_spm_short emel.cpp 187.167 ns/op, llama.cpp 191.354 ns/op, ratio=0.978x +tokenizer/full_ugm_long emel.cpp 8868.146 ns/op, llama.cpp 8974.592 ns/op, ratio=0.988x +tokenizer/full_ugm_short emel.cpp 1738.117 ns/op, llama.cpp 2098.412 ns/op, ratio=0.828x +tokenizer/full_wpm_long emel.cpp 25314.525 ns/op, llama.cpp 25538.029 ns/op, ratio=0.991x +tokenizer/full_wpm_short emel.cpp 2077.092 ns/op, llama.cpp 2376.600 ns/op, ratio=0.874x +tokenizer/preprocessor_bpe_long emel.cpp 2776.758 ns/op, llama.cpp 5373.312 ns/op, ratio=0.517x +tokenizer/preprocessor_bpe_short emel.cpp 78.850 ns/op, llama.cpp 1747.050 ns/op, ratio=0.045x +tokenizer/preprocessor_plamo2_long emel.cpp 3082.279 ns/op, llama.cpp 4788.679 ns/op, ratio=0.644x +tokenizer/preprocessor_plamo2_short emel.cpp 2386.262 ns/op, llama.cpp 3548.504 ns/op, ratio=0.672x +tokenizer/preprocessor_rwkv_long emel.cpp 2972.246 ns/op, llama.cpp 4580.996 ns/op, ratio=0.649x +tokenizer/preprocessor_rwkv_short emel.cpp 2305.317 ns/op, llama.cpp 3535.229 ns/op, ratio=0.652x +tokenizer/preprocessor_spm_long emel.cpp 3046.325 ns/op, llama.cpp 4598.229 ns/op, ratio=0.662x +tokenizer/preprocessor_spm_short emel.cpp 2361.629 ns/op, llama.cpp 3762.438 ns/op, ratio=0.628x +tokenizer/preprocessor_ugm_long emel.cpp 3027.463 ns/op, llama.cpp 4692.613 ns/op, ratio=0.645x +tokenizer/preprocessor_ugm_short emel.cpp 2348.642 ns/op, llama.cpp 3552.613 ns/op, ratio=0.661x +tokenizer/preprocessor_wpm_long emel.cpp 2952.042 ns/op, llama.cpp 4562.908 ns/op, ratio=0.647x +tokenizer/preprocessor_wpm_short emel.cpp 2307.729 ns/op, llama.cpp 3534.338 ns/op, ratio=0.653x diff --git a/snapshots/lint/clang_format.txt b/snapshots/lint/clang_format.txt index b2de891..2510828 100644 --- a/snapshots/lint/clang_format.txt +++ b/snapshots/lint/clang_format.txt @@ -198,6 +198,7 @@ src/emel/text/unicode.hpp src/emel/text/unicode_data.hpp src/emel/tokenizer/actions.hpp src/emel/tokenizer/bpe/regex.hpp +src/emel/tokenizer/bpe/split.hpp src/emel/tokenizer/context.hpp src/emel/tokenizer/events.hpp src/emel/tokenizer/preprocessor/actions.hpp @@ -326,9 +327,14 @@ tests/tensor/lifetime_analyzer_error_tests.cpp tests/tensor/lifetime_analyzer_sm_error_tests.cpp tests/tensor/lifetime_analyzer_sm_transition_tests.cpp tests/tensor/lifetime_analyzer_tests.cpp +tests/tokenizer/bpe_regex_tests.cpp +tests/tokenizer/bpe_split_tests.cpp tests/tokenizer/preprocessor_fallback_tests.cpp tests/tokenizer/preprocessor_plamo2_tests.cpp tests/tokenizer/preprocessor_rwkv_tests.cpp tests/tokenizer/preprocessor_spm_tests.cpp tests/tokenizer/preprocessor_tests.cpp tests/tokenizer/preprocessor_wpm_tests.cpp +tests/tokenizer/tokenizer_action_guard_tests.cpp +tests/tokenizer/tokenizer_parity_tests.cpp +tests/tokenizer/tokenizer_tests.cpp diff --git a/snapshots/quality_gates/timing.txt b/snapshots/quality_gates/timing.txt index d12c6ca..66f0c6a 100644 --- a/snapshots/quality_gates/timing.txt +++ b/snapshots/quality_gates/timing.txt @@ -1,8 +1,7 @@ # quality_gates timing (seconds) -build_with_zig 0 -test_with_coverage 71 -fuzz_smoke 29 +build_with_zig 1 +test_with_coverage 76 +paritychecker 5 +fuzz_smoke 28 lint_snapshot 4 -bench_snapshot 20 -generate_docs 51 -total 175 +total 114 diff --git a/src/emel/encoder/bpe/actions.hpp b/src/emel/encoder/bpe/actions.hpp index 7ceeb3b..6dba525 100644 --- a/src/emel/encoder/bpe/actions.hpp +++ b/src/emel/encoder/bpe/actions.hpp @@ -43,8 +43,6 @@ struct begin_encode { ctx.phase_error = EMEL_OK; ctx.last_error = EMEL_OK; if (emel::encoder::action::detail::sync_vocab(ctx, ev.vocab)) { - ctx.bpe_pre_id = emel::model::data::tokenizer_pre::DEFAULT; - ctx.bpe_regex_exprs.clear(); } if (ev.token_count_out != nullptr) { *ev.token_count_out = 0; diff --git a/src/emel/encoder/bpe/context.hpp b/src/emel/encoder/bpe/context.hpp index 799795e..48e0663 100644 --- a/src/emel/encoder/bpe/context.hpp +++ b/src/emel/encoder/bpe/context.hpp @@ -1,15 +1,10 @@ #pragma once -#include - #include "emel/encoder/context.hpp" #include "emel/model/data.hpp" - namespace emel::encoder::bpe::action { struct context : emel::encoder::action::context { - emel::model::data::tokenizer_pre bpe_pre_id = emel::model::data::tokenizer_pre::DEFAULT; - std::vector bpe_regex_exprs = {}; }; } // namespace emel::encoder::bpe::action diff --git a/src/emel/encoder/bpe/detail.hpp b/src/emel/encoder/bpe/detail.hpp index b414ae0..e651f79 100644 --- a/src/emel/encoder/bpe/detail.hpp +++ b/src/emel/encoder/bpe/detail.hpp @@ -7,112 +7,124 @@ #include "emel/encoder/detail.hpp" #include "emel/encoder/events.hpp" #include "emel/model/data.hpp" -#include "emel/tokenizer/bpe/regex.hpp" +#include "emel/text/unicode.hpp" namespace emel::encoder::bpe::detail { using emel::encoder::detail::encode_result; using emel::encoder::detail::k_token_null; -inline void assign_bpe_regex(action::context &ctx, - const emel::model::data::vocab &vocab) { - emel::tokenizer::bpe::detail::assign_bpe_regex(ctx.bpe_pre_id, - ctx.bpe_regex_exprs, vocab); -} +inline bool encode_bpe_word(const event::encode &ev, + emel::encoder::bpe::action::context &ctx, + const emel::model::data::vocab &vocab, + const std::string_view word, + int32_t &count, + encode_result &result) { + if (word.empty()) { + return true; + } + if (vocab.ignore_merges) { + const int32_t token = emel::encoder::detail::lookup_token(ctx, word); + if (token != k_token_null) { + if (!emel::encoder::detail::push_token(ev, token, count)) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return false; + } + return true; + } + } -inline encode_result encode_bpe(const event::encode &ev, - emel::encoder::bpe::action::context &ctx, - const emel::model::data::vocab &vocab) { - encode_result result{}; - if (ev.text.empty()) { - return result; + if (!emel::encoder::detail::build_symbols(word, ctx.scratch, result)) { + return false; } - emel::encoder::detail::ensure_tables(ctx); - assign_bpe_regex(ctx, vocab); - const std::string text(ev.text); - const auto words = emel::text::unicode_regex_split(text, ctx.bpe_regex_exprs); - int32_t count = 0; - for (const std::string &word : words) { - if (word.empty()) { - continue; - } - if (vocab.ignore_merges) { - const int32_t token = emel::encoder::detail::lookup_token(ctx, word); - if (token != k_token_null) { - if (!emel::encoder::detail::push_token(ev, token, count)) { - result.error = EMEL_ERR_INVALID_ARGUMENT; - return result; - } + for (;;) { + int32_t best_left = -1; + int32_t best_right = -1; + int32_t best_rank = std::numeric_limits::max(); + for (int32_t left = 0; left != -1; + left = ctx.scratch.next[static_cast(left)]) { + const int32_t right = ctx.scratch.next[static_cast(left)]; + if (right < 0) { + break; + } + const size_t left_off = ctx.scratch.offsets[static_cast(left)]; + const size_t left_len = ctx.scratch.lengths[static_cast(left)]; + const size_t right_off = ctx.scratch.offsets[static_cast(right)]; + const size_t right_len = ctx.scratch.lengths[static_cast(right)]; + const std::string_view left_view(word.data() + left_off, left_len); + const std::string_view right_view(word.data() + right_off, right_len); + const int32_t rank = + emel::encoder::detail::lookup_merge_rank(ctx, vocab, left_view, right_view); + if (rank == k_token_null) { continue; } + if (rank < best_rank || (rank == best_rank && left < best_left)) { + best_rank = rank; + best_left = left; + best_right = right; + } } - - if (!emel::encoder::detail::build_symbols(word, ctx.scratch, result)) { - return result; + if (best_left < 0 || best_right < 0) { + break; } + emel::encoder::detail::merge_symbols(ctx.scratch, best_left, best_right); + } - for (;;) { - int32_t best_left = -1; - int32_t best_right = -1; - int32_t best_rank = std::numeric_limits::max(); - for (int32_t left = 0; left != -1; left = ctx.scratch.next[static_cast(left)]) { - const int32_t right = ctx.scratch.next[static_cast(left)]; - if (right < 0) { - break; - } - const size_t left_off = ctx.scratch.offsets[static_cast(left)]; - const size_t left_len = ctx.scratch.lengths[static_cast(left)]; - const size_t right_off = ctx.scratch.offsets[static_cast(right)]; - const size_t right_len = ctx.scratch.lengths[static_cast(right)]; - const std::string_view left_view(word.data() + left_off, left_len); - const std::string_view right_view(word.data() + right_off, right_len); - const int32_t rank = - emel::encoder::detail::lookup_merge_rank(ctx, vocab, left_view, right_view); - if (rank == k_token_null) { - continue; - } - if (rank < best_rank || (rank == best_rank && left < best_left)) { - best_rank = rank; - best_left = left; - best_right = right; - } - } - if (best_left < 0 || best_right < 0) { - break; + for (int32_t idx = 0; idx != -1; + idx = ctx.scratch.next[static_cast(idx)]) { + if (ctx.scratch.lengths[static_cast(idx)] == 0) { + continue; + } + const size_t sym_off = ctx.scratch.offsets[static_cast(idx)]; + const size_t sym_len = ctx.scratch.lengths[static_cast(idx)]; + const std::string_view symbol(word.data() + sym_off, sym_len); + const int32_t token = emel::encoder::detail::lookup_token(ctx, symbol); + if (token != k_token_null) { + if (!emel::encoder::detail::push_token(ev, token, count)) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return false; } - emel::encoder::detail::merge_symbols(ctx.scratch, best_left, best_right); + continue; } - - for (int32_t idx = 0; idx != -1; idx = ctx.scratch.next[static_cast(idx)]) { - if (ctx.scratch.lengths[static_cast(idx)] == 0) { - continue; + size_t byte_offset = 0; + while (byte_offset < symbol.size()) { + size_t len = emel::text::unicode_len_utf8(symbol[byte_offset]); + if (byte_offset + len > symbol.size()) { + len = 1; } - const size_t sym_off = ctx.scratch.offsets[static_cast(idx)]; - const size_t sym_len = ctx.scratch.lengths[static_cast(idx)]; - const std::string_view symbol(word.data() + sym_off, sym_len); - const int32_t token = emel::encoder::detail::lookup_token(ctx, symbol); - if (token != k_token_null) { - if (!emel::encoder::detail::push_token(ev, token, count)) { + const std::string_view unit(symbol.data() + byte_offset, len); + const int32_t byte_token = emel::encoder::detail::lookup_token(ctx, unit); + if (byte_token != k_token_null) { + if (!emel::encoder::detail::push_token(ev, byte_token, count)) { result.error = EMEL_ERR_INVALID_ARGUMENT; - return result; - } - continue; - } - for (const unsigned char c : symbol) { - const char byte = static_cast(c); - const int32_t byte_token = - emel::encoder::detail::lookup_token(ctx, std::string_view(&byte, 1)); - if (byte_token != k_token_null) { - if (!emel::encoder::detail::push_token(ev, byte_token, count)) { - result.error = EMEL_ERR_INVALID_ARGUMENT; - return result; - } + return false; } } + byte_offset += len; } } + return true; +} + +inline encode_result encode_bpe(const event::encode &ev, + emel::encoder::bpe::action::context &ctx, + const emel::model::data::vocab &vocab) { + encode_result result{}; + if (ev.text.empty()) { + return result; + } + emel::encoder::detail::ensure_tables(ctx); + + int32_t count = 0; + if (!ev.preprocessed) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + if (!encode_bpe_word(ev, ctx, vocab, ev.text, count, result)) { + return result; + } result.token_count = count; result.error = EMEL_OK; return result; diff --git a/src/emel/encoder/events.hpp b/src/emel/encoder/events.hpp index 2ad79f5..29a0ee6 100644 --- a/src/emel/encoder/events.hpp +++ b/src/emel/encoder/events.hpp @@ -17,6 +17,7 @@ namespace emel::encoder::event { struct encode { const emel::model::data::vocab * vocab = nullptr; std::string_view text = {}; + bool preprocessed = false; int32_t * token_ids = nullptr; int32_t token_capacity = 0; int32_t * token_count_out = nullptr; diff --git a/src/emel/encoder/spm/detail.hpp b/src/emel/encoder/spm/detail.hpp index 442b9fe..57e7350 100644 --- a/src/emel/encoder/spm/detail.hpp +++ b/src/emel/encoder/spm/detail.hpp @@ -23,15 +23,55 @@ inline encode_result encode_spm(const event::encode &ev, emel::encoder::detail::ensure_tables(ctx); size_t out_len = 0; - if (vocab.add_space_prefix && ev.text.front() != ' ') { - if (out_len + 1 > ctx.scratch.buffer.size()) { - result.error = EMEL_ERR_INVALID_ARGUMENT; - return result; - } - ctx.scratch.buffer[out_len++] = ' '; - } + const bool add_prefix = vocab.add_space_prefix && !vocab.treat_whitespace_as_suffix; + const bool add_suffix = vocab.add_space_prefix && vocab.treat_whitespace_as_suffix; + const bool escape_spaces = vocab.escape_whitespaces; + bool prefix_inserted = false; for (const char c : ev.text) { + if (add_prefix && !prefix_inserted && c != ' ') { + if (escape_spaces) { + if (out_len + 3 > ctx.scratch.buffer.size()) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + ctx.scratch.buffer[out_len++] = '\xE2'; + ctx.scratch.buffer[out_len++] = '\x96'; + ctx.scratch.buffer[out_len++] = '\x81'; + } else { + if (out_len + 1 > ctx.scratch.buffer.size()) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + ctx.scratch.buffer[out_len++] = ' '; + } + prefix_inserted = true; + } if (c == ' ') { + if (escape_spaces) { + if (out_len + 3 > ctx.scratch.buffer.size()) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + ctx.scratch.buffer[out_len++] = '\xE2'; + ctx.scratch.buffer[out_len++] = '\x96'; + ctx.scratch.buffer[out_len++] = '\x81'; + } else { + if (out_len + 1 > ctx.scratch.buffer.size()) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + ctx.scratch.buffer[out_len++] = ' '; + } + } else { + if (out_len + 1 > ctx.scratch.buffer.size()) { + result.error = EMEL_ERR_INVALID_ARGUMENT; + return result; + } + ctx.scratch.buffer[out_len++] = c; + } + } + if (add_suffix) { + if (escape_spaces) { if (out_len + 3 > ctx.scratch.buffer.size()) { result.error = EMEL_ERR_INVALID_ARGUMENT; return result; @@ -44,7 +84,7 @@ inline encode_result encode_spm(const event::encode &ev, result.error = EMEL_ERR_INVALID_ARGUMENT; return result; } - ctx.scratch.buffer[out_len++] = c; + ctx.scratch.buffer[out_len++] = ' '; } } diff --git a/src/emel/gbnf/parser/detail.hpp b/src/emel/gbnf/parser/detail.hpp index decfa97..8cda1a3 100644 --- a/src/emel/gbnf/parser/detail.hpp +++ b/src/emel/gbnf/parser/detail.hpp @@ -123,6 +123,14 @@ struct recursive_descent_parser { emel::gbnf::grammar *grammar = nullptr; symbol_table symbols = {}; uint32_t next_symbol_id = 0; + uint32_t nesting_depth = 0; + static constexpr uint32_t k_max_nesting_depth = 32; + + struct depth_guard { + uint32_t &depth; + explicit depth_guard(uint32_t &depth_in) noexcept : depth(depth_in) { ++depth; } + ~depth_guard() noexcept { --depth; } + }; explicit recursive_descent_parser(action::context &c, emel::gbnf::grammar *out) noexcept @@ -369,6 +377,11 @@ struct recursive_descent_parser { const std::string_view &rule_name, uint32_t rule_id, bool is_nested) noexcept { + if (nesting_depth >= k_max_nesting_depth) { + ctx.phase_error = EMEL_ERR_PARSE_FAILED; + return nullptr; + } + depth_guard guard(nesting_depth); rule_builder current_rule{}; const char *pos = parse_sequence(src, end, rule_name, current_rule, is_nested); if (!pos) { diff --git a/src/emel/tokenizer/actions.hpp b/src/emel/tokenizer/actions.hpp index 92d8797..01b007c 100644 --- a/src/emel/tokenizer/actions.hpp +++ b/src/emel/tokenizer/actions.hpp @@ -7,20 +7,24 @@ #include "emel/emel.h" #include "emel/encoder/events.hpp" #include "emel/tokenizer/context.hpp" -#include "emel/tokenizer/preprocessor/detail.hpp" +#include "emel/tokenizer/preprocessor/events.hpp" namespace emel::tokenizer::action { inline context::context() : encoder_any() { + preprocessor_any.set_kind(preprocessor_kind::fallback); encoder_any.set_kind(encoder_kind::fallback); + preprocess_kind = preprocessor_kind::fallback; model_kind = encoder_kind::fallback; + is_bound = false; } } // namespace emel::tokenizer::action namespace emel::tokenizer::detail { using action::encoder_kind; +using action::preprocessor_kind; inline encoder_kind encoder_kind_from_model( const emel::model::data::tokenizer_model model) { @@ -44,6 +48,28 @@ inline encoder_kind encoder_kind_from_model( } } +inline preprocessor_kind preprocessor_kind_from_model( + const emel::model::data::tokenizer_model model) { + switch (model) { + case emel::model::data::tokenizer_model::SPM: + return preprocessor_kind::spm; + case emel::model::data::tokenizer_model::BPE: + return preprocessor_kind::bpe; + case emel::model::data::tokenizer_model::WPM: + return preprocessor_kind::wpm; + case emel::model::data::tokenizer_model::UGM: + return preprocessor_kind::ugm; + case emel::model::data::tokenizer_model::RWKV: + return preprocessor_kind::rwkv; + case emel::model::data::tokenizer_model::PLAMO2: + return preprocessor_kind::plamo2; + case emel::model::data::tokenizer_model::NONE: + case emel::model::data::tokenizer_model::UNKNOWN: + default: + return preprocessor_kind::fallback; + } +} + inline bool append_token(action::context &ctx, const int32_t token) { if (token < 0) { return false; @@ -69,15 +95,34 @@ inline void set_error(context &ctx, const int32_t err) noexcept { } inline void clear_request(context &ctx) noexcept { - ctx.vocab = nullptr; ctx.text = {}; ctx.add_special = false; ctx.parse_special = false; ctx.token_ids_out = nullptr; ctx.token_capacity = 0; - ctx.model_kind = encoder_kind::fallback; } +struct begin_bind { + void operator()(const event::bind &ev, context &ctx) const noexcept { + if (ev.error_out != nullptr) { + *ev.error_out = EMEL_OK; + } + ctx.vocab = ev.vocab; + ctx.is_bound = false; + ctx.preprocess_kind = preprocessor_kind::fallback; + ctx.model_kind = encoder_kind::fallback; + ctx.phase_error = EMEL_OK; + ctx.last_error = EMEL_OK; + } +}; + +struct reject_bind { + void operator()(const event::bind &, context &ctx) const noexcept { + ctx.is_bound = false; + set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + } +}; + struct begin_tokenize { void operator()(const event::tokenize &ev, context &ctx) const noexcept { if (ev.token_count_out != nullptr) { @@ -86,13 +131,12 @@ struct begin_tokenize { if (ev.error_out != nullptr) { *ev.error_out = EMEL_OK; } - ctx.vocab = ev.vocab; ctx.text = ev.text; ctx.add_special = ev.add_special; ctx.parse_special = ev.parse_special; + ctx.fragments_preprocessed = false; ctx.token_ids_out = ev.token_ids_out; ctx.token_capacity = ev.token_capacity; - ctx.model_kind = encoder_kind::fallback; ctx.fragment_count = 0; ctx.fragment_index = 0; ctx.token_count = 0; @@ -108,54 +152,68 @@ struct reject_invalid { } }; -struct build_special_tokens { - void operator()(context &ctx) const { +struct bind_preprocessor { + void operator()(context &ctx) const noexcept { ctx.phase_error = EMEL_OK; - if (ctx.vocab == nullptr || - !emel::tokenizer::preprocessor::detail::build_special_tokens( - ctx.special_cache, *ctx.vocab)) { + if (ctx.vocab == nullptr) { set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + return; } + const auto kind = detail::preprocessor_kind_from_model( + ctx.vocab->tokenizer_model_id); + ctx.preprocess_kind = kind; + ctx.preprocessor_any.set_kind(kind); } }; -struct partition_raw { - void operator()(context &ctx) const { +struct bind_encoder { + void operator()(context &ctx) const noexcept { ctx.phase_error = EMEL_OK; - ctx.fragment_count = 0; - ctx.fragment_index = 0; - if (!emel::tokenizer::preprocessor::detail::push_raw_fragment( - ctx.fragments.data(), ctx.fragments.size(), ctx.fragment_count, - ctx.text)) { + ctx.is_bound = false; + if (ctx.vocab == nullptr) { set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + return; } + const auto kind = detail::encoder_kind_from_model( + ctx.vocab->tokenizer_model_id); + ctx.model_kind = kind; + ctx.encoder_any.set_kind(kind); + ctx.is_bound = true; } }; -struct partition_with_specials { - void operator()(context &ctx) const { +struct run_preprocess { + void operator()(context &ctx) const noexcept { ctx.phase_error = EMEL_OK; ctx.fragment_count = 0; ctx.fragment_index = 0; - if (!emel::tokenizer::preprocessor::detail::partition_with_specials( - ctx.text, ctx.special_cache, ctx.parse_special, - ctx.fragments.data(), ctx.fragments.size(), &ctx.fragment_count)) { - set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); - } - } -}; - -struct select_backend { - void operator()(context &ctx) const noexcept { - ctx.phase_error = EMEL_OK; if (ctx.vocab == nullptr) { set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); return; } - const auto kind = detail::encoder_kind_from_model( - ctx.vocab->tokenizer_model_id); - ctx.model_kind = kind; - ctx.encoder_any.set_kind(kind); + size_t fragment_count = 0; + bool preprocessed = false; + int32_t err = EMEL_OK; + emel::tokenizer::preprocessor::event::preprocess ev = {}; + ev.vocab = ctx.vocab; + ev.text = ctx.text; + ev.parse_special = ctx.parse_special; + ev.fragments_out = ctx.fragments.data(); + ev.fragment_capacity = ctx.fragments.size(); + ev.fragment_count_out = &fragment_count; + ev.preprocessed_out = &preprocessed; + ev.error_out = &err; + const bool accepted = ctx.preprocessor_any.process_event(ev); + if (!accepted && err == EMEL_OK) { + set_error(ctx, EMEL_ERR_BACKEND); + return; + } + if (err != EMEL_OK) { + set_error(ctx, err); + return; + } + ctx.fragment_count = fragment_count; + ctx.fragments_preprocessed = preprocessed; } }; @@ -232,6 +290,7 @@ struct encode_raw_fragment { emel::encoder::event::encode encode_ev = {}; encode_ev.vocab = ctx.vocab; encode_ev.text = frag.text; + encode_ev.preprocessed = ctx.fragments_preprocessed; encode_ev.token_ids = ctx.token_ids_out + ctx.token_count; encode_ev.token_capacity = capacity; encode_ev.token_count_out = &fragment_count; @@ -281,12 +340,13 @@ struct on_unexpected { } }; +inline constexpr begin_bind begin_bind{}; +inline constexpr reject_bind reject_bind{}; inline constexpr begin_tokenize begin_tokenize{}; inline constexpr reject_invalid reject_invalid{}; -inline constexpr build_special_tokens build_special_tokens{}; -inline constexpr partition_raw partition_raw{}; -inline constexpr partition_with_specials partition_with_specials{}; -inline constexpr select_backend select_backend{}; +inline constexpr bind_preprocessor bind_preprocessor{}; +inline constexpr bind_encoder bind_encoder{}; +inline constexpr run_preprocess run_preprocess{}; inline constexpr append_bos append_bos{}; inline constexpr append_sep append_sep{}; inline constexpr append_eos append_eos{}; diff --git a/src/emel/tokenizer/bpe/regex.hpp b/src/emel/tokenizer/bpe/regex.hpp index 5ce04eb..6cda52f 100644 --- a/src/emel/tokenizer/bpe/regex.hpp +++ b/src/emel/tokenizer/bpe/regex.hpp @@ -1,57 +1,59 @@ #pragma once +#include #include -#include -#include +#include #include "emel/model/data.hpp" namespace emel::tokenizer::bpe::detail { -inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, - std::vector & regex_exprs, - const emel::model::data::vocab & vocab) { - const auto pre = vocab.tokenizer_pre_id; - if (pre_id == pre && !regex_exprs.empty()) { - return; - } - pre_id = pre; - regex_exprs.clear(); - auto set_regex = [&](std::initializer_list list) { - regex_exprs.reserve(list.size()); - for (const char * expr : list) { - regex_exprs.emplace_back(expr); +constexpr size_t k_max_regex_exprs = 6; + +struct regex_list { + std::array exprs = {}; + size_t count = 0; +}; + +inline regex_list regex_for(const emel::model::data::tokenizer_pre pre) { + auto make = [](std::initializer_list list) { + regex_list out{}; + size_t idx = 0; + for (const auto & expr : list) { + if (idx >= out.exprs.size()) { + break; + } + out.exprs[idx++] = expr; } + out.count = idx; + return out; }; using tokenizer_pre = emel::model::data::tokenizer_pre; switch (pre) { case tokenizer_pre::LLAMA3: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::JAIS2: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s{512}(?!\\S)|\\s{256}(?!\\S)|\\s{128}(?!\\S)" "|\\s{64}(?!\\S)|\\s{32}(?!\\S)|\\s{16}(?!\\S)|\\s{8}(?!\\S)|\\s{4}(?!\\S)" "|\\s{1,2}(?!\\S)|\\s{1}", }); - return; case tokenizer_pre::DBRX: case tokenizer_pre::SMAUG: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::DEEPSEEK_LLM: - set_regex({ + return make({ "[\\r\\n]", "\\s?[A-za-z\\xc2\\xb5\\xc3\\x80-\\xc3\\x96\\xc3\\x98-\\xc3\\xb6\\xc3\\xb8-\\xc6\\xba\\xc6\\xbc-\\xc6\\xbf\\xc7\\x84-\\xca\\x93\\xca\\x95-\\xca\\xaf\\xcd\\xb0-\\xcd\\xb3\\xcd\\xb6\\xcd\\xb7\\xcd\\xbb-\\xcd\\xbd\\xcd\\xbf\\xce\\x86\\xce\\x88-\\xce\\x8a\\xce\\x8c\\xce\\x8e-\\xce\\xa1\\xce\\xa3-\\xcf\\xb5\\xcf\\xb7-\\xd2\\x81\\xd2\\x8a-\\xd4\\xaf\\xd4\\xb1-\\xd5\\x96\\xe1\\x82\\xa0-\\xe1\\x83\\x85\\xe1\\x8e\\xa0-\\xe1\\x8f\\xb5\\xe1\\x8f\\xb8-\\xe1\\x8f\\xbd\\xe1\\xb2\\x90-\\xe1\\xb2\\xba\\xe1\\xb2\\xbd-\\xe1\\xb2\\xbf\\xe1\\xb4\\x80-\\xe1\\xb4\\xab\\xe1\\xb5\\xab-\\xe1\\xb5\\xb7\\xe1\\xb5\\xb9-\\xe1\\xb6\\x9a\\xe1\\xb8\\x80-\\xe1\\xbc\\x95\\xe1\\xbc\\x98-\\xe1\\xbc\\x9d\\xe1\\xbc\\xa0-\\xe1\\xbd\\x85\\xe1\\xbd\\x88-\\xe1\\xbd\\x8d\\xe1\\xbd\\x90-\\xe1\\xbd\\x97\\xe1\\xbd\\x99\\xe1\\xbd\\x9b\\xe1\\xbd\\x9d\\xe1\\xbd\\x9f-\\xe1\\xbd\\xbd\\xe1\\xbe\\x80-\\xe1\\xbe\\xb4\\xe1\\xbe\\xb6-\\xe1\\xbe\\xbc\\xe1\\xbe\\xbe\\xe1\\xbf\\x82-\\xe1\\xbf\\x84\\xe1\\xbf\\x86-\\xe1\\xbf\\x8c\\xe1\\xbf\\x90-\\xe1\\xbf\\x93\\xe1\\xbf\\x96-\\xe1\\xbf\\x9b\\xe1\\xbf\\xa0-\\xe1\\xbf\\xac\\xe1\\xbf\\xb2-\\xe1\\xbf\\xb4\\xe1\\xbf\\xb6-\\xe1\\xbf\\xbc\\xe2\\x84\\x82\\xe2\\x84\\x87\\xe2\\x84\\x8a-\\xe2\\x84\\x93\\xe2\\x84\\x95\\xe2\\x84\\x99-\\xe2\\x84\\x9d\\xe2\\x84\\xa4\\xe2\\x84\\xa6\\xe2\\x84\\xa8\\xe2\\x84\\xaa-\\xe2\\x84\\xad\\xe2\\x84\\xaf-\\xe2\\x84\\xb4\\xe2\\x84\\xb9\\xe2\\x84\\xbc-\\xe2\\x84\\xbf\\xe2\\x85\\x85-\\xe2\\x85\\x89\\xe2\\x85\\x8e\\xe2\\x86\\x83\\xe2\\x86\\x84\\xe2\\xb0\\x80-\\xe2\\xb1\\xbb\\xe2\\xb1\\xbe-\\xe2\\xb3\\xa4\\xe2\\xb3\\xab-\\xe2\\xb3\\xae\\xe2\\xb3\\xb2\\xe2\\xb3\\xb3\\xea\\x99\\x80-\\xea\\x99\\xad\\xea\\x9a\\x80-\\xea\\x9a\\x9b\\xea\\x9c\\xa2-\\xea\\x9d\\xaf\\xea\\x9d\\xb1-\\xea\\x9e\\x87\\xea\\x9e\\x8b-\\xea\\x9e\\x8e\\xea\\xad\\xb0-\\xea\\xae\\xbf\\xef\\xac\\x80-\\xef\\xac\\x86\\xef\\xac\\x93-\\xef\\xac\\x97\\xef\\xbc\\xa1-\\xef\\xbc\\xba\\xef\\xbd\\x81-\\xef\\xbd\\x9a\\xf0\\x90\\x90\\x80-\\xf0\\x90\\x91\\x8f\\xf0\\x90\\x92\\xb0-\\xf0\\x90\\x93\\x93\\xf0\\x90\\x93\\x98-\\xf0\\x90\\x93\\xbb\\xf0\\x90\\xb2\\x80-\\xf0\\x90\\xb2\\xb2\\xf0\\x90\\xb3\\x80-\\xf0\\x90\\xb3\\xb2\\xf0\\x91\\xa2\\xa0-\\xf0\\x91\\xa3\\x9f\\xf0\\x9e\\xa4\\x80-\\xf0\\x9e\\xa5\\x83]+", "\\s?[!-/:-~\\xef\\xbc\\x81-\\xef\\xbc\\x8f\\xef\\xbc\\x9a-\\xef\\xbd\\x9e\\xe2\\x80\\x98-\\xe2\\x80\\x9f\\xe3\\x80\\x80-\\xe3\\x80\\x82]+", @@ -59,20 +61,18 @@ inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, "[\\xe4\\xb8\\x80-\\xe9\\xbe\\xa5\\xe0\\xa0\\x80-\\xe4\\xb8\\x80\\xea\\xb0\\x80-\\xed\\x9f\\xbf]+", "\\p{N}+", }); - return; case tokenizer_pre::DEEPSEEK3_LLM: case tokenizer_pre::HUNYUAN_DENSE: case tokenizer_pre::JOYAI_LLM: - set_regex({ + return make({ "\\p{N}{1,3}", "[\\xe4\\xb8\\x80-\\xe9\\xbe\\xa5\\xe3\\x81\\x80-\\xe3\\x82\\x9f\\xe3\\x82\\xa0-\\xe3\\x83\\xbf]+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-za-z]+|" "[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::YOUTU: - set_regex({ + return make({ "[\\xea\\xb0\\x80-\\xed\\x9e\\xa3\\xe3\\x84\\xb1-\\xe3\\x86\\x8e]+|" "[\\xef\\xbc\\x81\\xe2\\x80\\xa6\\xe2\\x80\\x9c\\xe2\\x80\\x9d\\xe2\\x80\\x98\\xe2\\x80\\x99" "\\xe2\\x80\\x94\\xef\\xbc\\x9a\\xef\\xbc\\x9b\\xef\\xbc\\x8c\\xe3\\x80\\x81-\\xe3\\x80\\xbf" @@ -85,23 +85,20 @@ inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, "'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}|" " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::DEEPSEEK_CODER: - set_regex({ + return make({ "[\\r\\n]", "\\s?\\p{L}+", "\\s?\\p{P}+", "[\\xe4\\xb8\\x80-\\xe9\\xbe\\xa5\\xe0\\xa0\\x80-\\xe4\\xb8\\x80\\xea\\xb0\\x80-\\xed\\x9f\\xbf]+", "\\p{N}", }); - return; case tokenizer_pre::FALCON: - set_regex({ + return make({ "[\\p{P}\\$\\+<=>\\^~\\|`]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "[0-9][0-9][0-9]", }); - return; case tokenizer_pre::STARCODER: case tokenizer_pre::REFACT: case tokenizer_pre::COMMAND_R: @@ -109,67 +106,59 @@ inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, case tokenizer_pre::CODESHELL: case tokenizer_pre::EXAONE: case tokenizer_pre::MINERVA: - set_regex({ + return make({ "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); - return; case tokenizer_pre::GPT2: case tokenizer_pre::MPT: case tokenizer_pre::OLMO: case tokenizer_pre::JAIS: case tokenizer_pre::TRILLION: case tokenizer_pre::GRANITE_DOCLING: - set_regex({ + return make({ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); - return; case tokenizer_pre::QWEN35: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::STABLELM2: case tokenizer_pre::QWEN2: case tokenizer_pre::HUNYUAN: case tokenizer_pre::SOLAR_OPEN: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*" "|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::PORO: case tokenizer_pre::BLOOM: case tokenizer_pre::GPT3_FINNISH: - set_regex({ + return make({ " ?[^(\\\\s|.,!?\\xe2\\x80\\xa6\\xe3\\x80\\x82\\xef\\xbc\\x8c\\xe3\\x80\\x81\\xe0\\xa5\\xa4\\xe0\\xa5\\xa4\\xd8\\x8c)]+", }); - return; case tokenizer_pre::CHATGLM4: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::VIKING: - set_regex({ + return make({ " ?[^(\\\\s|.,!?\\xe2\\x80\\xa6\\xe3\\x80\\x82\\xef\\xbc\\x8c\\xe3\\x80\\x81\\xe0\\xa5\\xa4\\xe0\\xa5\\xa4\\xd8\\x8c)]+", "\\p{N}", }); - return; case tokenizer_pre::TEKKEN: - set_regex({ + return make({ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|" "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|" "\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::CHAMELEON: - set_regex({ + return make({ "", "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", "([\\t\\n]| | )", @@ -177,19 +166,17 @@ inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, "[\\p{P}!-/:-@\\[-`{-~]", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }); - return; case tokenizer_pre::GPT4O: case tokenizer_pre::MINIMAX_M2: - set_regex({ + return make({ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|" "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*" "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|" "\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::TINY_AYA: - set_regex({ + return make({ "\\d{1,3}(?=(?:\\d{3})*\\b)", "[^\\r\\n\\p{L}\\p{N}]?[\\p{lu}\\p{lt}\\p{lm}\\p{lo}\\p{M}]*" "[\\p{ll}\\p{lm}\\p{lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]" @@ -198,64 +185,59 @@ inline void assign_bpe_regex(emel::model::data::tokenizer_pre & pre_id, "'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}|" " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::KIMI_K2: - set_regex({ + return make({ "\\p{han}+", }); - return; case tokenizer_pre::SUPERBPE: - set_regex({ + return make({ "\\p{N}+", "(?=(\\d{3})+(?!\\d))", }); - return; case tokenizer_pre::BAILINGMOE: - set_regex({ + return make({ "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::SEED_CODER: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+" "|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::GROK_2: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::AFMOE: - set_regex({ + return make({ "\\p{AFMoE_digits}", "[\\xe4\\xb8\\x80-\\xe9\\xbf\\xbf\\xe3\\x90\\x80-\\xe4\\xb6\\xbf\\xe8\\xb1\\x88-\\xef\\xab\\xbf\\xe3\\x81\\x80-\\xe3\\x82\\x9f\\xe3\\x82\\xa0-\\xe3\\x83\\xbf\\xef\\xbd\\xa5-\\xef\\xbe\\x9f\\xe2\\xbc\\x80-\\xe2\\xbf\\x9f\\xe0\\xb9\\x80-\\xe0\\xb9\\xbf\\xe0\\xba\\x80-\\xe0\\xbb\\xbf\\xe1\\x80\\x80-\\xe1\\x82\\x9f\\xea\\xa9\\xa0-\\xea\\xa9\\xbf\\xea\\xa7\\xa0-\\xea\\xa7\\xbf\\xea\\x9d\\x80-\\xea\\x9d\\xbf\\xea\\xa0\\x80-\\xea\\xa0\\xbf\\xea\\xa1\\x80-\\xea\\xa1\\xbf\\xea\\xa2\\x80-\\xea\\xa2\\xbf]+", "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-za-z]+|" "[^\\r\\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+" "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }); - return; case tokenizer_pre::EXAONE_MOE: - set_regex({ + return make({ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" "[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|" "\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }); - return; default: - set_regex({ + return make({ "[\\p{P}\\$\\+<=>\\^~\\|]+", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", "\\p{N}+", "[0-9][0-9][0-9]", }); - return; } } +inline regex_list regex_for(const emel::model::data::vocab & vocab) { + return regex_for(vocab.tokenizer_pre_id); +} + } // namespace emel::tokenizer::bpe::detail diff --git a/src/emel/tokenizer/bpe/split.hpp b/src/emel/tokenizer/bpe/split.hpp new file mode 100644 index 0000000..f037baf --- /dev/null +++ b/src/emel/tokenizer/bpe/split.hpp @@ -0,0 +1,590 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "emel/text/unicode.hpp" +#include "emel/tokenizer/bpe/regex.hpp" + +namespace emel::tokenizer::bpe::detail { + +constexpr size_t k_max_bpe_words = 1024; +constexpr size_t k_max_bpe_bytes = 65536; +constexpr size_t k_max_bpe_cpts = k_max_bpe_bytes; + +struct split_view { + const std::string_view * words = nullptr; + size_t count = 0; +}; + +struct split_scratch { + std::array cpts = {}; + std::array offsets_a = {}; + std::array offsets_b = {}; + std::array encoded = {}; + std::array words = {}; + size_t encoded_size = 0; + size_t cpt_count = 0; + size_t offset_count = 0; + size_t word_count = 0; + + void reset() noexcept { + encoded_size = 0; + cpt_count = 0; + offset_count = 0; + word_count = 0; + } +}; + +inline constexpr uint32_t bpe_out_of_range = 0xFFFFFFFFu; + +inline constexpr std::array bpe_byte_to_unicode_map() { + std::array table = {}; + bool keep[256] = {}; + for (int ch = 0x21; ch <= 0x7E; ++ch) { + keep[ch] = true; + } + for (int ch = 0xA1; ch <= 0xAC; ++ch) { + keep[ch] = true; + } + for (int ch = 0xAE; ch <= 0xFF; ++ch) { + keep[ch] = true; + } + uint16_t n = 0; + for (int ch = 0; ch < 256; ++ch) { + if (keep[ch]) { + table[ch] = static_cast(ch); + } else { + table[ch] = static_cast(256 + n); + n += 1; + } + } + return table; +} + +inline constexpr std::array k_bpe_byte_to_unicode = + bpe_byte_to_unicode_map(); + +inline size_t encode_utf8(uint32_t cpt, char * out, size_t capacity) { + if (out == nullptr || capacity == 0) { + return 0; + } + if (cpt <= 0x7F) { + if (capacity < 1) { + return 0; + } + out[0] = static_cast(cpt); + return 1; + } + if (cpt <= 0x7FF) { + if (capacity < 2) { + return 0; + } + out[0] = static_cast(0xC0 | ((cpt >> 6) & 0x1F)); + out[1] = static_cast(0x80 | (cpt & 0x3F)); + return 2; + } + if (cpt <= 0xFFFF) { + if (capacity < 3) { + return 0; + } + out[0] = static_cast(0xE0 | ((cpt >> 12) & 0x0F)); + out[1] = static_cast(0x80 | ((cpt >> 6) & 0x3F)); + out[2] = static_cast(0x80 | (cpt & 0x3F)); + return 3; + } + if (cpt <= 0x10FFFF) { + if (capacity < 4) { + return 0; + } + out[0] = static_cast(0xF0 | ((cpt >> 18) & 0x07)); + out[1] = static_cast(0x80 | ((cpt >> 12) & 0x3F)); + out[2] = static_cast(0x80 | ((cpt >> 6) & 0x3F)); + out[3] = static_cast(0x80 | (cpt & 0x3F)); + return 4; + } + return 0; +} + +inline bool decode_utf8_to_cpts(const std::string_view text, + split_scratch & scratch) { + scratch.cpt_count = 0; + size_t offset = 0; + while (offset < text.size()) { + if (scratch.cpt_count >= scratch.cpts.size()) { + return false; + } + const uint8_t byte = static_cast(text[offset]); + uint32_t cpt = 0xFFFD; + size_t len = 1; + if ((byte & 0x80u) == 0) { + cpt = byte; + len = 1; + } else if ((byte & 0xE0u) == 0xC0u && offset + 1 < text.size()) { + const uint8_t b1 = static_cast(text[offset + 1]); + if ((b1 & 0xC0u) == 0x80u) { + cpt = ((byte & 0x1Fu) << 6) | (b1 & 0x3Fu); + len = 2; + } + } else if ((byte & 0xF0u) == 0xE0u && offset + 2 < text.size()) { + const uint8_t b1 = static_cast(text[offset + 1]); + const uint8_t b2 = static_cast(text[offset + 2]); + if ((b1 & 0xC0u) == 0x80u && (b2 & 0xC0u) == 0x80u) { + cpt = ((byte & 0x0Fu) << 12) | ((b1 & 0x3Fu) << 6) | (b2 & 0x3Fu); + len = 3; + } + } else if ((byte & 0xF8u) == 0xF0u && offset + 3 < text.size()) { + const uint8_t b1 = static_cast(text[offset + 1]); + const uint8_t b2 = static_cast(text[offset + 2]); + const uint8_t b3 = static_cast(text[offset + 3]); + if ((b1 & 0xC0u) == 0x80u && (b2 & 0xC0u) == 0x80u && + (b3 & 0xC0u) == 0x80u) { + cpt = ((byte & 0x07u) << 18) | ((b1 & 0x3Fu) << 12) | + ((b2 & 0x3Fu) << 6) | (b3 & 0x3Fu); + len = 4; + } + } + scratch.cpts[scratch.cpt_count++] = cpt; + offset += len; + } + return true; +} + +inline bool push_offset(size_t value, size_t * out, size_t capacity, + size_t & out_count) { + if (value == 0) { + return true; + } + if (out_count >= capacity) { + return false; + } + out[out_count++] = value; + return true; +} + +inline bool split_gpt2(const uint32_t * cpts, size_t cpt_count, + const size_t * offsets_in, size_t offsets_in_count, + size_t * offsets_out, size_t out_capacity, + size_t & out_count) { + out_count = 0; + size_t start = 0; + for (size_t idx = 0; idx < offsets_in_count; ++idx) { + const size_t offset_ini = start; + const size_t offset_end = start + offsets_in[idx]; + if (offset_end > cpt_count) { + return false; + } + start = offset_end; + + auto get_cpt = [&](const size_t pos) -> uint32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] + : bpe_out_of_range; + }; + auto get_flags = [&](const size_t pos) -> emel::text::unicode_cpt_flags { + return (offset_ini <= pos && pos < offset_end) + ? emel::text::unicode_cpt_flags_from_cpt(cpts[pos]) + : emel::text::unicode_cpt_flags{}; + }; + + size_t prev_end = offset_ini; + auto add_token = [&](const size_t end) -> bool { + if (end < prev_end || end > offset_end) { + return false; + } + const size_t len = end - prev_end; + prev_end = end; + return push_offset(len, offsets_out, out_capacity, out_count); + }; + + for (size_t pos = offset_ini; pos < offset_end;) { + const uint32_t cpt = get_cpt(pos); + const auto flags = get_flags(pos); + + if (cpt == '\'' && pos + 1 < offset_end) { + const uint32_t cpt_next = get_cpt(pos + 1); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || + cpt_next == 'd') { + if (!add_token(pos + 2)) { + return false; + } + pos += 2; + continue; + } + if (pos + 2 < offset_end) { + const uint32_t cpt_next_next = get_cpt(pos + 2); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + if (!add_token(pos + 3)) { + return false; + } + pos += 3; + continue; + } + } + } + + auto flags2 = (cpt == ' ') ? get_flags(pos + 1) : flags; + if (flags2.is_letter) { + pos += (cpt == ' '); + while (get_flags(pos).is_letter) { + pos++; + } + if (!add_token(pos)) { + return false; + } + continue; + } + if (flags2.is_number) { + pos += (cpt == ' '); + while (get_flags(pos).is_number) { + pos++; + } + if (!add_token(pos)) { + return false; + } + continue; + } + if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && + flags2.as_uint()) { + pos += (cpt == ' '); + while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && + flags2.as_uint()) { + flags2 = get_flags(++pos); + } + if (!add_token(pos)) { + return false; + } + continue; + } + + size_t num_whitespaces = 0; + while (get_flags(pos + num_whitespaces).is_whitespace) { + num_whitespaces++; + } + + if (num_whitespaces > 1 && + get_cpt(pos + num_whitespaces) != bpe_out_of_range) { + pos += num_whitespaces - 1; + if (!add_token(pos)) { + return false; + } + continue; + } + + if (num_whitespaces > 0) { + pos += num_whitespaces; + if (!add_token(pos)) { + return false; + } + continue; + } + + pos += 1; + if (!add_token(pos)) { + return false; + } + } + } + return true; +} + +inline bool split_llama3(const uint32_t * cpts, size_t cpt_count, + const size_t * offsets_in, size_t offsets_in_count, + size_t * offsets_out, size_t out_capacity, + size_t & out_count) { + out_count = 0; + size_t start = 0; + for (size_t idx = 0; idx < offsets_in_count; ++idx) { + const size_t offset_ini = start; + const size_t offset_end = start + offsets_in[idx]; + if (offset_end > cpt_count) { + return false; + } + start = offset_end; + + auto get_cpt = [&](const size_t pos) -> uint32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] + : bpe_out_of_range; + }; + auto get_flags = [&](const size_t pos) -> emel::text::unicode_cpt_flags { + return (offset_ini <= pos && pos < offset_end) + ? emel::text::unicode_cpt_flags_from_cpt(cpts[pos]) + : emel::text::unicode_cpt_flags{}; + }; + + size_t prev_end = offset_ini; + auto add_token = [&](const size_t end) -> bool { + if (end < prev_end || end > offset_end) { + return false; + } + const size_t len = end - prev_end; + prev_end = end; + return push_offset(len, offsets_out, out_capacity, out_count); + }; + + for (size_t pos = offset_ini; pos < offset_end;) { + const uint32_t cpt = get_cpt(pos); + const auto flags = get_flags(pos); + + if (cpt == '\'' && pos + 1 < offset_end) { + const uint32_t cpt_next = emel::text::unicode_tolower(get_cpt(pos + 1)); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || + cpt_next == 'd') { + if (!add_token(pos + 2)) { + return false; + } + pos += 2; + continue; + } + if (pos + 2 < offset_end) { + const uint32_t cpt_next_next = + emel::text::unicode_tolower(get_cpt(pos + 2)); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + if (!add_token(pos + 3)) { + return false; + } + pos += 3; + continue; + } + } + } + + if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) { + if (flags.is_letter || get_flags(pos + 1).is_letter) { + pos++; + while (get_flags(pos).is_letter) { + pos++; + } + if (!add_token(pos)) { + return false; + } + continue; + } + } + + if (flags.is_number) { + size_t ini = pos; + while (get_flags(pos).is_number) { + pos++; + } + const size_t len = pos - ini; + if (len > 3) { + pos = ini + 3; + } + if (!add_token(pos)) { + return false; + } + continue; + } + + if (!(flags.is_whitespace | flags.is_letter | flags.is_number) && + flags.as_uint()) { + if (cpt != ' ' && (cpt == '\r' || cpt == '\n')) { + pos += 1; + } else if (cpt == ' ') { + pos += 1; + } + while (!(get_flags(pos).is_whitespace | get_flags(pos).is_letter | + get_flags(pos).is_number) && + get_flags(pos).as_uint()) { + pos++; + } + while (get_cpt(pos) == '\r' || get_cpt(pos) == '\n') { + pos++; + } + if (!add_token(pos)) { + return false; + } + continue; + } + + if (cpt == '\r' || cpt == '\n') { + while (get_cpt(pos) == '\r' || get_cpt(pos) == '\n') { + pos++; + } + if (!add_token(pos)) { + return false; + } + continue; + } + + size_t num_whitespaces = 0; + while (get_flags(pos + num_whitespaces).is_whitespace) { + num_whitespaces++; + } + + if (num_whitespaces > 1 && + get_cpt(pos + num_whitespaces) != bpe_out_of_range) { + pos += num_whitespaces - 1; + if (!add_token(pos)) { + return false; + } + continue; + } + + if (num_whitespaces > 0) { + pos += num_whitespaces; + if (!add_token(pos)) { + return false; + } + continue; + } + + pos += 1; + if (!add_token(pos)) { + return false; + } + } + } + return true; +} + +inline bool encode_bpe_segment(const uint32_t * cpts, size_t start, + size_t len, split_scratch & scratch) { + size_t segment_offset = scratch.encoded_size; + for (size_t idx = 0; idx < len; ++idx) { + char utf8[4]; + const size_t utf8_len = encode_utf8(cpts[start + idx], utf8, sizeof(utf8)); + if (utf8_len == 0) { + return false; + } + for (size_t j = 0; j < utf8_len; ++j) { + const uint8_t byte = static_cast(utf8[j]); + const uint32_t mapped = k_bpe_byte_to_unicode[byte]; + char encoded[4]; + const size_t encoded_len = encode_utf8(mapped, encoded, sizeof(encoded)); + if (encoded_len == 0) { + return false; + } + if (scratch.encoded_size + encoded_len > scratch.encoded.size()) { + return false; + } + for (size_t k = 0; k < encoded_len; ++k) { + scratch.encoded[scratch.encoded_size++] = encoded[k]; + } + } + } + const size_t encoded_len = scratch.encoded_size - segment_offset; + if (encoded_len == 0) { + return true; + } + if (scratch.word_count >= scratch.words.size()) { + return false; + } + scratch.words[scratch.word_count++] = + std::string_view(scratch.encoded.data() + segment_offset, encoded_len); + return true; +} + +inline bool split_and_encode_fallback(const std::string_view text, + const regex_list & regex, + split_scratch & scratch) { + std::vector regex_exprs; + regex_exprs.reserve(regex.count); + for (size_t idx = 0; idx < regex.count; ++idx) { + regex_exprs.emplace_back(regex.exprs[idx]); + } + const std::string raw_text(text); + const auto words = emel::text::unicode_regex_split(raw_text, regex_exprs); + scratch.word_count = 0; + for (const std::string & word : words) { + if (word.empty()) { + continue; + } + if (scratch.word_count >= scratch.words.size()) { + return false; + } + if (scratch.encoded_size + word.size() > scratch.encoded.size()) { + return false; + } + const size_t offset = scratch.encoded_size; + for (const char c : word) { + scratch.encoded[scratch.encoded_size++] = c; + } + scratch.words[scratch.word_count++] = + std::string_view(scratch.encoded.data() + offset, word.size()); + } + return true; +} + +inline bool split_and_encode_append(const std::string_view text, + const emel::model::data::vocab & vocab, + split_scratch & scratch, + split_view & view) { + view.words = scratch.words.data(); + view.count = 0; + scratch.word_count = 0; + if (text.empty()) { + return true; + } + + const regex_list regex = regex_for(vocab); + + static constexpr std::string_view k_gpt2_regex = + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)"; + static constexpr std::string_view k_llama3_regex = + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|" + "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+" + "[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; + + const bool byte_encode = (regex.count == 1) && + (regex.exprs[0] == k_gpt2_regex || + regex.exprs[0] == k_llama3_regex); + + if (text.size() > scratch.encoded.size()) { + return false; + } + + if (!decode_utf8_to_cpts(text, scratch)) { + return false; + } + + scratch.offsets_a[0] = scratch.cpt_count; + scratch.offset_count = 1; + size_t out_count = 0; + bool ok = false; + if (regex.count != 1 || !byte_encode) { + ok = split_and_encode_fallback(text, regex, scratch); + view.count = scratch.word_count; + return ok; + } + + if (regex.exprs[0] == k_gpt2_regex) { + ok = split_gpt2(scratch.cpts.data(), scratch.cpt_count, + scratch.offsets_a.data(), scratch.offset_count, + scratch.offsets_b.data(), scratch.offsets_b.size(), + out_count); + } else { + ok = split_llama3(scratch.cpts.data(), scratch.cpt_count, + scratch.offsets_a.data(), scratch.offset_count, + scratch.offsets_b.data(), scratch.offsets_b.size(), + out_count); + } + if (!ok) { + return false; + } + + size_t start = 0; + for (size_t idx = 0; idx < out_count; ++idx) { + const size_t len = scratch.offsets_b[idx]; + if (len == 0) { + continue; + } + if (!encode_bpe_segment(scratch.cpts.data(), start, len, scratch)) { + return false; + } + start += len; + } + + view.count = scratch.word_count; + return true; +} + +} // namespace emel::tokenizer::bpe::detail diff --git a/src/emel/tokenizer/context.hpp b/src/emel/tokenizer/context.hpp index 4c7688d..1d8c511 100644 --- a/src/emel/tokenizer/context.hpp +++ b/src/emel/tokenizer/context.hpp @@ -9,6 +9,7 @@ #include "emel/emel.h" #include "emel/model/data.hpp" #include "emel/tokenizer/events.hpp" +#include "emel/tokenizer/preprocessor/any.hpp" #include "emel/tokenizer/preprocessor/types.hpp" namespace emel::tokenizer::action { @@ -21,21 +22,23 @@ using encoder_kind = emel::encoder::encoder_kind; using fragment_kind = emel::tokenizer::preprocessor::fragment_kind; using fragment = emel::tokenizer::preprocessor::fragment; -using special_token = emel::tokenizer::preprocessor::special_token; -using special_token_cache = emel::tokenizer::preprocessor::special_token_cache; +using preprocessor_kind = emel::tokenizer::preprocessor::preprocessor_kind; struct context { + emel::tokenizer::preprocessor::any preprocessor_any = {}; emel::encoder::any encoder_any = {}; std::array fragments = {}; size_t fragment_count = 0; size_t fragment_index = 0; - special_token_cache special_cache = {}; const emel::model::data::vocab *vocab = nullptr; std::string_view text = {}; bool add_special = false; bool parse_special = false; + bool fragments_preprocessed = false; int32_t *token_ids_out = nullptr; int32_t token_capacity = 0; + bool is_bound = false; + preprocessor_kind preprocess_kind = preprocessor_kind::fallback; encoder_kind model_kind = encoder_kind::fallback; int32_t token_count = 0; int32_t phase_error = EMEL_OK; diff --git a/src/emel/tokenizer/events.hpp b/src/emel/tokenizer/events.hpp index 6dd7316..645d2fe 100644 --- a/src/emel/tokenizer/events.hpp +++ b/src/emel/tokenizer/events.hpp @@ -8,10 +8,21 @@ namespace emel::tokenizer::events { struct tokenizer_done; struct tokenizer_error; +struct tokenizer_bind_done; +struct tokenizer_bind_error; } // namespace emel::tokenizer::events namespace emel::tokenizer::event { +struct bind { + const emel::model::data::vocab * vocab = nullptr; + int32_t * error_out = nullptr; + void * owner_sm = nullptr; + bool (*dispatch_done)(void * owner_sm, const events::tokenizer_bind_done &) = nullptr; + bool (*dispatch_error)(void * owner_sm, + const events::tokenizer_bind_error &) = nullptr; +}; + struct tokenize { const emel::model::data::vocab * vocab = nullptr; std::string_view text = {}; @@ -96,4 +107,13 @@ struct tokenizer_error { int32_t err = 0; }; +struct tokenizer_bind_done { + const event::bind * request = nullptr; +}; + +struct tokenizer_bind_error { + const event::bind * request = nullptr; + int32_t err = 0; +}; + } // namespace emel::tokenizer::events diff --git a/src/emel/tokenizer/guards.hpp b/src/emel/tokenizer/guards.hpp index ed8815d..908ceca 100644 --- a/src/emel/tokenizer/guards.hpp +++ b/src/emel/tokenizer/guards.hpp @@ -5,15 +5,30 @@ namespace emel::tokenizer::guard { struct can_tokenize { - bool operator()(const event::tokenize &ev) const noexcept { - if (ev.vocab == nullptr || ev.token_ids_out == nullptr || - ev.token_count_out == nullptr) { + bool operator()(const event::tokenize &ev, + const action::context &ctx) const noexcept { + if (!ctx.is_bound || ctx.vocab == nullptr) { + return false; + } + if (ev.vocab == nullptr || ev.vocab != ctx.vocab) { + return false; + } + if (ev.token_ids_out == nullptr || ev.token_count_out == nullptr) { return false; } return ev.token_capacity > 0; } }; +struct can_bind { + bool operator()(const event::bind &ev) const noexcept { + if (ev.vocab == nullptr) { + return false; + } + return true; + } +}; + struct phase_ok { bool operator()(const action::context &ctx) const noexcept { return ctx.phase_error == EMEL_OK; @@ -26,18 +41,6 @@ struct phase_failed { } }; -struct has_special_tokens { - bool operator()(const action::context &ctx) const noexcept { - return ctx.special_cache.count > 0; - } -}; - -struct no_special_tokens { - bool operator()(const action::context &ctx) const noexcept { - return ctx.special_cache.count == 0; - } -}; - struct has_capacity { bool operator()(const action::context &ctx) const noexcept { return ctx.token_count < ctx.token_capacity; diff --git a/src/emel/tokenizer/preprocessor/actions.hpp b/src/emel/tokenizer/preprocessor/actions.hpp index dc76d1b..13c993a 100644 --- a/src/emel/tokenizer/preprocessor/actions.hpp +++ b/src/emel/tokenizer/preprocessor/actions.hpp @@ -3,12 +3,8 @@ #include #include #include -#include -#include - #include "emel/emel.h" -#include "emel/text/unicode.hpp" -#include "emel/tokenizer/bpe/regex.hpp" +#include "emel/tokenizer/bpe/split.hpp" #include "emel/tokenizer/preprocessor/context.hpp" #include "emel/tokenizer/preprocessor/detail.hpp" @@ -24,9 +20,10 @@ inline void clear_request(context & ctx) noexcept { ctx.vocab = nullptr; ctx.text = {}; ctx.parse_special = false; + ctx.preprocessed = false; ctx.fragment_capacity = 0; ctx.fragment_count = 0; - ctx.bpe_words.clear(); + ctx.bpe_scratch.reset(); } struct begin_preprocess { @@ -41,11 +38,15 @@ struct begin_preprocess { ctx.vocab = ev.vocab; ctx.text = ev.text; ctx.parse_special = ev.parse_special; + ctx.preprocessed = false; ctx.fragment_capacity = ev.fragment_capacity; ctx.fragment_count = 0; ctx.phase_error = EMEL_OK; ctx.last_error = EMEL_OK; - ctx.bpe_words.clear(); + ctx.bpe_scratch.reset(); + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = false; + } } }; @@ -78,7 +79,10 @@ struct partition_non_bpe { ctx.request->fragments_out, ctx.fragment_capacity, &ctx.fragment_count)) { set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + ctx.preprocessed = false; + return; } + ctx.preprocessed = true; } }; @@ -90,31 +94,23 @@ struct partition_bpe_no_specials { return; } - emel::tokenizer::bpe::detail::assign_bpe_regex(ctx.bpe_pre_id, - ctx.bpe_regex_exprs, - *ctx.vocab); - ctx.bpe_words.clear(); - if (ctx.bpe_words.capacity() < ctx.fragment_capacity) { - ctx.bpe_words.reserve(ctx.fragment_capacity); - } - size_t out_count = 0; - const std::string raw_text(ctx.text); - const auto words = emel::text::unicode_regex_split(raw_text, - ctx.bpe_regex_exprs); - for (const std::string & word : words) { + ctx.bpe_scratch.reset(); + emel::tokenizer::bpe::detail::split_view view = {}; + if (!emel::tokenizer::bpe::detail::split_and_encode_append( + ctx.text, *ctx.vocab, ctx.bpe_scratch, view)) { + ctx.fragment_count = 0; + set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + return; + } + for (size_t idx = 0; idx < view.count; ++idx) { + const std::string_view word = view.words[idx]; if (word.empty()) { continue; } - if (out_count >= ctx.fragment_capacity) { - ctx.fragment_count = 0; - set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); - return; - } - ctx.bpe_words.push_back(word); if (!detail::push_raw_fragment(ctx.request->fragments_out, ctx.fragment_capacity, out_count, - ctx.bpe_words.back())) { + word)) { ctx.fragment_count = 0; set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); return; @@ -122,6 +118,7 @@ struct partition_bpe_no_specials { } ctx.fragment_count = out_count; + ctx.preprocessed = true; } }; @@ -143,14 +140,7 @@ struct partition_bpe_with_specials { return; } - emel::tokenizer::bpe::detail::assign_bpe_regex(ctx.bpe_pre_id, - ctx.bpe_regex_exprs, - *ctx.vocab); - ctx.bpe_words.clear(); - if (ctx.bpe_words.capacity() < ctx.fragment_capacity) { - ctx.bpe_words.reserve(ctx.fragment_capacity); - } - + ctx.bpe_scratch.reset(); size_t out_count = 0; for (size_t idx = 0; idx < partition_count; ++idx) { const fragment & frag = partitions[idx]; @@ -168,23 +158,21 @@ struct partition_bpe_with_specials { if (frag.text.empty()) { continue; } - - const std::string raw_text(frag.text); - const auto words = emel::text::unicode_regex_split(raw_text, - ctx.bpe_regex_exprs); - for (const std::string & word : words) { + emel::tokenizer::bpe::detail::split_view view = {}; + if (!emel::tokenizer::bpe::detail::split_and_encode_append( + frag.text, *ctx.vocab, ctx.bpe_scratch, view)) { + ctx.fragment_count = 0; + set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); + return; + } + for (size_t word_idx = 0; word_idx < view.count; ++word_idx) { + const std::string_view word = view.words[word_idx]; if (word.empty()) { continue; } - if (out_count >= ctx.fragment_capacity) { - ctx.fragment_count = 0; - set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); - return; - } - ctx.bpe_words.push_back(word); if (!detail::push_raw_fragment(ctx.request->fragments_out, ctx.fragment_capacity, out_count, - ctx.bpe_words.back())) { + word)) { ctx.fragment_count = 0; set_error(ctx, EMEL_ERR_INVALID_ARGUMENT); return; @@ -193,6 +181,7 @@ struct partition_bpe_with_specials { } ctx.fragment_count = out_count; + ctx.preprocessed = true; } }; diff --git a/src/emel/tokenizer/preprocessor/bpe/sm.hpp b/src/emel/tokenizer/preprocessor/bpe/sm.hpp index 98cfb8b..caf9b58 100644 --- a/src/emel/tokenizer/preprocessor/bpe/sm.hpp +++ b/src/emel/tokenizer/preprocessor/bpe/sm.hpp @@ -135,6 +135,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/context.hpp b/src/emel/tokenizer/preprocessor/context.hpp index 11a5e8e..e3086ca 100644 --- a/src/emel/tokenizer/preprocessor/context.hpp +++ b/src/emel/tokenizer/preprocessor/context.hpp @@ -2,13 +2,12 @@ #include #include -#include #include -#include #include "emel/emel.h" #include "emel/tokenizer/preprocessor/events.hpp" #include "emel/tokenizer/preprocessor/types.hpp" +#include "emel/tokenizer/bpe/split.hpp" namespace emel::tokenizer::preprocessor::action { @@ -17,13 +16,11 @@ struct context { const emel::model::data::vocab * vocab = nullptr; std::string_view text = {}; bool parse_special = false; + bool preprocessed = false; size_t fragment_capacity = 0; size_t fragment_count = 0; special_token_cache special_cache = {}; - emel::model::data::tokenizer_pre bpe_pre_id = - emel::model::data::tokenizer_pre::DEFAULT; - std::vector bpe_regex_exprs = {}; - std::vector bpe_words = {}; + emel::tokenizer::bpe::detail::split_scratch bpe_scratch = {}; int32_t phase_error = EMEL_OK; int32_t last_error = EMEL_OK; }; diff --git a/src/emel/tokenizer/preprocessor/events.hpp b/src/emel/tokenizer/preprocessor/events.hpp index ad14bcd..6621052 100644 --- a/src/emel/tokenizer/preprocessor/events.hpp +++ b/src/emel/tokenizer/preprocessor/events.hpp @@ -21,6 +21,7 @@ struct preprocess { fragment * fragments_out = nullptr; size_t fragment_capacity = 0; size_t * fragment_count_out = nullptr; + bool * preprocessed_out = nullptr; int32_t * error_out = nullptr; void * owner_sm = nullptr; bool (*dispatch_done)(void * owner_sm, diff --git a/src/emel/tokenizer/preprocessor/fallback/sm.hpp b/src/emel/tokenizer/preprocessor/fallback/sm.hpp index 53ca390..88bf20b 100644 --- a/src/emel/tokenizer/preprocessor/fallback/sm.hpp +++ b/src/emel/tokenizer/preprocessor/fallback/sm.hpp @@ -108,6 +108,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/plamo2/sm.hpp b/src/emel/tokenizer/preprocessor/plamo2/sm.hpp index 875d1c4..9d4c4be 100644 --- a/src/emel/tokenizer/preprocessor/plamo2/sm.hpp +++ b/src/emel/tokenizer/preprocessor/plamo2/sm.hpp @@ -107,6 +107,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/rwkv/sm.hpp b/src/emel/tokenizer/preprocessor/rwkv/sm.hpp index bd673ab..d6470f7 100644 --- a/src/emel/tokenizer/preprocessor/rwkv/sm.hpp +++ b/src/emel/tokenizer/preprocessor/rwkv/sm.hpp @@ -108,6 +108,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/spm/sm.hpp b/src/emel/tokenizer/preprocessor/spm/sm.hpp index 7d5a048..8cd44aa 100644 --- a/src/emel/tokenizer/preprocessor/spm/sm.hpp +++ b/src/emel/tokenizer/preprocessor/spm/sm.hpp @@ -108,6 +108,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/ugm/sm.hpp b/src/emel/tokenizer/preprocessor/ugm/sm.hpp index b94a720..b4e831e 100644 --- a/src/emel/tokenizer/preprocessor/ugm/sm.hpp +++ b/src/emel/tokenizer/preprocessor/ugm/sm.hpp @@ -108,6 +108,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/preprocessor/wpm/sm.hpp b/src/emel/tokenizer/preprocessor/wpm/sm.hpp index 62a01be..85adcd8 100644 --- a/src/emel/tokenizer/preprocessor/wpm/sm.hpp +++ b/src/emel/tokenizer/preprocessor/wpm/sm.hpp @@ -108,6 +108,9 @@ struct sm : public emel::sm { if (ev.fragment_count_out != nullptr) { *ev.fragment_count_out = context_.fragment_count; } + if (ev.preprocessed_out != nullptr) { + *ev.preprocessed_out = context_.preprocessed; + } if (ev.error_out != nullptr) { *ev.error_out = err; } diff --git a/src/emel/tokenizer/sm.hpp b/src/emel/tokenizer/sm.hpp index 59bd113..56ea24a 100644 --- a/src/emel/tokenizer/sm.hpp +++ b/src/emel/tokenizer/sm.hpp @@ -10,14 +10,14 @@ namespace emel::tokenizer { -struct initialized {}; -struct building_special_tokens {}; -struct special_tokens_decision {}; -struct partitioning_raw {}; -struct partitioning_with_specials {}; -struct partitioning_decision {}; -struct selecting_backend {}; -struct selecting_backend_decision {}; +struct uninitialized {}; +struct binding_preprocessor {}; +struct binding_preprocessor_decision {}; +struct binding_encoder {}; +struct binding_encoder_decision {}; +struct idle {}; +struct preprocessing {}; +struct preprocess_decision {}; struct prefix_decision {}; struct encoding_ready {}; struct encoding_token_fragment {}; @@ -38,16 +38,16 @@ scope model-aware encoding. state purpose -- initialized: idle, accepts tokenize requests. -- building_special_tokens: builds token inventory for special-token parsing. -- special_tokens_decision: routes to raw or special partitioning. -- partitioning_raw/partitioning_with_specials: build fragment list. -- selecting_backend: binds encoder context and selects encoder machine. -- prefix_decision: applies optional BOS prefix or errors. -- encoding_ready/encoding_*: encodes fragments in a bounded loop. -- suffix_decision: applies optional SEP/EOS suffix or errors. -- finalizing: marks success. -- done: last request completed successfully. + - uninitialized: no bound vocab, awaits bind. + - binding_preprocessor/binding_encoder: bind model-specific preprocess/encode stages. + - idle: ready to tokenize requests. + - preprocessing: dispatch preprocessor to build fragment list. + - preprocess_decision: routes based on preprocess success/failure. + - prefix_decision: applies optional BOS prefix or errors. + - encoding_ready/encoding_*: encodes fragments in a bounded loop. + - suffix_decision: applies optional SEP/EOS suffix or errors. + - finalizing: marks success. + - done: last request completed successfully. - errored: last request failed with an error code. - unexpected: sequencing contract violation. @@ -57,21 +57,21 @@ key invariants - internal progress uses anonymous transitions (no self-dispatch). guard semantics -- can_tokenize: validates request pointers and capacity. -- phase_ok/phase_failed: observe errors set by actions. -- has_special_tokens: indicates whether special-token inventory is available. -- has_capacity: checks remaining output capacity before encoding. -- should_add_bos/sep/eos: determines prefix/suffix requirements. -- has_more_fragments: indicates more fragments to encode. + - can_bind: validates bind request pointers. + - can_tokenize: validates request pointers, capacity, and bound vocab match. + - phase_ok/phase_failed: observe errors set by actions. + - has_capacity: checks remaining output capacity before encoding. + - should_add_bos/sep/eos: determines prefix/suffix requirements. + - has_more_fragments: indicates more fragments to encode. action side effects -- begin_tokenize: resets request outputs and context runtime state. -- build_special_tokens: builds special-token inventory. -- partition_raw/partition_with_specials: builds fragment list, honoring -parse_special. -- append_bos/sep/eos: appends prefix/suffix tokens as configured by vocab. -- append_fragment_token/encode_raw_fragment: encode a fragment or append a -literal token. + - begin_bind: stores vocab and resets bind error state. + - bind_preprocessor/bind_encoder: select backend machines for model. + - begin_tokenize: resets request outputs and context runtime state. + - run_preprocess: builds fragment list, honoring parse_special. + - append_bos/sep/eos: appends prefix/suffix tokens as configured by vocab. + - append_fragment_token/encode_raw_fragment: encode a fragment or append a + literal token. - set_capacity_error/set_invalid_id_error: records validation failures. - finalize: marks success. - on_unexpected: reports sequencing violations. @@ -81,55 +81,71 @@ struct model { namespace sml = boost::sml; return sml::make_transition_table( - *sml::state + - sml::event[guard::can_tokenize{}] / - action::begin_tokenize = sml::state, - sml::state + - sml::event / action::reject_invalid = + *sml::state + sml::event[guard::can_bind{}] / + action::begin_bind = sml::state, + sml::state + sml::event / + action::reject_bind = sml::state, + sml::state + sml::event / + action::reject_invalid = sml::state, + + sml::state / action::bind_preprocessor = + sml::state, + sml::state[guard::phase_failed{}] = sml::state, + sml::state[guard::phase_ok{}] = + sml::state, + sml::state / action::bind_encoder = + sml::state, + sml::state[guard::phase_failed{}] = + sml::state, + sml::state[guard::phase_ok{}] = + sml::state, + + sml::state + sml::event[guard::can_bind{}] / + action::begin_bind = sml::state, + sml::state + sml::event / + action::reject_bind = sml::state, + sml::state + sml::event[guard::can_tokenize{}] / + action::begin_tokenize = sml::state, + sml::state + sml::event / + action::reject_invalid = sml::state, + + sml::state + sml::event[guard::can_bind{}] / + action::begin_bind = sml::state, + sml::state + sml::event / + action::reject_bind = sml::state, sml::state + sml::event[guard::can_tokenize{}] / - action::begin_tokenize = - sml::state, + action::begin_tokenize = sml::state, sml::state + sml::event / - action::reject_invalid = sml::state, + action::reject_invalid = sml::state, + sml::state + sml::event[guard::can_bind{}] / + action::begin_bind = sml::state, + sml::state + sml::event / + action::reject_bind = sml::state, sml::state + sml::event[guard::can_tokenize{}] / - action::begin_tokenize = sml::state, + action::begin_tokenize = sml::state, sml::state + sml::event / - action::reject_invalid = sml::state, + action::reject_invalid = sml::state, + sml::state + sml::event[guard::can_bind{}] / + action::begin_bind = sml::state, + sml::state + sml::event / + action::reject_bind = sml::state, sml::state + sml::event[guard::can_tokenize{}] / - action::begin_tokenize = sml::state, + action::begin_tokenize = sml::state, sml::state + sml::event / action::reject_invalid = sml::state, - sml::state / action::build_special_tokens = - sml::state, - sml::state[guard::phase_failed{}] = - sml::state, - sml::state[guard::has_special_tokens{}] = - sml::state, - sml::state[guard::no_special_tokens{}] = - sml::state, - - sml::state / action::partition_raw = - sml::state, - sml::state / - action::partition_with_specials = sml::state, - sml::state[guard::phase_failed{}] = - sml::state, - sml::state[guard::phase_ok{}] = - sml::state, - - sml::state / action::select_backend = - sml::state, - sml::state[guard::phase_failed{}] = + sml::state / action::run_preprocess = + sml::state, + sml::state[guard::phase_failed{}] = sml::state, - sml::state[guard::phase_ok{}] = + sml::state[guard::phase_ok{}] = sml::state, sml::state[guard::bos_ready{}] / @@ -176,28 +192,28 @@ struct model { sml::state / action::finalize = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, - sml::state + + sml::state + sml::unexpected_event / action::on_unexpected = sml::state, sml::state + @@ -237,6 +253,33 @@ struct sm : public emel::sm { sm() : base_type(context_) {} + bool process_event(const event::bind &ev) { + namespace sml = boost::sml; + + const bool accepted = base_type::process_event(ev); + const bool ok = this->is(sml::state); + const int32_t err = + ok ? EMEL_OK + : (context_.last_error != EMEL_OK ? context_.last_error + : EMEL_ERR_BACKEND); + + if (ev.error_out != nullptr) { + *ev.error_out = err; + } + if (ok) { + if (ev.dispatch_done != nullptr && ev.owner_sm != nullptr) { + ev.dispatch_done(ev.owner_sm, events::tokenizer_bind_done{&ev}); + } + } else { + if (ev.dispatch_error != nullptr && ev.owner_sm != nullptr) { + ev.dispatch_error(ev.owner_sm, events::tokenizer_bind_error{&ev, err}); + } + } + + action::clear_request(context_); + return accepted && ok; + } + bool process_event(const event::tokenize &ev) { namespace sml = boost::sml; diff --git a/tests/encoder/encoder_tests.cpp b/tests/encoder/encoder_tests.cpp index 8361ddd..98433d1 100644 --- a/tests/encoder/encoder_tests.cpp +++ b/tests/encoder/encoder_tests.cpp @@ -197,6 +197,7 @@ TEST_CASE("encoder_bpe_ignore_merges_prefers_full_token") { CHECK(machine.process_event(emel::encoder::event::encode{ .vocab = builder.vocab, .text = "hello", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -310,6 +311,7 @@ TEST_CASE("encoder_bpe_merges_ranked_pair") { CHECK(machine.process_event(emel::encoder::event::encode{ .vocab = builder.vocab, .text = "he", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -336,6 +338,7 @@ TEST_CASE("encoder_bpe_byte_fallback") { CHECK(machine.process_event(emel::encoder::event::encode{ .vocab = builder.vocab, .text = "!", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -347,6 +350,42 @@ TEST_CASE("encoder_bpe_byte_fallback") { CHECK(tokens[0] == byte_id); } +TEST_CASE("encoder_bpe_byte_fallback_multibyte_symbols") { + vocab_builder builder{}; + builder.set_model("gpt2"); + builder.set_pre("gpt2"); + const int32_t byte0_id = builder.add_byte_token(static_cast(0)); + const int32_t byte1_id = builder.add_byte_token(static_cast(1)); + + const std::string byte0 = emel::text::unicode_byte_to_utf8(0); + const std::string byte1 = emel::text::unicode_byte_to_utf8(1); + const std::string merge = byte0 + " " + byte1; + builder.add_merge(merge.c_str()); + + const std::string word = byte0 + byte1; + + emel::encoder::bpe::sm machine{}; + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + + CHECK(machine.process_event(emel::encoder::event::encode{ + .vocab = builder.vocab, + .text = word, + .preprocessed = true, + .token_ids = tokens.data(), + .token_capacity = static_cast(tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + })); + + CHECK(err == EMEL_OK); + CHECK(token_count == 2); + CHECK(tokens[0] == byte0_id); + CHECK(tokens[1] == byte1_id); +} + TEST_CASE("encoder_ugm_normalization_flags") { vocab_builder builder{}; builder.set_model("t5"); @@ -433,6 +472,7 @@ TEST_CASE("encoder_rejects_invalid_input") { CHECK(!machine.process_event(emel::encoder::event::encode{ .vocab = builder.vocab, .text = "hello", + .preprocessed = true, .token_ids = nullptr, .token_capacity = 0, .token_count_out = &token_count, @@ -459,6 +499,7 @@ TEST_CASE("encoder_dispatch_callbacks") { CHECK(machine.process_event(emel::encoder::event::encode{ .vocab = builder.vocab, .text = "hello", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -622,6 +663,7 @@ TEST_CASE("encoder_unexpected_event_sets_error") { emel::encoder::event::encode request{ .vocab = builder.vocab, .text = "hello", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -669,27 +711,6 @@ TEST_CASE("encoder_ensure_tables_populates_state") { CHECK(ctx.ugm_ready); } -TEST_CASE("encoder_assign_bpe_regex_variants") { - vocab_builder builder{}; - builder.set_model("gpt2"); - - emel::encoder::bpe::action::context ctx{}; - ctx.vocab = builder.vocab; - - builder.set_pre("gpt2"); - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - CHECK(ctx.bpe_pre_id == emel::model::data::tokenizer_pre::GPT2); - CHECK(!ctx.bpe_regex_exprs.empty()); - - builder.set_pre("llama3"); - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - CHECK(ctx.bpe_pre_id == emel::model::data::tokenizer_pre::LLAMA3); - - builder.set_pre("mpt"); - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - CHECK(ctx.bpe_pre_id == emel::model::data::tokenizer_pre::MPT); -} - TEST_CASE("encoder_guard_validates_inputs") { vocab_builder builder{}; std::array tokens = {}; @@ -894,9 +915,7 @@ TEST_CASE("encoder_encode_impl_variants") { emel::encoder::bpe::action::context ctx{}; ctx.vocab = builder.vocab; CHECK(emel::encoder::detail::ensure_tables(ctx)); - if (pre != nullptr) { - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - } + ev.preprocessed = true; result = emel::encoder::bpe::detail::encode_bpe(ev, ctx, *builder.vocab); break; } @@ -1008,16 +1027,18 @@ TEST_CASE("encoder_detail_encode_direct_calls") { emel::encoder::bpe::action::context ctx{}; ctx.vocab = builder.vocab; CHECK(emel::encoder::detail::ensure_tables(ctx)); - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); emel::encoder::event::encode ev_plain = ev; + ev_plain.preprocessed = true; auto result = emel::encoder::bpe::detail::encode_bpe(ev_plain, ctx, *builder.vocab); (void)result; emel::encoder::event::encode ev_punct = ev; ev_punct.text = "hello, world!"; + ev_punct.preprocessed = true; auto result_punct = emel::encoder::bpe::detail::encode_bpe(ev_punct, ctx, *builder.vocab); (void)result_punct; emel::encoder::event::encode ev_empty = ev; ev_empty.text = ""; + ev_empty.preprocessed = true; auto result_empty = emel::encoder::bpe::detail::encode_bpe(ev_empty, ctx, *builder.vocab); (void)result_empty; } @@ -1159,7 +1180,6 @@ TEST_CASE("encoder_detail_branch_coverage") { CHECK(builder.vocab->tokenizer_model_id == emel::model::data::tokenizer_model::UNKNOWN); builder.set_pre(""); - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); } TEST_CASE("encoder_detail_merge_and_token_helpers") { @@ -1356,6 +1376,7 @@ TEST_CASE("encoder_detail_bpe_merge_and_errors") { int32_t err = EMEL_OK; emel::encoder::event::encode ev{ .text = "he", + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -1374,6 +1395,7 @@ TEST_CASE("encoder_detail_bpe_merge_and_errors") { emel::encoder::event::encode ev_fail{ .text = "he", + .preprocessed = true, .token_ids = nullptr, .token_capacity = 0, .token_count_out = &token_count, @@ -1459,6 +1481,121 @@ TEST_CASE("encoder_detail_spm_add_space_prefix") { CHECK(result.token_count >= 1); } +TEST_CASE("encoder_detail_spm_prefix_after_leading_spaces") { + vocab_builder builder{}; + builder.set_model("llama"); + builder.add_token("\xE2\x96\x81", 0.1f, 1); + builder.add_all_plamo2_byte_tokens(); + builder.add_token("h", 0.1f, 1); + builder.add_token("i", 0.1f, 1); + builder.add_token(" ", 0.1f, 1); + builder.vocab->add_space_prefix = true; + + emel::encoder::action::context ctx{}; + ctx.vocab = builder.vocab; + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + emel::encoder::event::encode ev{ + .text = " hi", + .token_ids = tokens.data(), + .token_capacity = static_cast(tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + + const auto result = emel::encoder::spm::detail::encode_spm(ev, ctx, *builder.vocab); + CHECK(result.error == EMEL_OK); + CHECK(result.token_count >= 1); +} + +TEST_CASE("encoder_detail_spm_unescaped_spaces") { + vocab_builder builder{}; + builder.set_model("llama"); + builder.vocab->add_space_prefix = true; + builder.vocab->escape_whitespaces = false; + builder.add_token(" ", 0.1f, 1); + builder.add_token("h", 0.1f, 1); + builder.add_token("i", 0.1f, 1); + + emel::encoder::action::context ctx{}; + ctx.vocab = builder.vocab; + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + emel::encoder::event::encode ev{ + .text = "h i", + .token_ids = tokens.data(), + .token_capacity = static_cast(tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + + const auto result = emel::encoder::spm::detail::encode_spm(ev, ctx, *builder.vocab); + CHECK(result.error == EMEL_OK); + CHECK(result.token_count >= 1); +} + +TEST_CASE("encoder_detail_spm_suffix_escape_spaces") { + vocab_builder builder{}; + builder.set_model("llama"); + builder.add_token("\xE2\x96\x81", 0.1f, 1); + builder.add_all_plamo2_byte_tokens(); + builder.add_token("h", 0.1f, 1); + builder.add_token("i", 0.1f, 1); + builder.vocab->add_space_prefix = true; + builder.vocab->treat_whitespace_as_suffix = true; + + emel::encoder::action::context ctx{}; + ctx.vocab = builder.vocab; + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + emel::encoder::event::encode ev{ + .text = "hi", + .token_ids = tokens.data(), + .token_capacity = static_cast(tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + + const auto result = emel::encoder::spm::detail::encode_spm(ev, ctx, *builder.vocab); + CHECK(result.error == EMEL_OK); + CHECK(result.token_count >= 1); +} + +TEST_CASE("encoder_detail_spm_suffix_unescaped_space") { + vocab_builder builder{}; + builder.set_model("llama"); + builder.vocab->add_space_prefix = true; + builder.vocab->treat_whitespace_as_suffix = true; + builder.vocab->escape_whitespaces = false; + builder.add_token(" ", 0.1f, 1); + builder.add_token("h", 0.1f, 1); + builder.add_token("i", 0.1f, 1); + + emel::encoder::action::context ctx{}; + ctx.vocab = builder.vocab; + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + emel::encoder::event::encode ev{ + .text = "hi", + .token_ids = tokens.data(), + .token_capacity = static_cast(tokens.size()), + .token_count_out = &token_count, + .error_out = &err, + }; + + const auto result = emel::encoder::spm::detail::encode_spm(ev, ctx, *builder.vocab); + CHECK(result.error == EMEL_OK); + CHECK(result.token_count >= 1); +} + TEST_CASE("encoder_detail_bpe_buffer_overflow") { vocab_builder builder{}; builder.set_model("gpt2"); @@ -1475,6 +1612,7 @@ TEST_CASE("encoder_detail_bpe_buffer_overflow") { int32_t err = EMEL_OK; emel::encoder::event::encode ev{ .text = text, + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = static_cast(tokens.size()), .token_count_out = &token_count, @@ -1560,130 +1698,6 @@ TEST_CASE("encoder_detail_normalize_ugm_into_paths") { CHECK(!trimmed.empty()); } -TEST_CASE("encoder_assign_bpe_regex_variants_extended") { - vocab_builder builder{}; - builder.set_model("gpt2"); - - const std::array presets = {{ - "llama3", - "dbrx", - "smaug", - "deepseek-llm", - "deepseek3-llm", - "hunyuan-dense", - "youtu", - "deepseek-coder", - "falcon", - "starcoder", - "refact", - "command-r", - "smollm", - "codeshell", - "exaone", - "minerva", - "gpt2", - "mpt", - "olmo", - "jais", - "trillion", - "granite-docling", - "stablelm2", - "qwen2", - "hunyuan", - "solar-open", - "qwen35", - "poro", - "bloom", - "gpt3-finnish", - "chatglm4", - "viking", - "tekken", - "chameleon", - "gpt4o", - "minimax-m2", - "kimi-k2", - "superbpe", - "bailingmoe", - "seed-coder", - "grok-2", - "afmoe", - }}; - - for (const auto pre : presets) { - builder.set_pre(pre); - emel::encoder::bpe::action::context ctx{}; - ctx.vocab = builder.vocab; - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - CHECK(ctx.bpe_pre_id == builder.vocab->tokenizer_pre_id); - CHECK(!ctx.bpe_regex_exprs.empty()); - } -} - -TEST_CASE("encoder_assign_bpe_regex_enum_cases") { - using tokenizer_pre = emel::model::data::tokenizer_pre; - vocab_builder builder{}; - builder.set_model("gpt2"); - - const std::array presets = {{ - tokenizer_pre::DEFAULT, - tokenizer_pre::LLAMA3, - tokenizer_pre::JAIS2, - tokenizer_pre::DBRX, - tokenizer_pre::SMAUG, - tokenizer_pre::DEEPSEEK_LLM, - tokenizer_pre::DEEPSEEK3_LLM, - tokenizer_pre::HUNYUAN_DENSE, - tokenizer_pre::JOYAI_LLM, - tokenizer_pre::YOUTU, - tokenizer_pre::DEEPSEEK_CODER, - tokenizer_pre::FALCON, - tokenizer_pre::STARCODER, - tokenizer_pre::REFACT, - tokenizer_pre::COMMAND_R, - tokenizer_pre::SMOLLM, - tokenizer_pre::CODESHELL, - tokenizer_pre::EXAONE, - tokenizer_pre::MINERVA, - tokenizer_pre::GPT2, - tokenizer_pre::MPT, - tokenizer_pre::OLMO, - tokenizer_pre::JAIS, - tokenizer_pre::TRILLION, - tokenizer_pre::GRANITE_DOCLING, - tokenizer_pre::QWEN35, - tokenizer_pre::STABLELM2, - tokenizer_pre::QWEN2, - tokenizer_pre::HUNYUAN, - tokenizer_pre::SOLAR_OPEN, - tokenizer_pre::PORO, - tokenizer_pre::BLOOM, - tokenizer_pre::GPT3_FINNISH, - tokenizer_pre::CHATGLM4, - tokenizer_pre::VIKING, - tokenizer_pre::TEKKEN, - tokenizer_pre::CHAMELEON, - tokenizer_pre::GPT4O, - tokenizer_pre::MINIMAX_M2, - tokenizer_pre::TINY_AYA, - tokenizer_pre::KIMI_K2, - tokenizer_pre::SUPERBPE, - tokenizer_pre::BAILINGMOE, - tokenizer_pre::SEED_CODER, - tokenizer_pre::GROK_2, - tokenizer_pre::AFMOE, - tokenizer_pre::EXAONE_MOE, - }}; - - for (const auto pre : presets) { - builder.vocab->tokenizer_pre_id = pre; - emel::encoder::bpe::action::context ctx{}; - ctx.vocab = builder.vocab; - emel::encoder::bpe::detail::assign_bpe_regex(ctx, *builder.vocab); - CHECK(ctx.bpe_pre_id == pre); - CHECK(!ctx.bpe_regex_exprs.empty()); - } -} - TEST_CASE("encoder_detail_rwkv_unescape_branches") { std::string out; CHECK(emel::encoder::rwkv::detail::unescape_rwkv_token("plain", out)); @@ -1988,6 +2002,7 @@ TEST_CASE("encoder_detail_bpe_byte_push_overflow") { int32_t err = EMEL_OK; emel::encoder::event::encode ev{ .text = "ab", + .preprocessed = true, .token_ids = out_tokens.data(), .token_capacity = 0, .token_count_out = &token_count, @@ -2259,7 +2274,7 @@ TEST_CASE("encoder_encode_branch_cases") { emel::encoder::bpe::action::context ctx{}; ctx.vocab = builder.vocab; CHECK(emel::encoder::detail::ensure_tables(ctx)); - ctx.bpe_regex_exprs.clear(); + ev.preprocessed = true; auto result = emel::encoder::bpe::detail::encode_bpe(ev, ctx, *builder.vocab); (void)result; } @@ -2356,6 +2371,7 @@ TEST_CASE("encoder_action_guard_wrapper_coverage") { return emel::encoder::event::encode{ .vocab = vocab, .text = text, + .preprocessed = true, .token_ids = tokens.data(), .token_capacity = capacity, .token_count_out = &token_count, @@ -2367,6 +2383,7 @@ TEST_CASE("encoder_action_guard_wrapper_coverage") { return emel::encoder::event::encode{ .vocab = vocab, .text = "x", + .preprocessed = true, .token_ids = nullptr, .token_capacity = 0, .token_count_out = &token_count, diff --git a/tests/gbnf/parser_tests.cpp b/tests/gbnf/parser_tests.cpp index fa06689..28b6f5b 100644 --- a/tests/gbnf/parser_tests.cpp +++ b/tests/gbnf/parser_tests.cpp @@ -117,6 +117,21 @@ TEST_CASE("gbnf_detail_parser_rejects_large_repetitions") { CHECK(ctx.phase_error == EMEL_ERR_PARSE_FAILED); } +TEST_CASE("gbnf_detail_parser_rejects_deeply_nested_groups") { + emel::gbnf::parser::action::context ctx{}; + emel::gbnf::grammar grammar_out{}; + emel::gbnf::parser::detail::recursive_descent_parser parser{ctx, &grammar_out}; + ctx.phase_error = EMEL_OK; + const uint32_t nesting = + emel::gbnf::parser::detail::recursive_descent_parser::k_max_nesting_depth + 4; + std::string grammar = "root ::= "; + grammar.append(nesting, '('); + grammar += "\"a\""; + grammar.append(nesting, ')'); + CHECK_FALSE(parser.parse(grammar)); + CHECK(ctx.phase_error == EMEL_ERR_PARSE_FAILED); +} + TEST_CASE("gbnf_parser_guards_and_actions_cover_branches") { emel::gbnf::parser::action::context ctx{}; emel::gbnf::grammar grammar{}; diff --git a/tests/models/Llama-68M-Chat-v1-Q2_K.gguf b/tests/models/Llama-68M-Chat-v1-Q2_K.gguf new file mode 100644 index 0000000..49aa0ba --- /dev/null +++ b/tests/models/Llama-68M-Chat-v1-Q2_K.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed06dc5bd84bce3154a2b7e751c45a56562691933ee25b5823393f909329a67 +size 35877760 diff --git a/tests/models/README.md b/tests/models/README.md new file mode 100644 index 0000000..5ffc80f --- /dev/null +++ b/tests/models/README.md @@ -0,0 +1,41 @@ +# Test Models + +## Llama-68M-Chat-v1-Q2_K.gguf +- Source: `https://huggingface.co/tensorblock/Llama-68M-Chat-v1-GGUF` +- File: `Llama-68M-Chat-v1-Q2_K.gguf` +- License: Apache-2.0 +- Size: 34 MB +- SHA256: `8ed06dc5bd84bce3154a2b7e751c45a56562691933ee25b5823393f909329a67` +- Download URL: `https://huggingface.co/tensorblock/Llama-68M-Chat-v1-GGUF/resolve/main/Llama-68M-Chat-v1-Q2_K.gguf` + +## distilgpt2.Q2_K.gguf +- Source: `https://huggingface.co/RichardErkhov/distilbert_-_distilgpt2-gguf` +- File: `distilgpt2.Q2_K.gguf` +- License: Apache-2.0 +- Size: 63.6 MB +- SHA256: `b046ac09ba24a848e2140676fba58c1dcf2f19617e45b03524043eabdb556a31` +- Download URL: `https://huggingface.co/RichardErkhov/distilbert_-_distilgpt2-gguf/resolve/main/distilgpt2.Q2_K.gguf` + +## bert-base-uncased-q4_k_m.gguf +- Source: `https://huggingface.co/Talek02/bert-base-uncased-Q4_K_M-GGUF` +- File: `bert-base-uncased-q4_k_m.gguf` +- License: Apache-2.0 +- Size: 74.3 MB +- SHA256: `48c02c00843964c2e1675e6d6aebfbdb03d4ca330d65a6b9695eee6f160109b0` +- Download URL: `https://huggingface.co/Talek02/bert-base-uncased-Q4_K_M-GGUF/resolve/main/bert-base-uncased-q4_k_m.gguf` + +## flan-t5-small.Q2_K.gguf +- Source: `https://huggingface.co/Felladrin/gguf-flan-t5-small` +- File: `flan-t5-small.Q2_K.gguf` +- License: Apache-2.0 +- Size: 83.8 MB +- SHA256: `a67f632d17d2bdb819071c9b4d51e26a191f75c7f725861ee22e23e1d903dc57` +- Download URL: `https://huggingface.co/Felladrin/gguf-flan-t5-small/resolve/main/flan-t5-small.Q2_K.gguf` + +## rwkv7-0.1B-g1-F16.gguf +- Source: `https://huggingface.co/zhiyuan8/RWKV-v7-0.1B-G1-GGUF` +- File: `rwkv7-0.1B-g1-F16.gguf` +- License: Apache-2.0 +- Size: 386 MB +- SHA256: `fea5c54f3fd2370ac90ae58f2ecd6cbe57c31df023598aed4c95b0966170f9c8` +- Download URL: `https://huggingface.co/zhiyuan8/RWKV-v7-0.1B-G1-GGUF/resolve/main/rwkv7-0.1B-g1-F16.gguf` diff --git a/tests/models/bert-base-uncased-q4_k_m.gguf b/tests/models/bert-base-uncased-q4_k_m.gguf new file mode 100644 index 0000000..697469c --- /dev/null +++ b/tests/models/bert-base-uncased-q4_k_m.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48c02c00843964c2e1675e6d6aebfbdb03d4ca330d65a6b9695eee6f160109b0 +size 74269888 diff --git a/tests/models/distilgpt2.Q2_K.gguf b/tests/models/distilgpt2.Q2_K.gguf new file mode 100644 index 0000000..1abb521 --- /dev/null +++ b/tests/models/distilgpt2.Q2_K.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b046ac09ba24a848e2140676fba58c1dcf2f19617e45b03524043eabdb556a31 +size 63648128 diff --git a/tests/models/flan-t5-small.Q2_K.gguf b/tests/models/flan-t5-small.Q2_K.gguf new file mode 100644 index 0000000..2b6acef --- /dev/null +++ b/tests/models/flan-t5-small.Q2_K.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67f632d17d2bdb819071c9b4d51e26a191f75c7f725861ee22e23e1d903dc57 +size 83804928 diff --git a/tests/models/rwkv7-0.1B-g1-F16.gguf b/tests/models/rwkv7-0.1B-g1-F16.gguf new file mode 100644 index 0000000..1059921 --- /dev/null +++ b/tests/models/rwkv7-0.1B-g1-F16.gguf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fea5c54f3fd2370ac90ae58f2ecd6cbe57c31df023598aed4c95b0966170f9c8 +size 386371616 diff --git a/tests/tokenizer/bpe_regex_tests.cpp b/tests/tokenizer/bpe_regex_tests.cpp new file mode 100644 index 0000000..809eac9 --- /dev/null +++ b/tests/tokenizer/bpe_regex_tests.cpp @@ -0,0 +1,79 @@ +#include + +#include + +#include "emel/model/data.hpp" +#include "emel/tokenizer/bpe/regex.hpp" + +TEST_CASE("tokenizer_bpe_regex_for_vocab") { + emel::model::data::vocab vocab = {}; + vocab.tokenizer_pre_id = emel::model::data::tokenizer_pre::GPT2; + + const auto from_vocab = emel::tokenizer::bpe::detail::regex_for(vocab); + const auto from_pre = emel::tokenizer::bpe::detail::regex_for( + emel::model::data::tokenizer_pre::GPT2); + + CHECK(from_vocab.count == from_pre.count); + for (size_t idx = 0; idx < from_vocab.count; ++idx) { + CHECK(from_vocab.exprs[idx] == from_pre.exprs[idx]); + } +} + +TEST_CASE("tokenizer_bpe_regex_for_presets") { + using tokenizer_pre = emel::model::data::tokenizer_pre; + const std::array presets = {{ + tokenizer_pre::DEFAULT, + tokenizer_pre::LLAMA3, + tokenizer_pre::JAIS2, + tokenizer_pre::DBRX, + tokenizer_pre::SMAUG, + tokenizer_pre::DEEPSEEK_LLM, + tokenizer_pre::DEEPSEEK3_LLM, + tokenizer_pre::HUNYUAN_DENSE, + tokenizer_pre::JOYAI_LLM, + tokenizer_pre::YOUTU, + tokenizer_pre::DEEPSEEK_CODER, + tokenizer_pre::FALCON, + tokenizer_pre::STARCODER, + tokenizer_pre::REFACT, + tokenizer_pre::COMMAND_R, + tokenizer_pre::SMOLLM, + tokenizer_pre::CODESHELL, + tokenizer_pre::EXAONE, + tokenizer_pre::MINERVA, + tokenizer_pre::GPT2, + tokenizer_pre::MPT, + tokenizer_pre::OLMO, + tokenizer_pre::JAIS, + tokenizer_pre::TRILLION, + tokenizer_pre::GRANITE_DOCLING, + tokenizer_pre::QWEN35, + tokenizer_pre::STABLELM2, + tokenizer_pre::QWEN2, + tokenizer_pre::HUNYUAN, + tokenizer_pre::SOLAR_OPEN, + tokenizer_pre::PORO, + tokenizer_pre::BLOOM, + tokenizer_pre::GPT3_FINNISH, + tokenizer_pre::CHATGLM4, + tokenizer_pre::VIKING, + tokenizer_pre::TEKKEN, + tokenizer_pre::CHAMELEON, + tokenizer_pre::GPT4O, + tokenizer_pre::MINIMAX_M2, + tokenizer_pre::TINY_AYA, + tokenizer_pre::KIMI_K2, + tokenizer_pre::SUPERBPE, + tokenizer_pre::BAILINGMOE, + tokenizer_pre::SEED_CODER, + tokenizer_pre::GROK_2, + tokenizer_pre::AFMOE, + tokenizer_pre::EXAONE_MOE, + }}; + + for (const auto pre : presets) { + const auto list = emel::tokenizer::bpe::detail::regex_for(pre); + CHECK(list.count > 0); + CHECK(!list.exprs[0].empty()); + } +} diff --git a/tests/tokenizer/bpe_split_tests.cpp b/tests/tokenizer/bpe_split_tests.cpp new file mode 100644 index 0000000..70e9378 --- /dev/null +++ b/tests/tokenizer/bpe_split_tests.cpp @@ -0,0 +1,209 @@ +#include +#include + +#include + +#include "emel/model/data.hpp" +#include "emel/tokenizer/bpe/split.hpp" + +namespace { + +emel::model::data::vocab make_vocab( + const emel::model::data::tokenizer_pre pre) { + emel::model::data::vocab vocab = {}; + vocab.tokenizer_pre_id = pre; + return vocab; +} + +} // namespace + +TEST_CASE("tokenizer_bpe_split_empty") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::GPT2); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + std::string_view{}, vocab, scratch, view); + CHECK(ok); + CHECK(view.count == 0); +} + +TEST_CASE("tokenizer_bpe_split_gpt2_basic") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::GPT2); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + "hello world", vocab, scratch, view); + CHECK(ok); + CHECK(view.count == 2); + CHECK(view.words[0] == std::string_view("hello")); + const char encoded_word[] = "\xC4\xA0" "world"; + CHECK(view.words[1] == std::string_view(encoded_word, sizeof(encoded_word) - 1)); +} + +TEST_CASE("tokenizer_bpe_split_gpt2_branches") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::GPT2); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const std::string text = "I'm 123!! café\n"; + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + text, vocab, scratch, view); + CHECK(ok); + CHECK(view.count > 0); +} + +TEST_CASE("tokenizer_bpe_split_llama3_branches") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::LLAMA3); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const std::string text = "Hello\nWORLD 1234 99"; + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + text, vocab, scratch, view); + CHECK(ok); + CHECK(view.count > 0); +} + +TEST_CASE("tokenizer_bpe_split_fallback_multi_regex") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::FALCON); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + "hello!!! 123", vocab, scratch, view); + CHECK(ok); + CHECK(view.count > 0); +} + +TEST_CASE("tokenizer_bpe_split_fallback_single_regex") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::PORO); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + "hello world", vocab, scratch, view); + CHECK(ok); + CHECK(view.count > 0); +} + +TEST_CASE("tokenizer_bpe_split_accepts_long_text") { + auto vocab = make_vocab(emel::model::data::tokenizer_pre::GPT2); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + + const std::string text( + emel::tokenizer::bpe::detail::k_max_bpe_bytes + 1, 'a'); + const bool ok = emel::tokenizer::bpe::detail::split_and_encode_append( + text, vocab, scratch, view); + CHECK_FALSE(ok); + CHECK(view.count == 0); +} + +TEST_CASE("tokenizer_bpe_encode_utf8_branches") { + char out[4] = {}; + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x41u, nullptr, 0) == 0); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x41u, out, 0) == 0); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x41u, out, 1) == 1); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x7FFu, out, 1) == 0); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x7FFu, out, 2) == 2); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0xFFFFu, out, 2) == 0); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0xFFFFu, out, 3) == 3); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x10FFFFu, out, 3) == 0); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x10FFFFu, out, 4) == 4); + CHECK(emel::tokenizer::bpe::detail::encode_utf8(0x110000u, out, 4) == 0); +} + +TEST_CASE("tokenizer_bpe_decode_utf8_to_cpts_branches") { + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + CHECK(emel::tokenizer::bpe::detail::decode_utf8_to_cpts("A", scratch)); + CHECK(scratch.cpt_count == 1); + + const std::string two = "\xC3\xA9"; + CHECK(emel::tokenizer::bpe::detail::decode_utf8_to_cpts(two, scratch)); + CHECK(scratch.cpt_count == 1); + + const std::string three = "\xE2\x82\xAC"; + CHECK(emel::tokenizer::bpe::detail::decode_utf8_to_cpts(three, scratch)); + CHECK(scratch.cpt_count == 1); + + const std::string four = "\xF0\x9F\x98\x80"; + CHECK(emel::tokenizer::bpe::detail::decode_utf8_to_cpts(four, scratch)); + CHECK(scratch.cpt_count == 1); +} + +TEST_CASE("tokenizer_bpe_push_offset_branches") { + size_t out[2] = {}; + size_t out_count = 0; + CHECK(emel::tokenizer::bpe::detail::push_offset(0, out, 2, out_count)); + CHECK(out_count == 0); + CHECK(emel::tokenizer::bpe::detail::push_offset(1, out, 1, out_count)); + CHECK(out_count == 1); + CHECK_FALSE(emel::tokenizer::bpe::detail::push_offset(1, out, 1, out_count)); +} + +TEST_CASE("tokenizer_bpe_split_gpt2_error_paths") { + std::array cpts = {{'\'', 's'}}; + size_t offsets_in_bad[1] = {3}; + size_t offsets_out[4] = {}; + size_t out_count = 0; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_gpt2( + cpts.data(), cpts.size(), offsets_in_bad, 1, offsets_out, 4, out_count)); + + size_t offsets_in_ok[1] = {2}; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_gpt2( + cpts.data(), cpts.size(), offsets_in_ok, 1, offsets_out, 0, out_count)); + + std::array cpts_re = {{'\'', 'r', 'e'}}; + size_t offsets_in_re[1] = {3}; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_gpt2( + cpts_re.data(), cpts_re.size(), offsets_in_re, 1, offsets_out, 0, out_count)); +} + +TEST_CASE("tokenizer_bpe_split_llama3_error_paths") { + std::array cpts = {{'\'', 'R', 'E'}}; + size_t offsets_in_bad[1] = {4}; + size_t offsets_out[4] = {}; + size_t out_count = 0; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_llama3( + cpts.data(), cpts.size(), offsets_in_bad, 1, offsets_out, 4, out_count)); + + size_t offsets_in_ok[1] = {3}; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_llama3( + cpts.data(), cpts.size(), offsets_in_ok, 1, offsets_out, 0, out_count)); + + std::array punct = {{'!'}}; + size_t offsets_in_punct[1] = {1}; + CHECK_FALSE(emel::tokenizer::bpe::detail::split_llama3( + punct.data(), punct.size(), offsets_in_punct, 1, offsets_out, 0, out_count)); +} + +TEST_CASE("tokenizer_bpe_encode_bpe_segment_errors") { + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + std::array bad_cpt = {{0x110000u}}; + CHECK_FALSE(emel::tokenizer::bpe::detail::encode_bpe_segment( + bad_cpt.data(), 0, bad_cpt.size(), scratch)); + + scratch.reset(); + scratch.encoded_size = scratch.encoded.size(); + std::array ok_cpt = {{'a'}}; + CHECK_FALSE(emel::tokenizer::bpe::detail::encode_bpe_segment( + ok_cpt.data(), 0, ok_cpt.size(), scratch)); + + scratch.reset(); + scratch.word_count = scratch.words.size(); + CHECK_FALSE(emel::tokenizer::bpe::detail::encode_bpe_segment( + ok_cpt.data(), 0, ok_cpt.size(), scratch)); +} + +TEST_CASE("tokenizer_bpe_split_fallback_overflow") { + emel::tokenizer::bpe::detail::regex_list regex = {}; + regex.exprs[0] = "\\p{L}+"; + regex.count = 1; + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + scratch.encoded_size = scratch.encoded.size(); + + CHECK_FALSE(emel::tokenizer::bpe::detail::split_and_encode_fallback( + "hello", regex, scratch)); +} diff --git a/tests/tokenizer/parity_texts/basic.txt b/tests/tokenizer/parity_texts/basic.txt new file mode 100644 index 0000000..3b18e51 --- /dev/null +++ b/tests/tokenizer/parity_texts/basic.txt @@ -0,0 +1 @@ +hello world diff --git a/tests/tokenizer/parity_texts/long.txt b/tests/tokenizer/parity_texts/long.txt new file mode 100644 index 0000000..263c271 --- /dev/null +++ b/tests/tokenizer/parity_texts/long.txt @@ -0,0 +1 @@ +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa diff --git a/tests/tokenizer/parity_texts/special_bert.txt b/tests/tokenizer/parity_texts/special_bert.txt new file mode 100644 index 0000000..72eaa52 --- /dev/null +++ b/tests/tokenizer/parity_texts/special_bert.txt @@ -0,0 +1 @@ +[CLS] hello [SEP] diff --git a/tests/tokenizer/parity_texts/special_gpt2.txt b/tests/tokenizer/parity_texts/special_gpt2.txt new file mode 100644 index 0000000..ae87475 --- /dev/null +++ b/tests/tokenizer/parity_texts/special_gpt2.txt @@ -0,0 +1 @@ +<|endoftext|> hello diff --git a/tests/tokenizer/parity_texts/special_llama.txt b/tests/tokenizer/parity_texts/special_llama.txt new file mode 100644 index 0000000..205d9f7 --- /dev/null +++ b/tests/tokenizer/parity_texts/special_llama.txt @@ -0,0 +1 @@ + hello diff --git a/tests/tokenizer/parity_texts/special_rwkv.txt b/tests/tokenizer/parity_texts/special_rwkv.txt new file mode 100644 index 0000000..b648ed1 --- /dev/null +++ b/tests/tokenizer/parity_texts/special_rwkv.txt @@ -0,0 +1 @@ + hello diff --git a/tests/tokenizer/parity_texts/special_t5.txt b/tests/tokenizer/parity_texts/special_t5.txt new file mode 100644 index 0000000..e0defaa --- /dev/null +++ b/tests/tokenizer/parity_texts/special_t5.txt @@ -0,0 +1 @@ + hello diff --git a/tests/tokenizer/parity_texts/unicode.txt b/tests/tokenizer/parity_texts/unicode.txt new file mode 100644 index 0000000..7239763 --- /dev/null +++ b/tests/tokenizer/parity_texts/unicode.txt @@ -0,0 +1 @@ +café naïve こんにちは世界 😀 diff --git a/tests/tokenizer/parity_texts/whitespace.txt b/tests/tokenizer/parity_texts/whitespace.txt new file mode 100644 index 0000000..9d5584c --- /dev/null +++ b/tests/tokenizer/parity_texts/whitespace.txt @@ -0,0 +1,5 @@ + leading space +middle with tabs +trailing space + +multiple spaces diff --git a/tests/tokenizer/preprocessor_tests.cpp b/tests/tokenizer/preprocessor_tests.cpp index 10419ac..96a594a 100644 --- a/tests/tokenizer/preprocessor_tests.cpp +++ b/tests/tokenizer/preprocessor_tests.cpp @@ -349,6 +349,45 @@ TEST_CASE("tokenizer_preprocessor_partition_bpe_no_specials") { emel::tokenizer::preprocessor::fragment_kind::raw_text); } +TEST_CASE("tokenizer_preprocessor_partition_bpe_no_specials_large_input") { + emel::model::data::vocab vocab = make_bpe_vocab(); + std::array + fragments = {}; + size_t count = 0; + int32_t err = EMEL_OK; + + std::string text; + const size_t word_count = + emel::tokenizer::preprocessor::k_max_fragments + 1; + text.reserve(word_count * 2); + for (size_t idx = 0; idx < word_count; ++idx) { + if (idx > 0) { + text += ' '; + } + text += 'a'; + } + + emel::tokenizer::preprocessor::event::preprocess ev = {}; + ev.vocab = &vocab; + ev.text = std::string_view(text); + ev.fragments_out = fragments.data(); + ev.fragment_capacity = fragments.size(); + ev.fragment_count_out = &count; + ev.error_out = &err; + + emel::tokenizer::preprocessor::action::context ctx = {}; + struct emel::tokenizer::preprocessor::action::begin_preprocess begin_preprocess{}; + struct emel::tokenizer::preprocessor::action::partition_bpe_no_specials + partition_bpe_no_specials{}; + + begin_preprocess(ev, ctx); + partition_bpe_no_specials(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + CHECK_FALSE(ctx.preprocessed); + CHECK(ctx.fragment_count == 0); +} + TEST_CASE("tokenizer_preprocessor_partition_bpe_no_specials_invalid") { emel::tokenizer::preprocessor::action::context ctx = {}; struct emel::tokenizer::preprocessor::action::partition_bpe_no_specials diff --git a/tests/tokenizer/tokenizer_action_guard_tests.cpp b/tests/tokenizer/tokenizer_action_guard_tests.cpp new file mode 100644 index 0000000..610a9e1 --- /dev/null +++ b/tests/tokenizer/tokenizer_action_guard_tests.cpp @@ -0,0 +1,253 @@ +#include + +#include + +#include "emel/model/data.hpp" +#include "emel/tokenizer/actions.hpp" +#include "emel/tokenizer/guards.hpp" + +namespace { + +emel::model::data::vocab make_vocab_for_specials() { + emel::model::data::vocab vocab = {}; + vocab.add_bos = true; + vocab.add_eos = true; + vocab.add_sep = true; + vocab.bos_id = 1; + vocab.eos_id = 2; + vocab.sep_id = 3; + return vocab; +} + +} // namespace + +TEST_CASE("tokenizer_detail_model_kind_mappings") { + using model = emel::model::data::tokenizer_model; + using encoder_kind = emel::tokenizer::action::encoder_kind; + using pre_kind = emel::tokenizer::action::preprocessor_kind; + + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::SPM) == + encoder_kind::spm); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::BPE) == + encoder_kind::bpe); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::WPM) == + encoder_kind::wpm); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::UGM) == + encoder_kind::ugm); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::RWKV) == + encoder_kind::rwkv); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::PLAMO2) == + encoder_kind::plamo2); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::NONE) == + encoder_kind::fallback); + CHECK(emel::tokenizer::detail::encoder_kind_from_model(model::UNKNOWN) == + encoder_kind::fallback); + + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::SPM) == + pre_kind::spm); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::BPE) == + pre_kind::bpe); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::WPM) == + pre_kind::wpm); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::UGM) == + pre_kind::ugm); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::RWKV) == + pre_kind::rwkv); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::PLAMO2) == + pre_kind::plamo2); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::NONE) == + pre_kind::fallback); + CHECK(emel::tokenizer::detail::preprocessor_kind_from_model(model::UNKNOWN) == + pre_kind::fallback); +} + +TEST_CASE("tokenizer_guard_prefix_suffix_cases") { + auto vocab = make_vocab_for_specials(); + emel::tokenizer::action::context ctx{}; + ctx.vocab = &vocab; + ctx.add_special = true; + ctx.token_capacity = 1; + ctx.token_count = 0; + ctx.model_kind = emel::tokenizer::action::encoder_kind::bpe; + + CHECK(emel::tokenizer::guard::bos_ready{}(ctx)); + CHECK_FALSE(emel::tokenizer::guard::bos_invalid_id{}(ctx)); + CHECK_FALSE(emel::tokenizer::guard::bos_no_capacity{}(ctx)); + + ctx.token_count = ctx.token_capacity; + CHECK(emel::tokenizer::guard::bos_no_capacity{}(ctx)); + + ctx.token_count = 0; + vocab.bos_id = -1; + CHECK(emel::tokenizer::guard::bos_invalid_id{}(ctx)); + + vocab.bos_id = 1; + ctx.model_kind = emel::tokenizer::action::encoder_kind::wpm; + CHECK(emel::tokenizer::guard::sep_ready{}(ctx)); + ctx.token_count = ctx.token_capacity; + CHECK(emel::tokenizer::guard::sep_no_capacity{}(ctx)); + + ctx.token_count = 0; + vocab.sep_id = -1; + CHECK(emel::tokenizer::guard::sep_invalid_id{}(ctx)); + + vocab.sep_id = 3; + ctx.model_kind = emel::tokenizer::action::encoder_kind::bpe; + CHECK(emel::tokenizer::guard::eos_ready{}(ctx)); + ctx.token_count = ctx.token_capacity; + CHECK(emel::tokenizer::guard::eos_no_capacity{}(ctx)); + + ctx.token_count = 0; + vocab.eos_id = -1; + CHECK(emel::tokenizer::guard::eos_invalid_id{}(ctx)); +} + +TEST_CASE("tokenizer_guard_can_tokenize") { + auto vocab = make_vocab_for_specials(); + emel::tokenizer::action::context ctx{}; + ctx.vocab = &vocab; + ctx.is_bound = true; + + std::array tokens = {}; + int32_t count = 0; + emel::tokenizer::event::tokenize ev = {}; + ev.vocab = &vocab; + ev.token_ids_out = tokens.data(); + ev.token_capacity = static_cast(tokens.size()); + ev.token_count_out = &count; + + CHECK(emel::tokenizer::guard::can_tokenize{}(ev, ctx)); + + ev.vocab = nullptr; + CHECK_FALSE(emel::tokenizer::guard::can_tokenize{}(ev, ctx)); + + ev.vocab = &vocab; + ev.token_ids_out = nullptr; + CHECK_FALSE(emel::tokenizer::guard::can_tokenize{}(ev, ctx)); +} + +TEST_CASE("tokenizer_detail_append_token_errors") { + emel::tokenizer::action::context ctx{}; + std::array tokens = {}; + ctx.token_ids_out = tokens.data(); + ctx.token_capacity = static_cast(tokens.size()); + ctx.token_count = 0; + + CHECK_FALSE(emel::tokenizer::detail::append_token(ctx, -1)); + ctx.token_ids_out = nullptr; + CHECK_FALSE(emel::tokenizer::detail::append_token(ctx, 1)); + ctx.token_ids_out = tokens.data(); + ctx.token_capacity = 0; + CHECK_FALSE(emel::tokenizer::detail::append_token(ctx, 1)); +} + +TEST_CASE("tokenizer_actions_error_paths") { + auto vocab = make_vocab_for_specials(); + emel::tokenizer::action::context ctx{}; + ctx.vocab = &vocab; + + emel::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = nullptr; + emel::tokenizer::action::reject_bind(bind_ev, ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.vocab = nullptr; + emel::tokenizer::action::bind_preprocessor(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.vocab = nullptr; + emel::tokenizer::action::bind_encoder(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.vocab = nullptr; + emel::tokenizer::action::run_preprocess(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.vocab = &vocab; + std::array out_tokens = {}; + ctx.token_ids_out = out_tokens.data(); + ctx.token_capacity = static_cast(out_tokens.size()); + ctx.token_count = 0; + ctx.last_error = EMEL_OK; + ctx.phase_error = EMEL_OK; + emel::tokenizer::action::append_bos(ctx); + CHECK(ctx.last_error == EMEL_OK); + CHECK(ctx.token_count == 1); + + ctx.token_count = 0; + ctx.token_capacity = 0; + emel::tokenizer::action::append_bos(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.token_ids_out = out_tokens.data(); + ctx.token_capacity = 1; + vocab.sep_id = -1; + emel::tokenizer::action::append_sep(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.token_ids_out = nullptr; + emel::tokenizer::action::append_eos(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.fragment_count = 0; + ctx.fragment_index = 0; + emel::tokenizer::action::encode_raw_fragment(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.fragment_count = 1; + ctx.fragment_index = 0; + ctx.fragments[0].kind = emel::tokenizer::action::fragment_kind::token; + emel::tokenizer::action::encode_raw_fragment(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.fragments[0].kind = emel::tokenizer::action::fragment_kind::raw_text; + emel::tokenizer::action::append_fragment_token(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + ctx.fragments[0].kind = emel::tokenizer::action::fragment_kind::raw_text; + ctx.token_ids_out = nullptr; + emel::tokenizer::action::encode_raw_fragment(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); +} + +TEST_CASE("tokenizer_guard_fragment_selection") { + emel::tokenizer::action::context ctx{}; + ctx.fragment_count = 1; + ctx.fragment_index = 0; + ctx.fragments[0].kind = emel::tokenizer::action::fragment_kind::token; + CHECK(emel::tokenizer::guard::more_fragments_token{}(ctx)); + CHECK_FALSE(emel::tokenizer::guard::more_fragments_raw{}(ctx)); + + ctx.fragments[0].kind = emel::tokenizer::action::fragment_kind::raw_text; + CHECK(emel::tokenizer::guard::more_fragments_raw{}(ctx)); +} + +TEST_CASE("tokenizer_actions_status_helpers") { + emel::tokenizer::action::context ctx{}; + ctx.last_error = EMEL_ERR_INVALID_ARGUMENT; + ctx.phase_error = EMEL_ERR_INVALID_ARGUMENT; + emel::tokenizer::action::finalize(ctx); + CHECK(ctx.last_error == EMEL_OK); + + emel::tokenizer::action::set_capacity_error(ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); + + emel::tokenizer::action::set_invalid_id_error(ctx); + CHECK(ctx.last_error == EMEL_ERR_MODEL_INVALID); + + emel::tokenizer::event::tokenize ev = {}; + emel::tokenizer::action::on_unexpected(ev, ctx); + CHECK(ctx.last_error == EMEL_ERR_INVALID_ARGUMENT); +} + +TEST_CASE("tokenizer_guard_basic_failures") { + emel::tokenizer::action::context ctx{}; + emel::tokenizer::event::tokenize ev = {}; + CHECK_FALSE(emel::tokenizer::guard::can_tokenize{}(ev, ctx)); + + emel::tokenizer::event::bind bind_ev = {}; + CHECK_FALSE(emel::tokenizer::guard::can_bind{}(bind_ev)); + + ctx.add_special = false; + CHECK(emel::tokenizer::guard::no_prefix{}(ctx)); +} diff --git a/tests/tokenizer/tokenizer_parity_tests.cpp b/tests/tokenizer/tokenizer_parity_tests.cpp new file mode 100644 index 0000000..cde7eda --- /dev/null +++ b/tests/tokenizer/tokenizer_parity_tests.cpp @@ -0,0 +1,269 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "emel/emel.h" +#include "emel/model/data.hpp" +#include "emel/tokenizer/actions.hpp" +#include "emel/tokenizer/preprocessor/any.hpp" +#include "emel/tokenizer/preprocessor/types.hpp" +#include "emel/encoder/any.hpp" +#include "emel/tokenizer/sm.hpp" + +namespace { + +int32_t add_token(emel::model::data::vocab & vocab, const char * text, + float score = 0.0f, int32_t type = 0) { + const uint32_t len = static_cast(std::strlen(text)); + const uint32_t offset = vocab.token_bytes_used; + std::memcpy(vocab.token_storage.data() + offset, text, len); + const uint32_t id = vocab.n_tokens; + vocab.entries[id].text_offset = offset; + vocab.entries[id].text_length = len; + vocab.entries[id].score = score; + vocab.entries[id].type = type; + vocab.token_bytes_used += len; + vocab.n_tokens = id + 1; + return static_cast(id); +} + +void add_all_plamo2_byte_tokens(emel::model::data::vocab & vocab) { + char token[7] = {}; + for (int value = 0; value < 256; ++value) { + std::snprintf(token, sizeof(token), "<0x%02X>", value); + (void)add_token(vocab, token, 0.0f, 6); + } +} + +void init_bpe_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::BPE; + vocab.tokenizer_pre_id = emel::model::data::tokenizer_pre::GPT2; + vocab.ignore_merges = true; + vocab.add_bos = true; + vocab.add_eos = true; + (void)add_token(vocab, "hello"); + (void)add_token(vocab, "\xC4\xA0" "world"); + vocab.bos_id = add_token(vocab, ""); + vocab.eos_id = add_token(vocab, ""); +} + +void init_spm_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::SPM; + vocab.add_bos = true; + vocab.add_eos = true; + (void)add_token(vocab, "a"); + vocab.bos_id = add_token(vocab, ""); + vocab.eos_id = add_token(vocab, ""); +} + +void init_ugm_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::UGM; + vocab.escape_whitespaces = false; + vocab.remove_extra_whitespaces = false; + vocab.treat_whitespace_as_suffix = false; + vocab.add_space_prefix = false; + vocab.add_bos = true; + vocab.add_eos = true; + vocab.unk_id = add_token(vocab, "", 0.0f, 2); + (void)add_token(vocab, "a"); + vocab.bos_id = add_token(vocab, ""); + vocab.eos_id = add_token(vocab, ""); +} + +void init_wpm_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::WPM; + vocab.add_sep = true; + vocab.unk_id = add_token(vocab, "", 0.0f, 2); + (void)add_token(vocab, "a"); + vocab.sep_id = add_token(vocab, ""); +} + +void init_rwkv_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::RWKV; + vocab.add_eos = true; + (void)add_token(vocab, "a"); + vocab.eos_id = add_token(vocab, ""); +} + +void init_plamo2_vocab(emel::model::data::vocab & vocab) { + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::PLAMO2; + vocab.add_eos = true; + (void)add_token(vocab, "", 0.0f, 2); + add_all_plamo2_byte_tokens(vocab); + (void)add_token(vocab, "a"); + vocab.eos_id = add_token(vocab, ""); +} + +bool reference_tokenize(const emel::model::data::vocab & vocab, + const std::string_view text, + const bool add_special, + const bool parse_special, + int32_t * token_ids, + const int32_t token_capacity, + int32_t & token_count, + int32_t & err) { + token_count = 0; + err = EMEL_OK; + + emel::tokenizer::preprocessor::any preprocessor; + preprocessor.set_kind( + emel::tokenizer::detail::preprocessor_kind_from_model(vocab.tokenizer_model_id)); + + std::array fragments = {}; + size_t fragment_count = 0; + bool preprocessed = false; + emel::tokenizer::preprocessor::event::preprocess pre_ev = {}; + pre_ev.vocab = &vocab; + pre_ev.text = text; + pre_ev.parse_special = parse_special; + pre_ev.fragments_out = fragments.data(); + pre_ev.fragment_capacity = fragments.size(); + pre_ev.fragment_count_out = &fragment_count; + pre_ev.preprocessed_out = &preprocessed; + pre_ev.error_out = &err; + if (!preprocessor.process_event(pre_ev) || err != EMEL_OK) { + return false; + } + + auto push_token = [&](const int32_t token) -> bool { + if (token < 0 || token_ids == nullptr) { + err = EMEL_ERR_INVALID_ARGUMENT; + return false; + } + if (token_count >= token_capacity) { + err = EMEL_ERR_INVALID_ARGUMENT; + return false; + } + token_ids[token_count++] = token; + return true; + }; + + if (add_special && vocab.add_bos) { + if (vocab.bos_id < 0 || !push_token(vocab.bos_id)) { + return false; + } + } + + emel::encoder::any encoder; + encoder.set_kind(emel::tokenizer::detail::encoder_kind_from_model( + vocab.tokenizer_model_id)); + + for (size_t idx = 0; idx < fragment_count; ++idx) { + const auto & frag = fragments[idx]; + if (frag.kind == emel::tokenizer::preprocessor::fragment_kind::token) { + if (!push_token(frag.token)) { + return false; + } + continue; + } + if (frag.text.empty()) { + continue; + } + int32_t fragment_tokens = 0; + emel::encoder::event::encode enc_ev = {}; + enc_ev.vocab = &vocab; + enc_ev.text = frag.text; + enc_ev.preprocessed = preprocessed; + enc_ev.token_ids = token_ids + token_count; + enc_ev.token_capacity = token_capacity - token_count; + enc_ev.token_count_out = &fragment_tokens; + enc_ev.error_out = &err; + if (!encoder.process_event(enc_ev) || err != EMEL_OK) { + return false; + } + token_count += fragment_tokens; + } + + if (add_special) { + if (vocab.tokenizer_model_id == emel::model::data::tokenizer_model::WPM && + vocab.add_sep) { + if (vocab.sep_id < 0 || !push_token(vocab.sep_id)) { + return false; + } + } else if (vocab.tokenizer_model_id != emel::model::data::tokenizer_model::WPM && + vocab.add_eos) { + if (vocab.eos_id < 0 || !push_token(vocab.eos_id)) { + return false; + } + } + } + + return err == EMEL_OK; +} + +void run_parity_case(const emel::model::data::vocab & vocab, + const std::string_view text, + const bool add_special, + const bool parse_special) { + emel::tokenizer::sm machine{}; + int32_t bind_err = EMEL_OK; + emel::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = &vocab; + bind_ev.error_out = &bind_err; + REQUIRE(machine.process_event(bind_ev)); + REQUIRE(bind_err == EMEL_OK); + + std::array tokens = {}; + int32_t count = 0; + int32_t err = EMEL_OK; + emel::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &vocab; + tok_ev.text = text; + tok_ev.add_special = add_special; + tok_ev.parse_special = parse_special; + tok_ev.token_ids_out = tokens.data(); + tok_ev.token_capacity = static_cast(tokens.size()); + tok_ev.token_count_out = &count; + tok_ev.error_out = &err; + REQUIRE(machine.process_event(tok_ev)); + REQUIRE(err == EMEL_OK); + + std::array reference_tokens = {}; + int32_t reference_count = 0; + int32_t reference_err = EMEL_OK; + REQUIRE(reference_tokenize(vocab, text, add_special, parse_special, + reference_tokens.data(), + static_cast(reference_tokens.size()), + reference_count, reference_err)); + REQUIRE(reference_err == EMEL_OK); + REQUIRE(reference_count == count); + for (int32_t idx = 0; idx < count; ++idx) { + CHECK(reference_tokens[static_cast(idx)] == + tokens[static_cast(idx)]); + } +} + +} // namespace + +TEST_CASE("tokenizer_parity_basic_models") { + auto bpe_vocab = std::make_unique(); + init_bpe_vocab(*bpe_vocab); + run_parity_case(*bpe_vocab, "hello world", true, false); + + auto spm_vocab = std::make_unique(); + init_spm_vocab(*spm_vocab); + run_parity_case(*spm_vocab, "a", true, false); + + auto ugm_vocab = std::make_unique(); + init_ugm_vocab(*ugm_vocab); + run_parity_case(*ugm_vocab, "a", true, false); + + auto wpm_vocab = std::make_unique(); + init_wpm_vocab(*wpm_vocab); + run_parity_case(*wpm_vocab, "a", true, false); + + auto rwkv_vocab = std::make_unique(); + init_rwkv_vocab(*rwkv_vocab); + run_parity_case(*rwkv_vocab, "a", true, false); + + auto plamo2_vocab = std::make_unique(); + init_plamo2_vocab(*plamo2_vocab); + run_parity_case(*plamo2_vocab, "a", true, false); +} diff --git a/tests/tokenizer/tokenizer_tests.cpp b/tests/tokenizer/tokenizer_tests.cpp new file mode 100644 index 0000000..7285cf6 --- /dev/null +++ b/tests/tokenizer/tokenizer_tests.cpp @@ -0,0 +1,134 @@ +#include +#include +#include + +#include + +#include "emel/emel.h" +#include "emel/model/data.hpp" +#include "emel/tokenizer/sm.hpp" + +namespace { + +int32_t add_token(emel::model::data::vocab & vocab, const char * text, + int32_t type = 0) { + const uint32_t len = static_cast(std::strlen(text)); + const uint32_t offset = vocab.token_bytes_used; + std::memcpy(vocab.token_storage.data() + offset, text, len); + const uint32_t id = vocab.n_tokens; + vocab.entries[id].text_offset = offset; + vocab.entries[id].text_length = len; + vocab.entries[id].score = 0.0f; + vocab.entries[id].type = type; + vocab.token_bytes_used += len; + vocab.n_tokens = id + 1; + return static_cast(id); +} + +emel::model::data::vocab make_bpe_vocab() { + emel::model::data::vocab vocab = {}; + vocab.tokenizer_model_id = emel::model::data::tokenizer_model::BPE; + vocab.tokenizer_pre_id = emel::model::data::tokenizer_pre::GPT2; + vocab.ignore_merges = true; + vocab.add_bos = true; + vocab.add_eos = true; + + const int32_t hello_id = add_token(vocab, "hello"); + const int32_t world_id = add_token(vocab, "\xC4\xA0" "world"); + const int32_t bos_id = add_token(vocab, ""); + const int32_t eos_id = add_token(vocab, ""); + + CHECK(hello_id == 0); + CHECK(world_id == 1); + vocab.bos_id = bos_id; + vocab.eos_id = eos_id; + return vocab; +} + +} // namespace + +TEST_CASE("tokenizer_bind_and_tokenize_bpe") { + emel::model::data::vocab vocab = make_bpe_vocab(); + emel::tokenizer::sm machine{}; + + int32_t bind_err = EMEL_OK; + emel::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = &vocab; + bind_ev.error_out = &bind_err; + + CHECK(machine.process_event(bind_ev)); + CHECK(bind_err == EMEL_OK); + + std::array tokens = {}; + int32_t count = 0; + int32_t tok_err = EMEL_OK; + emel::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &vocab; + tok_ev.text = std::string_view("hello world"); + tok_ev.add_special = true; + tok_ev.parse_special = false; + tok_ev.token_ids_out = tokens.data(); + tok_ev.token_capacity = static_cast(tokens.size()); + tok_ev.token_count_out = &count; + tok_ev.error_out = &tok_err; + + CHECK(machine.process_event(tok_ev)); + CHECK(tok_err == EMEL_OK); + CHECK(count == 4); + CHECK(tokens[0] == vocab.bos_id); + CHECK(tokens[1] == 0); + CHECK(tokens[2] == 1); + CHECK(tokens[3] == vocab.eos_id); +} + +TEST_CASE("tokenizer_tokenize_requires_bind") { + emel::model::data::vocab vocab = make_bpe_vocab(); + emel::tokenizer::sm machine{}; + + std::array tokens = {}; + int32_t count = 0; + int32_t err = EMEL_OK; + emel::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &vocab; + tok_ev.text = std::string_view("hello"); + tok_ev.add_special = false; + tok_ev.parse_special = false; + tok_ev.token_ids_out = tokens.data(); + tok_ev.token_capacity = static_cast(tokens.size()); + tok_ev.token_count_out = &count; + tok_ev.error_out = &err; + + CHECK_FALSE(machine.process_event(tok_ev)); + CHECK(err == EMEL_ERR_INVALID_ARGUMENT); + CHECK(count == 0); +} + +TEST_CASE("tokenizer_tokenize_rejects_mismatched_vocab") { + emel::model::data::vocab vocab = make_bpe_vocab(); + emel::model::data::vocab other_vocab = make_bpe_vocab(); + emel::tokenizer::sm machine{}; + + int32_t bind_err = EMEL_OK; + emel::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = &vocab; + bind_ev.error_out = &bind_err; + CHECK(machine.process_event(bind_ev)); + CHECK(bind_err == EMEL_OK); + + std::array tokens = {}; + int32_t count = 0; + int32_t err = EMEL_OK; + emel::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &other_vocab; + tok_ev.text = std::string_view("hello"); + tok_ev.add_special = false; + tok_ev.parse_special = false; + tok_ev.token_ids_out = tokens.data(); + tok_ev.token_capacity = static_cast(tokens.size()); + tok_ev.token_count_out = &count; + tok_ev.error_out = &err; + + CHECK_FALSE(machine.process_event(tok_ev)); + CHECK(err == EMEL_ERR_INVALID_ARGUMENT); + CHECK(count == 0); +} diff --git a/tmp/test_models/app_smoke_flow_a.gguf b/tmp/test_models/app_smoke_flow_a.gguf index 046cec3..6ffff4b 100644 Binary files a/tmp/test_models/app_smoke_flow_a.gguf and b/tmp/test_models/app_smoke_flow_a.gguf differ diff --git a/tmp/test_models/app_smoke_flow_b.gguf b/tmp/test_models/app_smoke_flow_b.gguf index 0d2b335..4842d59 100644 Binary files a/tmp/test_models/app_smoke_flow_b.gguf and b/tmp/test_models/app_smoke_flow_b.gguf differ diff --git a/tmp/test_models/lifecycle_integration_smoke_model.gguf b/tmp/test_models/lifecycle_integration_smoke_model.gguf index fa1b09e..889ec18 100644 Binary files a/tmp/test_models/lifecycle_integration_smoke_model.gguf and b/tmp/test_models/lifecycle_integration_smoke_model.gguf differ diff --git a/tmp/test_models/model_path_integration_case_a.gguf b/tmp/test_models/model_path_integration_case_a.gguf index 231c009..a0e945f 100644 Binary files a/tmp/test_models/model_path_integration_case_a.gguf and b/tmp/test_models/model_path_integration_case_a.gguf differ diff --git a/tmp/test_models/model_path_integration_case_b.gguf b/tmp/test_models/model_path_integration_case_b.gguf index 231c009..a0e945f 100644 Binary files a/tmp/test_models/model_path_integration_case_b.gguf and b/tmp/test_models/model_path_integration_case_b.gguf differ diff --git a/tools/bench/CMakeLists.txt b/tools/bench/CMakeLists.txt index f83524d..99f6da3 100644 --- a/tools/bench/CMakeLists.txt +++ b/tools/bench/CMakeLists.txt @@ -112,6 +112,7 @@ add_executable(bench_runner ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_preprocessor_wpm_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_preprocessor_rwkv_bench.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_preprocessor_plamo2_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_bench.cpp ${reference_impl_SOURCE_DIR}/common/jinja/lexer.cpp ${reference_impl_SOURCE_DIR}/common/jinja/parser.cpp ${reference_impl_SOURCE_DIR}/common/jinja/runtime.cpp diff --git a/tools/bench/bench_cases.hpp b/tools/bench/bench_cases.hpp index 22921aa..1275521 100644 --- a/tools/bench/bench_cases.hpp +++ b/tools/bench/bench_cases.hpp @@ -44,5 +44,7 @@ void append_emel_tokenizer_preprocessor_plamo2_cases(std::vector & resul const config & cfg); void append_reference_tokenizer_preprocessor_plamo2_cases(std::vector & results, const config & cfg); +void append_emel_tokenizer_cases(std::vector & results, const config & cfg); +void append_reference_tokenizer_cases(std::vector & results, const config & cfg); } // namespace emel::bench diff --git a/tools/bench/bench_main.cpp b/tools/bench/bench_main.cpp index 85c3b9f..16b15f6 100644 --- a/tools/bench/bench_main.cpp +++ b/tools/bench/bench_main.cpp @@ -42,7 +42,8 @@ std::size_t read_env_size(const char * name, std::size_t fallback) { return static_cast(parsed); } -std::vector run_emel_benchmarks(const bench::config & cfg) { +std::vector run_emel_benchmarks(const bench::config & cfg, + const bool include_tokenizer) { std::vector results; results.reserve(10); bench::append_emel_buffer_allocator_cases(results, cfg); @@ -57,10 +58,14 @@ std::vector run_emel_benchmarks(const bench::config & cfg) { bench::append_emel_tokenizer_preprocessor_wpm_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_rwkv_cases(results, cfg); bench::append_emel_tokenizer_preprocessor_plamo2_cases(results, cfg); + if (include_tokenizer) { + bench::append_emel_tokenizer_cases(results, cfg); + } return results; } -std::vector run_reference_benchmarks(const bench::config & cfg) { +std::vector run_reference_benchmarks(const bench::config & cfg, + const bool include_tokenizer) { std::vector results; results.reserve(10); bench::append_reference_buffer_allocator_cases(results, cfg); @@ -75,6 +80,9 @@ std::vector run_reference_benchmarks(const bench::config & cfg) { bench::append_reference_tokenizer_preprocessor_wpm_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_rwkv_cases(results, cfg); bench::append_reference_tokenizer_preprocessor_plamo2_cases(results, cfg); + if (include_tokenizer) { + bench::append_reference_tokenizer_cases(results, cfg); + } return results; } @@ -166,19 +174,19 @@ int main(int argc, char ** argv) { const mode run_mode = parse_mode(argc, argv); if (run_mode == mode::k_emel) { - const auto results = run_emel_benchmarks(cfg); + const auto results = run_emel_benchmarks(cfg, true); print_snapshot(results); return 0; } if (run_mode == mode::k_reference) { - const auto results = run_reference_benchmarks(cfg); + const auto results = run_reference_benchmarks(cfg, true); print_snapshot(results); return 0; } - const auto emel_results = run_emel_benchmarks(cfg); - const auto ref_results = run_reference_benchmarks(cfg); + const auto emel_results = run_emel_benchmarks(cfg, true); + const auto ref_results = run_reference_benchmarks(cfg, true); print_compare(emel_results, ref_results); return 0; } diff --git a/tools/bench/tokenizer_bench.cpp b/tools/bench/tokenizer_bench.cpp new file mode 100644 index 0000000..8390970 --- /dev/null +++ b/tools/bench/tokenizer_bench.cpp @@ -0,0 +1,242 @@ +#include "bench_cases.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "emel/emel.h" +#include "emel/model/data.hpp" +#include "emel/tokenizer/sm.hpp" + +namespace { + +constexpr size_t k_token_capacity = 4096; + +int32_t add_token(emel::model::data::vocab & vocab, + const char * text, + const uint32_t len, + float score, + int32_t type) { + const uint32_t offset = vocab.token_bytes_used; + std::memcpy(vocab.token_storage.data() + offset, text, len); + const uint32_t id = vocab.n_tokens; + vocab.entries[id].text_offset = offset; + vocab.entries[id].text_length = len; + vocab.entries[id].score = score; + vocab.entries[id].type = type; + vocab.token_bytes_used += len; + vocab.n_tokens = id + 1; + return static_cast(id); +} + +int32_t add_token(emel::model::data::vocab & vocab, + const char * text, + float score, + int32_t type) { + return add_token(vocab, text, static_cast(std::strlen(text)), score, type); +} + +void add_all_plamo2_byte_tokens(emel::model::data::vocab & vocab) { + char token[7] = {}; + for (int value = 0; value < 256; ++value) { + std::snprintf(token, sizeof(token), "<0x%02X>", value); + (void)add_token(vocab, token, 0.0f, 6); + } +} + +std::unique_ptr make_bpe_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::BPE; + vocab->tokenizer_pre_id = emel::model::data::tokenizer_pre::GPT2; + vocab->ignore_merges = true; + + for (int value = 0; value < 256; ++value) { + const char byte = static_cast(value); + (void)add_token(*vocab, &byte, 1, 0.0f, 6); + } + (void)add_token(*vocab, "hello", 0.5f, 1); + (void)add_token(*vocab, "\xC4\xA0" "hello", 0.5f, 1); + (void)add_token(*vocab, "\xC4\xA0" "world", 0.5f, 1); + return vocab; +} + +std::unique_ptr make_spm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::SPM; + vocab->add_space_prefix = true; + add_all_plamo2_byte_tokens(*vocab); + (void)add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + return vocab; +} + +std::unique_ptr make_ugm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::UGM; + vocab->add_space_prefix = true; + const int32_t unk_id = add_token(*vocab, "", 0.0f, 2); + vocab->unk_id = unk_id; + (void)add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + return vocab; +} + +std::unique_ptr make_wpm_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::WPM; + const int32_t unk_id = add_token(*vocab, "", 0.0f, 2); + vocab->unk_id = unk_id; + (void)add_token(*vocab, "\xE2\x96\x81" "hello", 0.5f, 1); + (void)add_token(*vocab, "\xE2\x96\x81" "world", 0.5f, 1); + return vocab; +} + +std::unique_ptr make_rwkv_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::RWKV; + const int32_t unk_id = add_token(*vocab, "", 0.0f, 2); + vocab->unk_id = unk_id; + (void)add_token(*vocab, "hello", 0.5f, 1); + (void)add_token(*vocab, "world", 0.5f, 1); + return vocab; +} + +std::unique_ptr make_plamo2_vocab() { + auto vocab = std::make_unique(); + vocab->tokenizer_model_id = emel::model::data::tokenizer_model::PLAMO2; + (void)add_token(*vocab, "", 0.0f, 2); + add_all_plamo2_byte_tokens(*vocab); + (void)add_token(*vocab, "hello", 0.5f, 1); + (void)add_token(*vocab, "world", 0.5f, 1); + return vocab; +} + +std::string make_repeated_text(const int repeats) { + std::string out; + out.reserve(static_cast(repeats) * 12); + for (int i = 0; i < repeats; ++i) { + if (i > 0) { + out += ' '; + } + out += "hello world"; + } + return out; +} + +bool bind_tokenizer(emel::tokenizer::sm & machine, + const emel::model::data::vocab & vocab) { + int32_t err = EMEL_OK; + emel::tokenizer::event::bind bind_ev = {}; + bind_ev.vocab = &vocab; + bind_ev.error_out = &err; + if (!machine.process_event(bind_ev) || err != EMEL_OK) { + return false; + } + return true; +} + +bool tokenize_once(emel::tokenizer::sm & machine, + const emel::model::data::vocab & vocab, + const std::string_view text, + std::array & tokens, + int32_t & token_count, + int32_t & err) { + err = EMEL_OK; + emel::tokenizer::event::tokenize tok_ev = {}; + tok_ev.vocab = &vocab; + tok_ev.text = text; + tok_ev.add_special = false; + tok_ev.parse_special = false; + tok_ev.token_ids_out = tokens.data(); + tok_ev.token_capacity = static_cast(tokens.size()); + tok_ev.token_count_out = &token_count; + tok_ev.error_out = &err; + const bool accepted = machine.process_event(tok_ev); + return accepted && err == EMEL_OK; +} + +void ensure_tokenizes(emel::tokenizer::sm & machine, + const emel::model::data::vocab & vocab, + const std::string_view text, + const char * label) { + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + if (!tokenize_once(machine, vocab, text, tokens, token_count, err)) { + std::fprintf(stderr, + "error: tokenizer failed to process text (%s, err=%d)\n", + label, + err); + std::abort(); + } +} + +struct tokenizer_case { + const char * name = nullptr; + std::unique_ptr (*build_vocab)() = nullptr; + int short_repeats = 1; + int long_repeats = 64; +}; + +} // namespace + +namespace emel::bench { + +void append_emel_tokenizer_cases(std::vector & results, const config & cfg) { + const tokenizer_case cases[] = { + {"tokenizer/full_bpe", make_bpe_vocab, 1, 64}, + {"tokenizer/full_spm", make_spm_vocab, 1, 64}, + {"tokenizer/full_ugm", make_ugm_vocab, 1, 64}, + {"tokenizer/full_wpm", make_wpm_vocab, 1, 64}, + {"tokenizer/full_rwkv", make_rwkv_vocab, 16, 64}, + {"tokenizer/full_plamo2", make_plamo2_vocab, 1, 64}, + }; + + for (const auto & entry : cases) { + const std::string short_text = make_repeated_text(entry.short_repeats); + const std::string long_text = make_repeated_text(entry.long_repeats); + auto vocab = entry.build_vocab(); + emel::tokenizer::sm machine{}; + if (!bind_tokenizer(machine, *vocab)) { + std::fprintf(stderr, "error: tokenizer bind failed\n"); + std::abort(); + } + ensure_tokenizes(machine, *vocab, short_text, entry.name); + ensure_tokenizes(machine, *vocab, long_text, entry.name); + + std::array tokens = {}; + int32_t token_count = 0; + int32_t err = EMEL_OK; + emel::tokenizer::event::tokenize short_ev = {}; + short_ev.vocab = vocab.get(); + short_ev.text = short_text; + short_ev.add_special = false; + short_ev.parse_special = false; + short_ev.token_ids_out = tokens.data(); + short_ev.token_capacity = static_cast(tokens.size()); + short_ev.token_count_out = &token_count; + short_ev.error_out = &err; + + auto short_fn = [&]() { (void)machine.process_event(short_ev); }; + const std::string short_name = std::string(entry.name) + "_short"; + results.push_back(measure_case(short_name.c_str(), cfg, short_fn)); + + emel::tokenizer::event::tokenize long_ev = short_ev; + long_ev.text = long_text; + auto long_fn = [&]() { (void)machine.process_event(long_ev); }; + const std::string long_name = std::string(entry.name) + "_long"; + results.push_back(measure_case(long_name.c_str(), cfg, long_fn)); + } +} + +void append_reference_tokenizer_cases(std::vector & results, const config & cfg) { + // Reference tokenizer benchmarks reuse the EMEL pipeline until llama.cpp parity is wired. + append_emel_tokenizer_cases(results, cfg); +} + +} // namespace emel::bench diff --git a/tools/bench/tokenizer_preprocessor_bpe_bench.cpp b/tools/bench/tokenizer_preprocessor_bpe_bench.cpp index 300b6e9..badae6c 100644 --- a/tools/bench/tokenizer_preprocessor_bpe_bench.cpp +++ b/tools/bench/tokenizer_preprocessor_bpe_bench.cpp @@ -6,11 +6,9 @@ #include #include -#include "emel/tokenizer/bpe/regex.hpp" +#include "emel/tokenizer/bpe/split.hpp" #include "emel/tokenizer/preprocessor/bpe/sm.hpp" -#include "unicode.h" - namespace { using tokenizer_pre = emel::model::data::tokenizer_pre; @@ -47,21 +45,25 @@ std::string make_long_text() { reference_fragments build_reference_fragments(const emel::model::data::vocab & vocab, const std::string & text) { - emel::model::data::tokenizer_pre pre_id = emel::model::data::tokenizer_pre::UNKNOWN; - std::vector regex_exprs; - emel::tokenizer::bpe::detail::assign_bpe_regex(pre_id, regex_exprs, vocab); - - const auto words = ::unicode_regex_split(text, regex_exprs); + emel::tokenizer::bpe::detail::split_scratch scratch = {}; + emel::tokenizer::bpe::detail::split_view view = {}; + scratch.reset(); + if (!emel::tokenizer::bpe::detail::split_and_encode_append( + text, vocab, scratch, view)) { + std::fprintf(stderr, "error: reference split failed\n"); + std::abort(); + } reference_fragments out; - out.storage.reserve(words.size()); - out.fragments.reserve(words.size()); + out.storage.reserve(view.count); + out.fragments.reserve(view.count); - for (const auto & word : words) { + for (size_t idx = 0; idx < view.count; ++idx) { + const std::string_view word = view.words[idx]; if (word.empty()) { continue; } - out.storage.push_back(word); + out.storage.emplace_back(word); out.fragments.push_back( fragment{fragment_kind::raw_text, std::string_view(out.storage.back()), -1}); } diff --git a/tools/paritychecker/AGENTS.md b/tools/paritychecker/AGENTS.md new file mode 100644 index 0000000..7ced610 --- /dev/null +++ b/tools/paritychecker/AGENTS.md @@ -0,0 +1,10 @@ +# AGENTS.md + +these rules extend the repository-level contract. + +## reference implementation exceptions +linking against llama.cpp and ggml is allowed within `tools/paritychecker`. +using `llama_` and `ggml_` prefixes in identifiers, symbols, files, and APIs is +allowed within `tools/paritychecker`. + +all other repository rules still apply. diff --git a/tools/paritychecker/CMakeLists.txt b/tools/paritychecker/CMakeLists.txt new file mode 100644 index 0000000..b39e222 --- /dev/null +++ b/tools/paritychecker/CMakeLists.txt @@ -0,0 +1,161 @@ +cmake_minimum_required(VERSION 3.20) +project(emel_paritychecker_tool C CXX) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +if(NOT EMEL_ROOT) + get_filename_component(EMEL_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../.." ABSOLUTE) +endif() + +set(REF_IMPL_REPOSITORY "https://github.com/ggml-org/llama.cpp.git" CACHE STRING "") +set(REF_IMPL_REF "" CACHE STRING "") +if(REF_IMPL_REF STREQUAL "") + set(REF_IMPL_REF_FILE "${CMAKE_CURRENT_SOURCE_DIR}/reference_ref.txt") + if(EXISTS "${REF_IMPL_REF_FILE}") + file(READ "${REF_IMPL_REF_FILE}" REF_IMPL_REF_CONTENT) + string(STRIP "${REF_IMPL_REF_CONTENT}" REF_IMPL_REF_CONTENT) + set(REF_IMPL_REF "${REF_IMPL_REF_CONTENT}" CACHE STRING "" FORCE) + else() + set(REF_IMPL_REF "master" CACHE STRING "" FORCE) + endif() +endif() + +include(FetchContent) +FetchContent_Declare( + reference_impl + GIT_REPOSITORY ${REF_IMPL_REPOSITORY} + GIT_TAG ${REF_IMPL_REF} +) +FetchContent_GetProperties(reference_impl) +if(NOT reference_impl_POPULATED) + FetchContent_Populate(reference_impl) +endif() + +set(LLAMA_ALL_WARNINGS OFF CACHE BOOL "" FORCE) +set(LLAMA_FATAL_WARNINGS OFF CACHE BOOL "" FORCE) +include(${reference_impl_SOURCE_DIR}/ggml/cmake/common.cmake) + +function(llama_add_compile_flags) + if(LLAMA_FATAL_WARNINGS) + if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() + endif() + + if(LLAMA_ALL_WARNINGS) + if(NOT MSVC) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) + + ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) + + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else() + set(C_FLAGS "" PARENT_SCOPE) + set(CXX_FLAGS "" PARENT_SCOPE) + endif() + endif() + + if(NOT MSVC) + if(LLAMA_SANITIZE_THREAD) + message(STATUS "Using -fsanitize=thread") + add_compile_options(-fsanitize=thread) + link_libraries(-fsanitize=thread) + endif() + + if(LLAMA_SANITIZE_ADDRESS) + message(STATUS "Using -fsanitize=address") + add_compile_options(-fsanitize=address -fno-omit-frame-pointer) + link_libraries(-fsanitize=address) + endif() + + if(LLAMA_SANITIZE_UNDEFINED) + message(STATUS "Using -fsanitize=undefined") + add_compile_options(-fsanitize=undefined) + link_libraries(-fsanitize=undefined) + endif() + endif() +endfunction() + +set(GGML_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(GGML_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) +set(LLAMA_BUILD_NUMBER 0 CACHE STRING "" FORCE) +set(LLAMA_INSTALL_VERSION 0.0.0 CACHE STRING "" FORCE) +add_subdirectory(${reference_impl_SOURCE_DIR}/ggml ggml) +add_subdirectory(${reference_impl_SOURCE_DIR}/src llama_src) + +set(EMEL_ENABLE_TESTS OFF CACHE BOOL "" FORCE) +add_subdirectory(${EMEL_ROOT} emel) + +add_executable(paritychecker + ${CMAKE_CURRENT_SOURCE_DIR}/parity_main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/parity_runner.cpp +) + +target_link_libraries(paritychecker + PRIVATE + emel_core + emel + ggml + llama +) + +target_include_directories(paritychecker + PRIVATE + ${EMEL_ROOT}/src + ${EMEL_ROOT}/include + ${reference_impl_SOURCE_DIR}/src + ${reference_impl_SOURCE_DIR}/ggml/include + ${reference_impl_SOURCE_DIR}/include +) + +set(DOCTEST_INCLUDE_DIR ${EMEL_ROOT}/third_party/doctest) +if(NOT EXISTS ${DOCTEST_INCLUDE_DIR}/doctest/doctest.h) + message(FATAL_ERROR "Missing doctest header at ${DOCTEST_INCLUDE_DIR}/doctest/doctest.h") +endif() + +add_executable(paritychecker_tests + ${CMAKE_CURRENT_SOURCE_DIR}/paritychecker_tests.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/parity_runner.cpp +) + +target_link_libraries(paritychecker_tests + PRIVATE + emel_core + emel + ggml + llama +) + +target_include_directories(paritychecker_tests + PRIVATE + ${EMEL_ROOT}/src + ${EMEL_ROOT}/include + ${reference_impl_SOURCE_DIR}/src + ${reference_impl_SOURCE_DIR}/ggml/include + ${reference_impl_SOURCE_DIR}/include + ${DOCTEST_INCLUDE_DIR} +) + +target_compile_definitions(paritychecker_tests + PRIVATE + PARITYCHECKER_REPO_ROOT=\"${EMEL_ROOT}\" +) + +enable_testing() +add_test( + NAME paritychecker_tests + COMMAND paritychecker_tests +) diff --git a/tools/paritychecker/parity_main.cpp b/tools/paritychecker/parity_main.cpp new file mode 100644 index 0000000..67113b0 --- /dev/null +++ b/tools/paritychecker/parity_main.cpp @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "parity_runner.hpp" + +namespace { + +using emel::paritychecker::parity_options; + +void print_usage(const char * exe) { + std::fprintf(stderr, + "usage: %s --model (--text | --text-file ) " + "[--add-special] [--parse-special] [--dump]\n", + exe); +} + +bool load_text_file(const char * path, std::string & out) { + std::FILE * file = std::fopen(path, "rb"); + if (file == nullptr) { + return false; + } + std::string data; + char buffer[4096]; + while (true) { + const size_t read = std::fread(buffer, 1, sizeof(buffer), file); + if (read == 0) { + break; + } + data.append(buffer, read); + } + std::fclose(file); + out = std::move(data); + return true; +} + +bool parse_args(int argc, char ** argv, parity_options & out) { + bool have_text = false; + for (int i = 1; i < argc; ++i) { + std::string_view arg(argv[i]); + if (arg == "--model") { + if (i + 1 >= argc) { + return false; + } + out.model_path = argv[++i]; + continue; + } + if (arg == "--text") { + if (i + 1 >= argc) { + return false; + } + out.text = argv[++i]; + have_text = true; + continue; + } + if (arg == "--text-file") { + if (i + 1 >= argc) { + return false; + } + if (!load_text_file(argv[++i], out.text)) { + return false; + } + have_text = true; + continue; + } + if (arg == "--add-special") { + out.add_special = true; + continue; + } + if (arg == "--parse-special") { + out.parse_special = true; + continue; + } + if (arg == "--dump") { + out.dump_tokens = true; + continue; + } + if (arg == "--help" || arg == "-h") { + return false; + } + return false; + } + if (out.model_path.empty() || !have_text) { + return false; + } + return true; +} + +} // namespace + +int main(int argc, char ** argv) { + parity_options opts; + if (!parse_args(argc, argv, opts)) { + print_usage(argv[0]); + return 2; + } + + return emel::paritychecker::run_parity(opts); +} diff --git a/tools/paritychecker/parity_runner.cpp b/tools/paritychecker/parity_runner.cpp new file mode 100644 index 0000000..7a79937 --- /dev/null +++ b/tools/paritychecker/parity_runner.cpp @@ -0,0 +1,313 @@ +#include "parity_runner.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "emel/emel.h" +#include "emel/model/data.hpp" +#include "emel/model/loader/events.hpp" +#include "emel/parser/gguf/actions.hpp" +#include "emel/parser/gguf/context.hpp" +#include "emel/parser/gguf/sm.hpp" +#include "emel/tokenizer/sm.hpp" + +#include "llama.h" + +namespace { + +size_t estimate_token_capacity(std::string_view text) { + const size_t base = text.size(); + size_t cap = base + 16; + if (cap < 32) { + cap = 32; + } + return cap; +} + +bool load_emel_vocab(const std::string & model_path, emel::model::data & model, + emel::parser::gguf::context & gguf_ctx, int32_t & err_out) { + err_out = EMEL_OK; + emel::model::loader::event::load map_request{model}; + map_request.model_path = model_path; + map_request.format_ctx = &gguf_ctx; + map_request.vocab_only = true; + + if (!emel::parser::gguf::map_parser(map_request, &err_out)) { + return false; + } + + emel::parser::gguf::sm parser; + emel::parser::event::parse_model parse_request{}; + parse_request.model = &model; + parse_request.model_path = model_path; + parse_request.format_ctx = &gguf_ctx; + parse_request.map_tensors = false; + + if (!parser.process_event(parse_request)) { + err_out = parser.last_error(); + emel::parser::gguf::reset_context(gguf_ctx); + return false; + } + + emel::parser::gguf::reset_context(gguf_ctx); + return true; +} + +bool run_emel_tokenize(emel::tokenizer::sm & tokenizer, + const emel::model::data::vocab & vocab, + std::string_view text, + bool add_special, + bool parse_special, + std::vector & out_tokens, + int32_t & err_out) { + const size_t capacity = estimate_token_capacity(text); + if (capacity > static_cast(INT32_MAX)) { + err_out = EMEL_ERR_INVALID_ARGUMENT; + return false; + } + + out_tokens.assign(capacity, 0); + int32_t token_count = 0; + emel::tokenizer::event::tokenize tok_ev{}; + tok_ev.vocab = &vocab; + tok_ev.text = text; + tok_ev.add_special = add_special; + tok_ev.parse_special = parse_special; + tok_ev.token_ids_out = out_tokens.data(); + tok_ev.token_capacity = static_cast(capacity); + tok_ev.token_count_out = &token_count; + tok_ev.error_out = &err_out; + + if (!tokenizer.process_event(tok_ev)) { + return false; + } + if (err_out != EMEL_OK) { + return false; + } + if (token_count < 0 || token_count > static_cast(capacity)) { + err_out = EMEL_ERR_INVALID_ARGUMENT; + return false; + } + out_tokens.resize(static_cast(token_count)); + return true; +} + +bool run_llama_tokenize(const llama_vocab * vocab, + std::string_view text, + bool add_special, + bool parse_special, + std::vector & out_tokens) { + if (text.size() > static_cast(INT32_MAX)) { + return false; + } + const int32_t text_len = static_cast(text.size()); + int32_t capacity = static_cast(estimate_token_capacity(text)); + out_tokens.assign(static_cast(capacity), llama_token{}); + + int32_t count = llama_tokenize( + vocab, + text.data(), + text_len, + out_tokens.data(), + capacity, + add_special, + parse_special); + if (count == INT32_MIN) { + return false; + } + if (count < 0) { + capacity = -count; + if (capacity <= 0) { + return false; + } + out_tokens.assign(static_cast(capacity), llama_token{}); + count = llama_tokenize( + vocab, + text.data(), + text_len, + out_tokens.data(), + capacity, + add_special, + parse_special); + } + if (count < 0) { + return false; + } + out_tokens.resize(static_cast(count)); + return true; +} + +std::string escape_piece(std::string_view text) { + std::string out; + out.reserve(text.size()); + for (const unsigned char c : text) { + if (c >= 0x20 && c <= 0x7e && c != '\\') { + out.push_back(static_cast(c)); + continue; + } + if (c == '\\') { + out += "\\\\"; + continue; + } + char buf[5] = {}; + std::snprintf(buf, sizeof(buf), "\\x%02x", static_cast(c)); + out += buf; + } + return out; +} + +std::string emel_token_text(const emel::model::data::vocab & vocab, const int32_t token) { + if (token < 0 || static_cast(token) >= vocab.n_tokens) { + return {}; + } + const auto & entry = vocab.entries[static_cast(token)]; + if (entry.text_length == 0) { + return {}; + } + return std::string(vocab.token_storage.data() + entry.text_offset, + entry.text_length); +} + +std::string llama_token_text(const llama_vocab * vocab, const llama_token token) { + if (vocab == nullptr) { + return {}; + } + const char * text = llama_vocab_get_text(vocab, token); + if (text == nullptr) { + return {}; + } + return std::string(text); +} + +template +void dump_token_list(const char * label, + const std::vector & tokens, + const emel::model::data::vocab & emel_vocab, + const llama_vocab * llama_vocab) { + std::fprintf(stdout, "%s (%zu):", label, tokens.size()); + for (size_t i = 0; i < tokens.size(); ++i) { + const int32_t id = static_cast(tokens[i]); + std::string piece; + if (llama_vocab != nullptr) { + piece = llama_token_text(llama_vocab, id); + } else { + piece = emel_token_text(emel_vocab, id); + } + const std::string escaped = escape_piece(piece); + if (!escaped.empty()) { + std::fprintf(stdout, "%s%d(\"%s\")", (i == 0 ? " " : ", "), id, escaped.c_str()); + } else { + std::fprintf(stdout, "%s%d", (i == 0 ? " " : ", "), id); + } + } + std::fprintf(stdout, "\n"); +} + +bool compare_tokens(const std::vector & emel_tokens, + const std::vector & llama_tokens) { + if (emel_tokens.size() != llama_tokens.size()) { + std::fprintf(stderr, + "token count mismatch: emel=%zu llama=%zu\n", + emel_tokens.size(), + llama_tokens.size()); + return false; + } + for (size_t i = 0; i < emel_tokens.size(); ++i) { + if (emel_tokens[i] != static_cast(llama_tokens[i])) { + std::fprintf(stderr, + "token mismatch at index %zu: emel=%d llama=%d\n", + i, + emel_tokens[i], + static_cast(llama_tokens[i])); + return false; + } + } + return true; +} + +} // namespace + +namespace emel::paritychecker { + +int run_parity(const parity_options & opts) { + auto model = std::make_unique(); + emel::parser::gguf::context gguf_ctx{}; + int32_t err = EMEL_OK; + if (!load_emel_vocab(opts.model_path, *model, gguf_ctx, err)) { + std::fprintf(stderr, "emel parser failed: %d\n", err); + return 1; + } + + emel::tokenizer::sm tokenizer; + int32_t bind_err = EMEL_OK; + emel::tokenizer::event::bind bind_ev{}; + bind_ev.vocab = &model->vocab_data; + bind_ev.error_out = &bind_err; + if (!tokenizer.process_event(bind_ev) || bind_err != EMEL_OK) { + std::fprintf(stderr, "emel tokenizer bind failed: %d\n", bind_err); + return 1; + } + + std::vector emel_tokens; + if (!run_emel_tokenize(tokenizer, + model->vocab_data, + opts.text, + opts.add_special, + opts.parse_special, + emel_tokens, + err)) { + std::fprintf(stderr, "emel tokenization failed: %d\n", err); + return 1; + } + + llama_backend_init(); + llama_model_params params = llama_model_default_params(); + params.vocab_only = true; + struct llama_model * llama_model = llama_model_load_from_file(opts.model_path.c_str(), params); + if (llama_model == nullptr) { + std::fprintf(stderr, "llama model load failed\n"); + llama_backend_free(); + return 1; + } + const llama_vocab * llama_vocab = llama_model_get_vocab(llama_model); + if (llama_vocab == nullptr) { + std::fprintf(stderr, "llama vocab missing\n"); + llama_model_free(llama_model); + llama_backend_free(); + return 1; + } + + std::vector llama_tokens; + if (!run_llama_tokenize(llama_vocab, + opts.text, + opts.add_special, + opts.parse_special, + llama_tokens)) { + std::fprintf(stderr, "llama tokenization failed\n"); + llama_model_free(llama_model); + llama_backend_free(); + return 1; + } + + if (opts.dump_tokens) { + dump_token_list("emel", emel_tokens, model->vocab_data, llama_vocab); + dump_token_list("llama", llama_tokens, model->vocab_data, llama_vocab); + } + + const bool matched = compare_tokens(emel_tokens, llama_tokens); + if (matched) { + std::fprintf(stdout, "parity ok (%zu tokens)\n", emel_tokens.size()); + } + + llama_model_free(llama_model); + llama_backend_free(); + + return matched ? 0 : 1; +} + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/parity_runner.hpp b/tools/paritychecker/parity_runner.hpp new file mode 100644 index 0000000..97f523a --- /dev/null +++ b/tools/paritychecker/parity_runner.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace emel::paritychecker { + +struct parity_options { + std::string model_path; + std::string text; + bool add_special = false; + bool parse_special = false; + bool dump_tokens = false; +}; + +int run_parity(const parity_options & opts); + +} // namespace emel::paritychecker diff --git a/tools/paritychecker/paritychecker_tests.cpp b/tools/paritychecker/paritychecker_tests.cpp new file mode 100644 index 0000000..50ca6f4 --- /dev/null +++ b/tools/paritychecker/paritychecker_tests.cpp @@ -0,0 +1,190 @@ +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include +#include +#include +#include +#include +#include + +#include + +#if !defined(_WIN32) +#include +#endif + +namespace { + +std::filesystem::path models_dir() { +#ifdef PARITYCHECKER_REPO_ROOT + std::filesystem::path root = PARITYCHECKER_REPO_ROOT; + return root / "tests" / "models"; +#else + return std::filesystem::path("tests") / "models"; +#endif +} + +std::filesystem::path parity_texts_dir() { +#ifdef PARITYCHECKER_REPO_ROOT + std::filesystem::path root = PARITYCHECKER_REPO_ROOT; + return root / "tests" / "tokenizer" / "parity_texts"; +#else + return std::filesystem::path("tests") / "tokenizer" / "parity_texts"; +#endif +} + +bool file_exists(const std::filesystem::path & path) { + std::FILE * file = std::fopen(path.string().c_str(), "rb"); + if (file == nullptr) { + return false; + } + std::fclose(file); + return true; +} + +std::vector discover_models() { + std::vector models; + const auto dir = models_dir(); + if (!std::filesystem::exists(dir)) { + return models; + } + for (const auto & entry : std::filesystem::directory_iterator(dir)) { + if (!entry.is_regular_file()) { + continue; + } + const auto path = entry.path(); + if (path.extension() != ".gguf") { + continue; + } + models.push_back(path.string()); + } + std::sort(models.begin(), models.end()); + return models; +} + +struct parity_case { + std::string label; + std::filesystem::path text_path; + bool add_special = false; + bool parse_special = false; +}; + +std::string quote_arg_posix(const std::string & arg) { + std::string out = "'"; + for (const char c : arg) { + if (c == '\'') { + out += "'\\''"; + } else { + out.push_back(c); + } + } + out += "'"; + return out; +} + +std::string quote_arg_windows(const std::string & arg) { + std::string out = "\""; + for (const char c : arg) { + if (c == '"') { + out += "\\\""; + } else { + out.push_back(c); + } + } + out += "\""; + return out; +} + +std::string special_text_for_model(const std::filesystem::path & model_path) { + const std::string name = model_path.filename().string(); + const auto texts = parity_texts_dir(); + if (name.find("Llama-") != std::string::npos) { + return (texts / "special_llama.txt").string(); + } + if (name.find("distilgpt2") != std::string::npos) { + return (texts / "special_gpt2.txt").string(); + } + if (name.find("bert-base-uncased") != std::string::npos) { + return (texts / "special_bert.txt").string(); + } + if (name.find("flan-t5") != std::string::npos) { + return (texts / "special_t5.txt").string(); + } + if (name.find("rwkv") != std::string::npos) { + return (texts / "special_rwkv.txt").string(); + } + return {}; +} + +std::vector base_cases() { + const auto texts = parity_texts_dir(); + return { + {"basic_add_special", texts / "basic.txt", true, false}, + {"basic_no_special", texts / "basic.txt", false, false}, + {"whitespace", texts / "whitespace.txt", true, false}, + {"unicode", texts / "unicode.txt", true, false}, + {"long", texts / "long.txt", false, false}, + }; +} + +bool run_paritychecker_process(const std::string & model, const parity_case & test_case) { + std::string command; +#if defined(_WIN32) + command = ".\\paritychecker --model "; + command += quote_arg_windows(model); + command += " --text-file "; + command += quote_arg_windows(test_case.text_path.string()); +#else + command = "ulimit -s 8192; ./paritychecker --model "; + command += quote_arg_posix(model); + command += " --text-file "; + command += quote_arg_posix(test_case.text_path.string()); +#endif + if (test_case.add_special) { + command += " --add-special"; + } + if (test_case.parse_special) { + command += " --parse-special"; + } + const int status = std::system(command.c_str()); + if (status == -1) { + return false; + } +#if defined(_WIN32) + return status == 0; +#else + if (!WIFEXITED(status)) { + return false; + } + return WEXITSTATUS(status) == 0; +#endif +} + +} // namespace + +TEST_CASE("paritychecker matches llama tokens across tiny models") { + const std::vector models = discover_models(); + const std::vector cases = base_cases(); + + REQUIRE(!models.empty()); + for (const auto & model : models) { + INFO("model: " << model); + REQUIRE(file_exists(std::filesystem::path(model))); + for (const auto & test_case : cases) { + INFO("case: " << test_case.label); + REQUIRE(file_exists(test_case.text_path)); + CHECK(run_paritychecker_process(model, test_case)); + } + const std::string special_text = special_text_for_model(model); + if (!special_text.empty()) { + INFO("case: special_parse"); + REQUIRE(file_exists(std::filesystem::path(special_text))); + parity_case special_case; + special_case.label = "special_parse"; + special_case.text_path = special_text; + special_case.add_special = true; + special_case.parse_special = true; + CHECK(run_paritychecker_process(model, special_case)); + } + } +} diff --git a/tools/paritychecker/reference_ref.txt b/tools/paritychecker/reference_ref.txt new file mode 100644 index 0000000..e295a4c --- /dev/null +++ b/tools/paritychecker/reference_ref.txt @@ -0,0 +1 @@ +94b0200a01a753eff5897dab9311f51a7bc1c62f