Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.gguf filter=lfs diff=lfs merge=lfs -text
8 changes: 5 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,11 @@ ALWAYS preserve or improve performance when porting logic from the reference
implementation.
ALWAYS implement equivalent functionality natively without external llama.cpp or
ggml linkage.
NEVER link "emel" against llama.cpp or ggml outside `tools/bench`.
ALWAYS link llama.cpp and ggml together with emel in `tools/bench` only.
NEVER link "emel" against llama.cpp or ggml outside `tools/bench` or
`tools/paritychecker`.
ALWAYS link llama.cpp and ggml together with emel in `tools/bench` and
`tools/paritychecker` only.
NEVER use `llama_` or `ggml_` prefixes in identifiers, symbols, files, or APIs
outside `tools/bench`.
outside `tools/bench` or `tools/paritychecker`.
ALWAYS use `emel_` or `EMEL_` prefixes for project-owned identifiers, symbols,
files, and APIs.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ if(EMEL_ENABLE_TESTS)
tests/tokenizer/preprocessor_rwkv_tests.cpp
tests/tokenizer/preprocessor_plamo2_tests.cpp
tests/tokenizer/preprocessor_fallback_tests.cpp
tests/tokenizer/bpe_regex_tests.cpp
tests/tokenizer/bpe_split_tests.cpp
tests/tokenizer/tokenizer_tests.cpp
tests/tokenizer/tokenizer_parity_tests.cpp
tests/tokenizer/tokenizer_action_guard_tests.cpp
tests/batch/splitter_tests.cpp
tests/batch/splitter_actions_tests.cpp
tests/batch/splitter_additional_tests.cpp
Expand Down
59 changes: 34 additions & 25 deletions docs/architecture/mermaid/tokenizer.mmd
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
stateDiagram-v2
direction TB
[*] --> initialized
initialized --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_
initialized --> errored : tokenize [always] / reject_invalid_
done --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_
[*] --> uninitialized
uninitialized --> binding_preprocessor : bind [can_bind_] / begin_bind_
uninitialized --> errored : bind [always] / reject_bind_
uninitialized --> errored : tokenize [always] / reject_invalid_
binding_preprocessor --> binding_preprocessor_decision : [always] / bind_preprocessor_
binding_preprocessor_decision --> errored : [phase_failed_] / none
binding_preprocessor_decision --> binding_encoder : [phase_ok_] / none
binding_encoder --> binding_encoder_decision : [always] / bind_encoder_
binding_encoder_decision --> errored : [phase_failed_] / none
binding_encoder_decision --> idle : [phase_ok_] / none
idle --> binding_preprocessor : bind [can_bind_] / begin_bind_
idle --> errored : bind [always] / reject_bind_
idle --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_
idle --> errored : tokenize [always] / reject_invalid_
done --> binding_preprocessor : bind [can_bind_] / begin_bind_
done --> errored : bind [always] / reject_bind_
done --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_
done --> errored : tokenize [always] / reject_invalid_
errored --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_
errored --> binding_preprocessor : bind [can_bind_] / begin_bind_
errored --> errored : bind [always] / reject_bind_
errored --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_
errored --> errored : tokenize [always] / reject_invalid_
unexpected --> building_special_tokens : tokenize [can_tokenize_] / begin_tokenize_
unexpected --> binding_preprocessor : bind [can_bind_] / begin_bind_
unexpected --> unexpected : bind [always] / reject_bind_
unexpected --> preprocessing : tokenize [can_tokenize_] / begin_tokenize_
unexpected --> unexpected : tokenize [always] / reject_invalid_
building_special_tokens --> special_tokens_decision : [always] / build_special_tokens_
special_tokens_decision --> errored : [phase_failed_] / none
special_tokens_decision --> partitioning_with_specials : [has_special_tokens_] / none
special_tokens_decision --> partitioning_raw : [no_special_tokens_] / none
partitioning_raw --> partitioning_decision : [always] / partition_raw_
partitioning_with_specials --> partitioning_decision : [always] / partition_with_specials_
partitioning_decision --> errored : [phase_failed_] / none
partitioning_decision --> selecting_backend : [phase_ok_] / none
selecting_backend --> selecting_backend_decision : [always] / select_backend_
selecting_backend_decision --> errored : [phase_failed_] / none
selecting_backend_decision --> prefix_decision : [phase_ok_] / none
preprocessing --> preprocess_decision : [always] / run_preprocess_
preprocess_decision --> errored : [phase_failed_] / none
preprocess_decision --> prefix_decision : [phase_ok_] / none
prefix_decision --> encoding_ready : [bos_ready_] / append_bos_
prefix_decision --> errored : [bos_no_capacity_] / set_capacity_error_
prefix_decision --> errored : [bos_invalid_id_] / set_invalid_id_error_
Expand All @@ -40,14 +49,14 @@ stateDiagram-v2
suffix_decision --> errored : [eos_invalid_id_] / set_invalid_id_error_
suffix_decision --> finalizing : [no_suffix_] / none
finalizing --> done : [always] / finalize_
initialized --> unexpected : _ [always] / on_unexpected_
building_special_tokens --> unexpected : _ [always] / on_unexpected_
special_tokens_decision --> unexpected : _ [always] / on_unexpected_
partitioning_raw --> unexpected : _ [always] / on_unexpected_
partitioning_with_specials --> unexpected : _ [always] / on_unexpected_
partitioning_decision --> unexpected : _ [always] / on_unexpected_
selecting_backend --> unexpected : _ [always] / on_unexpected_
selecting_backend_decision --> unexpected : _ [always] / on_unexpected_
uninitialized --> unexpected : _ [always] / on_unexpected_
binding_preprocessor --> unexpected : _ [always] / on_unexpected_
binding_preprocessor_decision --> unexpected : _ [always] / on_unexpected_
binding_encoder --> unexpected : _ [always] / on_unexpected_
binding_encoder_decision --> unexpected : _ [always] / on_unexpected_
idle --> unexpected : _ [always] / on_unexpected_
preprocessing --> unexpected : _ [always] / on_unexpected_
preprocess_decision --> unexpected : _ [always] / on_unexpected_
prefix_decision --> unexpected : _ [always] / on_unexpected_
encoding_ready --> unexpected : _ [always] / on_unexpected_
encoding_token_fragment --> unexpected : _ [always] / on_unexpected_
Expand Down
10 changes: 1 addition & 9 deletions docs/architecture/mermaid/tokenizer_preprocessor_fallback.mmd
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,7 @@ stateDiagram-v2
[*] --> idle
idle --> preparing : preprocess [valid_request_] / begin_preprocess_
idle --> errored : preprocess [invalid_request_] / reject_invalid_
preparing --> partitioning_select : [always] / build_specials_
partitioning_select --> partitioning_bpe_no_specials : [bpe_no_specials_] / none
partitioning_select --> partitioning_bpe_with_specials : [bpe_with_specials_] / none
partitioning_select --> partitioning_non_bpe : [not_bpe_] / none
partitioning_bpe_no_specials --> partition_decision : [always] / partition_bpe_no_specials_
partitioning_bpe_with_specials --> partition_decision : [always] / partition_bpe_with_specials_
preparing --> partitioning_non_bpe : [always] / build_specials_
partitioning_non_bpe --> partition_decision : [always] / partition_non_bpe_
partition_decision --> errored : [phase_failed_] / ensure_last_error_
partition_decision --> done : [phase_ok_] / mark_done_
Expand All @@ -20,9 +15,6 @@ stateDiagram-v2
unexpected --> errored : preprocess [invalid_request_] / reject_invalid_
idle --> unexpected : _ [always] / on_unexpected_
preparing --> unexpected : _ [always] / on_unexpected_
partitioning_select --> unexpected : _ [always] / on_unexpected_
partitioning_bpe_no_specials --> unexpected : _ [always] / on_unexpected_
partitioning_bpe_with_specials --> unexpected : _ [always] / on_unexpected_
partitioning_non_bpe --> unexpected : _ [always] / on_unexpected_
partition_decision --> unexpected : _ [always] / on_unexpected_
done --> unexpected : _ [always] / on_unexpected_
Expand Down
Loading