Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions common_settings.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,6 @@ COMMON_STATIC_TEST_COPTS = select({
"-Wall",
"-Wno-unknown-pragmas",
"-Werror",
# ov::Tensor::data method call results in deprecated warning and we use it in multiple places
"-Wno-deprecated-declarations",
"-Isrc",
"-fconcepts", # for gmock related utils
"-fvisibility=hidden",# Needed for pybind targets
Expand Down
4 changes: 4 additions & 0 deletions demos/benchmark/v3/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,4 +438,8 @@ async def limited_request_func(request_func_input, pbar):
print(f"Throughput - Tokens per second: {num_tokens / benchmark_results['duration']:^,.1f}")
print(f"Mean latency: {np.mean(benchmark_results['latencies'])*1000:.2f} ms")
print(f"Median latency: {np.median(benchmark_results['latencies'])*1000:.2f} ms")
# add printing 10 percentiles of latency to better understand latency distribution
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
print(f"{p}th percentile latency: {np.percentile(benchmark_results['latencies'], p)*1000:.2f} ms")
print(f"Average document length: {num_tokens / len(docs)} tokens")
32 changes: 31 additions & 1 deletion demos/common/export_models/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,17 @@ def add_common_arguments(parser):
parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps')
args = vars(parser.parse_args())


def _default_graph_queue_size(task_name):
if task_name == 'image_generation':
return 1
return 'AUTO'


def _prepend_graph_queue_directive(graph_content, task_name):
queue_size = _default_graph_queue_size(task_name)
return f"# OVMS_GRAPH_QUEUE_SIZE: {queue_size}\n{graph_content}"

t2s_graph_template = """
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
Expand Down Expand Up @@ -488,14 +499,27 @@ def export_text_generation_model(model_repository_path, source_model, model_name
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
print("task_parameters", task_parameters)
graph_content = gtemplate.render(model_path=model_path, draft_model_dir_name=draft_model_dir_name, **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'text_generation')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True):
set_max_context_length = ""
destination_path = os.path.join(model_repository_path, model_name)
destination_path = os.path.join(model_repository_path, model_name)ERROR: /ovms/src/llm/BUILD:196:16: Compiling src/llm/language_model/continuous_batching/servable.cpp failed: (Exit 1): gcc failed: error executing command (from target //src/llm:genai_servables) /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG -ffunction-sections ... (remaining 156 arguments skipped)
In file included from src/llm/language_model/continuous_batching/../../../logging.hpp:24,
from src/llm/language_model/continuous_batching/servable.cpp:22:
src/llm/language_model/continuous_batching/llm_executor.hpp: In member function 'void ovms::LLMExecutor::printMetrics()':
src/llm/language_model/continuous_batching/llm_executor.hpp:105:104: error: 'struct ov::genai::PipelineMetrics' has no member named 'kv_cache_size_in_bytes'
105 | metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
| ^~~~~~~~~~~~~~~~~~~~~~
Target //src:ovms failed to build
Use --verbose_failures to see the command lines of failed build steps.
INFO: Elapsed time: 9.590s, Critical Path: 8.22s
INFO: 64 processes: 64 internal.
FAILED: Build did NOT complete successfully
root@b6674760ad87:/ovms# bazel build --config mp_on_py_off //src:ovms
Comment on lines +510 to +522
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line assigning destination_path appears to have an entire Bazel build log appended to it, which will make this script syntactically invalid and break model export. Remove the pasted build output and keep the assignment as a valid Python statement (the build log should not be committed into this file).

Suggested change
destination_path = os.path.join(model_repository_path, model_name)ERROR: /ovms/src/llm/BUILD:196:16: Compiling src/llm/language_model/continuous_batching/servable.cpp failed: (Exit 1): gcc failed: error executing command (from target //src/llm:genai_servables) /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG -ffunction-sections ... (remaining 156 arguments skipped)
In file included from src/llm/language_model/continuous_batching/../../../logging.hpp:24,
from src/llm/language_model/continuous_batching/servable.cpp:22:
src/llm/language_model/continuous_batching/llm_executor.hpp: In member function 'void ovms::LLMExecutor::printMetrics()':
src/llm/language_model/continuous_batching/llm_executor.hpp:105:104: error: 'struct ov::genai::PipelineMetrics' has no member named 'kv_cache_size_in_bytes'
105 | metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
| ^~~~~~~~~~~~~~~~~~~~~~
Target //src:ovms failed to build
Use --verbose_failures to see the command lines of failed build steps.
INFO: Elapsed time: 9.590s, Critical Path: 8.22s
INFO: 64 processes: 64 internal.
FAILED: Build did NOT complete successfully
root@b6674760ad87:/ovms# bazel build --config mp_on_py_off //src:ovms
destination_path = os.path.join(model_repository_path, model_name)

Copilot uses AI. Check for mistakes.
print("Exporting embeddings model to ",destination_path)
if not os.path.isdir(destination_path) or args['overwrite_models']:
optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
Expand All @@ -509,6 +533,7 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
raise ValueError("Failed to export tokenizer model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'embeddings_ov')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand All @@ -523,6 +548,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr
raise ValueError("Failed to export text2speech model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'text2speech')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand All @@ -537,6 +563,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr
raise ValueError("Failed to export speech2text model", source_model)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'speech2text')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand All @@ -553,6 +580,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
export_rerank_tokenizer(source_model, destination_path, max_doc_length)
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_ov_template)
graph_content = gtemplate.render(model_path="./", **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'rerank_ov')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand Down Expand Up @@ -589,6 +617,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi
shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.bin'), os.path.join(tokenizer_path, 'model.bin'))
gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_template)
graph_content = gtemplate.render(model_name=model_name, **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'rerank')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand Down Expand Up @@ -635,6 +664,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam

gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(image_generation_graph_template)
graph_content = gtemplate.render(model_path=model_path, **task_parameters)
graph_content = _prepend_graph_queue_directive(graph_content, 'image_generation')
with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
f.write(graph_content)
print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
Expand Down
42 changes: 42 additions & 0 deletions docs/mediapipe.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,48 @@ Nodes in the MediaPipe graphs can reference both the models configured in model_

Subconfig file may only contain *model_config_list* section - in the same format as in [models config file](starting_server.md).

### Graph Pool (Pre-initialized Graph Queue)

OpenVINO Model Server can pre-initialize a pool of MediaPipe `CalculatorGraph` instances for a graph definition. Graphs in the pool are started once during server initialization and reused across inference requests, eliminating per-request graph initialization and teardown overhead. This is especially beneficial for graphs that involve expensive setup, done in calculators `Open()` method.

#### How it works

Without graph pool (legacy behavior), each incoming request creates a new `CalculatorGraph`, calls `StartRun()` with side packets, processes the request, then tears down the graph via `CloseAllPacketSources()` and `WaitUntilDone()`.

With graph pool enabled, a fixed number of graphs are pre-initialized and kept in a queue. When a request arrives, an idle graph is acquired from the queue. After processing, the graph is returned to the queue for the next request. The graph is never torn down — instead, `WaitUntilIdle()` is called between requests and the internal timestamp is incremented.

#### Configuration

The graph pool size is controlled via a comment directive in the graph `.pbtxt` file:

```
# OVMS_GRAPH_QUEUE_SIZE: AUTO
```

| Value | Behavior |
|:------|:---------|
| `AUTO` | Pool size is set to the number of hardware threads (`std::thread::hardware_concurrency()`), or 16 if detection fails |
| Positive integer (e.g. `4`) | Pool size set to the given value (must not exceed hardware thread count) |
| `0` or `-1` | Graph pool disabled — falls back to per-request graph creation |
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation says # OVMS_GRAPH_QUEUE_SIZE: 0 disables the pool, but the current parser rejects 0 as invalid. Update the docs to match the implementation, or accept 0 as a valid 'disabled' value (consistent with the doc and PR description).

Suggested change
| `0` or `-1` | Graph pool disabled — falls back to per-request graph creation |
| `-1` | Graph pool disabled — falls back to per-request graph creation |

Copilot uses AI. Check for mistakes.
| *(directive absent)* | Default: graph pool is disabled |

**Default behavior:** graph pool stays disabled unless `OVMS_GRAPH_QUEUE_SIZE` is explicitly present in `graph.pbtxt`.

**Generated graphs from exporters:**
- `demos/common/export_models/export_model.py` and OVMS `--pull --task ...` graph export emit `OVMS_GRAPH_QUEUE_SIZE` automatically.
- In `export_model.py`: image generation graphs use `1`, and all other graph types use `AUTO`.
- In OVMS `--task ...` graph export: image generation graphs use `1`, and other graph types use `min(physical_cores, rest_workers)` (with OVMS default REST worker calculation when `rest_workers` is not provided explicitly).

#### Important considerations for graph developers

**Stateful calculators:**
Since graphs in the pool are reused across requests, any state held by a calculator between `Process()` calls will persist across requests. If your calculator accumulates state (e.g. counters, buffers, history), that state will carry over to the next request that reuses the same graph instance. Design your calculators to either:
- Be stateless (reset any per-request state at the beginning of each `Process()` call), or
- Explicitly handle the fact that the graph may have already processed prior requests.

**Input side packets from requests are not supported:**
When graph pool is enabled, side packets are set once at pool construction time and cannot be overridden per request. If a client sends request parameters that would normally become input side packets (e.g. KServe request parameters other than `OVMS_MP_TIMESTAMP`), the request will be rejected with an error. If your graph relies on per-request side packets to configure calculator behavior, either disable the graph pool (`# OVMS_GRAPH_QUEUE_SIZE: 0`) or redesign the graph to accept such parameters as regular input stream packets instead of side packets.


## Deployment testing
### Debug logs
Expand Down
37 changes: 37 additions & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,39 @@ ovms_cc_library(
hdrs = ["queue.hpp"],
visibility = ["//visibility:public",],
)
ovms_cc_library(
name = "mediapipe_internal_graph_side_packets",
hdrs = ["mediapipe_internal/graph_side_packets.hpp"],
visibility = ["//visibility:public",],
)
ovms_cc_library(
name = "mediapipe_internal_graph_executor_constants",
hdrs = ["mediapipe_internal/graph_executor_constants.hpp"],
visibility = ["//visibility:public"],
)
ovms_cc_library(
name = "mediapipe_internal_graphqueue",
hdrs = [
"mediapipe_internal/graphqueue.hpp",
"mediapipe_internal/outputstreamobserver.hpp",
], # TODO FIXME
srcs = ["mediapipe_internal/graphqueue.cpp"],
deps = [
"libovms_queue",
"libovmslogging",
"libovms_execution_context",
"libovmstimer",
"libovmsmetrics",
"model_metric_reporter",
"mediapipe_internal_graph_executor_constants",
"mediapipe_internal_graph_side_packets",
"//third_party:openvino",
"@mediapipe//mediapipe/framework:calculator_graph",
"//src/python:libovmspythonmodule", # TODO not split
"//src/llm:genai_servables", # TODO split!
],
visibility = ["//visibility:public",],
)
ovms_cc_library(
name = "libovms_ovinferrequestsqueue",
hdrs = ["ovinferrequestsqueue.hpp"],
Expand Down Expand Up @@ -542,6 +575,7 @@ ovms_cc_library(
"mediapipe_internal/mediapipegraphconfig.cpp",
"mediapipe_internal/mediapipegraphdefinition.cpp",
"mediapipe_internal/mediapipegraphdefinition.hpp",
"mediapipe_internal/outputstreamobserver.hpp",
"mediapipe_internal/mediapipegraphexecutor.cpp",
"mediapipe_internal/mediapipegraphexecutor.hpp",
"mediapipe_internal/packettypes.hpp",
Expand Down Expand Up @@ -682,6 +716,8 @@ ovms_cc_library(
})
+ select({
"//conditions:default": [
"mediapipe_internal_graph_executor_constants",
"mediapipe_internal_graphqueue",
"@mediapipe_calculators//:mediapipe_calculators", # Need this dependencies here because we use ovms/src - cannot add in ovms_dependencies because we copy src directory later in Dockerfile
"@mediapipe//mediapipe/graphs/holistic_tracking:holistic_tracking_to_render_data",
"@mediapipe//mediapipe/graphs/iris_tracking:iris_tracking_cpu_deps",
Expand Down Expand Up @@ -3016,6 +3052,7 @@ cc_library(
":test_test_with_temp_dir",
"//src/graph_export:graph_export",
"//src:libovms_server_settings",
"//src:libovms_systeminfo",
"@com_google_googletest//:gtest",
],
local_defines = COMMON_LOCAL_DEFINES,
Expand Down
1 change: 1 addition & 0 deletions src/capi_frontend/server_settings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ struct ExportSettings {
std::string modelName = "";
std::string modelPath = "./";
std::string targetDevice = "CPU";
std::optional<uint32_t> restWorkers;
std::optional<std::string> extraQuantizationParams;
std::optional<std::string> vocoder;
std::string precision = "int8";
Expand Down
1 change: 1 addition & 0 deletions src/cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,7 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl&
hfSettings.exportSettings.extraQuantizationParams = result->operator[]("extra_quantization_params").as<std::string>();
if (result->count("vocoder"))
hfSettings.exportSettings.vocoder = result->operator[]("vocoder").as<std::string>();
hfSettings.exportSettings.restWorkers = serverSettings.restWorkers;
hfSettings.downloadPath = result->operator[]("model_repository_path").as<std::string>();
if (result->count("task")) {
hfSettings.task = stringToEnum(result->operator[]("task").as<std::string>());
Expand Down
1 change: 1 addition & 0 deletions src/graph_export/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ovms_cc_library(
"@ovms//src:libovms_module",
"@ovms//src:libovmsfilesystem",
"@ovms//src:libovmslocalfilesystem",
"@ovms//src:libovms_systeminfo",
"@com_github_tencent_rapidjson//:rapidjson",
"@ovms//src:libovmsschema",
"@ovms//src:libovms_version",
Expand Down
Loading
Loading