openvinotoolkit · atobiszei · Jan 16, 2025 · Feb 17, 2026 · Feb 18, 2026 · Feb 19, 2026
diff --git a/common_settings.bzl b/common_settings.bzl
@@ -209,8 +209,6 @@ COMMON_STATIC_TEST_COPTS = select({
                     "-Wall",
                     "-Wno-unknown-pragmas",
                     "-Werror",
-                    # ov::Tensor::data method call results in deprecated warning and we use it in multiple places
-                    "-Wno-deprecated-declarations",
                     "-Isrc",
                     "-fconcepts", # for gmock related utils
                     "-fvisibility=hidden",# Needed for pybind targets

diff --git a/demos/benchmark/v3/benchmark.py b/demos/benchmark/v3/benchmark.py
@@ -438,4 +438,8 @@ async def limited_request_func(request_func_input, pbar):
 print(f"Throughput - Tokens per second: {num_tokens / benchmark_results['duration']:^,.1f}")
 print(f"Mean latency: {np.mean(benchmark_results['latencies'])*1000:.2f} ms")
 print(f"Median latency: {np.median(benchmark_results['latencies'])*1000:.2f} ms")
+# add printing 10 percentiles of latency to better understand latency distribution
+percentiles = [10, 25, 50, 75, 90, 95, 99]
+for p in percentiles:
+    print(f"{p}th percentile latency: {np.percentile(benchmark_results['latencies'], p)*1000:.2f} ms")
 print(f"Average document length: {num_tokens / len(docs)} tokens")
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -101,6 +101,17 @@ def add_common_arguments(parser):
 parser_speech2text.add_argument('--enable_word_timestamps', default=False, action='store_true', help='Load model with word timestamps support.', dest='enable_word_timestamps')
 args = vars(parser.parse_args())
 
+
+def _default_graph_queue_size(task_name):
+    if task_name == 'image_generation':
+        return 1
+    return 'AUTO'
+
+
+def _prepend_graph_queue_directive(graph_content, task_name):
+    queue_size = _default_graph_queue_size(task_name)
+    return f"# OVMS_GRAPH_QUEUE_SIZE: {queue_size}\n{graph_content}"
+
 t2s_graph_template = """
 input_stream: "HTTP_REQUEST_PAYLOAD:input"
 output_stream: "HTTP_RESPONSE_PAYLOAD:output"
@@ -488,14 +499,27 @@ def export_text_generation_model(model_repository_path, source_model, model_name
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
     print("task_parameters", task_parameters)
     graph_content = gtemplate.render(model_path=model_path, draft_model_dir_name=draft_model_dir_name, **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'text_generation')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
     add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))
 
 def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True):
     set_max_context_length = ""
-    destination_path = os.path.join(model_repository_path, model_name)
+    destination_path = os.path.join(model_repository_path, model_name)ERROR: /ovms/src/llm/BUILD:196:16: Compiling src/llm/language_model/continuous_batching/servable.cpp failed: (Exit 1): gcc failed: error executing command (from target //src/llm:genai_servables) /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG -ffunction-sections ... (remaining 156 arguments skipped)
+In file included from src/llm/language_model/continuous_batching/../../../logging.hpp:24,
+                 from src/llm/language_model/continuous_batching/servable.cpp:22:
+src/llm/language_model/continuous_batching/llm_executor.hpp: In member function 'void ovms::LLMExecutor::printMetrics()':
+src/llm/language_model/continuous_batching/llm_executor.hpp:105:104: error: 'struct ov::genai::PipelineMetrics' has no member named 'kv_cache_size_in_bytes'
+  105 |             metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
+      |                                                                                                        ^~~~~~~~~~~~~~~~~~~~~~
+Target //src:ovms failed to build
+Use --verbose_failures to see the command lines of failed build steps.
+INFO: Elapsed time: 9.590s, Critical Path: 8.22s
+INFO: 64 processes: 64 internal.
+FAILED: Build did NOT complete successfully
+root@b6674760ad87:/ovms# bazel build --config mp_on_py_off //src:ovms
-    destination_path = os.path.join(model_repository_path, model_name)ERROR: /ovms/src/llm/BUILD:196:16: Compiling src/llm/language_model/continuous_batching/servable.cpp failed: (Exit 1): gcc failed: error executing command (from target //src/llm:genai_servables) /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG -ffunction-sections ... (remaining 156 arguments skipped)
-In file included from src/llm/language_model/continuous_batching/../../../logging.hpp:24,
-                 from src/llm/language_model/continuous_batching/servable.cpp:22:
-src/llm/language_model/continuous_batching/llm_executor.hpp: In member function 'void ovms::LLMExecutor::printMetrics()':
-src/llm/language_model/continuous_batching/llm_executor.hpp:105:104: error: 'struct ov::genai::PipelineMetrics' has no member named 'kv_cache_size_in_bytes'
-  105 |             metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
-      |                                                                                                        ^~~~~~~~~~~~~~~~~~~~~~
-Target //src:ovms failed to build
-Use --verbose_failures to see the command lines of failed build steps.
-INFO: Elapsed time: 9.590s, Critical Path: 8.22s
-INFO: 64 processes: 64 internal.
-FAILED: Build did NOT complete successfully
-root@b6674760ad87:/ovms# bazel build --config mp_on_py_off //src:ovms
+    destination_path = os.path.join(model_repository_path, model_name)
-    destination_path = os.path.join(model_repository_path, model_name)ERROR: /ovms/src/llm/BUILD:196:16: Compiling src/llm/language_model/continuous_batching/servable.cpp failed: (Exit 1): gcc failed: error executing command (from target //src/llm:genai_servables) /usr/bin/gcc -U_FORTIFY_SOURCE -fstack-protector -Wall -Wunused-but-set-parameter -Wno-free-nonheap-object -fno-omit-frame-pointer -g0 -O2 '-D_FORTIFY_SOURCE=1' -DNDEBUG -ffunction-sections ... (remaining 156 arguments skipped)
-In file included from src/llm/language_model/continuous_batching/../../../logging.hpp:24,
-                 from src/llm/language_model/continuous_batching/servable.cpp:22:
-src/llm/language_model/continuous_batching/llm_executor.hpp: In member function 'void ovms::LLMExecutor::printMetrics()':
-src/llm/language_model/continuous_batching/llm_executor.hpp:105:104: error: 'struct ov::genai::PipelineMetrics' has no member named 'kv_cache_size_in_bytes'
-  105 |             metrics.requests, metrics.scheduled_requests, formatCacheInfo(metrics.cache_usage, metrics.kv_cache_size_in_bytes, this->isDynamicKVCache));
-      |                                                                                                        ^~~~~~~~~~~~~~~~~~~~~~
-Target //src:ovms failed to build
-Use --verbose_failures to see the command lines of failed build steps.
-INFO: Elapsed time: 9.590s, Critical Path: 8.22s
-INFO: 64 processes: 64 internal.
-FAILED: Build did NOT complete successfully
-root@b6674760ad87:/ovms# bazel build --config mp_on_py_off //src:ovms
+    destination_path = os.path.join(model_repository_path, model_name)
     print("Exporting embeddings model to ",destination_path)
     if not os.path.isdir(destination_path) or args['overwrite_models']:
         optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
@@ -509,6 +533,7 @@ def export_embeddings_model_ov(model_repository_path, source_model, model_name,
             raise ValueError("Failed to export tokenizer model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
     graph_content = gtemplate.render(model_path="./", **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'embeddings_ov')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
@@ -523,6 +548,7 @@ def export_text2speech_model(model_repository_path, source_model, model_name, pr
             raise ValueError("Failed to export text2speech model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
     graph_content = gtemplate.render(model_path="./", **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'text2speech')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
@@ -537,6 +563,7 @@ def export_speech2text_model(model_repository_path, source_model, model_name, pr
             raise ValueError("Failed to export speech2text model", source_model)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
     graph_content = gtemplate.render(model_path="./", **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'speech2text')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
@@ -553,6 +580,7 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec
         export_rerank_tokenizer(source_model, destination_path, max_doc_length)
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_ov_template)
     graph_content = gtemplate.render(model_path="./", **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'rerank_ov')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
@@ -589,6 +617,7 @@ def export_rerank_model(model_repository_path, source_model, model_name, precisi
                 shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.bin'), os.path.join(tokenizer_path, 'model.bin'))
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_template)
     graph_content = gtemplate.render(model_name=model_name, **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'rerank')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
@@ -635,6 +664,7 @@ def export_image_generation_model(model_repository_path, source_model, model_nam
 
     gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(image_generation_graph_template)
     graph_content = gtemplate.render(model_path=model_path, **task_parameters)
+    graph_content = _prepend_graph_queue_directive(graph_content, 'image_generation')
     with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
          f.write(graph_content)
     print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))

diff --git a/docs/mediapipe.md b/docs/mediapipe.md
@@ -215,6 +215,48 @@ Nodes in the MediaPipe graphs can reference both the models configured in model_
 
 Subconfig file may only contain *model_config_list* section  - in the same format as in [models config file](starting_server.md).
 
+### Graph Pool (Pre-initialized Graph Queue)
+
+OpenVINO Model Server can pre-initialize a pool of MediaPipe `CalculatorGraph` instances for a graph definition. Graphs in the pool are started once during server initialization and reused across inference requests, eliminating per-request graph initialization and teardown overhead. This is especially beneficial for graphs that involve expensive setup, done in calculators `Open()` method.
+
+#### How it works
+
+Without graph pool (legacy behavior), each incoming request creates a new `CalculatorGraph`, calls `StartRun()` with side packets, processes the request, then tears down the graph via `CloseAllPacketSources()` and `WaitUntilDone()`.
+
+With graph pool enabled, a fixed number of graphs are pre-initialized and kept in a queue. When a request arrives, an idle graph is acquired from the queue. After processing, the graph is returned to the queue for the next request. The graph is never torn down — instead, `WaitUntilIdle()` is called between requests and the internal timestamp is incremented.
+
+#### Configuration
+
+The graph pool size is controlled via a comment directive in the graph `.pbtxt` file:
+
+```
+# OVMS_GRAPH_QUEUE_SIZE: AUTO
+```
+
+| Value | Behavior |
+|:------|:---------|
+| `AUTO` | Pool size is set to the number of hardware threads (`std::thread::hardware_concurrency()`), or 16 if detection fails |
+| Positive integer (e.g. `4`) | Pool size set to the given value (must not exceed hardware thread count) |
+| `0` or `-1` | Graph pool disabled — falls back to per-request graph creation |
-| `0` or `-1` | Graph pool disabled — falls back to per-request graph creation |
+| `-1` | Graph pool disabled — falls back to per-request graph creation |
-| `0` or `-1` | Graph pool disabled — falls back to per-request graph creation |
+| `-1` | Graph pool disabled — falls back to per-request graph creation |
+| *(directive absent)* | Default: graph pool is disabled |
+
+**Default behavior:** graph pool stays disabled unless `OVMS_GRAPH_QUEUE_SIZE` is explicitly present in `graph.pbtxt`.
+
+**Generated graphs from exporters:**
+- `demos/common/export_models/export_model.py` and OVMS `--pull --task ...` graph export emit `OVMS_GRAPH_QUEUE_SIZE` automatically.
+- In `export_model.py`: image generation graphs use `1`, and all other graph types use `AUTO`.
+- In OVMS `--task ...` graph export: image generation graphs use `1`, and other graph types use `min(physical_cores, rest_workers)` (with OVMS default REST worker calculation when `rest_workers` is not provided explicitly).
+
+#### Important considerations for graph developers
+
+**Stateful calculators:**
+Since graphs in the pool are reused across requests, any state held by a calculator between `Process()` calls will persist across requests. If your calculator accumulates state (e.g. counters, buffers, history), that state will carry over to the next request that reuses the same graph instance. Design your calculators to either:
+- Be stateless (reset any per-request state at the beginning of each `Process()` call), or
+- Explicitly handle the fact that the graph may have already processed prior requests.
+
+**Input side packets from requests are not supported:**
+When graph pool is enabled, side packets are set once at pool construction time and cannot be overridden per request. If a client sends request parameters that would normally become input side packets (e.g. KServe request parameters other than `OVMS_MP_TIMESTAMP`), the request will be rejected with an error. If your graph relies on per-request side packets to configure calculator behavior, either disable the graph pool (`# OVMS_GRAPH_QUEUE_SIZE: 0`) or redesign the graph to accept such parameters as regular input stream packets instead of side packets.
+
 
 ## Deployment testing
 ### Debug logs

diff --git a/src/BUILD b/src/BUILD
@@ -150,6 +150,39 @@ ovms_cc_library(
     hdrs = ["queue.hpp"],
     visibility = ["//visibility:public",],
 )
+ovms_cc_library(
+    name = "mediapipe_internal_graph_side_packets",
+    hdrs = ["mediapipe_internal/graph_side_packets.hpp"],
+    visibility = ["//visibility:public",],
+)
+ovms_cc_library(
+    name = "mediapipe_internal_graph_executor_constants",
+    hdrs = ["mediapipe_internal/graph_executor_constants.hpp"],
+    visibility = ["//visibility:public"],
+)
+ovms_cc_library(
+    name = "mediapipe_internal_graphqueue",
+    hdrs = [
+    "mediapipe_internal/graphqueue.hpp",
+    "mediapipe_internal/outputstreamobserver.hpp",
+    ], # TODO FIXME
+    srcs = ["mediapipe_internal/graphqueue.cpp"],
+    deps = [
+        "libovms_queue",
+        "libovmslogging",
+        "libovms_execution_context",
+        "libovmstimer",
+        "libovmsmetrics",
+        "model_metric_reporter",
+        "mediapipe_internal_graph_executor_constants",
+        "mediapipe_internal_graph_side_packets",
+        "//third_party:openvino",
+        "@mediapipe//mediapipe/framework:calculator_graph",
+        "//src/python:libovmspythonmodule", # TODO not split
+        "//src/llm:genai_servables", # TODO split!
+    ],
+    visibility = ["//visibility:public",],
+)
 ovms_cc_library(
     name = "libovms_ovinferrequestsqueue",
     hdrs = ["ovinferrequestsqueue.hpp"],
@@ -542,6 +575,7 @@ ovms_cc_library(
                 "mediapipe_internal/mediapipegraphconfig.cpp",
                 "mediapipe_internal/mediapipegraphdefinition.cpp",
                 "mediapipe_internal/mediapipegraphdefinition.hpp",
+                "mediapipe_internal/outputstreamobserver.hpp",
                 "mediapipe_internal/mediapipegraphexecutor.cpp",
                 "mediapipe_internal/mediapipegraphexecutor.hpp",
                 "mediapipe_internal/packettypes.hpp",
@@ -682,6 +716,8 @@ ovms_cc_library(
                 })
             + select({
             "//conditions:default": [
+                "mediapipe_internal_graph_executor_constants",
+                "mediapipe_internal_graphqueue",
                 "@mediapipe_calculators//:mediapipe_calculators", # Need this dependencies here because we use ovms/src - cannot add in ovms_dependencies because we copy src directory later in Dockerfile
                 "@mediapipe//mediapipe/graphs/holistic_tracking:holistic_tracking_to_render_data",
                 "@mediapipe//mediapipe/graphs/iris_tracking:iris_tracking_cpu_deps",
@@ -3016,6 +3052,7 @@ cc_library(
         ":test_test_with_temp_dir",
         "//src/graph_export:graph_export",
         "//src:libovms_server_settings",
+        "//src:libovms_systeminfo",
         "@com_google_googletest//:gtest",
     ],
     local_defines = COMMON_LOCAL_DEFINES,

diff --git a/src/capi_frontend/server_settings.hpp b/src/capi_frontend/server_settings.hpp
@@ -158,6 +158,7 @@ struct ExportSettings {
     std::string modelName = "";
     std::string modelPath = "./";
     std::string targetDevice = "CPU";
+    std::optional<uint32_t> restWorkers;
     std::optional<std::string> extraQuantizationParams;
     std::optional<std::string> vocoder;
     std::string precision = "int8";

diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp
@@ -728,6 +728,7 @@ void CLIParser::prepareGraph(ServerSettingsImpl& serverSettings, HFSettingsImpl&
             hfSettings.exportSettings.extraQuantizationParams = result->operator[]("extra_quantization_params").as<std::string>();
         if (result->count("vocoder"))
             hfSettings.exportSettings.vocoder = result->operator[]("vocoder").as<std::string>();
+        hfSettings.exportSettings.restWorkers = serverSettings.restWorkers;
         hfSettings.downloadPath = result->operator[]("model_repository_path").as<std::string>();
         if (result->count("task")) {
             hfSettings.task = stringToEnum(result->operator[]("task").as<std::string>());

diff --git a/src/graph_export/BUILD b/src/graph_export/BUILD
@@ -29,6 +29,7 @@ ovms_cc_library(
         "@ovms//src:libovms_module",
         "@ovms//src:libovmsfilesystem",
         "@ovms//src:libovmslocalfilesystem",
+        "@ovms//src:libovms_systeminfo",
         "@com_github_tencent_rapidjson//:rapidjson",
         "@ovms//src:libovmsschema",
         "@ovms//src:libovms_version",