From c3cac9b19b791569fb7fd13169bc2fc5141e223e Mon Sep 17 00:00:00 2001 From: Saumya Tiwari Date: Wed, 25 Mar 2026 10:45:40 +0000 Subject: [PATCH] Remove deprecated RerankerCalculator and update references --- demos/common/export_models/export_model.py | 94 ---- src/BUILD | 1 - src/rerank/BUILD | 28 +- src/rerank/rerank_calculator.cc | 420 ------------------ src/rerank/rerank_calculator.proto | 32 -- src/test/mediapipeflow_test.cpp | 1 - src/test/rerank/with_params/graph.pbtxt | 47 -- .../rerank/with_params/invalid_graph.pbtxt | 47 -- .../rerank/with_params/invalid_graph_ov.pbtxt | 15 +- yarn.lock | 47 ++ 10 files changed, 50 insertions(+), 682 deletions(-) delete mode 100644 src/rerank/rerank_calculator.cc delete mode 100644 src/rerank/rerank_calculator.proto delete mode 100644 src/test/rerank/with_params/graph.pbtxt delete mode 100644 src/test/rerank/with_params/invalid_graph.pbtxt diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py index 5aa81b0c81..c7785244f9 100644 --- a/demos/common/export_models/export_model.py +++ b/demos/common/export_models/export_model.py @@ -64,12 +64,6 @@ def add_common_arguments(parser): parser_embeddings_ov.add_argument('--truncate', default=False, action='store_true', help='Truncate the prompts to fit to the embeddings model', dest='truncate') parser_embeddings_ov.add_argument('--num_streams', default=1,type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams') -parser_rerank = subparsers.add_parser('rerank', help='[deprecated] export model for rerank endpoint with models split into separate, versioned directories') -add_common_arguments(parser_rerank) -parser_rerank.add_argument('--num_streams', default=1, type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams') -parser_rerank.add_argument('--max_doc_length', default=16000, type=int, help='Maximum length of input documents in tokens', dest='max_doc_length') -parser_rerank.add_argument('--version', default="1", help='version of the model', dest='version') - parser_rerank_ov = subparsers.add_parser('rerank_ov', help='export model for rerank endpoint with directory structure aligned with OpenVINO tools') add_common_arguments(parser_rerank_ov) parser_rerank_ov.add_argument('--num_streams', default=1, type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams') @@ -190,34 +184,6 @@ def add_common_arguments(parser): } """ -rerank_graph_template = """input_stream: "REQUEST_PAYLOAD:input" -output_stream: "RESPONSE_PAYLOAD:output" -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:tokenizer" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "{{model_name}}_tokenizer_model" - } - } -} -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:rerank" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "{{model_name}}_rerank_model" - } - } -} -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" -} -""" text_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input" output_stream: "HTTP_RESPONSE_PAYLOAD:output" @@ -273,24 +239,6 @@ def add_common_arguments(parser): } }""" -rerank_subconfig_template = """{ - "model_config_list": [ - { "config": - { - "name": "{{model_name}}_tokenizer_model", - "base_path": "tokenizer" - } - }, - { "config": - { - "name": "{{model_name}}_rerank_model", - "base_path": "rerank", - "target_device": "{{target_device|default("CPU", true)}}", - "plugin_config": { "NUM_STREAMS": "{{num_streams|default(1, true)}}" } - } - } - ] -}""" image_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input" output_stream: "HTTP_RESPONSE_PAYLOAD:output" @@ -558,46 +506,6 @@ def export_rerank_model_ov(model_repository_path, source_model, model_name, prec print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) -def export_rerank_model(model_repository_path, source_model, model_name, precision, task_parameters, version, config_file_path, max_doc_length): - if os.path.isfile(os.path.join(model_name, 'openvino_model.xml')): - print("OV model is source folder. Skipping conversion.") - os.makedirs(os.path.join(model_repository_path, model_name, 'rerank', version), exist_ok=True) - os.makedirs(os.path.join(model_repository_path, model_name, 'tokenizer', version), exist_ok=True) - shutil.move(os.path.join(model_repository_path, model_name, 'openvino_tokenizer.xml'), os.path.join(model_repository_path, model_name, 'tokenizer', version, 'model.xml')) - shutil.move(os.path.join(model_repository_path, model_name, 'openvino_tokenizer.bin'), os.path.join(model_repository_path, model_name, 'tokenizer', version, 'model.bin')) - shutil.move(os.path.join(model_repository_path, model_name, 'openvino_model.xml'), os.path.join(model_repository_path, model_name, 'rerank', version, 'model.xml')) - shutil.move(os.path.join(model_repository_path, model_name, 'openvino_model.bin'), os.path.join(model_repository_path, model_name, 'rerank', version, 'model.bin')) - else: # assume HF model name - with tempfile.TemporaryDirectory() as tmpdirname: - embeddings_path = os.path.join(model_repository_path, model_name, 'rerank', version) - print("Exporting rerank model to ",embeddings_path) - if not os.path.isdir(embeddings_path) or args['overwrite_models']: - optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname) - if os.system(optimum_command): - raise ValueError("Failed to export rerank model", source_model) - set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json') - os.makedirs(embeddings_path, exist_ok=True) - shutil.move(os.path.join(tmpdirname, 'openvino_model.xml'), os.path.join(embeddings_path, 'model.xml')) - shutil.move(os.path.join(tmpdirname, 'openvino_model.bin'), os.path.join(embeddings_path, 'model.bin')) - tokenizer_path = os.path.join(model_repository_path, model_name,'tokenizer', version) - print("Exporting tokenizer to ",tokenizer_path) - if not os.path.isdir(tokenizer_path) or args['overwrite_models']: - export_rerank_tokenizer(source_model, tmpdirname, max_doc_length) - set_rt_info(tmpdirname, 'openvino_tokenizer.xml', 'tokenizer_config.json') - os.makedirs(tokenizer_path, exist_ok=True) - shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.xml'), os.path.join(tokenizer_path, 'model.xml')) - shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.bin'), os.path.join(tokenizer_path, 'model.bin')) - gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_template) - graph_content = gtemplate.render(model_name=model_name, **task_parameters) - with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f: - f.write(graph_content) - print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt'))) - stemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_subconfig_template) - subconfig_content = stemplate.render(model_name=model_name, **task_parameters) - with open(os.path.join(model_repository_path, model_name, 'subconfig.json'), 'w') as f: - f.write(subconfig_content) - print("Created subconfig {}".format(os.path.join(model_repository_path, model_name, 'subconfig.json'))) - add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path))) def export_image_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, num_streams): @@ -670,8 +578,6 @@ def export_image_generation_model(model_repository_path, source_model, model_nam elif args['task'] == 'embeddings_ov': export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['truncate']) -elif args['task'] == 'rerank': - export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length']) elif args['task'] == 'rerank_ov': export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length']) diff --git a/src/BUILD b/src/BUILD index ea624f5e59..5f422fc736 100644 --- a/src/BUILD +++ b/src/BUILD @@ -581,7 +581,6 @@ ovms_cc_library( "//src/image_gen:imagegen_init", "//src/llm:openai_completions_api_handler", "//src/embeddings:embeddingscalculator_ov", - "//src/rerank:rerankcalculator", "//src/rerank:rerankcalculator_ov", "//src/llm:llmcalculator",], }) + select({ diff --git a/src/rerank/BUILD b/src/rerank/BUILD index 7f3b1a6ec9..192252efad 100644 --- a/src/rerank/BUILD +++ b/src/rerank/BUILD @@ -17,15 +17,7 @@ load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library") load("//:common_settings.bzl", "ovms_cc_library") -mediapipe_proto_library( - name = "rerank_calculator_proto", # rerank_calculator_cc_proto - just mediapipe stuff with mediapipe_proto_library adding nonvisible target - srcs = ["rerank_calculator.proto"], - visibility = ["//visibility:private"], - deps = [ - "@mediapipe//mediapipe/framework:calculator_options_proto", - "@mediapipe//mediapipe/framework:calculator_proto", - ], -) + ovms_cc_library( name = "rerank_servable", @@ -45,24 +37,6 @@ mediapipe_proto_library( ], ) -ovms_cc_library( - name = "rerankcalculator", - srcs = ["rerank_calculator.cc"], - deps = [ - "@mediapipe//mediapipe/framework:calculator_framework", - "@com_github_tencent_rapidjson//:rapidjson", - "@model_api//:model_api", - "//src:httppayload", - "//src:libhttpclientconnection", - "//src:libovmslogging", - "//src:libovmsprofiler", - "rerank_calculator_cc_proto", - ":rerank_api_handler", - ], - visibility = ["//visibility:public"], - alwayslink = 1, -) - ovms_cc_library( name = "rerankcalculator_ov", srcs = ["rerank_calculator_ov.cc"], diff --git a/src/rerank/rerank_calculator.cc b/src/rerank/rerank_calculator.cc deleted file mode 100644 index bb731690af..0000000000 --- a/src/rerank/rerank_calculator.cc +++ /dev/null @@ -1,420 +0,0 @@ -//***************************************************************************** -// Copyright 2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -//***************************************************************************** -#include -#include -#include -#include -#include - -#pragma warning(push) -#pragma warning(disable : 6001 6385 6386 6326 6011 4309 6246 4005 4456) -#include "absl/strings/escaping.h" -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#include "mediapipe/framework/calculator_framework.h" -#include "mediapipe/framework/port/canonical_errors.h" -#include "mediapipe/framework/port/ret_check.h" -#pragma GCC diagnostic pop -#pragma warning(pop) - -#include -#include "src/port/rapidjson_stringbuffer.hpp" -#include "src/port/rapidjson_writer.hpp" - -#include "../http_payload.hpp" -#include "../logging.hpp" -#include "../profiler.hpp" -#include "src/rerank/rerank_calculator.pb.h" -#include "src/rerank/rerank_utils.hpp" - -using namespace rapidjson; -using namespace ovms; - -namespace mediapipe { - -using InputDataType = ovms::HttpPayload; -using OutputDataType = std::string; - -class RerankCalculator : public CalculatorBase { - static const std::string INPUT_TAG_NAME; - static const std::string OUTPUT_TAG_NAME; - static constexpr size_t NUMBER_OF_SPECIAL_TOKENS = 4; - - mediapipe::Timestamp timestamp{0}; - std::chrono::time_point created; - - int64_t bos_token{0}; - int64_t eos_token{0}; - int64_t sep_token{0}; - int64_t pad_token{0}; - - uint64_t max_position_embeddings{512}; - - size_t max_allowed_chunks{0}; // Read from options in ::Open() - -protected: - std::shared_ptr<::InferenceAdapter> tokenizer_session{nullptr}; - std::shared_ptr<::InferenceAdapter> rerank_session{nullptr}; - -public: - static absl::Status GetContract(CalculatorContract* cc) { - RET_CHECK(!cc->Inputs().GetTags().empty()); - RET_CHECK(!cc->Outputs().GetTags().empty()); - cc->Inputs().Tag(INPUT_TAG_NAME).Set(); - cc->Outputs().Tag(OUTPUT_TAG_NAME).Set(); - cc->InputSidePackets().Tag("TOKENIZER_SESSION").Set>(); - cc->InputSidePackets().Tag("RERANK_SESSION").Set>(); - return absl::OkStatus(); - } - - absl::Status Close(CalculatorContext* cc) final { - OVMS_PROFILE_FUNCTION(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "RerankCalculator [Node: {} ] Close", cc->NodeName()); - return absl::OkStatus(); - } - - absl::Status Open(CalculatorContext* cc) final { - OVMS_PROFILE_FUNCTION(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "RerankCalculator [Node: {}] Open start", cc->NodeName()); - tokenizer_session = cc->InputSidePackets().Tag("TOKENIZER_SESSION").Get>(); - rerank_session = cc->InputSidePackets().Tag("RERANK_SESSION").Get>(); - - const auto& options = cc->Options(); - this->max_allowed_chunks = options.max_allowed_chunks(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Max allowed chunks: {}", this->max_allowed_chunks); - - try { - // special tokens - this->bos_token = rerank_session->getModelConfig().at("bos_token_id").as(); - this->eos_token = rerank_session->getModelConfig().at("eos_token_id").as(); - if (rerank_session->getModelConfig().count("sep_token_id") == 0) { - this->sep_token = this->eos_token; - } else { - this->sep_token = rerank_session->getModelConfig().at("sep_token_id").as(); - } - this->pad_token = rerank_session->getModelConfig().at("pad_token_id").as(); - - // max_position_embeddings - if (options.has_max_position_embeddings()) { - this->max_position_embeddings = options.max_position_embeddings(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Options defined max_position_embeddings: {}", this->max_position_embeddings); - } else { - auto maxPositionEmbeddingsIt = rerank_session->getModelConfig().find("max_position_embeddings"); - if (maxPositionEmbeddingsIt != rerank_session->getModelConfig().end()) { - this->max_position_embeddings = maxPositionEmbeddingsIt->second.as(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Model max_position_embeddings: {}", this->max_position_embeddings); - } else { - auto maxTrainedPositionsIt = rerank_session->getModelConfig().find("max_trained_positions"); - if (maxTrainedPositionsIt != rerank_session->getModelConfig().end()) { - this->max_position_embeddings = maxTrainedPositionsIt->second.as(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Model max_position_embeddings (inherited from max_trained_positions): {}", this->max_position_embeddings); - } else { - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Model missing max_position_embeddings and max_trained_positions in config, using default value: {}", this->max_position_embeddings); - } - } - } - - // post-validation - if (this->max_position_embeddings <= 2 * NUMBER_OF_SPECIAL_TOKENS) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "max_position_embeddings should be larger than 2 * NUMBER_OF_SPECIAL_TOKENS"); - return absl::InvalidArgumentError("max_position_embeddings should be larger than 2 * NUMBER_OF_SPECIAL_TOKENS"); - } - } catch (ov::AssertFailure& e) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "OpenVINO Assert Failure: {}", e.what()); - return absl::InternalError(e.what()); - } catch (std::out_of_range& e) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "{}", e.what()); - return absl::InternalError(e.what()); - } catch (...) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "Unknown error"); - return absl::InternalError("Unknown error"); - } - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "RerankCalculator [Node: {}] Open end", cc->NodeName()); - return absl::OkStatus(); - } - - std::vector ComputeTokensForString(std::string str) const { - if (tokenizer_session->getInputNames().size() != 1) - throw std::runtime_error("Tokenizer session should have only one input"); - if (tokenizer_session->getOutputNames().size() != 2) - throw std::runtime_error("Tokenizer session should have only two outputs"); - - auto tokenizer_input_name = tokenizer_session->getInputNames()[0]; - ::InferenceInput tokenizer_input_map; - tokenizer_input_map[tokenizer_input_name] = ov::Tensor(ov::element::string, ov::Shape{1}, &str); - ::InferenceOutput tokenizer_output_map = tokenizer_session->infer(tokenizer_input_map); - - if (tokenizer_output_map.size() != 2) - throw std::runtime_error("Tokenizer session should have only two outputs"); - if (tokenizer_output_map.count("input_ids") != 1) - throw std::runtime_error("Tokenizer session should have input_ids output"); - if (tokenizer_output_map.count("attention_mask") != 1) - throw std::runtime_error("Tokenizer session should have attention_mask output"); - - auto input_ids = tokenizer_output_map.at("input_ids"); - if (input_ids.get_shape().size() != 2) - throw std::runtime_error("input_ids should have 2 dimensions"); - if (input_ids.get_shape()[0] != 1) - throw std::runtime_error("input_ids should have 1 batch size"); - if (input_ids.get_element_type() != ov::element::i64) - throw std::runtime_error("input_ids should have i64 element type"); // TODO: Add support for other precisions? - - int64_t* input_ids_data = reinterpret_cast(input_ids.data()); - return std::vector(input_ids_data, input_ids_data + input_ids.get_shape()[1]); - } - - std::pair ComputeTokensForBatchedString(std::vector strings) const { - if (tokenizer_session->getInputNames().size() != 1) - throw std::runtime_error("Tokenizer session should have only one input"); - if (tokenizer_session->getOutputNames().size() != 2) - throw std::runtime_error("Tokenizer session should have only two outputs"); - - auto tokenizer_input_name = tokenizer_session->getInputNames()[0]; - ::InferenceInput tokenizer_input_map; - tokenizer_input_map[tokenizer_input_name] = ov::Tensor(ov::element::string, ov::Shape{strings.size()}, strings.data()); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Starting inference tokenizer model"); - ::InferenceOutput tokenizer_output_map = tokenizer_session->infer(tokenizer_input_map); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Finished inference tokenizer model"); - - if (tokenizer_output_map.size() != 2) - throw std::runtime_error("Tokenizer session should have only two outputs"); - if (tokenizer_output_map.count("input_ids") != 1) - throw std::runtime_error("Tokenizer session should have input_ids output"); - if (tokenizer_output_map.count("attention_mask") != 1) - throw std::runtime_error("Tokenizer session should have attention_mask output"); - - auto input_ids = tokenizer_output_map.at("input_ids"); - if (input_ids.get_shape().size() != 2) - throw std::runtime_error("input_ids should have 2 dimensions"); - if (input_ids.get_shape()[0] != strings.size()) - throw std::runtime_error("input_ids should have batch size equal to number of tokenized strings"); - if (input_ids.get_element_type() != ov::element::i64) - throw std::runtime_error("input_ids should have i64 element type"); - - auto attention_mask = tokenizer_output_map.at("attention_mask"); - if (attention_mask.get_shape().size() != 2) - throw std::runtime_error("attention_mask should have 2 dimensions"); - if (attention_mask.get_shape()[0] != strings.size()) - throw std::runtime_error("attention_mask should have batch size equal to number of tokenized strings"); - if (attention_mask.get_element_type() != ov::element::i64) - throw std::runtime_error("attention_mask should have i64 element type"); // TODO: Add support for other precisions? - - return std::make_pair(input_ids, attention_mask); - } - - std::pair PrepareInputsForRerankModel(const RerankHandler& handler, std::vector& chunk_mapping) const { - // Validate batch size before tokenizing - if (handler.getDocumentsList().size() > this->max_allowed_chunks) - throw std::runtime_error("Number of documents exceeds max_allowed_chunks"); - // TODO: Validate max string length for some arbitrary size - - // Compute Query Tokens - auto query_tokens = ComputeTokensForString(handler.getQuery()); - - // Truncate last tokens if exceeding max_position_embeddings / 2 as mentioned in cohere doc: - // https://docs.cohere.com/v2/docs/reranking-best-practices#queries - const size_t max_query_tokens = this->max_position_embeddings / 2; - if (query_tokens.size() > max_query_tokens) { - query_tokens.resize(max_query_tokens); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Number of query tokens: {} exceeded half of max_position_embeddings: {}, truncating to {}", query_tokens.size(), this->max_position_embeddings, max_query_tokens); - } else { - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Number of query tokens: {}", query_tokens.size()); - } - - // Compute Document Tokens - auto [doc_input_ids, doc_attention_mask] = ComputeTokensForBatchedString(handler.getDocumentsList()); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "\nMax position embeddings: {}\nQuery tokens: {}\nSpecial tokens: {}\nRemaining space for chunk: {}", - this->max_position_embeddings, query_tokens.size(), NUMBER_OF_SPECIAL_TOKENS, this->max_position_embeddings - query_tokens.size() - NUMBER_OF_SPECIAL_TOKENS); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Number of documents: {}; with max token count: {} before chunking", doc_input_ids.get_shape()[0], doc_input_ids.get_shape()[1]); - - // max_tokens_per_chunk can never be <= 0 since query_tokens.size() is at max half of max_position_embeddings - // and max_position_embeddings is at least 2 * NUMBER_OF_SPECIAL_TOKENS - size_t max_tokens_per_chunk = this->max_position_embeddings - query_tokens.size() - NUMBER_OF_SPECIAL_TOKENS; - ov::Tensor out_input_ids, out_attention_mask; - auto status = chunkDocuments( - doc_input_ids, - doc_attention_mask, - out_input_ids, out_attention_mask, - chunk_mapping, max_tokens_per_chunk, - this->max_allowed_chunks, this->pad_token); - if (!status.ok()) { - throw std::runtime_error(std::string{"Chunking failed: "} + std::string(status.message())); - } - doc_input_ids = out_input_ids; - doc_attention_mask = out_attention_mask; - - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Number of chunks: {}; with max token count: {} after chunking", doc_input_ids.get_shape()[0], doc_input_ids.get_shape()[1]); - - size_t tokens_count_of_longest_document = doc_input_ids.get_shape()[1]; - if (tokens_count_of_longest_document > max_tokens_per_chunk) - throw std::runtime_error("tokens_count_of_longest_document exceeds max_tokens_per_chunk"); // should never happen - size_t total_tokens_count_per_batch = tokens_count_of_longest_document + NUMBER_OF_SPECIAL_TOKENS + query_tokens.size(); - size_t batch_size = doc_input_ids.get_shape()[0]; - if (batch_size != chunk_mapping.size()) - throw std::runtime_error("error"); // should never happen - - if (total_tokens_count_per_batch > this->max_position_embeddings) - throw std::runtime_error("Query tokens count + special tokens + tokens count of longest document exceeds max_position_embeddings"); - - auto input_ids = ov::Tensor(ov::element::i64, ov::Shape{batch_size, total_tokens_count_per_batch}); - auto attention_mask = ov::Tensor(ov::element::i64, ov::Shape{batch_size, total_tokens_count_per_batch}); - - // Combine query and document tokens - // Schema (tokenizer must be exported without --add_special_tokens flag, we will add it manually) - /* - BOS_TOKEN EOS_TOKEN SEP_TOKEN EOS_TOKEN - BOS_TOKEN EOS_TOKEN SEP_TOKEN EOS_TOKEN - BOS_TOKEN EOS_TOKEN SEP_TOKEN EOS_TOKEN - BOS_TOKEN EOS_TOKEN SEP_TOKEN EOS_TOKEN - */ - - for (size_t i = 0; i < batch_size; i++) { - int64_t* input_ids_data = reinterpret_cast(input_ids.data()) + i * total_tokens_count_per_batch; - int64_t* attention_mask_data = reinterpret_cast(attention_mask.data()) + i * total_tokens_count_per_batch; - - int64_t* doc_input_ids_data = reinterpret_cast(doc_input_ids.data()) + i * tokens_count_of_longest_document; - - // Fill input_ids - input_ids_data[0] = this->bos_token; - std::memcpy(input_ids_data + 1, query_tokens.data(), query_tokens.size() * sizeof(int64_t)); - input_ids_data[query_tokens.size() + 1] = this->eos_token; - input_ids_data[query_tokens.size() + 2] = this->sep_token; - std::memcpy(input_ids_data + 1 + query_tokens.size() + 2, doc_input_ids_data, tokens_count_of_longest_document * sizeof(int64_t)); - - input_ids_data[total_tokens_count_per_batch - 1] = this->pad_token; - - auto it = std::find(doc_input_ids_data, doc_input_ids_data + tokens_count_of_longest_document, this->pad_token); - size_t pad_token_index = (it != doc_input_ids_data + tokens_count_of_longest_document) ? std::distance(doc_input_ids_data, it) : tokens_count_of_longest_document; - - input_ids_data[1 + query_tokens.size() + 2 + pad_token_index] = this->eos_token; - - // Fill attention_mask - std::fill(attention_mask_data, attention_mask_data + total_tokens_count_per_batch, int64_t(0)); - std::fill(attention_mask_data, attention_mask_data + 1 + query_tokens.size() + 2 + pad_token_index + 1, int64_t(1)); - } - - return std::make_pair(input_ids, attention_mask); - } - - std::vector ComputeScoresUsingRerankModel(ov::Tensor input_ids, ov::Tensor attention_mask, const std::vector& chunkMapping, size_t actual_batch_size) const { - if (rerank_session->getInputNames().size() != 2) // TODO: Support 3 inputs with token_type_ids - throw std::runtime_error("Rerank model should have 2 inputs"); - if (rerank_session->getOutputNames().size() != 1) // There should be only one output when exported with --task text-classification - throw std::runtime_error("Rerank model should have 1 output"); - - // Validate input/output names - if (rerank_session->getInputNames()[0] != "input_ids" && rerank_session->getInputNames()[1] != "input_ids") - throw std::runtime_error("Rerank model should have input_ids input"); - if (rerank_session->getInputNames()[0] != "attention_mask" && rerank_session->getInputNames()[1] != "attention_mask") - throw std::runtime_error("Rerank model should have attention_mask input"); - if (rerank_session->getOutputNames()[0] != "logits") - throw std::runtime_error("Rerank model should have logits output"); - - if (input_ids.get_shape()[1] > this->max_position_embeddings) - throw std::runtime_error("exceeding max_position_embeddings"); // should never happen - - ::InferenceInput rerank_input_map; - rerank_input_map["input_ids"] = input_ids; - rerank_input_map["attention_mask"] = attention_mask; - - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Starting inference rerank model"); - ::InferenceOutput rerank_output_map = rerank_session->infer(rerank_input_map); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Finished inference rerank model"); - if (rerank_output_map.size() != 1) - throw std::runtime_error("Rerank model results should have 1 output"); - if (rerank_output_map.count("logits") != 1) - throw std::runtime_error("Rerank model results should have logits output"); - - auto logits = rerank_output_map.at("logits"); - - if (logits.get_shape().size() != 2) // 2D tensor - throw std::runtime_error("Logits should be 2D tensor"); - if (logits.get_shape()[0] != input_ids.get_shape()[0]) - throw std::runtime_error("Batch size mismatch"); - - std::vector scores; - scores.resize(actual_batch_size, 0); - - size_t logits_dim = logits.get_shape()[1]; - - for (int i = 0; i < input_ids.get_shape()[0]; ++i) { - size_t score_index = chunkMapping[i]; - if (score_index >= actual_batch_size) - throw std::runtime_error("score_index out of bounds"); // should never happen - float logit = logits_dim > 1 ? reinterpret_cast(logits.data())[i * logits_dim + 1] : reinterpret_cast(logits.data())[i]; - float score = 1 / (1 + std::exp(-logit)); - float current_highest_score = scores[score_index]; - scores[score_index] = std::max(current_highest_score, score); - } - - return scores; - } - - absl::Status Process(CalculatorContext* cc) final { - OVMS_PROFILE_FUNCTION(); - RET_CHECK(tokenizer_session != nullptr); - RET_CHECK(rerank_session != nullptr); - if (cc->Inputs().Tag(INPUT_TAG_NAME).IsEmpty()) { - return absl::InvalidArgumentError("Input is empty"); - } - InputDataType payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get(); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request body: {}", payload.body); - SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Request uri: {}", payload.uri); - RerankHandler handler(*payload.parsedJson); - absl::Status status = handler.parseRequest(); - if (!status.ok()) { - return status; - } - - try { - // Prepare inputs for rerank model - std::vector chunk_mapping; - auto [input_ids, attention_mask] = PrepareInputsForRerankModel(handler, chunk_mapping); - - // Compute scores using rerank model - size_t batch_size = handler.getDocumentsList().size(); - auto scores = ComputeScoresUsingRerankModel( - input_ids, - attention_mask, - chunk_mapping, - batch_size); - - // Serialize scores - StringBuffer buffer; - status = handler.parseResponse(buffer, scores); - if (!status.ok()) { - return status; - } - cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string(buffer.GetString()), timestamp); - return absl::OkStatus(); - } catch (ov::AssertFailure& e) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "OpenVINO Assert Failure: {}", e.what()); - return absl::InternalError(e.what()); - } catch (std::runtime_error& e) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "runtime_error: {}", e.what()); - return absl::InternalError(e.what()); - } catch (...) { - SPDLOG_LOGGER_ERROR(rerank_calculator_logger, "Unknown error"); - return absl::InternalError("Unknown error"); - } - } -}; -const std::string RerankCalculator::INPUT_TAG_NAME{"REQUEST_PAYLOAD"}; -const std::string RerankCalculator::OUTPUT_TAG_NAME{"RESPONSE_PAYLOAD"}; - -REGISTER_CALCULATOR(RerankCalculator); - -} // namespace mediapipe diff --git a/src/rerank/rerank_calculator.proto b/src/rerank/rerank_calculator.proto deleted file mode 100644 index 321348727d..0000000000 --- a/src/rerank/rerank_calculator.proto +++ /dev/null @@ -1,32 +0,0 @@ -//***************************************************************************** -// Copyright 2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -//***************************************************************************** - -syntax = "proto2"; -package mediapipe; - -import "mediapipe/framework/calculator.proto"; - -message RerankCalculatorOptions { - extend mediapipe.CalculatorOptions { - // https://github.com/google/mediapipe/issues/634 have to be unique in app - // no rule to obtain this - optional RerankCalculatorOptions ext = 113473741; - } - - optional uint64 max_allowed_chunks = 1 [default = 10000]; // Default taken from Cohere API documentation - - optional uint64 max_position_embeddings = 2; -} diff --git a/src/test/mediapipeflow_test.cpp b/src/test/mediapipeflow_test.cpp index 55b6ab96ed..1826da8d37 100644 --- a/src/test/mediapipeflow_test.cpp +++ b/src/test/mediapipeflow_test.cpp @@ -3797,7 +3797,6 @@ TEST(WhitelistRegistered, MediapipeCalculatorsList) { "DetectionsToRectsCalculator", "DetectionsToRenderDataCalculator", "EmbeddingsCalculatorOV", - "RerankCalculator", "RerankCalculatorOV", "EmptyLabelCalculator", "EmptyLabelClassificationCalculator", diff --git a/src/test/rerank/with_params/graph.pbtxt b/src/test/rerank/with_params/graph.pbtxt deleted file mode 100644 index d710ba75c4..0000000000 --- a/src/test/rerank/with_params/graph.pbtxt +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -input_stream: "REQUEST_PAYLOAD:input" -output_stream: "RESPONSE_PAYLOAD:output" -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:tokenizer" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "tokenizer_model" - } - } -} -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:rerank" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "rerank_model" - } - } -} -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" - node_options: { - [type.googleapis.com / mediapipe.RerankCalculatorOptions]: { - max_allowed_chunks: 4 - max_position_embeddings: 12 - } - } -} diff --git a/src/test/rerank/with_params/invalid_graph.pbtxt b/src/test/rerank/with_params/invalid_graph.pbtxt deleted file mode 100644 index 39d3b1be93..0000000000 --- a/src/test/rerank/with_params/invalid_graph.pbtxt +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2025 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -input_stream: "REQUEST_PAYLOAD:input" -output_stream: "RESPONSE_PAYLOAD:output" -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:tokenizer" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "tokenizer_model" - } - } -} -node { - calculator: "OpenVINOModelServerSessionCalculator" - output_side_packet: "SESSION:rerank" - node_options: { - [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { - servable_name: "rerank_model" - } - } -} -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" - node_options: { - [type.googleapis.com / mediapipe.RerankCalculatorOptions]: { - max_allowed_chunks: 4 - max_position_embeddings: 8 # invalid due to number of special tokens (4) + space for query (4) = 8, no space for document - } - } -} diff --git a/src/test/rerank/with_params/invalid_graph_ov.pbtxt b/src/test/rerank/with_params/invalid_graph_ov.pbtxt index 39d3b1be93..fb9ebfd5e1 100644 --- a/src/test/rerank/with_params/invalid_graph_ov.pbtxt +++ b/src/test/rerank/with_params/invalid_graph_ov.pbtxt @@ -32,16 +32,5 @@ node { } } } -node { - input_side_packet: "TOKENIZER_SESSION:tokenizer" - input_side_packet: "RERANK_SESSION:rerank" - calculator: "RerankCalculator" - input_stream: "REQUEST_PAYLOAD:input" - output_stream: "RESPONSE_PAYLOAD:output" - node_options: { - [type.googleapis.com / mediapipe.RerankCalculatorOptions]: { - max_allowed_chunks: 4 - max_position_embeddings: 8 # invalid due to number of special tokens (4) + space for query (4) = 8, no space for document - } - } -} + + diff --git a/yarn.lock b/yarn.lock index c95abed3b6..964e3ed385 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11,3 +11,50 @@ semver "5.6.0" source-map-support "0.5.9" tsutils "3.21.0" + +"@bazel/worker@5.7.2": + version "5.7.2" + resolved "https://registry.yarnpkg.com/@bazel/worker/-/worker-5.7.2.tgz#43d800dc1b5a3707340a4eb0102da81c53fc6f63" + integrity sha512-H+auDA0QKF4mtZxKkZ2OKJvD7hGXVsVKtvcf4lbb93ur0ldpb5k810PcDxngmIGBcIX5kmyxniNTIiGFNobWTg== + dependencies: + google-protobuf "^3.6.1" + +buffer-from@^1.0.0: + version "1.1.2" + resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.2.tgz#2b146a6fd72e80b4f55d255f35ed59a3a9a41bd5" + integrity sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ== + +google-protobuf@^3.6.1: + version "3.21.4" + resolved "https://registry.yarnpkg.com/google-protobuf/-/google-protobuf-3.21.4.tgz#2f933e8b6e5e9f8edde66b7be0024b68f77da6c9" + integrity sha512-MnG7N936zcKTco4Jd2PX2U96Kf9PxygAPKBug+74LHzmHXmceN16MmRcdgZv+DGef/S9YvQAfRsNCn4cjf9yyQ== + +semver@5.6.0: + version "5.6.0" + resolved "https://registry.yarnpkg.com/semver/-/semver-5.6.0.tgz#7e74256fbaa49c75aa7c7a205cc22799cac80004" + integrity sha512-RS9R6R35NYgQn++fkDWaOmqGoj4Ek9gGs+DPxNUZKuwE183xjJroKvyo1IzVFeXvUrvmALy6FWD5xrdJT25gMg== + +source-map-support@0.5.9: + version "0.5.9" + resolved "https://registry.yarnpkg.com/source-map-support/-/source-map-support-0.5.9.tgz#41bc953b2534267ea2d605bccfa7bfa3111ced5f" + integrity sha512-gR6Rw4MvUlYy83vP0vxoVNzM6t8MUXqNuRsuBmBHQDu1Fh6X015FrLdgoDKcNdkwGubozq0P4N0Q37UyFVr1EA== + dependencies: + buffer-from "^1.0.0" + source-map "^0.6.0" + +source-map@^0.6.0: + version "0.6.1" + resolved "https://registry.yarnpkg.com/source-map/-/source-map-0.6.1.tgz#74722af32e9614e9c287a8d0bbde48b5e2f1a263" + integrity sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g== + +tslib@^1.8.1: + version "1.14.1" + resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00" + integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg== + +tsutils@3.21.0: + version "3.21.0" + resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623" + integrity sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA== + dependencies: + tslib "^1.8.1"