Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 219 additions & 2 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,19 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
}

validate_cgraph();

for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * cur_node = cgraph->nodes[node_n];
m_nodes.push_back(cur_node);
set_input_output(cur_node);
}

m_is_full_model = has_inp_tokens && has_output;
if (!m_is_full_model) {
compute_cgraph_dynamic_dims();
add_extra_model_inputs_for_fallback();
add_extra_model_outputs_for_fallback();
}

for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
m_node_info_list[node_n].node_op_type = compute_op_type(m_node_info_list[node_n].node);
Expand Down Expand Up @@ -150,6 +156,10 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
current_node_info.node_inputs[src_name] = src;
current_node_info.node_inputs_names.push_back(src_name);

if (src_name == "inp_tokens") {
has_inp_tokens = true;
}

// Add model inputs
if (!naive && !src->view_src) {
ggml_backend_buffer * buffer = src->buffer;
Expand All @@ -176,6 +186,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
if (!naive) {
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
static std::set<std::string> debug_output_names = {};
if (node_output_name.find("output") != std::string::npos) {
has_output = true;
}
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
if (node->op == GGML_OP_SET_ROWS || node->flags & GGML_TENSOR_FLAG_OUTPUT ||
node_output_name.find("output") != std::string::npos || debug_output_names.count(node_output_name)) {
Expand Down Expand Up @@ -264,6 +277,9 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const {
throw std::runtime_error("Unsupported VIEW case");
}
op_case = 2;
if (!m_is_full_model && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) {
op_case = 0;
}
}
break;
}
Expand Down Expand Up @@ -359,7 +375,7 @@ void GgmlOvDecoder::validate_cgraph() const {
}
}

ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
auto name = std::string(input->name);
ov::PartialShape input_shape;

Expand Down Expand Up @@ -391,6 +407,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
} else {
input_shape = ov::PartialShape{get_shape(input)};
}
if (dynamic_dim_index != -1) {
input_shape[3-dynamic_dim_index] = -1;
}
return input_shape;
}

Expand Down Expand Up @@ -863,3 +882,201 @@ const std::string & GgmlOvDecoder::get_op_type() const {
static const std::string unknown_op = "UNKNOWN_GGML_OP";
return unknown_op;
}

/**
* @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
*
* This function traverses the computation graph and determines the dynamic dimensions
* for each node based on its operation type and dependencies. The dynamic dimension
* is stored in the `m_node_dynamic_dims` map, where a value of -1 indicates no dynamic
* dimension. Specific operations such as GGML_OP_GET_ROWS, GGML_OP_MUL, GGML_OP_VIEW,
* etc., are handled to compute the dynamic dimension index.
*
* Key behaviors:
* - Nodes with operations like GGML_OP_NONE, GGML_OP_GET_ROWS, GGML_OP_MUL, and others
* are analyzed to determine their dynamic dimensions.
* - Nodes with specific names (e.g., "inp_tokens", "inp_pos", "inp_out_ids") are
* explicitly assigned a dynamic dimension index of 0.
* - For operations like GGML_OP_VIEW and GGML_OP_RESHAPE, the function ensures that
* the dynamic dimension is uniquely determined; otherwise, a warning is printed.
* - Unhandled operations print a message indicating the node name and operation type.
*
* This function is critical for preparing the computation graph for execution, ensuring
* that dynamic dimensions are correctly propagated and resolved.
*/
void GgmlOvDecoder::compute_cgraph_dynamic_dims() {
auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
if (!node) {
return;
}

if (node->op == GGML_OP_CPY) {
m_node_dynamic_dims[node] = -1;
}

if (m_node_dynamic_dims.count(node)) {
return;
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
ggml_tensor * src = node->src[i];
if (src) {
self(self, src);
}
}
switch (node->op) {
case GGML_OP_NONE:
m_node_dynamic_dims[node] = -1;
if (std::string(node->name) == "inp_tokens" || std::string(node->name) == "inp_pos" ||
std::string(node->name) == "inp_out_ids") {
m_node_dynamic_dims[node] = 0;
}
break;
case GGML_OP_GET_ROWS:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[1]] != -1) {
m_node_dynamic_dims[node] = 1;
}
break;
case GGML_OP_MUL:
case GGML_OP_MUL_MAT:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
}
if (m_node_dynamic_dims[node->src[1]] != -1) {
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
}
break;
case GGML_OP_VIEW:
case GGML_OP_FLASH_ATTN_EXT:
case GGML_OP_PERMUTE:
case GGML_OP_RESHAPE:
m_node_dynamic_dims[node] = -1;
if (m_node_dynamic_dims[node->src[0]] != -1) {
auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
int same_dim_count = 0;
for (int i = 0; i < 4; i++) {
if (node->ne[i] == dynamic_dim_value) {
m_node_dynamic_dims[node] = i;
same_dim_count++;
}
}
if (same_dim_count != 1) {
std::cout << "Cannot determine dynamic dim for node: " << node->name << std::endl;
}
}
break;
case GGML_OP_RMS_NORM:
case GGML_OP_ADD:
case GGML_OP_GLU:
case GGML_OP_ROPE:
case GGML_OP_SCALE:
m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
break;
case GGML_OP_CPY:
case GGML_OP_SET_ROWS:
m_node_dynamic_dims[node] = -1;
break;
default:
std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
break;
}
};

for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
visit_node(visit_node, node);
}
}

/**
* @brief Adds extra model outputs to support fallback mechanisms.
*
* This function ensures that all relevant nodes in the computation graph are included
* as model outputs for fallback scenarios. It creates a mapping of tensor data addresses
* to their corresponding nodes, excluding nodes with the GGML_OP_VIEW operation.
*
* Key behaviors:
* - Iterates through all nodes in the computation graph and maps their data addresses
* to the corresponding tensor nodes, skipping nodes with GGML_OP_VIEW.
* - Adds nodes to the `m_model_outputs` map if they are not already present, using
* the tensor's name as the key.
*
* This function is essential for ensuring that fallback mechanisms have access to all
* necessary model outputs, particularly in scenarios where certain outputs are not
* explicitly defined in the original model configuration.
*/
void GgmlOvDecoder::add_extra_model_outputs_for_fallback() {
std::map<void *, ggml_tensor *> address_map;
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
if (node->op == GGML_OP_VIEW) {
continue;
}
address_map[node->data] = node;
}

for (const auto & pair : address_map) {
const std::string & name = pair.second->name;
if (m_model_outputs.find(name) == m_model_outputs.end()) {
m_model_outputs[name] = pair.second;
}
}
}

/**
* @brief Adds extra model inputs to support fallback mechanisms.
*
* This function ensures that all necessary input nodes in the computation graph are
* included as model inputs for fallback scenarios. It iterates through the source nodes
* of each computation graph node and adds them to the `m_model_inputs` map if they meet
* specific criteria.
*
* Key behaviors:
* - Skips source nodes that are already present in `m_model_weights` or `m_model_inputs`.
* - Excludes intermediate nodes that are part of `m_node_info_list`.
* - For eligible source nodes, creates OpenVINO parameter nodes with appropriate types
* and shapes, and assigns them friendly names.
* - Updates the `m_inputs` and `m_model_inputs` maps with the new parameter nodes.
*
* This function is critical for ensuring that fallback mechanisms have access to all
* required model inputs, particularly in scenarios where certain inputs are not
* explicitly defined in the original model configuration.
*/
void GgmlOvDecoder::add_extra_model_inputs_for_fallback() {
for (int i = 0; i < m_cgraph->n_nodes; i++) {
ggml_tensor * node = m_cgraph->nodes[i];
for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i];
if (src == nullptr) {
continue;
}
std::string src_name = std::string(src->name);
if (m_model_weights.find(src_name) != m_model_weights.end()) {
continue;
}

bool is_intermediate_node = false;
for (const auto & node_info : m_node_info_list) {
if (node_info.node == src) {
is_intermediate_node = true;
break;
}
}
if (is_intermediate_node) {
continue;
}
if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
continue;
}

m_inputs[src_name] = src;
auto param_node = std::make_shared<ov::op::v0::Parameter>(
get_ov_type(src), get_graph_input_shape(node, src, m_node_dynamic_dims[src]));
param_node->set_friendly_name(src_name);
param_node->output(0).get_tensor().set_names({src_name});
m_model_inputs[src_name] = param_node;
}
}
}
16 changes: 15 additions & 1 deletion ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual bool is_static() const override { return m_is_static; }

ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;

static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

Expand All @@ -202,8 +202,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

void set_compute_params(const ComputeParams & compute_params) { m_compute_params = compute_params; }

virtual bool is_full_model() const override {return m_is_full_model; }

bool m_is_static = false;
bool m_is_prefill = false;
bool m_is_full_model = true; // label the cgraph is splited or not
int m_prefill_chunk_size = 0;

static std::vector<size_t> get_shape(const ggml_tensor * tensor);
Expand All @@ -216,6 +219,13 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
void set_input_output(ggml_tensor * node, bool naive = false);
int compute_op_case(const ggml_tensor * node) const;

// @brief Computes the dynamic dimensions for the computation graph nodes to support fallback mechanisms.
void compute_cgraph_dynamic_dims();
// @brief Adds extra model outputs to support fallback mechanisms.
void add_extra_model_outputs_for_fallback();
// @brief Adds extra model inputs to support fallback mechanisms.
void add_extra_model_inputs_for_fallback();

void validate_cgraph() const;

ggml_cgraph * m_cgraph = nullptr;
Expand All @@ -228,6 +238,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<NodeInfo> m_node_info_list;
std::map<ggml_tensor *, int> m_node_dynamic_dims; // map from ggml_tensor to its dynamic dimension index, -1 means static

bool has_inp_tokens = false;
bool has_output = false;

ModelParams m_model_params;
ComputeParams m_compute_params;
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-openvino/openvino/decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class GgmlDecoder : public DecoderBase {

virtual int get_op_case(int node_idx) const = 0;

virtual bool is_full_model() const = 0;

virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-openvino/openvino/translate_session.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
}
};

if (!m_naive) {
if (!m_naive && ggml_model_decoder->is_full_model()) {
preprocess(*tensor_map, *ggml_model_decoder);
}
ggml_model_decoder->visit_subgraph(node_visitor);
Expand Down
Loading