Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion src/rerank/rerank_calculator_ov.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class RerankCalculatorOV : public CalculatorBase {
static const std::string RERANK_MODEL_INPUT_IDS_NAME;
static const std::string RERANK_MODEL_ATTENTION_MASK_NAME;
static const std::string RERANK_MODEL_TOKEN_TYPE_IDS_NAME;
static const std::string RERANK_MODEL_POSITION_IDS_NAME;
static const std::string RERANK_MODEL_BEAM_IDX_NAME;
static constexpr size_t NUMBER_OF_SPECIAL_TOKENS = 4;

mediapipe::Timestamp timestamp{0};
Expand Down Expand Up @@ -151,6 +153,39 @@ class RerankCalculatorOV : public CalculatorBase {
// Validate batch size before tokenizing
if (handler.getDocumentsList().size() > this->max_allowed_chunks)
throw std::runtime_error("Number of documents exceeds max_allowed_chunks");
if (rerank_session->isQwen3) {
// Qwen3 reranker: format each query-document pair using the chat template
// Template from openvino-2026.0-genai/tests/python_tests/utils/qwen3_reranker_utils.py
auto batchSize = handler.getDocumentsList().size();
std::vector<std::string> data(batchSize);

std::string prefix = "<|im_start|>system\nJudge whether the Document meets the requirements "
"based on the Query and the Instruct provided. Note that the answer can only be "
"\"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
"<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n"
"<Query>: " + handler.getQuery() + "\n";
std::string suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n";

for (size_t i = 0; i < batchSize; i++) {
data[i] = prefix + "<Document>: " + handler.getDocumentsList()[i] + suffix;
}

chunk_mapping.resize(batchSize);
std::iota(chunk_mapping.begin(), chunk_mapping.end(), 0);
auto tokens = rerank_session->getTokenizer().encode(data);
if (tokens.input_ids.get_shape().size() != 2) {
throw std::runtime_error("Tokens shape invalid.");
}
if (this->max_position_embeddings < tokens.input_ids.get_shape()[1]) {
std::ostringstream msg;
msg << "Qwen3 rerank request length of " << tokens.input_ids.get_shape()[1]
<< " tokens exceeds the model context of " << max_position_embeddings;
throw std::runtime_error(msg.str());
}
SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Qwen3 rerank: {} documents, {} tokens per sequence",
batchSize, tokens.input_ids.get_shape()[1]);
Comment on lines +176 to +186
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the Qwen3 path, the tokenizer outputs are not validated the way the non-Qwen3 path effectively is (via chunkDocuments()), but later code assumes attention_mask is i64 and uses attention_mask.data<int64_t>() to compute position_ids. Please add explicit validation of tokens.input_ids/tokens.attention_mask element types and shapes for the Qwen3 branch to avoid UB if the tokenizer output precision/layout differs.

Suggested change
if (tokens.input_ids.get_shape().size() != 2) {
throw std::runtime_error("Tokens shape invalid.");
}
if (this->max_position_embeddings < tokens.input_ids.get_shape()[1]) {
std::ostringstream msg;
msg << "Qwen3 rerank request length of " << tokens.input_ids.get_shape()[1]
<< " tokens exceeds the model context of " << max_position_embeddings;
throw std::runtime_error(msg.str());
}
SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Qwen3 rerank: {} documents, {} tokens per sequence",
batchSize, tokens.input_ids.get_shape()[1]);
const ov::Shape& input_shape = tokens.input_ids.get_shape();
const ov::Shape& mask_shape = tokens.attention_mask.get_shape();
// Basic rank validation
if (input_shape.size() != 2 || mask_shape.size() != 2) {
throw std::runtime_error("Qwen3 tokenizer outputs must be 2D tensors [batch, sequence].");
}
// Ensure attention_mask layout matches input_ids layout
if (input_shape != mask_shape) {
throw std::runtime_error("Qwen3 tokenizer outputs have mismatched shapes for input_ids and attention_mask.");
}
// Ensure element types match the assumptions made later (attention_mask.data<int64_t>())
if (tokens.input_ids.get_element_type() != ov::element::i64) {
throw std::runtime_error("Qwen3 tokenizer input_ids tensor must have i64 element type.");
}
if (tokens.attention_mask.get_element_type() != ov::element::i64) {
throw std::runtime_error("Qwen3 tokenizer attention_mask tensor must have i64 element type.");
}
if (this->max_position_embeddings < input_shape[1]) {
std::ostringstream msg;
msg << "Qwen3 rerank request length of " << input_shape[1]
<< " tokens exceeds the model context of " << max_position_embeddings;
throw std::runtime_error(msg.str());
}
SPDLOG_LOGGER_DEBUG(rerank_calculator_logger, "Qwen3 rerank: {} documents, {} tokens per sequence",
batchSize, input_shape[1]);

Copilot uses AI. Check for mistakes.
return std::make_pair(tokens.input_ids, tokens.attention_mask);
}
if (!rerank_session->addBosToken) {
auto batchSize = handler.getDocumentsList().size();
std::vector<std::string> data(batchSize);
Expand Down Expand Up @@ -257,6 +292,27 @@ class RerankCalculatorOV : public CalculatorBase {
if (typeIds.has_value()) {
inferRequest.set_tensor(RERANK_MODEL_TOKEN_TYPE_IDS_NAME, typeIds.value());
}
// For CausalLM models (e.g. Qwen3 rerankers): set position_ids and beam_idx
if (rerank_session->hasPositionIds) {
size_t batch = input_ids.get_shape()[0];
size_t seq_len = input_ids.get_shape()[1];
auto position_ids = ov::Tensor(ov::element::i64, input_ids.get_shape());
int64_t* pos_data = position_ids.data<int64_t>();
int64_t* attn_data = attention_mask.data<int64_t>();
for (size_t b = 0; b < batch; b++) {
int64_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] = attn_data[b * seq_len + s] ? pos++ : 0;
}
}
inferRequest.set_tensor(RERANK_MODEL_POSITION_IDS_NAME, position_ids);
}
if (rerank_session->hasBeamIdx) {
size_t batch = input_ids.get_shape()[0];
auto beam_idx = ov::Tensor(ov::element::i32, {batch});
std::fill_n(beam_idx.data<int32_t>(), batch, 0);
Comment on lines +299 to +313
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

position_ids and beam_idx tensors are created with hard-coded element types (i64 / i32). If the model’s actual input element types differ, set_tensor() will fail at runtime. Please derive the element type from the model/compiled model input (by name) and allocate the tensors with the expected type (or validate and error with a clear message).

Suggested change
auto position_ids = ov::Tensor(ov::element::i64, input_ids.get_shape());
int64_t* pos_data = position_ids.data<int64_t>();
int64_t* attn_data = attention_mask.data<int64_t>();
for (size_t b = 0; b < batch; b++) {
int64_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] = attn_data[b * seq_len + s] ? pos++ : 0;
}
}
inferRequest.set_tensor(RERANK_MODEL_POSITION_IDS_NAME, position_ids);
}
if (rerank_session->hasBeamIdx) {
size_t batch = input_ids.get_shape()[0];
auto beam_idx = ov::Tensor(ov::element::i32, {batch});
std::fill_n(beam_idx.data<int32_t>(), batch, 0);
// Derive element types from the compiled model inputs to avoid dtype mismatches.
const ov::element::Type pos_element_type =
inferRequest.get_compiled_model().input(RERANK_MODEL_POSITION_IDS_NAME).get_element_type();
const ov::element::Type mask_element_type =
inferRequest.get_compiled_model().input(RERANK_MODEL_ATTENTION_MASK_NAME).get_element_type();
ov::Tensor position_ids(pos_element_type, input_ids.get_shape());
if (pos_element_type == ov::element::i64) {
int64_t* pos_data = position_ids.data<int64_t>();
if (mask_element_type == ov::element::i64) {
int64_t* attn_data = attention_mask.data<int64_t>();
for (size_t b = 0; b < batch; b++) {
int64_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] = attn_data[b * seq_len + s] ? pos++ : 0;
}
}
} else if (mask_element_type == ov::element::i32) {
int32_t* attn_data = attention_mask.data<int32_t>();
for (size_t b = 0; b < batch; b++) {
int64_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] = attn_data[b * seq_len + s] ? pos++ : 0;
}
}
} else {
throw std::runtime_error("Unsupported attention_mask element type for position_ids generation");
}
} else if (pos_element_type == ov::element::i32) {
int32_t* pos_data = position_ids.data<int32_t>();
if (mask_element_type == ov::element::i64) {
int64_t* attn_data = attention_mask.data<int64_t>();
for (size_t b = 0; b < batch; b++) {
int32_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] =
attn_data[b * seq_len + s] ? static_cast<int32_t>(pos++) : 0;
}
}
} else if (mask_element_type == ov::element::i32) {
int32_t* attn_data = attention_mask.data<int32_t>();
for (size_t b = 0; b < batch; b++) {
int32_t pos = 0;
for (size_t s = 0; s < seq_len; s++) {
pos_data[b * seq_len + s] = attn_data[b * seq_len + s] ? pos++ : 0;
}
}
} else {
throw std::runtime_error("Unsupported attention_mask element type for position_ids generation");
}
} else {
throw std::runtime_error("Unsupported position_ids element type in compiled model");
}
inferRequest.set_tensor(RERANK_MODEL_POSITION_IDS_NAME, position_ids);
}
if (rerank_session->hasBeamIdx) {
size_t batch = input_ids.get_shape()[0];
const ov::element::Type beam_element_type =
inferRequest.get_compiled_model().input(RERANK_MODEL_BEAM_IDX_NAME).get_element_type();
ov::Tensor beam_idx(beam_element_type, {batch});
if (beam_element_type == ov::element::i32) {
std::fill_n(beam_idx.data<int32_t>(), batch, 0);
} else if (beam_element_type == ov::element::i64) {
std::fill_n(beam_idx.data<int64_t>(), batch, 0);
} else {
throw std::runtime_error("Unsupported beam_idx element type in compiled model");
}

Copilot uses AI. Check for mistakes.
inferRequest.set_tensor(RERANK_MODEL_BEAM_IDX_NAME, beam_idx);
}
Comment on lines +295 to +315
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR introduces a new Qwen3-specific request formatting path and new model inputs (position_ids, beam_idx) handling, but there are no unit/functional tests covering these behaviors. Since the repo already has rerank-related tests (e.g. src/test/reranknode_test.cpp, src/test/rerank_chunking_test.cpp), please add coverage that at least validates Qwen3 detection from config.json and that the calculator sets the expected extra tensors / produces a [batch, 1] logits output.

Copilot uses AI. Check for mistakes.
inferRequest.start_async();
inferRequest.wait();
auto logits = inferRequest.get_tensor("logits");
Expand Down Expand Up @@ -321,7 +377,9 @@ class RerankCalculatorOV : public CalculatorBase {
std::vector<size_t> chunk_mapping;
auto [input_ids, attention_mask] = PrepareInputsForRerankModel(handler, chunk_mapping);
std::optional<ov::Tensor> typeIds;
if (rerank_session->getNumberOfModelInputs() == 3) {
// Only create token_type_ids for non-Qwen3 models with 3 inputs
// (Qwen3 CausalLM has position_ids as 3rd input, not token_type_ids)
if (!rerank_session->isQwen3 && rerank_session->getNumberOfModelInputs() == 3) {
typeIds = ov::Tensor{ov::element::i64, input_ids.get_shape()};
std::fill_n(typeIds->data<int64_t>(), input_ids.get_size(), 0);
}
Expand Down Expand Up @@ -359,6 +417,8 @@ const std::string RerankCalculatorOV::OUTPUT_TAG_NAME{"RESPONSE_PAYLOAD"};
const std::string RerankCalculatorOV::RERANK_MODEL_INPUT_IDS_NAME{"input_ids"};
const std::string RerankCalculatorOV::RERANK_MODEL_ATTENTION_MASK_NAME{"attention_mask"};
const std::string RerankCalculatorOV::RERANK_MODEL_TOKEN_TYPE_IDS_NAME{"token_type_ids"};
const std::string RerankCalculatorOV::RERANK_MODEL_POSITION_IDS_NAME{"position_ids"};
const std::string RerankCalculatorOV::RERANK_MODEL_BEAM_IDX_NAME{"beam_idx"};

REGISTER_CALCULATOR(RerankCalculatorOV);

Expand Down
122 changes: 122 additions & 0 deletions src/rerank/rerank_servable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,18 @@
#include <string>
#include <unordered_map>

#include <openvino/opsets/opset1.hpp>
#include <openvino/opsets/opset8.hpp>
#include <openvino/core/preprocess/pre_post_process.hpp>

namespace ovms {

struct RerankServable : SidepacketServable {
bool addBosToken = true;
bool isQwen3 = false;
bool hasPositionIds = false;
bool hasBeamIdx = false;

RerankServable(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) :
SidepacketServable(modelDir, targetDevice, pluginConfig, graphPath) {
std::filesystem::path tokenizerConfigPath = (parsedModelsPath / "tokenizer_config.json");
Expand All @@ -49,6 +57,120 @@ struct RerankServable : SidepacketServable {
addBosToken = false;
}
}

protected:
std::shared_ptr<ov::Model> applyPrePostProcessing(ov::Core& core, std::shared_ptr<ov::Model> model, ov::AnyMap& properties) override {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would split that logic. Since we only need graph postprocessing for Qwen3 I would extract detection to another isQwen3Model method and go with something like:

if (isQwen3Model)
	applyQwen3GraphPostProcessing()

I think it will be more straighforward on the higher level as now we need to get into that function to see early return if it's not qwen3 model

// Detect Qwen3 model type from config.json
std::filesystem::path configPath = parsedModelsPath / "config.json";
if (std::filesystem::exists(configPath)) {
std::ifstream ifs(configPath.string());
if (ifs.is_open()) {
rapidjson::Document modelConfig;
rapidjson::IStreamWrapper isw(ifs);
rapidjson::ParseResult parseResult = modelConfig.ParseStream(isw);
if (!parseResult.Code()) {
if (modelConfig.HasMember("model_type") && modelConfig["model_type"].IsString()) {
std::string modelType = modelConfig["model_type"].GetString();
if (modelType == "qwen3") {
SPDLOG_INFO("Detected Qwen3 reranker model, applying specialized postprocessing");
isQwen3 = true;
}
}
}
}
}

if (!isQwen3) {
return model;
}

// Check model inputs for position_ids and beam_idx
for (const auto& input : model->inputs()) {
if (input.get_any_name() == "position_ids") {
hasPositionIds = true;
SPDLOG_DEBUG("Qwen3 reranker model has position_ids input");
}
if (input.get_any_name() == "beam_idx") {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be else if ?

hasBeamIdx = true;
SPDLOG_DEBUG("Qwen3 reranker model has beam_idx input");
}
}

// Check output shape — only apply postprocessing for CausalLM models (3D output)
ov::PartialShape outputShape = model->get_output_partial_shape(0);
if (outputShape.rank().get_length() == 2) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I would go for the wider check. If we expect 3D, let's check for 3D, so if rank != 3.

// Already a 2D output (text-classification export) — postprocessing won't help
// because the classification head has random weights
SPDLOG_WARN("Qwen3 reranker has 2D output shape (text-classification export). "
"Re-export with --task text-generation for correct scoring.");
return model;
Comment on lines +101 to +106
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outputShape.rank().get_length() will throw/assert if the output rank is dynamic. Consider using outputShape.rank() == 2 (which is safe with dynamic ranks) and also explicitly handling the expected CausalLM case (rank == 3) vs unexpected ranks (log and return/throw).

Suggested change
if (outputShape.rank().get_length() == 2) {
// Already a 2D output (text-classification export) — postprocessing won't help
// because the classification head has random weights
SPDLOG_WARN("Qwen3 reranker has 2D output shape (text-classification export). "
"Re-export with --task text-generation for correct scoring.");
return model;
ov::Rank outputRank = outputShape.rank();
if (outputRank.is_dynamic()) {
SPDLOG_WARN("Qwen3 reranker output rank is dynamic; skipping specialized postprocessing");
return model;
}
std::size_t outputRankLength = outputRank.get_length();
if (outputRankLength == 2) {
// Already a 2D output (text-classification export) — postprocessing won't help
// because the classification head has random weights
SPDLOG_WARN("Qwen3 reranker has 2D output shape (text-classification export). "
"Re-export with --task text-generation for correct scoring.");
return model;
} else if (outputRankLength != 3) {
SPDLOG_WARN("Qwen3 reranker has unexpected output rank {}. Expected 2 (classification) or 3 (CausalLM). "
"Skipping specialized postprocessing.",
outputRankLength);
return model;

Copilot uses AI. Check for mistakes.
}

// Look up yes/no token IDs
int64_t yesTokenId = -1;
int64_t noTokenId = -1;
{
auto yesTokens = tokenizer->encode("yes");
if (yesTokens.input_ids.get_size() == 1 && yesTokens.input_ids.get_element_type() == ov::element::i64) {
yesTokenId = reinterpret_cast<int64_t*>(yesTokens.input_ids.data())[0];
}
auto noTokens = tokenizer->encode("no");
if (noTokens.input_ids.get_size() == 1 && noTokens.input_ids.get_element_type() == ov::element::i64) {
noTokenId = reinterpret_cast<int64_t*>(noTokens.input_ids.data())[0];
}
Comment on lines +113 to +120
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's safe to rely on tokenizer->encode as in certain settings it will treat it as an input prompt and add special tokens like <bos><yes_token> and we end up picking wrong tokens.
I can see in GenAI there is a direct vocab read for that:
https://github.com/openvinotoolkit/openvino.genai/blob/716a778fc0ccfa86f1395b186a0cb2ca8ed7ece5/src/cpp/src/rag/text_rerank_pipeline.cpp#L179
I think it would be safer to do it that way.

}

if (yesTokenId < 0 || noTokenId < 0) {
SPDLOG_ERROR("Failed to look up yes/no token IDs for Qwen3 reranker");
return model;
}
SPDLOG_INFO("Qwen3 reranker token IDs: yes={}, no={}", yesTokenId, noTokenId);

// Apply Qwen3 postprocessing to model graph
// Ported from openvino-2026.0-genai text_rerank_pipeline.cpp apply_qwen3_postprocessing()
//
// Input: model output logits [batch, seq_len, vocab_size]
// Output: [batch, 1] tensor containing (yes_logit - no_logit)
// sigmoid of this equals softmax P(yes), so OVMS's existing sigmoid scoring works.
ov::preprocess::PrePostProcessor processor(model);

processor.output().postprocess().custom(
[yesTokenId, noTokenId](const ov::Output<ov::Node>& node) -> std::shared_ptr<ov::Node> {
// Step 1: Slice last token — [batch, seq_len, vocab] → [batch, 1, vocab]
auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{std::numeric_limits<int64_t>::max()});
auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
Comment on lines +140 to +142
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::numeric_limits<int64_t>::max() is used here but <limits> isn’t included in this header. Please include <limits> explicitly to avoid relying on transitive includes (IWYU) and prevent fragile builds.

Copilot uses AI. Check for mistakes.
auto axis1 = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be named just axis or sliceAxis right? There is no axis2, so we don't need to differ and value of the constant tells which axis we pick.


auto lastTokenSlice = std::make_shared<ov::op::v8::Slice>(node, start, stop, step, axis1);

// Step 2: Squeeze seq_len dim — [batch, 1, vocab] → [batch, vocab]
auto squeezed = std::make_shared<ov::op::v0::Squeeze>(lastTokenSlice, axis1);

// Step 3: Gather yes and no logits — [batch, vocab] → [batch, 2]
auto indices = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2},
std::vector<int64_t>{noTokenId, yesTokenId});
auto gatherAxis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
auto gathered = std::make_shared<ov::op::v8::Gather>(squeezed, indices, gatherAxis);

// Step 4: Compute yes_logit - no_logit → [batch, 1]
// gathered[:, 0] = no_logit, gathered[:, 1] = yes_logit
auto yesStart = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
auto yesStop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{2});
auto yesSlice = std::make_shared<ov::op::v8::Slice>(gathered, yesStart, yesStop, step, gatherAxis);

auto noStart = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
auto noStop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1});
auto noSlice = std::make_shared<ov::op::v8::Slice>(gathered, noStart, noStop, step, gatherAxis);

// yes_logit - no_logit → sigmoid of this = softmax P(yes)
auto diff = std::make_shared<ov::op::v1::Subtract>(yesSlice, noSlice);

return diff; // [batch, 1]
Copy link

Copilot AI Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The custom postprocess returns a new node without ensuring the output tensor name and element type match what the rest of the rerank pipeline expects. RerankCalculatorOV later fetches inferRequest.get_tensor("logits") and reads it as float*; if PrePostProcessor drops/changes the output name or leaves the output as f16, inference or scoring will break. Please either (a) set the postprocessed result tensor names back to "logits" and convert the output to f32 in the postprocess graph, or (b) update the calculator to query the compiled model’s output name and handle non-f32 element types.

Suggested change
return diff; // [batch, 1]
// Ensure the final output tensor matches pipeline expectations:
// - element type: f32 (RerankCalculatorOV reads as float*)
// - tensor name: "logits" (queried via inferRequest.get_tensor("logits"))
auto diffF32 = std::make_shared<ov::op::v0::Convert>(diff, ov::element::f32);
diffF32->set_friendly_name("logits");
return diffF32; // [batch, 1] logits in f32

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might be valid. Setting the name will ensure the output read will work even if something changes due to graph surgery.
Can you confirm new output node is indeed f32? I wonder if the convert op here is indeed necessary.

});

return processor.build();
}
};

using RerankServableMap = std::unordered_map<std::string, std::shared_ptr<RerankServable>>;
Expand Down
Loading