Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ ovms_cc_library(
"libovms_cliparser",
"libovms_systeminfo",
"ovms_exit_codes",
"//src/utils:env_guard",
],
visibility = ["//visibility:public",],
additional_copts = COPTS_DROGON,
Expand Down
44 changes: 39 additions & 5 deletions src/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
// limitations under the License.
//*****************************************************************************
#include "config.hpp"

#include <algorithm>
#include <atomic>
#include <filesystem>
#include <limits>
#include <regex>
Expand All @@ -36,19 +37,37 @@
#include "modelconfig.hpp"
#include "stringutils.hpp"
#include "systeminfo.hpp"
#include "utils/env_guard.hpp"

namespace ovms {

const uint32_t AVAILABLE_CORES = getCoreCount();
const uint32_t WIN_MAX_GRPC_WORKERS = 1;
const uint32_t MAX_PORT_NUMBER = std::numeric_limits<uint16_t>::max();

// For drogon, we need to minimize the number of default workers since this value is set for both: unary and streaming (making it always double)
const uint64_t DEFAULT_REST_WORKERS = AVAILABLE_CORES;
const uint32_t DEFAULT_GRPC_MAX_THREADS = AVAILABLE_CORES * 8.0;
const size_t DEFAULT_GRPC_MEMORY_QUOTA = (size_t)2 * 1024 * 1024 * 1024; // 2GB
const uint64_t MAX_REST_WORKERS = 10'000;

// We need to minimize the number of default drogon workers since this value is set for both: unary and streaming (making it always double)
// on linux, restrict also based on the max allowed number of open files
#ifdef __linux__

const uint64_t RESERVED_OPEN_FILES = 15; // we need to reserve some file descriptors for other operations, so we don't want to use all of them for drogon workers
const uint64_t OPEN_FILES_PER_REST_WORKER = 7; // 5x rest_workers to initialize ovms and 2x rest_workers for new connections
uint64_t getDefaultRestWorkers() {
const uint64_t maxOpenFiles = getMaxOpenFilesLimit();
if (maxOpenFiles <= RESERVED_OPEN_FILES) {
return static_cast<uint64_t>(0);
}
return std::min(static_cast<uint64_t>(AVAILABLE_CORES), (maxOpenFiles - RESERVED_OPEN_FILES) / OPEN_FILES_PER_REST_WORKER);
}
#else
uint64_t getDefaultRestWorkers() {
return AVAILABLE_CORES;
}
#endif

Config& Config::parse(int argc, char** argv) {
ovms::CLIParser parser;
ovms::ServerSettingsImpl serverSettings;
Expand All @@ -73,6 +92,14 @@ Config& Config::parse(int argc, char** argv) {
bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* modelsSettings) {
this->serverSettings = *serverSettings;
this->modelsSettings = *modelsSettings;

static EnvGuard envGuard;
#if defined(__linux__) || defined(_WIN32)
if (this->serverSettings.logLevel == "DEBUG") {
envGuard.set("OPENVINO_LOG_LEVEL", "4");
}
#endif

return validate();
}

Expand Down Expand Up @@ -297,7 +324,8 @@ bool Config::validate() {
}

// check rest_workers value
if (((restWorkers() > MAX_REST_WORKERS) || (restWorkers() < 2))) {
const uint32_t restWorkersValue = restWorkers(); // Cache to avoid multiple calls
if (((restWorkersValue > MAX_REST_WORKERS) || (restWorkersValue < 2))) {
std::cerr << "rest_workers count should be from 2 to " << MAX_REST_WORKERS << std::endl;
return false;
}
Expand All @@ -306,6 +334,12 @@ bool Config::validate() {
std::cerr << "rest_workers is set but rest_port is not set. rest_port is required to start rest servers" << std::endl;
return false;
}
#ifdef __linux__
if (restWorkersValue > (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6) {
std::cerr << "rest_workers count cannot be larger than " << (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6 << " due to open files limit. Current open files limit: " << getMaxOpenFilesLimit() << std::endl;
return false;
}
#endif

#ifdef _WIN32
if (grpcWorkers() > WIN_MAX_GRPC_WORKERS) {
Expand Down Expand Up @@ -368,7 +402,7 @@ const std::string Config::restBindAddress() const { return this->serverSettings.
uint32_t Config::grpcWorkers() const { return this->serverSettings.grpcWorkers; }
uint32_t Config::grpcMaxThreads() const { return this->serverSettings.grpcMaxThreads.value_or(DEFAULT_GRPC_MAX_THREADS); }
size_t Config::grpcMemoryQuota() const { return this->serverSettings.grpcMemoryQuota.value_or(DEFAULT_GRPC_MEMORY_QUOTA); }
uint32_t Config::restWorkers() const { return this->serverSettings.restWorkers.value_or(DEFAULT_REST_WORKERS); }
uint32_t Config::restWorkers() const { return static_cast<uint32_t>(std::max(static_cast<uint64_t>(2), static_cast<uint64_t>(this->serverSettings.restWorkers.value_or(getDefaultRestWorkers())))); }
const std::string& Config::modelName() const { return this->modelsSettings.modelName; }
const std::string& Config::modelPath() const { return this->modelsSettings.modelPath; }
const std::string& Config::batchSize() const {
Expand Down
2 changes: 2 additions & 0 deletions src/llm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ ovms_cc_library(
"//src:httppayload",
"//src:libhttpclientconnection",
"//src:sse_utils",
"//src:libovms_systeminfo",
"//src:libovms_config",
"//third_party:genai",] + select({
"//:disable_python": [],
"//:not_disable_python" : [":py_jinja_template_processor"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <algorithm>
#include <fstream>
#include <memory>
#include <stdexcept>
Expand All @@ -32,10 +33,12 @@
#pragma GCC diagnostic pop
#pragma warning(pop)

#include "../../../config.hpp"
#include "../../../json_parser.hpp"
#include "../../../logging.hpp"
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
#include "../../../status.hpp"
#include "../../../systeminfo.hpp"
#include "llm_executor.hpp"
#include "servable.hpp"
#include "servable_initializer.hpp"
Expand Down Expand Up @@ -204,7 +207,10 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
return status;
}

properties->tokenizerPluginConfig = {{"PERFORMANCE_HINT", "THROUGHPUT"}};
const uint32_t numStreams = std::min(static_cast<uint32_t>(Config::instance().restWorkers()), static_cast<uint32_t>(getCoreCount()));
SPDLOG_DEBUG("Setting tokenizer/detokenizer NUM_STREAMS to: {}", numStreams);
properties->tokenizerPluginConfig = {{"NUM_STREAMS", static_cast<int>(numStreams)}, {"PERFORMANCE_HINT", "THROUGHPUT"}};

try {
properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
properties->schedulerConfig, properties->device,
Expand Down
25 changes: 24 additions & 1 deletion src/modelmanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
#include "schema.hpp"
#include "servable_definition.hpp"
#include "stringutils.hpp"
#include "systeminfo.hpp"

namespace ovms {

Expand All @@ -79,7 +80,6 @@ const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "c:\\Intel\\openvino_cache";
const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "/opt/cache";
#endif
ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistry* registry, PythonBackend* pythonBackend) :
ieCore(std::make_unique<ov::Core>()),
pipelineFactory(std::make_unique<PipelineFactory>()),
#if (MEDIAPIPE_DISABLE == 0)
mediapipeFactory(std::make_unique<MediapipeFactory>(pythonBackend)),
Expand All @@ -89,6 +89,23 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
modelCacheDirectory(modelCacheDirectory),
metricRegistry(registry),
pythonBackend(pythonBackend) {
try {
this->ieCore = std::make_unique<ov::Core>();
const uint16_t detectedCoreCount = getCoreCount();
SPDLOG_DEBUG("Setting CPU inference_num_threads to: {}", detectedCoreCount);
this->ieCore->set_property("CPU", ov::inference_num_threads(static_cast<int>(detectedCoreCount)));

#ifdef __linux__
if (isRunningInDocker()) {
const bool cpuQuotaDefined = getDockerCpuQuota() > 0;
this->ieCore->set_property("CPU", ov::hint::enable_cpu_pinning(!cpuQuotaDefined));
}
#endif
} catch (const std::exception& ex) {
SPDLOG_CRITICAL("Failed to initialize OpenVINO Core with CPU properties set from detected core count and Docker constraints. Reason: {}", ex.what());
throw;
}

OV_LOGGER("ov::Core(): {}", reinterpret_cast<void*>(this->ieCore.get()));
// Take --cache_dir from CLI
if (this->modelCacheDirectory.empty()) {
Expand Down Expand Up @@ -151,6 +168,12 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
throw;
}
this->logPluginConfiguration();
#ifdef __linux__
if (isRunningInDocker()) {
SPDLOG_INFO("Running inside Docker container");
SPDLOG_INFO("cpu quota: {}, cpu affinity: {}, max_open_files: {}", getDockerCpuQuota(), getCpuAffinityCount(), getMaxOpenFilesLimit());
}
#endif
}

void ModelManager::logPluginConfiguration() {
Expand Down
121 changes: 120 additions & 1 deletion src/systeminfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,135 @@
//*****************************************************************************
#include "systeminfo.hpp"

#include <algorithm>
#include <fstream>
#include <limits>
#include <sstream>
#include <string>
#include <thread>

#ifdef __linux__
#include <sched.h>
#include <sys/resource.h>
#endif

#include "logging.hpp"
#include "status.hpp"

namespace ovms {
uint16_t getCoreCount() {
return std::thread::hardware_concurrency();
uint16_t detectedCoreCount = static_cast<uint16_t>(std::thread::hardware_concurrency());
#ifdef __linux__
if (isRunningInDocker()) {
const uint16_t affinityCount = getCpuAffinityCount();
const uint16_t quotaCount = getDockerCpuQuota();
if (quotaCount > 0) {
detectedCoreCount = std::min(affinityCount, quotaCount);
} else {
detectedCoreCount = affinityCount;
}
}
#endif
return detectedCoreCount;
}
Comment on lines 34 to +48

uint64_t getMaxOpenFilesLimit() {
#ifdef __linux__
struct rlimit limit;
if (getrlimit(RLIMIT_NOFILE, &limit) == 0) {
return limit.rlim_cur;
}
#endif
return std::numeric_limits<uint64_t>::max();
}

#ifdef __linux__

bool isRunningInDocker() {
// Check for /.dockerenv file
std::ifstream dockerenv("/.dockerenv");
if (dockerenv.good()) {
return true;
}

// Check /proc/self/cgroup for docker references
std::ifstream cgroup("/proc/self/cgroup");
if (cgroup.is_open()) {
std::string line;
while (std::getline(cgroup, line)) {
if (line.find("docker") != std::string::npos) {
return true;
}
}
}

return false;
}

uint16_t getCpuAffinityCount() {
cpu_set_t mask;
CPU_ZERO(&mask);

if (sched_getaffinity(0, sizeof(mask), &mask) == -1) {
return std::thread::hardware_concurrency();
}

int cpu_count = CPU_COUNT(&mask);
return static_cast<uint16_t>(cpu_count);
}

uint16_t getDockerCpuQuota() {
// Try cgroup v2 cpu.max (format: "quota period")
std::ifstream cpu_max_v2("/sys/fs/cgroup/cpu.max");
if (cpu_max_v2.is_open()) {
std::string line;
if (std::getline(cpu_max_v2, line)) {
std::istringstream iss(line);
std::string quota_str, period_str;
if (iss >> quota_str >> period_str) {
if (quota_str == "max") {
return 0; // No quota set
}
try {
uint64_t quota = std::stoull(quota_str);
uint64_t period = std::stoull(period_str);
if (quota > 0 && period > 0) {
uint16_t cpu_count = static_cast<uint16_t>((quota + period - 1) / period);
return cpu_count;
}
} catch (const std::exception&) {
// Parsing failed, continue
}
}
}
}

// Try cgroup v1 cpu.cfs_quota_us and cpu.cfs_period_us
std::ifstream quota_file("/sys/fs/cgroup/cpu/cpu.cfs_quota_us");
std::ifstream period_file("/sys/fs/cgroup/cpu/cpu.cfs_period_us");

if (quota_file.is_open() && period_file.is_open()) {
std::string quota_str, period_str;
if (std::getline(quota_file, quota_str) && std::getline(period_file, period_str)) {
// Trim whitespace
quota_str.erase(quota_str.find_last_not_of(" \n\r\t") + 1);
period_str.erase(period_str.find_last_not_of(" \n\r\t") + 1);
try {
uint64_t quota = std::stoull(quota_str);
uint64_t period = std::stoull(period_str);
if (quota > 0 && period > 0) {
uint16_t cpu_count = static_cast<uint16_t>((quota + period - 1) / period);
return cpu_count;
}
} catch (const std::exception&) {
// Parsing failed, continue
}
}
}

return 0; // No quota set
}

#endif // __linux__

} // namespace ovms
14 changes: 14 additions & 0 deletions src/systeminfo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,18 @@ namespace ovms {
* @return uint16_t Available number of cores in the system
*/
uint16_t getCoreCount();
uint64_t getMaxOpenFilesLimit();
#ifdef __linux__
bool isRunningInDocker();
/**
* @brief Get number of CPUs available via CPU affinity mask
* @return uint16_t Number of CPUs in the affinity mask, or total hardware concurrency if affinity is not set
*/
uint16_t getCpuAffinityCount();
/**
* @brief Get CPU limit from cgroup (docker run --cpus constraint)
* @return uint16_t Number of CPUs allowed by quota, or 0 if no quota is set
*/
uint16_t getDockerCpuQuota();
#endif
} // namespace ovms
Loading