openvinotoolkit · dtrawins · Apr 1, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/src/BUILD b/src/BUILD
@@ -348,6 +348,7 @@ ovms_cc_library(
         "libovms_cliparser",
         "libovms_systeminfo",
         "ovms_exit_codes",
+        "//src/utils:env_guard",
     ],
     visibility = ["//visibility:public",],
     additional_copts = COPTS_DROGON,

diff --git a/src/config.cpp b/src/config.cpp
@@ -14,7 +14,8 @@
 // limitations under the License.
 //*****************************************************************************
 #include "config.hpp"
-
+#include <algorithm>
+#include <atomic>
 #include <filesystem>
 #include <limits>
 #include <regex>
@@ -36,19 +37,37 @@
 #include "modelconfig.hpp"
 #include "stringutils.hpp"
 #include "systeminfo.hpp"
+#include "utils/env_guard.hpp"
 
 namespace ovms {
 
 const uint32_t AVAILABLE_CORES = getCoreCount();
 const uint32_t WIN_MAX_GRPC_WORKERS = 1;
 const uint32_t MAX_PORT_NUMBER = std::numeric_limits<uint16_t>::max();
 
-// For drogon, we need to minimize the number of default workers since this value is set for both: unary and streaming (making it always double)
-const uint64_t DEFAULT_REST_WORKERS = AVAILABLE_CORES;
 const uint32_t DEFAULT_GRPC_MAX_THREADS = AVAILABLE_CORES * 8.0;
 const size_t DEFAULT_GRPC_MEMORY_QUOTA = (size_t)2 * 1024 * 1024 * 1024;  // 2GB
 const uint64_t MAX_REST_WORKERS = 10'000;
 
+// We need to minimize the number of default drogon workers since this value is set for both: unary and streaming (making it always double)
+// on linux, restrict also based on the max allowed number of open files
+#ifdef __linux__
+
+const uint64_t RESERVED_OPEN_FILES = 15;        // we need to reserve some file descriptors for other operations, so we don't want to use all of them for drogon workers
+const uint64_t OPEN_FILES_PER_REST_WORKER = 7;  // 5x rest_workers to initialize ovms and 2x rest_workers for new connections
+uint64_t getDefaultRestWorkers() {
+    const uint64_t maxOpenFiles = getMaxOpenFilesLimit();
+    if (maxOpenFiles <= RESERVED_OPEN_FILES) {
+        return static_cast<uint64_t>(0);
+    }
+    return std::min(static_cast<uint64_t>(AVAILABLE_CORES), (maxOpenFiles - RESERVED_OPEN_FILES) / OPEN_FILES_PER_REST_WORKER);
+}
+#else
+uint64_t getDefaultRestWorkers() {
+    return AVAILABLE_CORES;
+}
+#endif
+
 Config& Config::parse(int argc, char** argv) {
     ovms::CLIParser parser;
     ovms::ServerSettingsImpl serverSettings;
@@ -73,6 +92,14 @@ Config& Config::parse(int argc, char** argv) {
 bool Config::parse(ServerSettingsImpl* serverSettings, ModelsSettingsImpl* modelsSettings) {
     this->serverSettings = *serverSettings;
     this->modelsSettings = *modelsSettings;
+
+    static EnvGuard envGuard;
+#if defined(__linux__) || defined(_WIN32)
+    if (this->serverSettings.logLevel == "DEBUG") {
+        envGuard.set("OPENVINO_LOG_LEVEL", "4");
+    }
+#endif
+
     return validate();
 }
 
@@ -297,7 +324,8 @@ bool Config::validate() {
     }
 
     // check rest_workers value
-    if (((restWorkers() > MAX_REST_WORKERS) || (restWorkers() < 2))) {
+    const uint32_t restWorkersValue = restWorkers();  // Cache to avoid multiple calls
+    if (((restWorkersValue > MAX_REST_WORKERS) || (restWorkersValue < 2))) {
         std::cerr << "rest_workers count should be from 2 to " << MAX_REST_WORKERS << std::endl;
         return false;
     }
@@ -306,6 +334,12 @@ bool Config::validate() {
         std::cerr << "rest_workers is set but rest_port is not set. rest_port is required to start rest servers" << std::endl;
         return false;
     }
+#ifdef __linux__
+    if (restWorkersValue > (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6) {
+        std::cerr << "rest_workers count cannot be larger than " << (getMaxOpenFilesLimit() - RESERVED_OPEN_FILES) / 6 << " due to open files limit. Current open files limit: " << getMaxOpenFilesLimit() << std::endl;
+        return false;
+    }
+#endif
 
 #ifdef _WIN32
     if (grpcWorkers() > WIN_MAX_GRPC_WORKERS) {
@@ -368,7 +402,7 @@ const std::string Config::restBindAddress() const { return this->serverSettings.
 uint32_t Config::grpcWorkers() const { return this->serverSettings.grpcWorkers; }
 uint32_t Config::grpcMaxThreads() const { return this->serverSettings.grpcMaxThreads.value_or(DEFAULT_GRPC_MAX_THREADS); }
 size_t Config::grpcMemoryQuota() const { return this->serverSettings.grpcMemoryQuota.value_or(DEFAULT_GRPC_MEMORY_QUOTA); }
-uint32_t Config::restWorkers() const { return this->serverSettings.restWorkers.value_or(DEFAULT_REST_WORKERS); }
+uint32_t Config::restWorkers() const { return static_cast<uint32_t>(std::max(static_cast<uint64_t>(2), static_cast<uint64_t>(this->serverSettings.restWorkers.value_or(getDefaultRestWorkers())))); }
 const std::string& Config::modelName() const { return this->modelsSettings.modelName; }
 const std::string& Config::modelPath() const { return this->modelsSettings.modelPath; }
 const std::string& Config::batchSize() const {

diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -283,6 +283,8 @@ ovms_cc_library(
         "//src:httppayload",
         "//src:libhttpclientconnection",
         "//src:sse_utils",
+        "//src:libovms_systeminfo",
+        "//src:libovms_config",
         "//third_party:genai",] + select({
         "//:disable_python": [],
         "//:not_disable_python" : [":py_jinja_template_processor"],

diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
+#include <algorithm>
 #include <fstream>
 #include <memory>
 #include <stdexcept>
@@ -32,10 +33,12 @@
 #pragma GCC diagnostic pop
 #pragma warning(pop)
 
+#include "../../../config.hpp"
 #include "../../../json_parser.hpp"
 #include "../../../logging.hpp"
 #include "../../../mediapipe_internal/mediapipe_utils.hpp"
 #include "../../../status.hpp"
+#include "../../../systeminfo.hpp"
 #include "llm_executor.hpp"
 #include "servable.hpp"
 #include "servable_initializer.hpp"
@@ -204,7 +207,10 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return status;
     }
 
-    properties->tokenizerPluginConfig = {{"PERFORMANCE_HINT", "THROUGHPUT"}};
+    const uint32_t numStreams = std::min(static_cast<uint32_t>(Config::instance().restWorkers()), static_cast<uint32_t>(getCoreCount()));
+    SPDLOG_DEBUG("Setting tokenizer/detokenizer NUM_STREAMS to: {}", numStreams);
+    properties->tokenizerPluginConfig = {{"NUM_STREAMS", static_cast<int>(numStreams)}, {"PERFORMANCE_HINT", "THROUGHPUT"}};
+
     try {
         properties->pipeline = std::make_shared<ov::genai::ContinuousBatchingPipeline>(parsedModelsPath,
             properties->schedulerConfig, properties->device,

diff --git a/src/modelmanager.cpp b/src/modelmanager.cpp
@@ -69,6 +69,7 @@
 #include "schema.hpp"
 #include "servable_definition.hpp"
 #include "stringutils.hpp"
+#include "systeminfo.hpp"
 
 namespace ovms {
 
@@ -79,7 +80,6 @@ const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "c:\\Intel\\openvino_cache";
 const std::string DEFAULT_MODEL_CACHE_DIRECTORY = "/opt/cache";
 #endif
 ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistry* registry, PythonBackend* pythonBackend) :
-    ieCore(std::make_unique<ov::Core>()),
     pipelineFactory(std::make_unique<PipelineFactory>()),
 #if (MEDIAPIPE_DISABLE == 0)
     mediapipeFactory(std::make_unique<MediapipeFactory>(pythonBackend)),
@@ -89,6 +89,23 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
     modelCacheDirectory(modelCacheDirectory),
     metricRegistry(registry),
     pythonBackend(pythonBackend) {
+    try {
+        this->ieCore = std::make_unique<ov::Core>();
+        const uint16_t detectedCoreCount = getCoreCount();
+        SPDLOG_DEBUG("Setting CPU inference_num_threads to: {}", detectedCoreCount);
+        this->ieCore->set_property("CPU", ov::inference_num_threads(static_cast<int>(detectedCoreCount)));
+
+#ifdef __linux__
+        if (isRunningInDocker()) {
+            const bool cpuQuotaDefined = getDockerCpuQuota() > 0;
+            this->ieCore->set_property("CPU", ov::hint::enable_cpu_pinning(!cpuQuotaDefined));
+        }
+#endif
+    } catch (const std::exception& ex) {
+        SPDLOG_CRITICAL("Failed to initialize OpenVINO Core with CPU properties set from detected core count and Docker constraints. Reason: {}", ex.what());
+        throw;
+    }
+
     OV_LOGGER("ov::Core(): {}", reinterpret_cast<void*>(this->ieCore.get()));
     // Take --cache_dir from CLI
     if (this->modelCacheDirectory.empty()) {
@@ -151,6 +168,12 @@ ModelManager::ModelManager(const std::string& modelCacheDirectory, MetricRegistr
         throw;
     }
     this->logPluginConfiguration();
+#ifdef __linux__
+    if (isRunningInDocker()) {
+        SPDLOG_INFO("Running inside Docker container");
+        SPDLOG_INFO("cpu quota: {}, cpu affinity: {}, max_open_files: {}", getDockerCpuQuota(), getCpuAffinityCount(), getMaxOpenFilesLimit());
+    }
+#endif
 }
 
 void ModelManager::logPluginConfiguration() {

diff --git a/src/systeminfo.cpp b/src/systeminfo.cpp
@@ -15,16 +15,135 @@
 //*****************************************************************************
 #include "systeminfo.hpp"
 
+#include <algorithm>
 #include <fstream>
+#include <limits>
 #include <sstream>
 #include <string>
 #include <thread>
 
+#ifdef __linux__
+#include <sched.h>
+#include <sys/resource.h>
+#endif
+
 #include "logging.hpp"
 #include "status.hpp"
 
 namespace ovms {
 uint16_t getCoreCount() {
-    return std::thread::hardware_concurrency();
+    uint16_t detectedCoreCount = static_cast<uint16_t>(std::thread::hardware_concurrency());
+#ifdef __linux__
+    if (isRunningInDocker()) {
+        const uint16_t affinityCount = getCpuAffinityCount();
+        const uint16_t quotaCount = getDockerCpuQuota();
+        if (quotaCount > 0) {
+            detectedCoreCount = std::min(affinityCount, quotaCount);
+        } else {
+            detectedCoreCount = affinityCount;
+        }
+    }
+#endif
+    return detectedCoreCount;
+}
+
+uint64_t getMaxOpenFilesLimit() {
+#ifdef __linux__
+    struct rlimit limit;
+    if (getrlimit(RLIMIT_NOFILE, &limit) == 0) {
+        return limit.rlim_cur;
+    }
+#endif
+    return std::numeric_limits<uint64_t>::max();
+}
+
+#ifdef __linux__
+
+bool isRunningInDocker() {
+    // Check for /.dockerenv file
+    std::ifstream dockerenv("/.dockerenv");
+    if (dockerenv.good()) {
+        return true;
+    }
+
+    // Check /proc/self/cgroup for docker references
+    std::ifstream cgroup("/proc/self/cgroup");
+    if (cgroup.is_open()) {
+        std::string line;
+        while (std::getline(cgroup, line)) {
+            if (line.find("docker") != std::string::npos) {
+                return true;
+            }
+        }
+    }
+
+    return false;
 }
+
+uint16_t getCpuAffinityCount() {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+
+    if (sched_getaffinity(0, sizeof(mask), &mask) == -1) {
+        return std::thread::hardware_concurrency();
+    }
+
+    int cpu_count = CPU_COUNT(&mask);
+    return static_cast<uint16_t>(cpu_count);
+}
+
+uint16_t getDockerCpuQuota() {
+    // Try cgroup v2 cpu.max (format: "quota period")
+    std::ifstream cpu_max_v2("/sys/fs/cgroup/cpu.max");
+    if (cpu_max_v2.is_open()) {
+        std::string line;
+        if (std::getline(cpu_max_v2, line)) {
+            std::istringstream iss(line);
+            std::string quota_str, period_str;
+            if (iss >> quota_str >> period_str) {
+                if (quota_str == "max") {
+                    return 0;  // No quota set
+                }
+                try {
+                    uint64_t quota = std::stoull(quota_str);
+                    uint64_t period = std::stoull(period_str);
+                    if (quota > 0 && period > 0) {
+                        uint16_t cpu_count = static_cast<uint16_t>((quota + period - 1) / period);
+                        return cpu_count;
+                    }
+                } catch (const std::exception&) {
+                    // Parsing failed, continue
+                }
+            }
+        }
+    }
+
+    // Try cgroup v1 cpu.cfs_quota_us and cpu.cfs_period_us
+    std::ifstream quota_file("/sys/fs/cgroup/cpu/cpu.cfs_quota_us");
+    std::ifstream period_file("/sys/fs/cgroup/cpu/cpu.cfs_period_us");
+
+    if (quota_file.is_open() && period_file.is_open()) {
+        std::string quota_str, period_str;
+        if (std::getline(quota_file, quota_str) && std::getline(period_file, period_str)) {
+            // Trim whitespace
+            quota_str.erase(quota_str.find_last_not_of(" \n\r\t") + 1);
+            period_str.erase(period_str.find_last_not_of(" \n\r\t") + 1);
+            try {
+                uint64_t quota = std::stoull(quota_str);
+                uint64_t period = std::stoull(period_str);
+                if (quota > 0 && period > 0) {
+                    uint16_t cpu_count = static_cast<uint16_t>((quota + period - 1) / period);
+                    return cpu_count;
+                }
+            } catch (const std::exception&) {
+                // Parsing failed, continue
+            }
+        }
+    }
+
+    return 0;  // No quota set
+}
+
+#endif  // __linux__
+
 }  // namespace ovms
diff --git a/src/systeminfo.hpp b/src/systeminfo.hpp
@@ -22,4 +22,18 @@ namespace ovms {
  * @return uint16_t Available number of cores in the system
  */
 uint16_t getCoreCount();
+uint64_t getMaxOpenFilesLimit();
+#ifdef __linux__
+bool isRunningInDocker();
+/**
+ * @brief Get number of CPUs available via CPU affinity mask
+ * @return uint16_t Number of CPUs in the affinity mask, or total hardware concurrency if affinity is not set
+ */
+uint16_t getCpuAffinityCount();
+/**
+ * @brief Get CPU limit from cgroup (docker run --cpus constraint)
+ * @return uint16_t Number of CPUs allowed by quota, or 0 if no quota is set
+ */
+uint16_t getDockerCpuQuota();
+#endif
 }  // namespace ovms