DataDog · sarah-witt · Mar 26, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
@@ -0,0 +1 @@
+Add support for timeout paramter to use with process_isolation.
@@ -4,6 +4,8 @@
 import os
 import subprocess
 import sys
+import threading
+from contextlib import contextmanager, nullcontext
 
 from datadog_checks.base.utils.common import ensure_bytes, to_native_string
 from datadog_checks.base.utils.format import json
@@ -12,6 +14,16 @@
 from .constants import KNOWN_DATADOG_AGENT_SETTER_METHODS, EnvVars
 
 
+@contextmanager
+def _timer(timeout, callback):
+    timer = threading.Timer(timeout, callback)
+    timer.start()
+    try:
+        yield
+    finally:
+        timer.cancel()
+
+
 def run_with_isolation(check, aggregator, datadog_agent):
     message_indicator = os.urandom(8).hex()
     instance = dict(check.instance)
@@ -21,6 +33,10 @@ def run_with_isolation(check, aggregator, datadog_agent):
     instance.pop('process_isolation', None)
     init_config.pop('process_isolation', None)
 
+    timeout = instance.pop('process_isolation_timeout', init_config.pop('process_isolation_timeout', None))
+    if timeout is not None:
+        timeout = float(timeout)
+
     env_vars = dict(os.environ)
     env_vars[EnvVars.MESSAGE_INDICATOR] = message_indicator
     env_vars[EnvVars.CHECK_NAME] = check.name
@@ -47,37 +63,54 @@ def run_with_isolation(check, aggregator, datadog_agent):
         stderr=subprocess.STDOUT,
         env=env_vars,
     )
+    timed_out = False
+
+    def _kill_on_timeout():
+        nonlocal timed_out
+        timed_out = True
+        # currently this only kills the parent process; does not support checks that spawn sub-processes
+        process.kill()
+
+    timer = _timer(timeout, _kill_on_timeout) if timeout is not None else nullcontext()
+
     with process:
         check.log.info('Running check in a separate process')
+        with timer:
+            # To avoid blocking never use a pipe's file descriptor iterator. See https://bugs.python.org/issue3907
+            for line in iter(process.stdout.readline, b''):
+                line = line.rstrip().decode('utf-8')
+                indicator, _, procedure = line.partition(':')
+                if indicator != message_indicator:
+                    check.log.debug(line)
+                    continue
+
+                check.log.trace(line)
+
+                message_type, _, message = procedure.partition(':')
+                message = json.decode(message)
+                if message_type == 'aggregator':
+                    getattr(aggregator, message['method'])(check, *message['args'], **message['kwargs'])
+                elif message_type == 'log':
+                    getattr(check.log, message['method'])(*message['args'])
+                elif message_type == 'datadog_agent':
+                    method = message['method']
+                    value = getattr(datadog_agent, method)(*message['args'], **message['kwargs'])
+                    if method not in KNOWN_DATADOG_AGENT_SETTER_METHODS:
+                        try:
+                            process.stdin.write(b'%s\n' % ensure_bytes(json.encode({'value': value})))
+                            process.stdin.flush()
+                        except BrokenPipeError:
+                            break
+                elif message_type == 'error':
+                    check.log.error(message[0]['traceback'])
+                    break
+                else:
+                    check.log.error(
+                        'Unknown message type encountered during communication with the isolated process: %s',
+                        message_type,
+                    )
+                    break
 
-        # To avoid blocking never use a pipe's file descriptor iterator. See https://bugs.python.org/issue3907
-        for line in iter(process.stdout.readline, b''):
-            line = line.rstrip().decode('utf-8')
-            indicator, _, procedure = line.partition(':')
-            if indicator != message_indicator:
-                check.log.debug(line)
-                continue
-
-            check.log.trace(line)
-
-            message_type, _, message = procedure.partition(':')
-            message = json.decode(message)
-            if message_type == 'aggregator':
-                getattr(aggregator, message['method'])(check, *message['args'], **message['kwargs'])
-            elif message_type == 'log':
-                getattr(check.log, message['method'])(*message['args'])
-            elif message_type == 'datadog_agent':
-                method = message['method']
-                value = getattr(datadog_agent, method)(*message['args'], **message['kwargs'])
-                if method not in KNOWN_DATADOG_AGENT_SETTER_METHODS:
-                    process.stdin.write(b'%s\n' % ensure_bytes(json.encode({'value': value})))
-                    process.stdin.flush()
-            elif message_type == 'error':
-                check.log.error(message[0]['traceback'])
-                break
-            else:
-                check.log.error(
-                    'Unknown message type encountered during communication with the isolated process: %s',
-                    message_type,
-                )
-                break
+    if timed_out:
+        check.log.error('Check timed out after %s seconds', timeout)
+        check.warning('Check timed out and possibly reported incomplete data.')
@@ -62,3 +62,22 @@ def test_replay_all(caplog, dd_run_check, aggregator, datadog_agent, init_config
             break
     else:
         raise AssertionError('Expected DEBUG log with message: {}'.format(expected_message))
+
+
+class SlowReplayCheck(AgentCheck):
+    __NAMESPACE__ = 'slow_replay'
+
+    def check(self, _):
+        import time
+
+        time.sleep(9999)
+
+
+def test_replay_timeout(caplog, dd_run_check):
+    instance = {'process_isolation': True, 'process_isolation_timeout': 1}
+    check = SlowReplayCheck('slow_replay', {}, [instance])
+
+    with caplog.at_level(logging.ERROR):
+        dd_run_check(check)
+
+    assert 'Check timed out' in caplog.text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Add support for timeout paramter to use with process_isolation.