Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions codecarbon/core/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def set_RAM_tracking(self):
self.ram_tracker = "RAM power estimation model"
ram = RAM(
tracking_mode=self.tracker._tracking_mode,
tracking_pids=self.tracker._tracking_pids,
force_ram_power=self.tracker._force_ram_power,
)
self.tracker._conf["ram_total_size"] = ram.machine_memory_GB
Expand All @@ -46,6 +47,7 @@ def _setup_cpu_load_mode(self, tdp, max_power):
model,
max_power,
tracking_mode=self.tracker._tracking_mode,
tracking_pids=self.tracker._tracking_pids,
)
self.cpu_tracker = MODE_CPU_LOAD
self.tracker._conf["cpu_model"] = hardware_cpu.get_model()
Expand Down Expand Up @@ -141,6 +143,7 @@ def _setup_fallback_tracking(self, tdp, max_power):
model,
max_power,
tracking_mode=self.tracker._tracking_mode,
tracking_pids=self.tracker._tracking_pids,
)
self.cpu_tracker = MODE_CPU_LOAD
else:
Expand All @@ -163,6 +166,7 @@ def _setup_fallback_tracking(self, tdp, max_power):
model,
max_power,
tracking_mode=self.tracker._tracking_mode,
tracking_pids=self.tracker._tracking_pids,
)
self.cpu_tracker = MODE_CPU_LOAD
else:
Expand Down
26 changes: 25 additions & 1 deletion codecarbon/emissions_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def __init__(
str
] = _sentinel, # Deprecated, use electricitymaps_api_token
tracking_mode: Optional[str] = _sentinel,
tracking_pids: Optional[List[int]] = _sentinel,
log_level: Optional[Union[int, str]] = _sentinel,
on_csv_write: Optional[str] = _sentinel,
logger_preamble: Optional[str] = _sentinel,
Expand Down Expand Up @@ -231,6 +232,8 @@ def __init__(
power consumption due to the entire machine or to try and
isolate the tracked processe's in isolation.
Defaults to "machine".
:param tracking_pids: PID of the process to be tracked when using "process" mode.
Defaults to None, which means the current process.
:param log_level: Global codecarbon log level. Accepts one of:
{"debug", "info", "warning", "error", "critical"}.
Defaults to "info".
Expand Down Expand Up @@ -314,6 +317,7 @@ def __init__(
self._set_from_conf(prometheus_url, "prometheus_url", "localhost:9091")
self._set_from_conf(output_handlers, "output_handlers", [])
self._set_from_conf(tracking_mode, "tracking_mode", "machine")
self._set_from_conf(tracking_pids, "tracking_pids", None, int)
self._set_from_conf(on_csv_write, "on_csv_write", "append")
self._set_from_conf(logger_preamble, "logger_preamble", "")
self._set_from_conf(force_cpu_power, "force_cpu_power", None, float)
Expand Down Expand Up @@ -383,6 +387,12 @@ def __init__(
else:
logger.info(f" GPU model: {self._conf.get('gpu_model')}")

if self._tracking_mode == "process":
logger.info(" Tracking mode: process")
logger.info(" Tracked PIDs: " + str(self._tracking_pids))
else:
logger.info(" Tracking mode: machine")

# Run `self._measure_power_and_energy` every `measure_power_secs` seconds in a
# background thread
self._scheduler = PeriodicScheduler(
Expand Down Expand Up @@ -447,7 +457,12 @@ def _init_output_methods(self, *, api_key: str = None):
self.run_id = uuid.uuid4()

if self._save_to_prometheus:
self._output_handlers.append(PrometheusOutput(self._prometheus_url))
self._output_handlers.append(
PrometheusOutput(
self._prometheus_url,
jobname=self._project_name + "_" + self._experiment_name,
)
)

if self._save_to_logfire:
self._output_handlers.append(LogfireOutput())
Expand Down Expand Up @@ -686,6 +701,10 @@ def stop(self) -> Optional[float]:

self.final_emissions_data = emissions_data
self.final_emissions = emissions_data.emissions

for handler in self._output_handlers:
handler.exit()

return emissions_data.emissions

def _persist_data(
Expand Down Expand Up @@ -1163,6 +1182,7 @@ def track_emissions(
str
] = _sentinel, # Deprecated, use electricitymaps_api_token
tracking_mode: Optional[str] = _sentinel,
tracking_pids: Optional[List[int]] = _sentinel,
log_level: Optional[Union[int, str]] = _sentinel,
on_csv_write: Optional[str] = _sentinel,
logger_preamble: Optional[str] = _sentinel,
Expand Down Expand Up @@ -1223,6 +1243,8 @@ def track_emissions(
power consumption due to the entire machine or to try and
isolate the tracked processe's in isolation.
Defaults to "machine".
:param tracking_pids: PID of the process to be tracked when using "process" mode.
Defaults to None, which means the current process.
:param log_level: Global codecarbon log level. Accepts one of:
{"debug", "info", "warning", "error", "critical"}.
Defaults to "info".
Expand Down Expand Up @@ -1297,6 +1319,7 @@ def wrapped_fn(*args, **kwargs):
gpu_ids=gpu_ids,
electricitymaps_api_token=_electricitymaps_token,
tracking_mode=tracking_mode,
tracking_pids=tracking_pids,
log_level=log_level,
on_csv_write=on_csv_write,
logger_preamble=logger_preamble,
Expand Down Expand Up @@ -1336,6 +1359,7 @@ def wrapped_fn(*args, **kwargs):
experiment_name=experiment_name,
electricitymaps_api_token=_electricitymaps_token,
tracking_mode=tracking_mode,
tracking_pids=tracking_pids,
log_level=log_level,
on_csv_write=on_csv_write,
logger_preamble=logger_preamble,
Expand Down
51 changes: 47 additions & 4 deletions codecarbon/external/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def __init__(
tdp: int,
rapl_dir: str = "/sys/class/powercap/intel-rapl/subsystem",
tracking_mode: str = "machine",
tracking_pids: int = None,
rapl_include_dram: bool = False,
rapl_prefer_psys: bool = False,
):
Expand All @@ -179,9 +180,17 @@ def __init__(
self._tdp = tdp
self._is_generic_tdp = False
self._tracking_mode = tracking_mode
self._pid = psutil.Process().pid
self._tracking_pids = tracking_pids
self._cpu_count = count_cpus()
self._process = psutil.Process(self._pid)

if tracking_pids is not None:
# Make list if it is not already a list
if not isinstance(tracking_pids, list):
self._tracking_pids = [tracking_pids]
else:
self._tracking_pids = tracking_pids
else:
self._tracking_pids = [psutil.Process().pid]

if self._mode == "intel_power_gadget":
self._intel_interface = IntelPowerGadget(self._output_dir)
Expand Down Expand Up @@ -246,11 +255,43 @@ def _get_power_from_cpu_load(self):
)
elif self._tracking_mode == "process":

cpu_load = self._process.cpu_percent(interval=0.5) / self._cpu_count
cpu_load = 0

for pid in self._tracking_pids:
if not psutil.pid_exists(pid):
# Log a warning and continue
logger.warning(f"Process with pid {pid} does not exist anymore.")
continue
self._process = psutil.Process(pid)
cpu_load += self._process.cpu_percent(interval=0.5)

try:
children = self._process.children(recursive=True)
for child in children:
try:
# Use interval=0.0 for children to avoid blocking
child_cpu = child.cpu_percent(interval=0.0)
logger.info(f"Child {child.pid} CPU: {child_cpu}")
cpu_load += child_cpu
except (
psutil.NoSuchProcess,
psutil.AccessDenied,
psutil.ZombieProcess,
):
# Child process may have terminated or we don't have access
continue
except (psutil.NoSuchProcess, psutil.AccessDenied):
# Main process terminated or access denied
pass

# Normalize by CPU count
logger.info(f"Total CPU load (all processes): {cpu_load}")
cpu_load = cpu_load / self._cpu_count
power = self._tdp * cpu_load / 100
logger.debug(
f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for process {self._pid}."
f"CPU load {self._tdp} W and {cpu_load * 100:.1f}% => estimation of {power} W for processes {self._tracking_pids} (including children)."
)

else:
raise Exception(f"Unknown tracking_mode {self._tracking_mode}")
return Power.from_watts(power)
Expand Down Expand Up @@ -337,6 +378,7 @@ def from_utils(
model: Optional[str] = None,
tdp: Optional[int] = None,
tracking_mode: str = "machine",
tracking_pids: int = None,
rapl_include_dram: bool = False,
rapl_prefer_psys: bool = False,
) -> "CPU":
Expand Down Expand Up @@ -364,6 +406,7 @@ def from_utils(
model=model,
tdp=tdp,
tracking_mode=tracking_mode,
tracking_pids=tracking_pids,
rapl_include_dram=rapl_include_dram,
rapl_prefer_psys=rapl_prefer_psys,
)
Expand Down
56 changes: 43 additions & 13 deletions codecarbon/external/ram.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import math
import re
import subprocess
import traceback
from dataclasses import dataclass
from typing import Optional
from typing import List, Optional

import psutil

Expand Down Expand Up @@ -34,9 +35,9 @@ class RAM(BaseHardware):

def __init__(
self,
pid: int = psutil.Process().pid,
children: bool = True,
tracking_mode: str = "machine",
tracking_pids: Optional[List[int]] = None,
force_ram_power: Optional[int] = None,
):
"""
Expand All @@ -45,19 +46,24 @@ def __init__(
is True.

Args:
pid (int, optional): Process id (with respect to which we'll look for
children). Defaults to psutil.Process().pid.
children (int, optional): Look for children of the process when computing
total RAM used. Defaults to True.
tracking_mode (str, optional): Whether to track "machine" or "process" RAM.
Defaults to "machine".
tracking_pids ([int], optional): Process id to track RAM usage for "process"
tracking_mode. Defaults to None.
force_ram_power (int, optional): User-provided RAM power in watts. If provided,
this value is used instead of estimating RAM power.
Defaults to None.
"""
self._pid = pid
self._children = children
self._tracking_mode = tracking_mode

if tracking_mode == "process" and tracking_pids is None:
self._tracking_pids = [psutil.Process().pid]
else:
self._tracking_pids = tracking_pids

self._force_ram_power = force_ram_power
# Check if using ARM architecture
self.is_arm_cpu = self._detect_arm_cpu()
Expand Down Expand Up @@ -192,16 +198,21 @@ def _calculate_ram_power(self, memory_gb: float) -> float:
# Apply minimum power constraint
return max(min_power, total_power)

def _get_children_memories(self):
def _get_children_memories(self, pid: int):
"""
Compute the used RAM by the process's children

Returns:
list(int): The list of RAM values
"""
current_process = psutil.Process(self._pid)
memorie_consumption = dict()
current_process = psutil.Process(pid)

children = current_process.children(recursive=True)
return [child.memory_info().rss for child in children]
for child in children:
memorie_consumption[child.pid] = child.memory_info().rss

return memorie_consumption

def _read_slurm_scontrol(self):
try:
Expand Down Expand Up @@ -285,17 +296,35 @@ def slurm_memory_GB(self):
return mem

@property
def process_memory_GB(self):
def process_memory_GB(self) -> float:
"""
Property to compute the process's total memory usage in bytes.

Returns:
float: RAM usage (GB)
"""
children_memories = self._get_children_memories() if self._children else []
main_memory = psutil.Process(self._pid).memory_info().rss
memories = children_memories + [main_memory]
return sum([m for m in memories if m] + [0]) / B_TO_GB

# Store memory usage in dict to avoid double counting
total_memory = dict()

for pid in self._tracking_pids:
if not psutil.pid_exists(pid):
logger.warning(f"Process with pid {pid} does not exist anymore.")
continue

# Own memory
total_memory[pid] = psutil.Process(pid).memory_info().rss

# Children's memory
children_memories = self._get_children_memories(pid)
for child_pid, mem in children_memories.items():
total_memory[child_pid] = mem

# Reduce to total memory
total_memory = sum(total_memory.values())
logger.debug(f"Process total memory usage: {total_memory / B_TO_GB:.2f} GB")

return total_memory / B_TO_GB

@property
def machine_memory_GB(self):
Expand Down Expand Up @@ -338,6 +367,7 @@ def total_power(self) -> Power:
)
except Exception as e:
logger.warning(f"Could not measure RAM Power ({str(e)})")
logger.warning(traceback.format_exc())
ram_power = Power.from_watts(0)

return ram_power
3 changes: 3 additions & 0 deletions codecarbon/output_methods/base_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ def live_out(self, total: EmissionsData, delta: EmissionsData):

def task_out(self, data: List[TaskEmissionsData], experiment_name: str):
pass

def exit(self):
pass
13 changes: 9 additions & 4 deletions codecarbon/output_methods/metrics/metric_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,22 @@ class MetricDocumentation:
)
cpu_energy_doc = MetricDocumentation(
"codecarbon_cpu_energy",
description="Energy used per CPU (kWh)",
description="Energy used per CPU since last reading (kWh)",
)
gpu_energy_doc = MetricDocumentation(
"codecarbon_gpu_energy",
description="Energy used per GPU (kWh)",
description="Energy used per GPU since last reading (kWh)",
)
ram_energy_doc = MetricDocumentation(
"codecarbon_ram_energy",
description="Energy used per RAM (kWh)",
description="Energy used per RAM since last reading (kWh)",
)
energy_consumed_doc = MetricDocumentation(
"codecarbon_energy_consumed",
description="Sum of cpu_energy, gpu_energy and ram_energy (kW)",
description="Sum of cpu_energy, gpu_energy and ram_energy (kWh)",
)

energy_consumed_total_doc = MetricDocumentation(
"codecarbon_energy_total",
description="Accumulated cpu_energy, gpu_energy and ram_energy (kWh)",
)
Loading
Loading