Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions build/rpm/pcp.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -1641,6 +1641,21 @@ This package contains the PCP Performance Metrics Domain Agent (PMDA) for
collecting metrics from simple network checks.
# end pcp-pmda-netcheck

#
# pcp-pmda-nixl
#
%package pmda-nixl
License: GPL-2.0-or-later
Summary: Performance Co-Pilot (PCP) metrics for NVIDIA NIXL agents
URL: https://pcp.io
Requires: pcp = @package_version@ pcp-libs = @package_version@
Requires: python3-pcp
%description pmda-nixl
This package contains the PCP Performance Metrics Domain Agent (PMDA) for
collecting performance telemetry from NIXL (NVIDIA Inference Xfer
Library) agents via their shared-memory ring buffer interface.
# end pcp-pmda-nixl

#
# pcp-pmda-rocestat
#
Expand Down Expand Up @@ -2365,6 +2380,7 @@ basic_manifest | keep '(etc/pcp|pmdas)/rabbitmq(/|$)' >pcp-pmda-rabbitmq-files
basic_manifest | keep '(etc/pcp|pmdas)/rds(/|$)' >pcp-pmda-rds-files
basic_manifest | keep '(etc/pcp|pmdas)/redis(/|$)' >pcp-pmda-redis-files
basic_manifest | keep '(etc/pcp|pmdas)/resctrl(/|$)|sys-fs-resctrl' >pcp-pmda-resctrl-files
basic_manifest | keep '(etc/pcp|pmdas)/nixl(/|$)' >pcp-pmda-nixl-files
basic_manifest | keep '(etc/pcp|pmdas)/rocestat(/|$)' >pcp-pmda-rocestat-files
basic_manifest | keep '(etc/pcp|pmdas)/roomtemp(/|$)' >pcp-pmda-roomtemp-files
basic_manifest | keep '(etc/pcp|pmdas)/rsyslog(/|$)' >pcp-pmda-rsyslog-files
Expand Down Expand Up @@ -2399,6 +2415,7 @@ for pmda_package in \
libvirt lio lmsensors logger lustre lustrecomm \
mailq memcache mic mounts mongodb mssql mysql \
named netcheck netfilter news nfsclient nginx \
nixl \
nutcracker nvidia \
openmetrics opentelemetry openvswitch oracle \
pdns perfevent podman postfix postgresql \
Expand Down Expand Up @@ -2829,6 +2846,9 @@ done
%preun pmda-nfsclient
%{pmda_remove "$1" "nfsclient"}

%preun pmda-nixl
%{pmda_remove "$1" "nixl"}

%preun pmda-openvswitch
%{pmda_remove "$1" "openvswitch"}

Expand Down Expand Up @@ -3159,6 +3179,8 @@ fi

%files pmda-nfsclient -f pcp-pmda-nfsclient-files.rpm

%files pmda-nixl -f pcp-pmda-nixl-files.rpm

%files pmda-openvswitch -f pcp-pmda-openvswitch-files.rpm

%files pmda-rabbitmq -f pcp-pmda-rabbitmq-files.rpm
Expand Down
113 changes: 113 additions & 0 deletions qa/1999
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/bin/sh
# PCP QA Test No. 1999
# Test pmdanixl - install, metric namespace, per-agent instances and values.
#
# Copyright (c) 2026 Red Hat.
#
seq=`basename $0`
echo "QA output created by $seq"

. ./common.python

pmda_path="$PCP_PMDAS_DIR/nixl"
pmda_script="$pmda_path/pmdanixl.python"

[ -d "$pmda_path" ] || _notrun "NIXL PMDA is not installed"

$python -c "from pcp import pmda" >/dev/null 2>&1
[ $? -eq 0 ] || _notrun "python pcp pmda module not installed"

$python -c "import mmap, ctypes" >/dev/null 2>&1
[ $? -eq 0 ] || _notrun "python mmap or ctypes module not available"

status=1 # failure is the default!

# Use /tmp (the PMDA default) so no special env var needs to reach pmcd
agent_file="/var/tmp/pcp_qa_nixl_1999"

_cleanup()
{
cd $here
_cleanup_pmda nixl
rm -f "$agent_file"
$sudo rm -rf $tmp $tmp.*
}

trap "_cleanup; exit \$status" 0 1 2 3 15

_filter_nixl()
{
sed \
-e 's/PMID: 39\./PMID: NIXL./g' \
-e 's/InDom: 39\./InDom: NIXL./g' \
-e 's/0x[0-9a-f]*/0xNNNNNNNN/g' \
# end
}

pmdanixl_remove()
{
cd "$pmda_path"
echo
echo "=== Removing NIXL agent ===" | tee -a $seq_full
$sudo ./Remove >>$seq_full 2>&1
cd "$here"
}

pmdanixl_install()
{
cd "$pmda_path"
$sudo ./Remove >>$seq_full 2>&1
echo
echo "=== Installing NIXL agent ===" | tee -a $seq_full
$sudo ./Install </dev/null >>$seq_full 2>&1
cd "$here"
}

# Real QA test starts here
_prepare_pmda nixl

$python "$here/src/nixl_telemetry.python" "$agent_file"
[ $? -eq 0 ] || _notrun "failed to create synthetic NIXL telemetry file"
chmod 666 "$agent_file"

pmdanixl_install

if ! pminfo -f pmcd.agent.status 2>&1 | grep -q '"nixl"'
then
echo "Arrgh! PMDA install failed ... see $seq_full"
_exit 1
fi

echo "=== Metric namespace ==="
pminfo nixl | LC_COLLATE=POSIX sort

echo
echo "=== Metric types and semantics ==="
pminfo -dfmtT nixl 2>&1 | _filter_nixl

echo
echo "=== Transfer metric values ==="
pminfo -f nixl.transfer.tx_bytes \
nixl.transfer.rx_bytes \
nixl.transfer.tx_requests \
nixl.transfer.rx_requests 2>&1

echo
echo "=== Memory metric values ==="
pminfo -f nixl.memory.registered \
nixl.memory.deregistered 2>&1

echo
echo "=== Performance metric values ==="
pminfo -f nixl.performance.xfer_time_us \
nixl.performance.xfer_post_time_us 2>&1

echo
echo "=== Error metric values ==="
pminfo -f nixl.errors.backend 2>&1

pmdanixl_remove

# success, all done
status=0
exit
183 changes: 183 additions & 0 deletions qa/1999.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
QA output created by 1999

=== Installing NIXL agent ===
=== Metric namespace ===
nixl.errors.backend
nixl.errors.canceled
nixl.errors.invalid_param
nixl.errors.mismatch
nixl.errors.no_telemetry
nixl.errors.not_allowed
nixl.errors.not_found
nixl.errors.not_posted
nixl.errors.not_supported
nixl.errors.remote_disconnect
nixl.errors.repost_active
nixl.errors.unknown
nixl.memory.deregistered
nixl.memory.registered
nixl.performance.xfer_post_time_us
nixl.performance.xfer_time_us
nixl.transfer.rx_bytes
nixl.transfer.rx_requests
nixl.transfer.tx_bytes
nixl.transfer.tx_requests

=== Metric types and semantics ===

nixl.errors.no_telemetry PMID: NIXL.3.11 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.canceled PMID: NIXL.3.10 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.remote_disconnect PMID: NIXL.3.9 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.not_supported PMID: NIXL.3.8 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.unknown PMID: NIXL.3.7 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.repost_active PMID: NIXL.3.6 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.not_allowed PMID: NIXL.3.5 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.mismatch PMID: NIXL.3.4 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.not_found PMID: NIXL.3.3 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.backend PMID: NIXL.3.2 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 1

nixl.errors.invalid_param PMID: NIXL.3.1 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.errors.not_posted PMID: NIXL.3.0 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.performance.xfer_post_time_us PMID: NIXL.2.1 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: microsec
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 1000

nixl.performance.xfer_time_us PMID: NIXL.2.0 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: microsec
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 5000

nixl.memory.deregistered PMID: NIXL.1.1 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: byte
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 0

nixl.memory.registered PMID: NIXL.1.0 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: byte
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 4096

nixl.transfer.rx_requests PMID: NIXL.0.3 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 10

nixl.transfer.tx_requests PMID: NIXL.0.2 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: count
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 10

nixl.transfer.rx_bytes PMID: NIXL.0.1 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: byte
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 524288

nixl.transfer.tx_bytes PMID: NIXL.0.0 []
Data Type: 64-bit unsigned int InDom: NIXL.0 0xNNNNNNNN
Semantics: counter Units: byte
Help: <empty entry>
inst [0 or "pcp_qa_nixl_1999"] value 1048576

=== Transfer metric values ===

nixl.transfer.tx_bytes
inst [0 or "pcp_qa_nixl_1999"] value 1048576

nixl.transfer.rx_bytes
inst [0 or "pcp_qa_nixl_1999"] value 524288

nixl.transfer.tx_requests
inst [0 or "pcp_qa_nixl_1999"] value 10

nixl.transfer.rx_requests
inst [0 or "pcp_qa_nixl_1999"] value 10

=== Memory metric values ===

nixl.memory.registered
inst [0 or "pcp_qa_nixl_1999"] value 4096

nixl.memory.deregistered
inst [0 or "pcp_qa_nixl_1999"] value 0

=== Performance metric values ===

nixl.performance.xfer_time_us
inst [0 or "pcp_qa_nixl_1999"] value 5000

nixl.performance.xfer_post_time_us
inst [0 or "pcp_qa_nixl_1999"] value 1000

=== Error metric values ===

nixl.errors.backend
inst [0 or "pcp_qa_nixl_1999"] value 1

=== Removing NIXL agent ===
3 changes: 3 additions & 0 deletions qa/common.filter
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ _filter_optional_pmdas()
-e '/pmdanews/d' \
-e '/pmdanfsclient/d' \
-e '/pmdanginx/d' \
-e '/pmdanixl/d' \
-e '/pmdanutcracker/d' \
-e '/pmdanvidia/d' \
-e '/pmdaopenmetrics/d' \
Expand Down Expand Up @@ -395,6 +396,7 @@ _filter_optional_pmda_instances()
-e '/inst \[33 or "podman"]/d' \
-e '/inst \[35 or "mongodb"]/d' \
-e '/inst \[37 or "rds"]/d' \
-e '/inst \[39 or "nixl"]/d' \
-e '/inst \[57 or "statsd"]/d' \
-e '/inst \[60 or "linux"]/d' \
-e '/inst \[62 or "nfsclient"]/d' \
Expand Down Expand Up @@ -502,6 +504,7 @@ _filter_top_pmns()
-e '/^ news /d' \
-e '/^ nfsclient /d' \
-e '/^ nginx /d' \
-e '/^ nixl /d' \
-e '/^ nutcracker /d' \
-e '/^ nvidia /d' \
-e '/^ openmetrics /d' \
Expand Down
1 change: 1 addition & 0 deletions qa/group
Original file line number Diff line number Diff line change
Expand Up @@ -2371,5 +2371,6 @@ suse
1996 pmda.infiniband local
1997 pcp nfsiostat python local
1998 pmda.rds local python
1999 pmda.nixl local python
4751 libpcp threads valgrind local pcp helgrind
9000 other local
Loading
Loading