Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions internal/provisioning/dpuagent/dpuagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"

Expand Down Expand Up @@ -58,10 +59,16 @@ const defaultRetryInterval = 30 * time.Second

const bootIDFile = "/proc/sys/kernel/random/boot_id"

const (
defaultRunDir = "/run/dpu-agent"
doneMarkerFileName = "configuration-complete"
)

type DPUAgent struct {
optCtx *operations.Context
operations []operations.Operation
retryInterval time.Duration
runDir string

// rebootMethodDiscoveryFunc, if non-nil, replaces MFT tool probing (tests only).
rebootMethodDiscoveryFunc func(context.Context) bool
Expand Down Expand Up @@ -99,6 +106,7 @@ func NewDPUAgent(optCtx *operations.Context) *DPUAgent {
return &DPUAgent{
optCtx: optCtx,
operations: operations,
runDir: defaultRunDir,
}
}

Expand Down Expand Up @@ -143,6 +151,9 @@ func (d *DPUAgent) Run(ctx context.Context) error {
return fmt.Errorf("execution of operator %s aborted: %v", op.Name(), err)
}
}
if err := writeDoneMarker(d.runDir); err != nil {
klog.Errorf("Failed to write done marker: %v", err)
}
d.updateStatusUntilSuccess(ctx)
return nil
}
Expand Down Expand Up @@ -177,3 +188,15 @@ func (d *DPUAgent) initCurrentBootID() error {
d.optCtx.CurrentBootID = strings.TrimSpace(string(currentBootID))
return nil
}

func writeDoneMarker(dir string) error {
if err := os.MkdirAll(dir, 0755); err != nil {
return fmt.Errorf("create run directory %s: %w", dir, err)
}
markerPath := filepath.Join(dir, doneMarkerFileName)
if err := os.WriteFile(markerPath, []byte(time.Now().UTC().Format(time.RFC3339)+"\n"), 0644); err != nil {
return fmt.Errorf("write done marker file: %w", err)
}
klog.Infof("Configuration complete, marker written to %s", markerPath)
return nil
}
82 changes: 82 additions & 0 deletions internal/provisioning/dpuagent/dpuagent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package dpuagent
import (
"context"
"fmt"
"os"
"path/filepath"
"time"

provisioningv1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1"
Expand All @@ -35,6 +37,86 @@ import (
const testRetryInterval = 1 * time.Millisecond

var _ = Describe("DPUAgent", func() {
Describe("Done marker", func() {
var tmpDir string

BeforeEach(func() {
var err error
tmpDir, err = os.MkdirTemp("", "dpu-agent-run-*")
Expect(err).NotTo(HaveOccurred())
})

AfterEach(func() {
Expect(os.RemoveAll(tmpDir)).To(Succeed())
})

It("should write the done marker file after all operations complete successfully", func() {
agent := &DPUAgent{
retryInterval: testRetryInterval,
runDir: tmpDir,
optCtx: &operations.Context{
Client: &mockClient{},
DiscoverPorts: func() ([]pciutil.NICPort, error) {
return []pciutil.NICPort{
{Netdev: "p0", PCIAddress: "0000:03:00.0", MSTDevice: "/dev/mst/mt41692_pciconf0"},
{Netdev: "p1", PCIAddress: "0000:03:00.1", MSTDevice: "/dev/mst/mt41692_pciconf0.1"},
}, nil
},
},
operations: []operations.Operation{
&mockOperation{
name: "op1",
conditionType: "Op1Condition",
executeFunc: func(execCtx context.Context, optCtx *operations.Context) error {
return nil
},
},
},
}

Expect(agent.Run(ctx)).To(Succeed())

markerPath := filepath.Join(tmpDir, "configuration-complete")
Expect(markerPath).To(BeAnExistingFile())
content, err := os.ReadFile(markerPath)
Expect(err).NotTo(HaveOccurred())
Expect(string(content)).NotTo(BeEmpty())
})

It("should not write the done marker when the run is aborted", func() {
cancelCtx, cancelFunc := context.WithCancel(ctx)
agent := &DPUAgent{
retryInterval: testRetryInterval,
runDir: tmpDir,
optCtx: &operations.Context{
Client: &mockClient{},
DiscoverPorts: func() ([]pciutil.NICPort, error) {
return []pciutil.NICPort{
{Netdev: "p0", PCIAddress: "0000:03:00.0", MSTDevice: "/dev/mst/mt41692_pciconf0"},
{Netdev: "p1", PCIAddress: "0000:03:00.1", MSTDevice: "/dev/mst/mt41692_pciconf0.1"},
}, nil
},
},
operations: []operations.Operation{
&mockOperation{
name: "cancel-op",
conditionType: "CancelOpCondition",
executeFunc: func(execCtx context.Context, optCtx *operations.Context) error {
cancelFunc()
return fmt.Errorf("error that triggers context check")
},
},
},
}

err := agent.Run(cancelCtx)
Expect(err).To(HaveOccurred())

markerPath := filepath.Join(tmpDir, "configuration-complete")
Expect(markerPath).NotTo(BeAnExistingFile())
})
})

Describe("Run", func() {
It("should include package installation after static file verification", func() {
agent := NewDPUAgent(&operations.Context{})
Expand Down