Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cmd/hostagent/subcmds/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/nvidia/doca-platform/internal/provisioning/hostagent"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/networkmanager"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/nodemanager"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/phase/reboot"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/service"

"github.com/spf13/cobra"
Expand Down Expand Up @@ -103,11 +104,13 @@ var serveCmd = &cobra.Command{
os.Exit(1)
}

if err := service.NewInstallationService(unCachedClient, nm).Start(true); err != nil {
rh := reboot.NewHandler(mgr.GetClient(), dpuNodeManager.GetNodeName, nm.GetDevice)

if err := service.NewInstallationService(unCachedClient, nm, rh).Start(true); err != nil {
klog.Fatalf("failed to start installation service: %v", err)
}

reconciler := hostagent.NewHostAgentReconciler(mgr.GetClient(), opts.BFBRegistryAddress, dpuNodeManager, nm)
reconciler := hostagent.NewHostAgentReconciler(mgr.GetClient(), opts.BFBRegistryAddress, dpuNodeManager, nm, rh)
if err = reconciler.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DPU")
os.Exit(1)
Expand Down
5 changes: 3 additions & 2 deletions internal/provisioning/hostagent/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ type HostAgentReconciler struct {
func NewHostAgentReconciler(client client.Client,
bfbRegistryAddress string,
nodeManager nodemanager.Interface,
networkManager networkmanager.Interface) *HostAgentReconciler {
networkManager networkmanager.Interface,
rebootHandler *reboot.Handler) *HostAgentReconciler {
r := &HostAgentReconciler{
Client: client,
NodeManager: nodeManager,
Expand All @@ -70,7 +71,7 @@ func NewHostAgentReconciler(client client.Client,
provisioningv1.DPUInitializeInterface: interfaceinit.NewHandler(client, r.NetworkManager.GetDevice),
provisioningv1.DPUConfigFWParameters: configfw.NewHandler(client, r.NetworkManager.GetDevice),
provisioningv1.DPUOSInstalling: install.NewHandler(client, bfbRegistry, r.NetworkManager.GetDevice),
provisioningv1.DPURebooting: reboot.NewHandler(client, r.NodeManager.GetNodeName, r.NetworkManager.GetDevice),
provisioningv1.DPURebooting: rebootHandler,
provisioningv1.DPUHostNetworkConfiguration: network.NewHandler(r.NetworkManager.AddNetworkRequest),
}
return r
Expand Down
8 changes: 4 additions & 4 deletions internal/provisioning/hostagent/phase/reboot/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,18 +128,18 @@ func (r *Handler) reboot(ctx context.Context, dpuNode *provisioningv1.DPUNode, d
}
}
if runPowerCycle {
if err := r.runPowerCycle(dpuNode, rebootNow); err != nil {
if err := r.RunPowerCycle(dpuNode, rebootNow); err != nil {
return rebootNow, err
}
return nil, nil
}
if err := r.runSLR(ctx, rebootNow); err != nil {
if err := r.RunSLR(ctx, rebootNow); err != nil {
return rebootNow, err
}
return nil, nil
}

func (r *Handler) runPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisioningv1.DPU) error {
func (r *Handler) RunPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisioningv1.DPU) error {
powerCycleCommand, err := reboot.PowerCycleCommand(dpuNode)
if err != nil {
return fmt.Errorf("failed to get power cycle command: %w", err)
Expand All @@ -155,7 +155,7 @@ func (r *Handler) runPowerCycle(dpuNode *provisioningv1.DPUNode, dpus []provisio
return nil
}

func (r *Handler) runSLR(ctx context.Context, toBeRebooted []provisioningv1.DPU) error {
func (r *Handler) RunSLR(ctx context.Context, toBeRebooted []provisioningv1.DPU) error {
devs := make([]hostutil.Device, len(toBeRebooted))
for i, dpu := range toBeRebooted {
dev, ok := r.getDeviceBySerialNumberFunc(dpu.Spec.SerialNumber)
Expand Down
73 changes: 72 additions & 1 deletion internal/provisioning/hostagent/service/installation_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"time"

provisioningv1 "github.com/nvidia/doca-platform/api/provisioning/v1alpha1"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/phase/reboot"
"github.com/nvidia/doca-platform/internal/provisioning/hostagent/service/types"

restful "github.com/emicklei/go-restful/v3"
Expand Down Expand Up @@ -78,16 +79,18 @@ type InstallationService struct {
// listeners maps interface names to their listeners
listeners map[string]net.Listener
networkManager NetworkConfigurator
rebootHandler *reboot.Handler
// stopCh is closed by Stop() to terminate background goroutines
stopCh chan struct{}
stopOnce sync.Once
}

func NewInstallationService(client client.Client, nm NetworkConfigurator) *InstallationService {
func NewInstallationService(client client.Client, nm NetworkConfigurator, rh *reboot.Handler) *InstallationService {
s := &InstallationService{
Client: client,
listeners: make(map[string]net.Listener),
networkManager: nm,
rebootHandler: rh,
stopCh: make(chan struct{}),
}
ws := new(restful.WebService).Path("/")
Expand All @@ -110,6 +113,11 @@ func NewInstallationService(client client.Client, nm NetworkConfigurator) *Insta
Consumes(restful.MIME_JSON).
Produces(restful.MIME_JSON).
To(s.ConfigureHostVFs))
ws.Route(
ws.POST("/trigger-reboot").
Consumes(restful.MIME_JSON).
Produces(restful.MIME_JSON).
To(s.TriggerReboot))
ws.Route(ws.GET("/healthz").To(s.HealthCheck))
// Package repositories: serve .deb and .rpm packages for DPU provisioning.
ws.Route(ws.GET("/deb/{subpath:*}").To(serveRepoFile(debRepoDir)))
Expand Down Expand Up @@ -339,6 +347,69 @@ func (s *InstallationService) ConfigureHostVFs(req *restful.Request, resp *restf
resp.WriteHeader(http.StatusOK)
}

func (s *InstallationService) TriggerReboot(req *restful.Request, resp *restful.Response) {
var request types.TriggerRebootRequest
if err := req.ReadEntity(&request); err != nil {
klog.Errorf("failed to read trigger reboot request: %v", err)
_ = resp.WriteError(http.StatusBadRequest, err)
return
}
klog.Infof("Received trigger reboot request: %#v", request)

ctx := req.Request.Context()

dpu := &provisioningv1.DPU{}
if err := s.Get(ctx, client.ObjectKey{Namespace: request.DPUNamespace, Name: request.DPUName}, dpu); err != nil {
klog.Errorf("failed to get DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
if apierrors.IsNotFound(err) {
_ = resp.WriteError(http.StatusNotFound, err)
} else {
_ = resp.WriteError(http.StatusInternalServerError, err)
}
return
}

if string(dpu.UID) != request.DPUUID {
klog.Warningf("Rejecting trigger reboot request for DPU %s/%s: request UID %q does not match current DPU UID %q",
request.DPUNamespace, request.DPUName, request.DPUUID, dpu.UID)
_ = resp.WriteError(http.StatusConflict, fmt.Errorf("stale DPU object: expected UID %q but got %q", request.DPUUID, dpu.UID))
return
}

// Detach from the HTTP request context: the request arrives over tmfifo,
// and shutting down the ARM severs that connection.
rebootCtx := context.WithoutCancel(ctx)

switch request.RebootMethod {
case provisioningv1.RebootMethodSystemLevelReset,
provisioningv1.RebootMethodFirmwareReset,
provisioningv1.RebootMethodSystemReboot:
if err := s.rebootHandler.RunSLR(rebootCtx, []provisioningv1.DPU{*dpu}); err != nil {
klog.Errorf("SLR failed for DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
case provisioningv1.RebootMethodPowerCycle:
dpuNode := &provisioningv1.DPUNode{}
if err := s.Get(rebootCtx, client.ObjectKey{Name: dpu.Spec.DPUNodeName}, dpuNode); err != nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if err := s.Get(rebootCtx, client.ObjectKey{Name: dpu.Spec.DPUNodeName}, dpuNode); err != nil {
if err := s.Get(rebootCtx, client.ObjectKey{Namespace: dpu.Namespace, Name: dpu.Spec.DPUNodeName}, dpuNode); err != nil {

klog.Errorf("failed to get DPUNode %s: %v", dpu.Spec.DPUNodeName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
if err := s.rebootHandler.RunPowerCycle(dpuNode, []provisioningv1.DPU{*dpu}); err != nil {
klog.Errorf("PowerCycle failed for DPU %s/%s: %v", request.DPUNamespace, request.DPUName, err)
_ = resp.WriteError(http.StatusInternalServerError, err)
return
}
default:
_ = resp.WriteError(http.StatusBadRequest, fmt.Errorf("unsupported reboot method: %q", request.RebootMethod))
return
}

klog.Infof("Successfully triggered reboot (%s) for DPU %s/%s", request.RebootMethod, request.DPUNamespace, request.DPUName)
resp.WriteHeader(http.StatusOK)
}

func (s *InstallationService) UpdateStatus(req *restful.Request, resp *restful.Response) {
var request types.UpdateStatusRequest
if err := req.ReadEntity(&request); err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ var _ = Describe("InstallationService", func() {
testNS = &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: "installation-service-testns-"}}
Expect(k8sClient.Create(ctx, testNS)).To(Succeed())

installationService = NewInstallationService(k8sClient, nil)
installationService = NewInstallationService(k8sClient, nil, nil)
Expect(installationService.Start(false)).To(Succeed())
// Start() runs the server in a goroutine; wait until it is listening to avoid connection refused.
Eventually(func() error {
Expand Down
7 changes: 7 additions & 0 deletions internal/provisioning/hostagent/service/types/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,10 @@ type ConfigureHostVFsRequest struct {
DPUName string `json:"dpuName"`
DPUNamespace string `json:"dpuNamespace"`
}

type TriggerRebootRequest struct {
DPUName string `json:"dpuName"`
DPUNamespace string `json:"dpuNamespace"`
DPUUID string `json:"dpuUID"`
RebootMethod provisioningv1.RebootMethodType `json:"rebootMethod"`
}