Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions frontend/csi/node_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package csi
import (
"context"
"crypto/rand"
"encoding/hex"
"fmt"
"math/big"
"os"
Expand Down Expand Up @@ -1261,6 +1262,26 @@ func (p *Plugin) nodeUnstageISCSIVolume(
}
}

// Derive the device path using the LunSerial and LunID.
// The publishInfo.DevicePath may be incorrect due to Kernel actions.
// Fallback to using the publishInfo.DevicePath if the multipath device cannot be derived.
fields := LogFields{
"LunSerial": publishInfo.IscsiLunSerial,
"LunID": publishInfo.IscsiLunNumber,
}
lunSerialHex := hex.EncodeToString([]byte(publishInfo.IscsiLunSerial))
multipathDevice, err := iscsiUtils.GetMultipathDeviceForLUN(ctx, lunSerialHex, int(publishInfo.IscsiLunNumber))
if err != nil {
Logc(ctx).WithError(err).WithFields(fields).Debug("Error finding multipath device for lun.")
if errors.IsNotFoundError(err) {
Logc(ctx).WithError(err).WithFields(fields).Debug("no multipath device is found for lun, resetting device info")
publishInfo.DevicePath = ""
}
} else {
Logc(ctx).WithFields(fields).Debugf("Found multipath device %s for lun.", multipathDevice)
publishInfo.DevicePath = "/dev/" + multipathDevice
}

// Delete the device from the host.
unmappedMpathDevice, err := utils.PrepareDeviceForRemoval(ctx, publishInfo, nil, p.unsafeDetach, force)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion utils/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,7 @@ func compareWithPublishedSerialNumber(ctx context.Context, publishInfo *VolumePu
// Get Device based on the serial number and at the same time identify if it is a ghost device.
// Multipath UUID contains LUN serial in hex format
lunSerialHex := hex.EncodeToString([]byte(publishInfo.IscsiLunSerial))
multipathDevice, err := IscsiUtils.GetMultipathDeviceBySerial(ctx, lunSerialHex)
multipathDevice, err := IscsiUtils.GetMultipathDeviceForLUN(ctx, lunSerialHex, int(publishInfo.IscsiLunNumber))
if err != nil {
return false, fmt.Errorf("failed to verify multipath device for serial '%v'; %v ",
publishInfo.IscsiLunSerial, err)
Expand Down
51 changes: 31 additions & 20 deletions utils/devices_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"time"
"unsafe"

execCmd "github.com/netapp/trident/utils/exec"
log "github.com/sirupsen/logrus"

"github.com/cenkalti/backoff/v4"
Expand All @@ -29,6 +30,8 @@ const (
luksType = "luks2"
// Return code for "no permission (bad passphrase)" from cryptsetup command
luksCryptsetupBadPassphraseReturnCode = 2
luksCloseDeviceSafelyClosedExitCode = 0
luksCloseDeviceAlreadyClosedExitCode = 4
)

// flushOneDevice flushes any outstanding I/O to a disk
Expand Down Expand Up @@ -175,17 +178,28 @@ func (d *LUKSDevice) Close(ctx context.Context) error {
}

// closeLUKSDevice performs a luksClose on the specified LUKS device
// It gracefully handles the cases where a LUKS device has already been closed or the device doesn't exist.
func closeLUKSDevice(ctx context.Context, luksDevicePath string) error {
output, err := command.ExecuteWithTimeoutAndInput(
ctx, "cryptsetup", luksCommandTimeout, true, "", "luksClose", luksDevicePath,
)
if nil != err {
Log().WithFields(LogFields{
"MappedDeviceName": luksDevicePath,
"error": err.Error(),
"output": string(output),
}).Debug("Failed to Close LUKS device")
return fmt.Errorf("failed to Close LUKS device %s; %v", luksDevicePath, err)
fields := LogFields{"luksDevicePath": luksDevicePath, "output": string(output), "err": err.Error()}
var exitErr execCmd.ExitError
if !errors.As(err, &exitErr) {
Logc(ctx).WithFields(fields).Error("Failed to close LUKS device with unknown error.")
return fmt.Errorf("failed to close LUKS device %s; %w", luksDevicePath, err)
}

switch exitErr.ExitCode() {
// exit code "0" and "4" are safe to ignore. "0" will likely never be hit but check for it regardless.
case luksCloseDeviceSafelyClosedExitCode, luksCloseDeviceAlreadyClosedExitCode:
Logc(ctx).WithFields(fields).Debug("LUKS device is already closed or did not exist.")
return nil
default:
Logc(ctx).WithFields(fields).Error("Failed to close LUKS device.")
return fmt.Errorf("exit code '%d' when closing LUKS device '%s'; %w", exitErr.ExitCode(), luksDevicePath, err)
}
}
return nil
}
Expand All @@ -209,21 +223,18 @@ func IsLUKSDeviceOpen(ctx context.Context, luksDevicePath string) (bool, error)
// EnsureLUKSDeviceClosed ensures there is not an open LUKS device at the specified path
func EnsureLUKSDeviceClosed(ctx context.Context, luksDevicePath string) error {
GenerateRequestContextForLayer(ctx, LogLayerUtils)
fields := LogFields{"luksDevicePath": luksDevicePath}

_, err := osFs.Stat(luksDevicePath)
if err == nil {
// Need to Close the LUKS device
return closeLUKSDevice(ctx, luksDevicePath)
} else if os.IsNotExist(err) {
Logc(ctx).WithFields(LogFields{
"device": luksDevicePath,
}).Debug("LUKS device not found.")
} else {
Logc(ctx).WithFields(LogFields{
"device": luksDevicePath,
"error": err.Error(),
}).Debug("Failed to stat device")
return fmt.Errorf("could not stat device: %s %v.", luksDevicePath, err)
if err := closeLUKSDevice(ctx, luksDevicePath); err != nil {
Logc(ctx).WithFields(fields).WithError(err).Error("Could not close LUKS device.")
return fmt.Errorf("could not close LUKS device %s; %w", luksDevicePath, err)
}

// If LUKS close succeeded, the block device node should be gone.
// It's the responsibility of the kernel and udev to manage /dev/mapper entries.
// If the /dev/mapper entry lives on, log a warning and return success.
if _, err := osFs.Stat(luksDevicePath); err != nil {
Copy link
Contributor

@jwebster7 jwebster7 Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is only a proactive check for stale /dev/mapper entries. If Stat fails to find the file (the positive case) this will add incorrect log statements.

s/err != nil/err == nil to have the correct behavior.

Logc(ctx).WithFields(fields).Warn("Stale device mapper file found for LUKS device. Is udev is running?")
}
return nil
}
Expand Down
11 changes: 10 additions & 1 deletion utils/exec/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,18 @@ import (
var (
xtermControlRegex = regexp.MustCompile(`\x1B\[[0-9;]*[a-zA-Z]`)

_ Command = NewCommand()
// Ensure these structures always implement these interfaces at compilation.
_ Command = NewCommand()
_ ExitError = &exec.ExitError{}
)

// ExitError defines the methods that exec.ExitError implements.
// This enables unit testing and mocking of exit codes.
type ExitError interface {
error
ExitCode() int
}

// Command defines a set of behaviors for executing commands on a host.
type Command interface {
Execute(ctx context.Context, name string, args ...string) ([]byte, error)
Expand Down
115 changes: 107 additions & 8 deletions utils/iscsi.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import (
"encoding/binary"
"encoding/hex"
"fmt"
"io/fs"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strconv"
Expand Down Expand Up @@ -478,8 +480,12 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceDisks(ctx context.Context, mult
diskPath := chrootPathPrefix + fmt.Sprintf("/sys/block/%s/slaves/", multipathDevice)
diskDirs, err := os.ReadDir(diskPath)
if err != nil {
Logc(ctx).WithError(err).Errorf("Could not read %s", diskDirs)
return nil, fmt.Errorf("failed to identify multipath device disks; unable to read '%s'", diskDirs)
if errors.Is(err, fs.ErrNotExist) {
Logc(ctx).Warningf("multipath device directory %s does not exist, device is already gone?", diskDirs)
return nil, nil
}
Logc(ctx).WithError(err).Errorf("Could not read %s", diskPath)
return nil, fmt.Errorf("failed to identify multipath device disks; unable to read %q :%w", diskPath, err)
}

for _, diskDir := range diskDirs {
Expand All @@ -496,7 +502,10 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceDisks(ctx context.Context, mult

// GetMultipathDeviceBySerial find DM device whose UUID /sys/block/dmX/dm/uuid contains serial in hex format.
func (h *IscsiReconcileHelper) GetMultipathDeviceBySerial(ctx context.Context, hexSerial string) (string, error) {
sysPath := chrootPathPrefix + "/sys/block/"
var (
sysPath = chrootPathPrefix + "/sys/block/"
multipathCMDTimeout = 10 * time.Second
)

blockDirs, err := os.ReadDir(sysPath)
if err != nil {
Expand All @@ -520,18 +529,108 @@ func (h *IscsiReconcileHelper) GetMultipathDeviceBySerial(ctx context.Context, h
continue
}

if strings.Contains(uuid, hexSerial) {
if !strings.Contains(uuid, hexSerial) {
continue
}

// Use 'multipath -l' and check the exit code. A success return
// indicates that it is a not a stale device. Noted that frequently
// running this in a busy environment may time out, and lead to a
// situation that the device is not properly discovered. But it will
// fail the following operations and retries next time.
if _, err := command.ExecuteWithTimeout(ctx, "multipath", multipathCMDTimeout, true, "-l", dmDeviceName); err != nil {
Logc(ctx).WithFields(LogFields{
"UUID": hexSerial,
"multipathDevice": dmDeviceName,
}).Debug("Found multipath device by UUID.")
return dmDeviceName, nil
"device": dmDeviceName,
}).WithError(err).Warn("Candidate is not a valid map known to multipathd, ignoring stale entry.")
continue
}

Logc(ctx).WithFields(LogFields{
"UUID": hexSerial,
"multipathDevice": dmDeviceName,
}).Debug("Found multipath device by UUID.")
return dmDeviceName, nil
}

return "", errors.NotFoundError("no multipath device found")
}

// GetMultipathDeviceForLUN is the most robust method to find a multipath device.
// It uses a three-factor check (serial, path liveness, LUN ID) to ensure the
// correct and active device is returned, filtering out any stale entries.
func (h *IscsiReconcileHelper) GetMultipathDeviceForLUN(ctx context.Context, hexSerial string, lunID int) (string, error) {
var (
sysPath = chrootPathPrefix + "/sys/block/"
lunIDString = strconv.Itoa(lunID)
fields = LogFields{
"lunID": lunID,
"lunSerial": hexSerial,
}
)

Logc(ctx).WithFields(fields).Debug(">>>> iscsi.GetMultipathDeviceForLUN")
defer Logc(ctx).WithFields(fields).Debug("<<<< iscsi.GetMultipathDeviceForLUN")

blockDirs, err := os.ReadDir(sysPath)
if err != nil {
Logc(ctx).WithError(err).WithFields(fields).Errorf("Could not read %s", sysPath)
return "", fmt.Errorf("failed to list block devices in %q", sysPath)
}

for _, blockDir := range blockDirs {
dmDeviceName := blockDir.Name()
if !strings.HasPrefix(dmDeviceName, "dm-") {
continue
}

// Check for a WWID match.
uuid, err := h.GetMultipathDeviceUUID(dmDeviceName)
if err != nil || !strings.Contains(uuid, hexSerial) {
// Not a match, or not a multipath device.
continue
}

// Perform checks whether it's a stale device by checking slaves.
Logc(ctx).WithFields(LogFields{
"serial": hexSerial,
"candidate": dmDeviceName,
"uuid": uuid,
}).Debug("Found candidate multipath device")

// A stale device should contain no slaves.
slavesPath := filepath.Join(sysPath, dmDeviceName, "slaves")
slaves, err := os.ReadDir(slavesPath)
if err != nil || len(slaves) == 0 {
Logc(ctx).WithFields(fields).Warnf("Candidate %s is a stale device with no paths, ignoring.", dmDeviceName)
continue
}

// Verify the LUN ID on an actual path using the 1st slave device info.
scsiDevicePath := filepath.Join(slavesPath, slaves[0].Name(), "device", "scsi_device")
scsiDeviceDirs, err := os.ReadDir(scsiDevicePath)
if err != nil || len(scsiDeviceDirs) == 0 {
Logc(ctx).WithFields(fields).WithError(err).Warnf("Could not read scsi_device %q info for candidate path %s.", scsiDevicePath, dmDeviceName)
continue
}

scsiDeviceAddress := scsiDeviceDirs[0].Name()
parts := strings.Split(scsiDeviceAddress, ":")
if len(parts) != 4 {
Logc(ctx).WithFields(fields).Warnf("Invalid SCSI device address %s.", scsiDeviceAddress)
continue
}

currentLunID := parts[3]
if currentLunID == lunIDString {
// All lun ID and serial match the multipath device.
Logc(ctx).WithFields(fields).Debugf("Successfully identified and verified multipath device %s.", dmDeviceName)
return dmDeviceName, nil
}
}

return "", errors.NotFoundError(fmt.Sprintf("no active multipath device found for serial %s and LUN ID %d", hexSerial, lunID))
}

// waitForDeviceScan scans all paths to a specific LUN and waits until all
// SCSI disk-by-path devices for that LUN are present on the host.
func waitForDeviceScan(ctx context.Context, lunID int, iSCSINodeName string) error {
Expand Down
1 change: 1 addition & 0 deletions utils/iscsi_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type IscsiReconcileUtils interface {
GetISCSIHostSessionMapForTarget(context.Context, string) map[int]int
GetSysfsBlockDirsForLUN(int, map[int]int) []string
GetMultipathDeviceUUID(string) (string, error)
GetMultipathDeviceForLUN(context.Context, string, int) (string, error)
GetMultipathDeviceBySerial(context.Context, string) (string, error)
GetMultipathDeviceDisks(context.Context, string) ([]string, error)
GetDevicesForLUN(paths []string) ([]string, error)
Expand Down