Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions pkg/satellite/attach.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,16 +364,81 @@ func attachLVMThin(ctx context.Context, exec storage.Exec, dev *apiv1.PhysicalDe
}, nil
}

// hostMountNamespaceCommand controls how the satellite reaches the
// host's mount namespace for pool-creating zpool subcommands. The
// production value (`nsenter -t 1 -m`) hops into PID 1's mount
// namespace, which is where the host's devtmpfs lives — necessary
// because the container's /dev is a private devtmpfs instance and
// the kernel's per-partition mknod doesn't propagate into it before
// zpool's internal open() fires (Bug 359, see attachZFS).
//
// Exposed as a package var so unit tests can substitute a no-op
// prefix and assert the resulting command line directly; production
// callers MUST NOT mutate it.
var hostMountNamespaceCommand = []string{"nsenter", "-t", "1", "-m", "--"}
Comment on lines +375 to +378
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The variable hostMountNamespaceCommand is unexported (starts with a lowercase letter), which makes it inaccessible to tests in the satellite_test package (such as those in pkg/satellite/attach_test.go). The documentation states it is exposed so unit tests can substitute it, but this currently only applies to internal tests within the satellite package. To allow external test packages to utilize this seam, consider exporting the variable as HostMountNamespaceCommand.


// runHostZpool runs `zpool <args...>` in the host's mount namespace.
// Use this for any zpool subcommand that allocates or removes a
// top-level vdev partition (`create`, `add`, `replace`); pure
// probes (`list`, `get`, `import -l` cache reads) are unaffected
// by the partition-node race and stay on the container-local
// exec.Run path.
//
// Returns the captured stdout + nil on success, or stdout + wrapped
// error on failure (mirrors exec.Run's contract so callers do not
// need to special-case nsenter failure modes — a missing nsenter
// binary surfaces the same shape of error as a missing zpool one).
func runHostZpool(ctx context.Context, exec storage.Exec, args ...string) ([]byte, error) {
cmd := append([]string{}, hostMountNamespaceCommand...)
cmd = append(cmd, "zpool")
cmd = append(cmd, args...)

return exec.Run(ctx, cmd[0], cmd[1:]...)
}

// attachZFS: zpool create. The pool name on disk matches the
// LINSTOR pool name to keep cross-host import predictable; the
// PhysicalDevice's StableID-derived path is the single vdev.
//
// Bug 359: `zpool create` is dispatched into the host's mount
// namespace via nsenter — see runHostZpool for the rationale.
// The satellite container's /dev devtmpfs is a private mount
// (kubelet provides every privileged container with its own
// /dev), and there is no udevd inside the container to keep
// it in sync with the kernel's BLKPG events. zpool create
// stamps a GPT on /dev/sda (creating sda1+sda9 in the kernel),
// then tries to open(/dev/sda1) to write the ZFS label — and
// the partition device node has not yet appeared in the
// container's /dev, so the open fails with ENODEV (errno 19):
//
// cannot label 'sda': failed to detect device partitions
// on '/dev/sda1': 19
//
// Even with `mountPropagation: HostToContainer` (Bug 346) the
// container's /dev is a distinct devtmpfs instance, so kernel-
// initiated mknod on the host's /dev does not reach into the
// container fast enough. Running zpool in the host's mount
// namespace sidesteps this entirely: the host's devtmpfs is
// updated synchronously by the kernel, so zpool's internal
// open() succeeds on the first try.
func attachZFS(ctx context.Context, exec storage.Exec, dev *apiv1.PhysicalDevice, devicePath string) (AttachResult, error) {
pool := dev.AttachTo.ZPoolName
if pool == "" {
return AttachResult{}, errors.New("ZFS attach requires ZPoolName")
}

_, err := exec.Run(ctx, "zpool", "create", "-f",
// `-m none`: do not mount the pool's root dataset. Without this
// zpool tries to `mkdir /<pool-name>` on the host's rootfs,
// which is read-only on Talos (siderolabs/zfs system extension
// host) and would make `zpool create` exit non-zero AFTER
// stamping the pool successfully — observed on the e2e3 stand
// after the Bug 359 nsenter dispatch: "cannot mount '/data':
// failed to create mountpoint: Read-only file system". Blockstor
// never operates on the dataset's mounted directory anyway —
// satellite-side resource creation goes through `zfs create -V`
// volume datasets which never need a pool-root mount.
_, err := runHostZpool(ctx, exec,
"create", "-f", "-m", "none",
"-O", "compression=off",
"-O", "atime=off",
pool, devicePath)
Expand Down Expand Up @@ -480,13 +545,19 @@ func extendLVMThin(ctx context.Context, exec storage.Exec, dev *apiv1.PhysicalDe
// `-f` is required for the same reason `zpool create -f` carries
// it: a device with stale ZFS labels from a prior attempt would
// otherwise refuse to be added.
//
// Bug 359: `zpool add` allocates a new top-level vdev partition
// on `devicePath` and therefore triggers the same /dev partition-
// node race as `zpool create`. Dispatched via runHostZpool so the
// kernel's devtmpfs update is visible to the zpool process — see
// the attachZFS docstring for the full rationale.
func extendZFS(ctx context.Context, exec storage.Exec, dev *apiv1.PhysicalDevice, devicePath string) (AttachResult, error) {
pool := dev.AttachTo.ZPoolName
if pool == "" {
return AttachResult{}, errors.New("ZFS extend requires ZPoolName")
}

_, err := exec.Run(ctx, "zpool", "add", "-f", pool, devicePath)
_, err := runHostZpool(ctx, exec, "add", "-f", pool, devicePath)
if err != nil {
return AttachResult{}, errors.Wrap(err, "zpool add")
}
Expand Down
208 changes: 198 additions & 10 deletions pkg/satellite/attach_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,19 +138,26 @@ func TestAttachZFSIssuesZpoolCreate(t *testing.T) {

// Match a substring rather than the exact string to keep the
// test robust against future flag tweaks (the load-bearing
// part is "zpool create -f rpool /dev/...").
// parts are the nsenter host-mount-ns hop + "zpool create -f"
// + the pool name).
//
// Bug 359: `zpool create` is dispatched via `nsenter -t 1 -m`
// so it runs in the host's mount namespace — the container's
// private /dev devtmpfs does not see the kernel's partition
// mknod fast enough for zpool's internal open() to find
// /dev/sda1 (see attach.go::runHostZpool).
found := false

for _, line := range fx.CommandLines() {
if strings.HasPrefix(line, "zpool create -f") && strings.Contains(line, "rpool") {
if strings.HasPrefix(line, "nsenter -t 1 -m -- zpool create -f") && strings.Contains(line, "rpool") {
found = true

break
}
}

if !found {
t.Errorf("missing zpool create; calls=%v", fx.CommandLines())
t.Errorf("missing nsenter-wrapped zpool create; calls=%v", fx.CommandLines())
}
}

Expand Down Expand Up @@ -334,7 +341,11 @@ func TestAttachWipeClearsPartitionTable(t *testing.T) {
wipeIdx = i
case strings.HasPrefix(line, "blockdev --rereadpt"):
rereadIdx = i
case strings.HasPrefix(line, "zpool create"):
// Bug 359: zpool create runs via nsenter into the host's
// mount namespace; tolerate both forms so the assertion
// stays anchored on the load-bearing "zpool create" verb.
case strings.HasPrefix(line, "zpool create"),
strings.HasPrefix(line, "nsenter -t 1 -m -- zpool create"):
createIdx = i
}
}
Expand Down Expand Up @@ -389,13 +400,20 @@ func TestAttachExtendsExistingZpool(t *testing.T) {
calls := fx.CommandLines()

for _, line := range calls {
if strings.HasPrefix(line, "zpool create") {
// Bug 359: both the legacy direct form and the nsenter-
// wrapped form must be rejected — pool-already-exists is
// supposed to short-circuit to extend, regardless of the
// host-namespace dispatcher.
if strings.HasPrefix(line, "zpool create") ||
strings.HasPrefix(line, "nsenter -t 1 -m -- zpool create") {
t.Fatalf("Bug 337: zpool create MUST NOT run when pool already exists; calls=%v", calls)
}
}

if !slices.Contains(calls, "zpool add -f data /dev/sdb") {
t.Errorf("Bug 337: expected `zpool add -f data /dev/sdb`; calls=%v", calls)
// Bug 359: zpool add is dispatched via nsenter so the host's
// devtmpfs is visible to libzpool's partition-node open().
if !slices.Contains(calls, "nsenter -t 1 -m -- zpool add -f data /dev/sdb") {
t.Errorf("Bug 337/359: expected `nsenter -t 1 -m -- zpool add -f data /dev/sdb`; calls=%v", calls)
}
}

Expand Down Expand Up @@ -616,22 +634,192 @@ func TestAttachCreatesWhenPoolAbsent(t *testing.T) {
calls := fx.CommandLines()

for _, line := range calls {
if strings.HasPrefix(line, "zpool add") {
// Bug 359: tolerate both the legacy and nsenter-wrapped
// forms — pool-absent must NEVER trigger an extend path
// regardless of how the dispatcher reaches the host.
if strings.HasPrefix(line, "zpool add") ||
strings.HasPrefix(line, "nsenter -t 1 -m -- zpool add") {
t.Fatalf("Bug 337: zpool add MUST NOT run when pool is absent; calls=%v", calls)
}
}

found := false

// Bug 359: zpool create is dispatched into the host's mount
// namespace via nsenter so the kernel-driven /dev mknod for
// freshly-created partitions is visible to libzpool's open()
// (see attach.go::runHostZpool).
for _, line := range calls {
if strings.HasPrefix(line, "zpool create -f") && strings.Contains(line, "fresh") {
if strings.HasPrefix(line, "nsenter -t 1 -m -- zpool create -f") && strings.Contains(line, "fresh") {
found = true

break
}
}

if !found {
t.Errorf("Bug 337: missing zpool create on absent pool; calls=%v", calls)
t.Errorf("Bug 337/359: missing nsenter-wrapped zpool create on absent pool; calls=%v", calls)
}
}

// TestAttachZFSDispatchedViaHostMountNamespace pins Bug 359: any
// pool-creating zpool subcommand emitted by Attach MUST be
// prefixed with the host-mount-namespace hop so the kernel's
// per-partition mknod (done in devtmpfs synchronously when the
// kernel re-reads the partition table) is visible to libzpool's
// open() of /dev/sdX1.
//
// Reproduction on e2e3 stand 2026-05-22:
//
// $ linstor ps cdp --pool-name data --storage-pool data zfs e2e3-worker-1 /dev/sda
// SUCCESS: physical-storage attach accepted on node 'e2e3-worker-1'
// $ linstor sp l # 30s later
// data e2e3-worker-1 ZFS data - - False Error ...
// pool backing storage missing on node e2e3-worker-1:
// storage pool data is not present
//
// Satellite log (pre-fix):
//
// zpool create -f -O compression=off -O atime=off data /dev/sda:
// cannot label 'sda': failed to detect device partitions on
// '/dev/sda1': 19
//
// The container's /dev is a private devtmpfs instance — even with
// `mountPropagation: HostToContainer` from Bug 346 the kernel's
// devtmpfs mknod fires on the host's /dev and is not propagated
// into the container before zpool's libefi open() runs. nsenter
// into PID 1's mount namespace puts zpool on the host's devtmpfs,
// which the kernel keeps in sync synchronously.
//
// This test asserts: (a) Attach with Wipe=true ends with a
// zpool create that is prefixed by `nsenter -t 1 -m --`, and
// (b) extend likewise wraps `zpool add`. Both must hold for
// the operator-facing `ps cdp` contract — SUCCESS implies the
// backing pool exists on the host.
func TestAttachZFSDispatchedViaHostMountNamespace(t *testing.T) {
t.Parallel()

t.Run("create path", func(t *testing.T) {
t.Parallel()

fx := storage.NewFakeExec()
// Probe says pool absent → Attach falls through to attachZFS.
fx.Expect("zpool list -H -o name data",
storage.FakeResponse{Err: errors.New("no such pool")})

dev := &apiv1.PhysicalDevice{
DevicePath: "/dev/sda",
AttachTo: &apiv1.PhysicalDeviceAttachTo{
StoragePoolName: "data",
ProviderKind: "ZFS",
ZPoolName: "data",
Wipe: true,
},
}

_, err := satellite.Attach(t.Context(), fx, dev)
if err != nil {
t.Fatalf("Attach: %v", err)
}

calls := fx.CommandLines()

// Concrete contract: the create command MUST start with the
// nsenter prefix AND carry `-m none` (Talos host root is RO
// — without this zpool tries to mkdir /<pool> and fails AFTER
// the pool is already stamped on disk, leaving a half-committed
// state). A bare `zpool create` would re-trip the "failed to
// detect device partitions on '/dev/sda1': 19" race observed
// on the stand.
want := "nsenter -t 1 -m -- zpool create -f -m none -O compression=off -O atime=off data /dev/sda"
if !slices.Contains(calls, want) {
t.Fatalf("Bug 359: missing host-namespace dispatched zpool create.\n want: %q\n got: %v", want, calls)
}

// Sanity: no bare `zpool create` (i.e. not nsenter-wrapped)
// must have leaked through. A future refactor that bypasses
// runHostZpool will re-introduce the partition-node race.
for _, line := range calls {
if strings.HasPrefix(line, "zpool create") {
t.Fatalf("Bug 359: bare `zpool create` dispatched; must go via nsenter; calls=%v", calls)
}
}
})

t.Run("extend path", func(t *testing.T) {
t.Parallel()

fx := storage.NewFakeExec()
// Probe says pool already present → Attach takes the extend
// branch via extendZFS, which now also dispatches via nsenter.
fx.Expect("zpool list -H -o name data",
storage.FakeResponse{Stdout: []byte("data\n")})

dev := &apiv1.PhysicalDevice{
DevicePath: "/dev/sdb",
AttachTo: &apiv1.PhysicalDeviceAttachTo{
StoragePoolName: "data",
ProviderKind: "ZFS",
ZPoolName: "data",
},
}

_, err := satellite.Attach(t.Context(), fx, dev)
if err != nil {
t.Fatalf("Attach: %v", err)
}

calls := fx.CommandLines()

want := "nsenter -t 1 -m -- zpool add -f data /dev/sdb"
if !slices.Contains(calls, want) {
t.Fatalf("Bug 359: missing host-namespace dispatched zpool add.\n want: %q\n got: %v", want, calls)
}

for _, line := range calls {
if strings.HasPrefix(line, "zpool add") {
t.Fatalf("Bug 359: bare `zpool add` dispatched; must go via nsenter; calls=%v", calls)
}
}
})
}

// TestAttachZFSSurfacesZpoolCreateFailure pins the contract that
// when the host-namespace zpool create errors (pool didn't
// materialise on disk), Attach MUST return the wrapped error
// rather than silently succeeding. The pre-fix bug returned
// SUCCESS to the REST `ps cdp` caller even though the satellite-
// side zpool create failed — leaving the SP CRD in `State=Error`
// indefinitely with no way for the operator to retry.
func TestAttachZFSSurfacesZpoolCreateFailure(t *testing.T) {
t.Parallel()

fx := storage.NewFakeExec()
fx.Expect("zpool list -H -o name data",
storage.FakeResponse{Err: errors.New("no such pool")})
// Simulate the e2e3-stand failure: zpool create exit 1 with
// the canonical "failed to detect device partitions" stderr.
fx.Expect("nsenter -t 1 -m -- zpool create -f -m none -O compression=off -O atime=off data /dev/sda",
storage.FakeResponse{Err: errors.New(
"zpool create -f -m none -O compression=off -O atime=off data /dev/sda: " +
"cannot label 'sda': failed to detect device partitions on '/dev/sda1': 19")})

dev := &apiv1.PhysicalDevice{
DevicePath: "/dev/sda",
AttachTo: &apiv1.PhysicalDeviceAttachTo{
StoragePoolName: "data",
ProviderKind: "ZFS",
ZPoolName: "data",
Wipe: true,
},
}

_, err := satellite.Attach(t.Context(), fx, dev)
if err == nil {
t.Fatalf("Bug 359: zpool create failure MUST propagate to caller; got nil error")
}

if !strings.Contains(err.Error(), "zpool create") {
t.Errorf("Bug 359: error must mention zpool create; got %v", err)
}
}
Loading
Loading