Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions store/posixage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ file, or via an interactive user prompt.
- Support for multiple encryption functions
- Support for multiple decryption functions

### Locking

The store uses a process-level lock file to coordinate access across processes.
Lock acquisition retries until the caller context is canceled or the lock is
acquired. Use `context.WithTimeout` or `context.WithDeadline` on store
operations when lock acquisition should be bounded.

The store can recover a stale `.posixage.lock` file when it is older than
`30s`.

Callbacks are invoked in the order they are registered. For decryption, the
store tries each callback in sequence, and the first one that successfully
provides a valid key will return the decrypted secret.
Expand Down
62 changes: 32 additions & 30 deletions store/posixage/internal/flock/flock.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ var (
)

const (
defaultLockTimeout = time.Millisecond * 100
lockFileName = ".posixage.lock"
lockFileName = ".posixage.lock"
)

// UnlockFunc is the callback function returned by [TryLock] and [TryRLock]
Expand Down Expand Up @@ -68,27 +67,32 @@ func tryLock(ctx context.Context, root *os.Root, exclusive bool) (UnlockFunc, er
return nil, err
}

err = retryLock(ctx, fl, exclusive)
// lock was successful if error == nil, so let's just return
if err == nil {
if err = lockFile(fl.Fd(), exclusive); err == nil {
// truncate to update the modtime to signal to other processes that the
// current lock is valid so they don't attempt a recovery on it.
_ = fl.Truncate(0)
return sync.OnceValue(func() error {
return unlockFile(fl)
}), nil
}
err = errors.Join(ErrLockUnsuccessful, err)

if ctx.Err() == nil {
if recoverErr := recoverStaleLock(root, fl); recoverErr != nil && !errors.Is(recoverErr, errRecoverLock) {
return nil, errors.Join(err, recoverErr)
}
fl = nil

// lock was unsuccessful so let's retry
if recoverErr := recoverStaleLock(root, fl); recoverErr != nil {
// return on recovery failed.
// perhaps the file is still locked and not older than 30 seconds?
// maybe a permission error prevented it from being removed?
return nil, errors.Join(err, recoverErr)
fl, err = openFile(root)
if err != nil {
return nil, err
}
}

fl, err = openFile(root)
if err != nil {
return nil, err
if ctx.Err() != nil {
return nil, errors.Join(err, ctx.Err())
}
// recovery was successful. Let's try get another lock one last time.

err = retryLock(ctx, fl, exclusive)
if err != nil {
return nil, err
Expand All @@ -100,27 +104,23 @@ func tryLock(ctx context.Context, root *os.Root, exclusive bool) (UnlockFunc, er
}

// retryLock attempts to acquire an advisory lock on the given file
// using flock, retrying until [defaultLockTimeout] is reached
// or the context is canceled.
// using flock, retrying until the context is canceled or the lock is acquired.
//
// Retries use exponential backoff with a maximum delay of 100ms
// between attempts.
//
// Set exclusive to true for write or delete operations to prevent
// concurrent reads.
func retryLock(ctx context.Context, f *os.File, exclusive bool) error {
lockCtx, lockCtxCancel := context.WithTimeout(ctx, defaultLockTimeout)
defer lockCtxCancel()

ep := backoff.NewExponentialBackOff()
ep.InitialInterval = time.Millisecond * 10
ep.MaxInterval = time.Millisecond * 100
_, err := backoff.Retry(lockCtx, func() (bool, error) {
_, err := backoff.Retry(ctx, func() (bool, error) {
if err := lockFile(f.Fd(), exclusive); err != nil {
return false, err
}
return true, nil
}, backoff.WithBackOff(ep))
}, backoff.WithBackOff(ep), backoff.WithMaxElapsedTime(0))
if err != nil {
return errors.Join(ErrLockUnsuccessful, err)
}
Expand All @@ -135,12 +135,13 @@ func retryLock(ctx context.Context, f *os.File, exclusive bool) error {
// TryLock acquires an exclusive advisory lock on a lock file.
//
// If the file does not exist, it is created. If the lock cannot be
// acquired immediately, the function retries until the default timeout
// (100ms) is reached.
// acquired immediately, the function retries until ctx is canceled or the
// lock is acquired.
//
// As a safeguard, the function attempts to recover from stale locks,
// defined as lock files older than 30 seconds. If recovery fails,
// manual intervention may be required.
// defined as lock files older than 30s. Stale lock recovery is skipped when
// ctx has been canceled. If recovery fails, manual intervention may be
// required.
//
// It returns an unlock function that must be called to release the lock.
func TryLock(ctx context.Context, root *os.Root) (UnlockFunc, error) {
Expand All @@ -150,12 +151,13 @@ func TryLock(ctx context.Context, root *os.Root) (UnlockFunc, error) {
// TryRLock acquires a non-exclusive advisory lock on a lock file.
//
// If the file does not exist, it is created. If the lock cannot be
// acquired immediately, the function retries until the default timeout
// (100ms) is reached.
// acquired immediately, the function retries until ctx is canceled or the
// lock is acquired.
//
// As a safeguard, the function attempts to recover from stale locks,
// defined as lock files older than 30 seconds. If recovery fails,
// manual intervention may be required.
// defined as lock files older than 30s. Stale lock recovery is skipped when
// ctx has been canceled. If recovery fails, manual intervention may be
// required.
//
// It returns an unlock function that must be called to release the lock.
func TryRLock(ctx context.Context, root *os.Root) (UnlockFunc, error) {
Expand Down
102 changes: 99 additions & 3 deletions store/posixage/internal/flock/flock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package flock

import (
"context"
"os"
"runtime"
"testing"
Expand Down Expand Up @@ -55,11 +56,19 @@ func TestFlock(t *testing.T) {
_ = unlock()
})

_, err = tryLock(t.Context(), root, exclusive)
ctx, cancel := context.WithTimeout(t.Context(), 75*time.Millisecond)
defer cancel()

_, err = tryLock(ctx, root, exclusive)
require.ErrorIs(t, err, ErrLockUnsuccessful)
require.ErrorIs(t, err, context.DeadlineExceeded)

_, err = tryLock(t.Context(), root, !exclusive)
ctx, cancel = context.WithTimeout(t.Context(), 75*time.Millisecond)
defer cancel()

_, err = tryLock(ctx, root, !exclusive)
require.ErrorIs(t, err, ErrLockUnsuccessful)
require.ErrorIs(t, err, context.DeadlineExceeded)
})

t.Run("multiple non-exclusive locks can be held", func(t *testing.T) {
Expand All @@ -82,8 +91,12 @@ func TestFlock(t *testing.T) {
_ = unlockTwo()
})

_, err = tryLock(t.Context(), root, exclusive)
ctx, cancel := context.WithTimeout(t.Context(), 75*time.Millisecond)
defer cancel()

_, err = tryLock(ctx, root, exclusive)
require.ErrorIs(t, err, ErrLockUnsuccessful)
require.ErrorIs(t, err, context.DeadlineExceeded)
})

t.Run("can recover from an exclusive lock", func(t *testing.T) {
Expand All @@ -108,6 +121,89 @@ func TestFlock(t *testing.T) {
require.NoError(t, err)
require.NoError(t, unlock())
})

t.Run("caller context can wait past former default timeout", func(t *testing.T) {
root, err := os.OpenRoot(t.TempDir())
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, root.Close())
})

exclusive := true
unlock, err := tryLock(t.Context(), root, exclusive)
require.NoError(t, err)

release := make(chan struct{})
go func() {
defer close(release)
time.Sleep(150 * time.Millisecond)
assert.NoError(t, unlock())
}()
t.Cleanup(func() {
<-release
})

ctx, cancel := context.WithTimeout(t.Context(), 750*time.Millisecond)
defer cancel()

unlockTwo, err := tryLock(ctx, root, exclusive)
require.NoError(t, err)
require.NoError(t, unlockTwo())
})

t.Run("caller context deadline bounds lock acquisition", func(t *testing.T) {
root, err := os.OpenRoot(t.TempDir())
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, root.Close())
})

exclusive := true
unlock, err := tryLock(t.Context(), root, exclusive)
require.NoError(t, err)
t.Cleanup(func() {
_ = unlock()
})

ctx, cancel := context.WithTimeout(t.Context(), 75*time.Millisecond)
defer cancel()

start := time.Now()
_, err = tryLock(ctx, root, exclusive)
require.ErrorIs(t, err, ErrLockUnsuccessful)
require.ErrorIs(t, err, context.DeadlineExceeded)
assert.Less(t, time.Since(start), time.Second)
})

t.Run("stale recovery is skipped after context cancellation", func(t *testing.T) {
root, err := os.OpenRoot(t.TempDir())
require.NoError(t, err)
t.Cleanup(func() {
assert.NoError(t, root.Close())
})

exclusive := true
unlock, err := tryLock(t.Context(), root, exclusive)
require.NoError(t, err)
t.Cleanup(func() {
_ = unlock()
})

if runtime.GOOS != "windows" {
fakeModTime := time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC)
require.NoError(t, root.Chtimes(lockFileName, fakeModTime, fakeModTime))
}

ctx, cancel := context.WithTimeout(t.Context(), 75*time.Millisecond)
cancel()

_, err = tryLock(ctx, root, exclusive)
require.ErrorIs(t, err, ErrLockUnsuccessful)
require.ErrorIs(t, err, context.Canceled)

_, err = root.Stat(lockFileName)
require.NoError(t, err)
})
}

func TestRecoverLock(t *testing.T) {
Expand Down
Loading