Skip to content

Commit f7c009d

Browse files
committed
Add burst-proof resource reservations
1 parent 3258e81 commit f7c009d

8 files changed

Lines changed: 365 additions & 40 deletions

File tree

lib/instances/admission.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package instances
2+
3+
func volumeOverlayReservationBytes(volumes []VolumeAttachment) int64 {
4+
var total int64
5+
for _, vol := range volumes {
6+
if vol.Overlay {
7+
total += vol.OverlaySize
8+
}
9+
}
10+
return total
11+
}
12+
13+
func requestedDiskReservationBytes(overlaySize int64, volumes []VolumeAttachment) int64 {
14+
return overlaySize + volumeOverlayReservationBytes(volumes)
15+
}
16+
17+
func storedDiskReservationBytes(stored *StoredMetadata) int64 {
18+
if stored == nil {
19+
return 0
20+
}
21+
return requestedDiskReservationBytes(stored.OverlaySize, stored.Volumes)
22+
}

lib/instances/admission_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package instances
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestRequestedDiskReservationBytes(t *testing.T) {
10+
t.Parallel()
11+
12+
diskBytes := requestedDiskReservationBytes(10, []VolumeAttachment{
13+
{VolumeID: "base-only", Overlay: false, OverlaySize: 100},
14+
{VolumeID: "overlay-a", Overlay: true, OverlaySize: 20},
15+
{VolumeID: "overlay-b", Overlay: true, OverlaySize: 30},
16+
})
17+
18+
assert.Equal(t, int64(60), diskBytes)
19+
}
20+
21+
func TestStoredDiskReservationBytes(t *testing.T) {
22+
t.Parallel()
23+
24+
diskBytes := storedDiskReservationBytes(&StoredMetadata{
25+
OverlaySize: 15,
26+
Volumes: []VolumeAttachment{
27+
{VolumeID: "base-only", Overlay: false, OverlaySize: 100},
28+
{VolumeID: "overlay", Overlay: true, OverlaySize: 25},
29+
},
30+
})
31+
32+
assert.Equal(t, int64(40), diskBytes)
33+
}

lib/instances/create.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,13 +182,22 @@ func (m *manager) createInstance(
182182
return nil, fmt.Errorf("total memory %d (size + hotplug_size) exceeds maximum allowed %d per instance", totalMemory, m.limits.MaxMemoryPerInstance)
183183
}
184184

185-
// Validate aggregate resource limits via ResourceValidator (if configured)
185+
diskBytes := requestedDiskReservationBytes(overlaySize, req.Volumes)
186+
reservedResources := false
187+
188+
// Reserve aggregate resources for this create while it is in flight.
186189
if m.resourceValidator != nil {
187190
needsGPU := req.GPU != nil && req.GPU.Profile != ""
188-
if err := m.resourceValidator.ValidateAllocation(ctx, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, needsGPU); err != nil {
189-
log.ErrorContext(ctx, "resource validation failed", "error", err)
191+
if err := m.resourceValidator.ReserveAllocation(ctx, id, vcpus, totalMemory, req.NetworkBandwidthDownload, req.NetworkBandwidthUpload, req.DiskIOBps, diskBytes, needsGPU); err != nil {
192+
log.ErrorContext(ctx, "resource reservation failed", "error", err)
190193
return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
191194
}
195+
reservedResources = true
196+
defer func() {
197+
if reservedResources {
198+
m.resourceValidator.FinishAllocation(id)
199+
}
200+
}()
192201
}
193202

194203
if req.Env == nil {
@@ -492,6 +501,10 @@ func (m *manager) createInstance(
492501
return nil, err
493502
}
494503
startVMSpanEnd(nil)
504+
if reservedResources {
505+
m.resourceValidator.FinishAllocation(id)
506+
reservedResources = false
507+
}
495508

496509
// 20. Persist runtime metadata updates after VM boot.
497510
meta = &metadata{StoredMetadata: *stored}

lib/instances/manager.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,12 @@ type ResourceValidator interface {
8484
// ValidateAllocation checks if the requested resources are available.
8585
// Returns nil if allocation is allowed, or a detailed error describing
8686
// which resource is insufficient and the current capacity/usage.
87-
ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, needsGPU bool) error
87+
ValidateAllocation(ctx context.Context, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, diskBytes int64, needsGPU bool) error
88+
// ReserveAllocation tentatively reserves resources for an in-flight operation.
89+
// Call FinishAllocation once the operation fails or becomes visible to resource accounting.
90+
ReserveAllocation(ctx context.Context, instanceID string, vcpus int, memoryBytes int64, networkDownloadBps int64, networkUploadBps int64, diskIOBps int64, diskBytes int64, needsGPU bool) error
91+
// FinishAllocation removes any pending reservation for the given instance ID.
92+
FinishAllocation(instanceID string)
8893
}
8994

9095
type manager struct {

lib/instances/restore.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,21 @@ func (m *manager) restoreInstance(
5959
}
6060

6161
// 2b. Validate aggregate resource limits before allocating resources (if configured)
62+
reservedResources := false
6263
if m.resourceValidator != nil {
6364
needsGPU := stored.GPUProfile != ""
6465
totalMemory := stored.Size + stored.HotplugSize
65-
if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil {
66-
log.ErrorContext(ctx, "resource validation failed for restore", "instance_id", id, "error", err)
66+
diskBytes := storedDiskReservationBytes(stored)
67+
if err := m.resourceValidator.ReserveAllocation(ctx, id, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, diskBytes, needsGPU); err != nil {
68+
log.ErrorContext(ctx, "resource reservation failed for restore", "instance_id", id, "error", err)
6769
return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
6870
}
71+
reservedResources = true
72+
defer func() {
73+
if reservedResources {
74+
m.resourceValidator.FinishAllocation(id)
75+
}
76+
}()
6977
}
7078

7179
// 3. Get snapshot directory
@@ -253,6 +261,10 @@ func (m *manager) restoreInstance(
253261
return nil, fmt.Errorf("resume vm failed: %w", err)
254262
}
255263
resumeSpanEnd(nil)
264+
if reservedResources {
265+
m.resourceValidator.FinishAllocation(id)
266+
reservedResources = false
267+
}
256268

257269
// Forked standby restores may allocate a fresh identity while the guest memory snapshot
258270
// still has the source VM's old IP configuration. Reconfigure guest networking after

lib/instances/start.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,21 @@ func (m *manager) startInstance(
6161
}
6262

6363
// 2b. Validate aggregate resource limits before allocating resources (if configured)
64+
reservedResources := false
6465
if m.resourceValidator != nil {
6566
needsGPU := stored.GPUProfile != ""
6667
totalMemory := stored.Size + stored.HotplugSize
67-
if err := m.resourceValidator.ValidateAllocation(ctx, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, needsGPU); err != nil {
68-
log.ErrorContext(ctx, "resource validation failed for start", "instance_id", id, "error", err)
68+
diskBytes := storedDiskReservationBytes(stored)
69+
if err := m.resourceValidator.ReserveAllocation(ctx, id, stored.Vcpus, totalMemory, stored.NetworkBandwidthDownload, stored.NetworkBandwidthUpload, stored.DiskIOBps, diskBytes, needsGPU); err != nil {
70+
log.ErrorContext(ctx, "resource reservation failed for start", "instance_id", id, "error", err)
6971
return nil, fmt.Errorf("%w: %v", ErrInsufficientResources, err)
7072
}
73+
reservedResources = true
74+
defer func() {
75+
if reservedResources {
76+
m.resourceValidator.FinishAllocation(id)
77+
}
78+
}()
7179
}
7280

7381
// 3. Get image info (needed for buildHypervisorConfig)
@@ -188,6 +196,10 @@ func (m *manager) startInstance(
188196
return nil, err
189197
}
190198
startVMSpanEnd(nil)
199+
if reservedResources {
200+
m.resourceValidator.FinishAllocation(id)
201+
reservedResources = false
202+
}
191203

192204
// Success - release cleanup stack (prevent cleanup)
193205
cu.Release()

0 commit comments

Comments
 (0)