Skip to content

Commit 7e1691b

Browse files
committed
feat(backup): anchor incremental chain on VM active_checkpoint_id
Per @abh1sar's review at NASBackupProvider.java:251, the "find the latest backup with a bitmap" heuristic in decideChain breaks after a restore — the restored disk image has no QEMU dirty-bitmap, so taking the last-backed-up row as parent for the next incremental produces a chain whose first link references blocks that no longer exist. Track the active checkpoint per-VM in vm_instance_details under the new nas.active_checkpoint_id key: * takeBackup writes it after every successful backup (cleared on the stopped-VM offline path where no bitmap is created). * restoreVMFromBackup / restoreBackupToVM clear it on success, so the next backup is automatically a full and starts a fresh chain. * decideChain reads it first: null => full; non-null => find the matching bitmap_name on a BackedUp backup row; if none, fall back to full. The new VM_ACTIVE_CHECKPOINT_ID key lives next to the existing chain keys in NASBackupChainKeys (it's a vm_instance_details detail, not a backup detail, so no schema migration is needed).
1 parent 72f967a commit 7e1691b

3 files changed

Lines changed: 199 additions & 29 deletions

File tree

plugins/backup/nas/src/main/java/org/apache/cloudstack/backup/NASBackupChainKeys.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ public final class NASBackupChainKeys {
4545
/** Set to the bitmap name when this incremental had to recreate its parent bitmap on the host (informational; this incremental is larger than usual). */
4646
public static final String BITMAP_RECREATED = "nas.bitmap_recreated";
4747

48+
/**
49+
* VM-scoped detail (stored in {@code vm_instance_details}) holding the QEMU dirty-bitmap
50+
* name that currently exists on the running VM and is therefore the only valid parent
51+
* for the next incremental backup. Written by {@link #BITMAP_NAME} on each successful
52+
* backup; cleared on restore (the restored disk image has no bitmap, so the next backup
53+
* must be a fresh full). When the VM has no detail, {@code decideChain} forces full.
54+
*/
55+
public static final String VM_ACTIVE_CHECKPOINT_ID = "nas.active_checkpoint_id";
56+
4857
private NASBackupChainKeys() {
4958
}
5059
}

plugins/backup/nas/src/main/java/org/apache/cloudstack/backup/NASBackupProvider.java

Lines changed: 110 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,10 @@
3838
import com.cloud.utils.Pair;
3939
import com.cloud.utils.component.AdapterBase;
4040
import com.cloud.utils.exception.CloudRuntimeException;
41+
import com.cloud.vm.VMInstanceDetailVO;
4142
import com.cloud.vm.VirtualMachine;
4243
import com.cloud.vm.dao.VMInstanceDao;
44+
import com.cloud.vm.dao.VMInstanceDetailsDao;
4345
import com.cloud.vm.snapshot.VMSnapshot;
4446
import com.cloud.vm.snapshot.VMSnapshotDetailsVO;
4547
import com.cloud.vm.snapshot.VMSnapshotVO;
@@ -117,6 +119,9 @@ public class NASBackupProvider extends AdapterBase implements BackupProvider, Co
117119
@Inject
118120
private VMInstanceDao vmInstanceDao;
119121

122+
@Inject
123+
private VMInstanceDetailsDao vmInstanceDetailsDao;
124+
120125
@Inject
121126
private PrimaryDataStoreDao primaryDataStoreDao;
122127

@@ -226,6 +231,12 @@ boolean isIncremental() {
226231
* appended to the existing chain. Stopped VMs are always full (libvirt {@code backup-begin}
227232
* requires a running QEMU process). The {@code nas.backup.full.every} ConfigKey controls
228233
* how many backups (full + incrementals) form one chain before a new full is forced.
234+
*
235+
* <p>The decision is anchored on the VM's {@code nas.active_checkpoint_id} detail, which
236+
* records the bitmap that currently exists on the running QEMU. After a restore that
237+
* detail is cleared, so the next backup is automatically full — even though there may be
238+
* a more recent "last backup taken" row in the database. This matches the prescription in
239+
* the PR review (avoid relying on "last backup" because that breaks after restore).</p>
229240
*/
230241
protected ChainDecision decideChain(VirtualMachine vm) {
231242
final String newBitmap = "backup-" + System.currentTimeMillis() / 1000L;
@@ -242,38 +253,29 @@ protected ChainDecision decideChain(VirtualMachine vm) {
242253
return ChainDecision.fullStart(newBitmap);
243254
}
244255

245-
// Walk this VM's backups newest→oldest, find the most recent BackedUp backup that has a
246-
// bitmap stored. If we don't find one, this is the first backup in a chain — start full.
247-
List<Backup> history = backupDao.listByVmId(vm.getDataCenterId(), vm.getId());
248-
if (history == null || history.isEmpty()) {
256+
// 1. If the VM has no active_checkpoint_id, there is no bitmap on the host to use as
257+
// a parent. This is the case after restore (we clear it), after VM was just assigned
258+
// to the offering, or on the very first backup.
259+
String activeCheckpoint = readVmActiveCheckpoint(vm.getId());
260+
if (activeCheckpoint == null) {
249261
return ChainDecision.fullStart(newBitmap);
250262
}
251-
history.sort(Comparator.comparing(Backup::getDate).reversed());
252263

253-
Backup parent = null;
254-
String parentBitmap = null;
255-
String parentChainId = null;
256-
int parentChainPosition = -1;
257-
for (Backup b : history) {
258-
if (!Backup.Status.BackedUp.equals(b.getStatus())) {
259-
continue;
260-
}
261-
String bm = readDetail(b, NASBackupChainKeys.BITMAP_NAME);
262-
if (bm == null) {
263-
continue;
264-
}
265-
parent = b;
266-
parentBitmap = bm;
267-
parentChainId = readDetail(b, NASBackupChainKeys.CHAIN_ID);
268-
String posStr = readDetail(b, NASBackupChainKeys.CHAIN_POSITION);
269-
try {
270-
parentChainPosition = posStr == null ? 0 : Integer.parseInt(posStr);
271-
} catch (NumberFormatException e) {
272-
parentChainPosition = 0;
273-
}
274-
break;
264+
// 2. Find the latest BackedUp backup of this VM whose BITMAP_NAME matches the VM's
265+
// active_checkpoint_id. Only that backup is a safe parent — any earlier backup
266+
// would have a bitmap that QEMU may have rotated out. Per the review:
267+
// "The latest backup should have the bitmap_name equal to the VM's
268+
// active_checkpoint_id which will become the parent backup. If not, force full."
269+
Backup parent = findLatestBackedUpBackupWithBitmap(vm.getId(), activeCheckpoint);
270+
if (parent == null) {
271+
LOG.debug("VM {} has active_checkpoint_id={} but no matching backup found — forcing full",
272+
vm.getInstanceName(), activeCheckpoint);
273+
return ChainDecision.fullStart(newBitmap);
275274
}
276-
if (parent == null || parentBitmap == null || parentChainId == null) {
275+
276+
String parentChainId = readDetail(parent, NASBackupChainKeys.CHAIN_ID);
277+
int parentChainPosition = chainPosition(parent);
278+
if (parentChainId == null || parentChainPosition == Integer.MAX_VALUE) {
277279
return ChainDecision.fullStart(newBitmap);
278280
}
279281

@@ -286,10 +288,44 @@ protected ChainDecision decideChain(VirtualMachine vm) {
286288
// qcow2 onto it. The path is stored relative to the NAS mount point — the script
287289
// resolves it inside its mount session.
288290
String parentPath = composeParentBackupPath(parent);
289-
return ChainDecision.incremental(newBitmap, parentBitmap, parentPath,
291+
return ChainDecision.incremental(newBitmap, activeCheckpoint, parentPath,
290292
parentChainId, parentChainPosition + 1);
291293
}
292294

295+
/**
296+
* Read the {@code nas.active_checkpoint_id} VM detail. Returns {@code null} when no detail
297+
* exists (post-restore, first backup, or after explicit reset).
298+
*/
299+
private String readVmActiveCheckpoint(long vmId) {
300+
VMInstanceDetailVO d = vmInstanceDetailsDao.findDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID);
301+
if (d == null) {
302+
return null;
303+
}
304+
String v = d.getValue();
305+
return (v == null || v.isEmpty()) ? null : v;
306+
}
307+
308+
/**
309+
* Locate the most-recent {@code BackedUp} backup for {@code vmId} whose bitmap name equals
310+
* {@code bitmapName}. Used by {@link #decideChain} to anchor the next incremental.
311+
*/
312+
private Backup findLatestBackedUpBackupWithBitmap(long vmId, String bitmapName) {
313+
List<Backup> history = backupDao.listByVmId(null, vmId);
314+
if (history == null || history.isEmpty()) {
315+
return null;
316+
}
317+
history.sort(Comparator.comparing(Backup::getDate).reversed());
318+
for (Backup b : history) {
319+
if (!Backup.Status.BackedUp.equals(b.getStatus())) {
320+
continue;
321+
}
322+
if (bitmapName.equals(readDetail(b, NASBackupChainKeys.BITMAP_NAME))) {
323+
return b;
324+
}
325+
}
326+
return null;
327+
}
328+
293329
private String readDetail(Backup backup, String key) {
294330
BackupDetailVO d = backupDetailsDao.findDetail(backup.getId(), key);
295331
return d == null ? null : d.getValue();
@@ -331,6 +367,34 @@ private void persistChainMetadata(Backup backup, ChainDecision decision, String
331367
}
332368
}
333369

370+
/**
371+
* Upsert the VM's {@code nas.active_checkpoint_id} detail to {@code bitmapName}. Called
372+
* after every successful backup so the next backup's parent-bitmap decision is anchored
373+
* on what actually exists on QEMU, not on "last backup taken".
374+
*/
375+
private void upsertVmActiveCheckpoint(long vmId, String bitmapName) {
376+
VMInstanceDetailVO existing = vmInstanceDetailsDao.findDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID);
377+
if (existing == null) {
378+
vmInstanceDetailsDao.addDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID, bitmapName, false);
379+
return;
380+
}
381+
existing.setValue(bitmapName);
382+
vmInstanceDetailsDao.update(existing.getId(), existing);
383+
}
384+
385+
/**
386+
* Remove the VM's {@code nas.active_checkpoint_id} detail. Called from the restore paths:
387+
* after restore the disk image has no QEMU bitmap attached, so any future incremental
388+
* would be based on stale state. Clearing forces the next backup to be a fresh full.
389+
*/
390+
private void clearVmActiveCheckpoint(long vmId) {
391+
VMInstanceDetailVO existing = vmInstanceDetailsDao.findDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID);
392+
if (existing != null) {
393+
vmInstanceDetailsDao.removeDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID);
394+
LOG.debug("Cleared nas.active_checkpoint_id for VM id={} (was {})", vmId, existing.getValue());
395+
}
396+
}
397+
334398
private String lookupParentBackupUuid(long vmId, String parentBitmap) {
335399
if (parentBitmap == null) {
336400
return null;
@@ -440,6 +504,18 @@ public Pair<Boolean, Backup> takeBackup(final VirtualMachine vm, Boolean quiesce
440504
logger.info("NAS incremental for VM {} recreated parent bitmap {} (likely VM was restarted since last backup)",
441505
vm.getInstanceName(), answer.getBitmapRecreated());
442506
}
507+
// Pin the VM's active_checkpoint_id to whichever bitmap the agent actually
508+
// created. This is the only valid parent for the next incremental (see
509+
// decideChain). For the stopped-VM offline path BITMAP_CREATED is null —
510+
// no bitmap exists on the host, so we also clear any stale detail from a
511+
// prior online backup. Either way, after this step the detail accurately
512+
// reflects what's on the running QEMU (or absence thereof).
513+
String confirmedBitmap = answer.getBitmapCreated();
514+
if (confirmedBitmap != null) {
515+
upsertVmActiveCheckpoint(vm.getId(), confirmedBitmap);
516+
} else {
517+
clearVmActiveCheckpoint(vm.getId());
518+
}
443519
return new Pair<>(true, backupVO);
444520
} else {
445521
throw new CloudRuntimeException("Failed to update backup");
@@ -531,6 +607,11 @@ private Pair<Boolean, String> restoreVMBackup(VirtualMachine vm, Backup backup)
531607
} catch (OperationTimedoutException e) {
532608
throw new CloudRuntimeException("Operation to restore backup timed out, please try again");
533609
}
610+
// After a restore the QEMU dirty-bitmap chain is gone — clear active_checkpoint_id so
611+
// the next backup is taken as a fresh full and starts a new chain. See decideChain.
612+
if (answer != null && answer.getResult()) {
613+
clearVmActiveCheckpoint(vm.getId());
614+
}
534615
return new Pair<>(answer.getResult(), answer.getDetails());
535616
}
536617

plugins/backup/nas/src/test/java/org/apache/cloudstack/backup/NASBackupProviderTest.java

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,10 @@
4848
import com.cloud.storage.VolumeVO;
4949
import com.cloud.storage.dao.VolumeDao;
5050
import com.cloud.utils.Pair;
51+
import com.cloud.vm.VMInstanceDetailVO;
5152
import com.cloud.vm.VMInstanceVO;
5253
import com.cloud.vm.dao.VMInstanceDao;
54+
import com.cloud.vm.dao.VMInstanceDetailsDao;
5355

5456
import org.apache.cloudstack.backup.dao.BackupDao;
5557
import org.apache.cloudstack.backup.dao.BackupDetailsDao;
@@ -100,6 +102,9 @@ public class NASBackupProviderTest {
100102
@Mock
101103
private BackupDetailsDao backupDetailsDao;
102104

105+
@Mock
106+
private VMInstanceDetailsDao vmInstanceDetailsDao;
107+
103108
@Test
104109
public void testDeleteBackup() throws OperationTimedoutException, AgentUnavailableException {
105110
Long hostId = 1L;
@@ -357,4 +362,79 @@ public void testGetVMHypervisorHostFallbackToZoneWideKVMHost() {
357362
Mockito.verify(hostDao).findHypervisorHostInCluster(clusterId);
358363
Mockito.verify(resourceManager).findOneRandomRunningHostByHypervisor(Hypervisor.HypervisorType.KVM, zoneId);
359364
}
365+
366+
// -- decideChain anchored on VM's active_checkpoint_id -------------------------------
367+
368+
/**
369+
* No active_checkpoint_id on the VM (post-restore, first-ever backup, or detail purged) =>
370+
* decideChain must return a fresh full. This is the regression abh1sar called out at
371+
* NASBackupProvider.java:251 ("This logic breaks after restore. We can't use the last
372+
* backup taken as parent in that case.").
373+
*/
374+
@Test
375+
public void decideChainReturnsFullWhenVmHasNoActiveCheckpoint() {
376+
Long zoneId = 1L;
377+
Long vmId = 42L;
378+
VMInstanceVO vm = mock(VMInstanceVO.class);
379+
Mockito.when(vm.getId()).thenReturn(vmId);
380+
Mockito.when(vm.getDataCenterId()).thenReturn(zoneId);
381+
Mockito.when(vm.getState()).thenReturn(VMInstanceVO.State.Running);
382+
383+
Mockito.when(vmInstanceDetailsDao.findDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID)).thenReturn(null);
384+
385+
NASBackupProvider.ChainDecision decision = nasBackupProvider.decideChain(vm);
386+
Assert.assertNotNull(decision);
387+
Assert.assertEquals(NASBackupChainKeys.TYPE_FULL, decision.mode);
388+
Assert.assertNull(decision.bitmapParent);
389+
Assert.assertEquals(0, decision.chainPosition);
390+
}
391+
392+
/**
393+
* After a successful restoreVMFromBackup, decideChain on the next backup must produce
394+
* a full. We verify by checking that vmInstanceDetailsDao.removeDetail is called with
395+
* the active_checkpoint_id key.
396+
*/
397+
@Test
398+
public void restoreClearsActiveCheckpointDetail() throws AgentUnavailableException, OperationTimedoutException {
399+
Long vmId = 7L;
400+
Long hostId = 8L;
401+
Long backupOfferingId = 9L;
402+
403+
VMInstanceVO vm = mock(VMInstanceVO.class);
404+
Mockito.when(vm.getId()).thenReturn(vmId);
405+
Mockito.when(vm.getLastHostId()).thenReturn(hostId);
406+
Mockito.when(vm.getRemoved()).thenReturn(null);
407+
Mockito.when(vm.getName()).thenReturn("vm7");
408+
409+
HostVO host = mock(HostVO.class);
410+
Mockito.when(host.getStatus()).thenReturn(Status.Up);
411+
Mockito.when(host.getId()).thenReturn(hostId);
412+
Mockito.when(hostDao.findById(hostId)).thenReturn(host);
413+
414+
BackupVO backup = new BackupVO();
415+
backup.setVmId(vmId);
416+
backup.setBackupOfferingId(backupOfferingId);
417+
backup.setExternalId("i-2-7-VM/2026.05.16.10.00.00");
418+
ReflectionTestUtils.setField(backup, "id", 100L);
419+
// backedUpVolumes defaults to null => BackupVO.getBackedUpVolumes returns emptyList().
420+
421+
BackupRepositoryVO repo = new BackupRepositoryVO(1L, "nas", "test-repo",
422+
"nfs", "address", "sync", 1024L, null);
423+
Mockito.when(backupRepositoryDao.findByBackupOfferingId(backupOfferingId)).thenReturn(repo);
424+
425+
Mockito.when(volumeDao.findByInstance(vmId)).thenReturn(Collections.emptyList());
426+
427+
BackupAnswer answer = mock(BackupAnswer.class);
428+
Mockito.when(answer.getResult()).thenReturn(true);
429+
Mockito.when(agentManager.send(Mockito.anyLong(), Mockito.any(RestoreBackupCommand.class))).thenReturn(answer);
430+
431+
// Pre-existing checkpoint detail so removeDetail has something to "clear".
432+
VMInstanceDetailVO existing = mock(VMInstanceDetailVO.class);
433+
Mockito.when(existing.getValue()).thenReturn("backup-1715000000");
434+
Mockito.when(vmInstanceDetailsDao.findDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID)).thenReturn(existing);
435+
436+
boolean ok = nasBackupProvider.restoreVMFromBackup(vm, backup);
437+
Assert.assertTrue(ok);
438+
Mockito.verify(vmInstanceDetailsDao).removeDetail(vmId, NASBackupChainKeys.VM_ACTIVE_CHECKPOINT_ID);
439+
}
360440
}

0 commit comments

Comments
 (0)