3838import com .cloud .utils .Pair ;
3939import com .cloud .utils .component .AdapterBase ;
4040import com .cloud .utils .exception .CloudRuntimeException ;
41+ import com .cloud .vm .VMInstanceDetailVO ;
4142import com .cloud .vm .VirtualMachine ;
4243import com .cloud .vm .dao .VMInstanceDao ;
44+ import com .cloud .vm .dao .VMInstanceDetailsDao ;
4345import com .cloud .vm .snapshot .VMSnapshot ;
4446import com .cloud .vm .snapshot .VMSnapshotDetailsVO ;
4547import com .cloud .vm .snapshot .VMSnapshotVO ;
@@ -117,6 +119,9 @@ public class NASBackupProvider extends AdapterBase implements BackupProvider, Co
117119 @ Inject
118120 private VMInstanceDao vmInstanceDao ;
119121
122+ @ Inject
123+ private VMInstanceDetailsDao vmInstanceDetailsDao ;
124+
120125 @ Inject
121126 private PrimaryDataStoreDao primaryDataStoreDao ;
122127
@@ -226,6 +231,12 @@ boolean isIncremental() {
226231 * appended to the existing chain. Stopped VMs are always full (libvirt {@code backup-begin}
227232 * requires a running QEMU process). The {@code nas.backup.full.every} ConfigKey controls
228233 * how many backups (full + incrementals) form one chain before a new full is forced.
234+ *
235+ * <p>The decision is anchored on the VM's {@code nas.active_checkpoint_id} detail, which
236+ * records the bitmap that currently exists on the running QEMU. After a restore that
237+ * detail is cleared, so the next backup is automatically full — even though there may be
238+ * a more recent "last backup taken" row in the database. This matches the prescription in
239+ * the PR review (avoid relying on "last backup" because that breaks after restore).</p>
229240 */
230241 protected ChainDecision decideChain (VirtualMachine vm ) {
231242 final String newBitmap = "backup-" + System .currentTimeMillis () / 1000L ;
@@ -242,38 +253,29 @@ protected ChainDecision decideChain(VirtualMachine vm) {
242253 return ChainDecision .fullStart (newBitmap );
243254 }
244255
245- // Walk this VM's backups newest→oldest, find the most recent BackedUp backup that has a
246- // bitmap stored. If we don't find one, this is the first backup in a chain — start full.
247- List <Backup > history = backupDao .listByVmId (vm .getDataCenterId (), vm .getId ());
248- if (history == null || history .isEmpty ()) {
256+ // 1. If the VM has no active_checkpoint_id, there is no bitmap on the host to use as
257+ // a parent. This is the case after restore (we clear it), after VM was just assigned
258+ // to the offering, or on the very first backup.
259+ String activeCheckpoint = readVmActiveCheckpoint (vm .getId ());
260+ if (activeCheckpoint == null ) {
249261 return ChainDecision .fullStart (newBitmap );
250262 }
251- history .sort (Comparator .comparing (Backup ::getDate ).reversed ());
252263
253- Backup parent = null ;
254- String parentBitmap = null ;
255- String parentChainId = null ;
256- int parentChainPosition = -1 ;
257- for (Backup b : history ) {
258- if (!Backup .Status .BackedUp .equals (b .getStatus ())) {
259- continue ;
260- }
261- String bm = readDetail (b , NASBackupChainKeys .BITMAP_NAME );
262- if (bm == null ) {
263- continue ;
264- }
265- parent = b ;
266- parentBitmap = bm ;
267- parentChainId = readDetail (b , NASBackupChainKeys .CHAIN_ID );
268- String posStr = readDetail (b , NASBackupChainKeys .CHAIN_POSITION );
269- try {
270- parentChainPosition = posStr == null ? 0 : Integer .parseInt (posStr );
271- } catch (NumberFormatException e ) {
272- parentChainPosition = 0 ;
273- }
274- break ;
264+ // 2. Find the latest BackedUp backup of this VM whose BITMAP_NAME matches the VM's
265+ // active_checkpoint_id. Only that backup is a safe parent — any earlier backup
266+ // would have a bitmap that QEMU may have rotated out. Per the review:
267+ // "The latest backup should have the bitmap_name equal to the VM's
268+ // active_checkpoint_id which will become the parent backup. If not, force full."
269+ Backup parent = findLatestBackedUpBackupWithBitmap (vm .getId (), activeCheckpoint );
270+ if (parent == null ) {
271+ LOG .debug ("VM {} has active_checkpoint_id={} but no matching backup found — forcing full" ,
272+ vm .getInstanceName (), activeCheckpoint );
273+ return ChainDecision .fullStart (newBitmap );
275274 }
276- if (parent == null || parentBitmap == null || parentChainId == null ) {
275+
276+ String parentChainId = readDetail (parent , NASBackupChainKeys .CHAIN_ID );
277+ int parentChainPosition = chainPosition (parent );
278+ if (parentChainId == null || parentChainPosition == Integer .MAX_VALUE ) {
277279 return ChainDecision .fullStart (newBitmap );
278280 }
279281
@@ -286,10 +288,44 @@ protected ChainDecision decideChain(VirtualMachine vm) {
286288 // qcow2 onto it. The path is stored relative to the NAS mount point — the script
287289 // resolves it inside its mount session.
288290 String parentPath = composeParentBackupPath (parent );
289- return ChainDecision .incremental (newBitmap , parentBitmap , parentPath ,
291+ return ChainDecision .incremental (newBitmap , activeCheckpoint , parentPath ,
290292 parentChainId , parentChainPosition + 1 );
291293 }
292294
295+ /**
296+ * Read the {@code nas.active_checkpoint_id} VM detail. Returns {@code null} when no detail
297+ * exists (post-restore, first backup, or after explicit reset).
298+ */
299+ private String readVmActiveCheckpoint (long vmId ) {
300+ VMInstanceDetailVO d = vmInstanceDetailsDao .findDetail (vmId , NASBackupChainKeys .VM_ACTIVE_CHECKPOINT_ID );
301+ if (d == null ) {
302+ return null ;
303+ }
304+ String v = d .getValue ();
305+ return (v == null || v .isEmpty ()) ? null : v ;
306+ }
307+
308+ /**
309+ * Locate the most-recent {@code BackedUp} backup for {@code vmId} whose bitmap name equals
310+ * {@code bitmapName}. Used by {@link #decideChain} to anchor the next incremental.
311+ */
312+ private Backup findLatestBackedUpBackupWithBitmap (long vmId , String bitmapName ) {
313+ List <Backup > history = backupDao .listByVmId (null , vmId );
314+ if (history == null || history .isEmpty ()) {
315+ return null ;
316+ }
317+ history .sort (Comparator .comparing (Backup ::getDate ).reversed ());
318+ for (Backup b : history ) {
319+ if (!Backup .Status .BackedUp .equals (b .getStatus ())) {
320+ continue ;
321+ }
322+ if (bitmapName .equals (readDetail (b , NASBackupChainKeys .BITMAP_NAME ))) {
323+ return b ;
324+ }
325+ }
326+ return null ;
327+ }
328+
293329 private String readDetail (Backup backup , String key ) {
294330 BackupDetailVO d = backupDetailsDao .findDetail (backup .getId (), key );
295331 return d == null ? null : d .getValue ();
@@ -331,6 +367,34 @@ private void persistChainMetadata(Backup backup, ChainDecision decision, String
331367 }
332368 }
333369
370+ /**
371+ * Upsert the VM's {@code nas.active_checkpoint_id} detail to {@code bitmapName}. Called
372+ * after every successful backup so the next backup's parent-bitmap decision is anchored
373+ * on what actually exists on QEMU, not on "last backup taken".
374+ */
375+ private void upsertVmActiveCheckpoint (long vmId , String bitmapName ) {
376+ VMInstanceDetailVO existing = vmInstanceDetailsDao .findDetail (vmId , NASBackupChainKeys .VM_ACTIVE_CHECKPOINT_ID );
377+ if (existing == null ) {
378+ vmInstanceDetailsDao .addDetail (vmId , NASBackupChainKeys .VM_ACTIVE_CHECKPOINT_ID , bitmapName , false );
379+ return ;
380+ }
381+ existing .setValue (bitmapName );
382+ vmInstanceDetailsDao .update (existing .getId (), existing );
383+ }
384+
385+ /**
386+ * Remove the VM's {@code nas.active_checkpoint_id} detail. Called from the restore paths:
387+ * after restore the disk image has no QEMU bitmap attached, so any future incremental
388+ * would be based on stale state. Clearing forces the next backup to be a fresh full.
389+ */
390+ private void clearVmActiveCheckpoint (long vmId ) {
391+ VMInstanceDetailVO existing = vmInstanceDetailsDao .findDetail (vmId , NASBackupChainKeys .VM_ACTIVE_CHECKPOINT_ID );
392+ if (existing != null ) {
393+ vmInstanceDetailsDao .removeDetail (vmId , NASBackupChainKeys .VM_ACTIVE_CHECKPOINT_ID );
394+ LOG .debug ("Cleared nas.active_checkpoint_id for VM id={} (was {})" , vmId , existing .getValue ());
395+ }
396+ }
397+
334398 private String lookupParentBackupUuid (long vmId , String parentBitmap ) {
335399 if (parentBitmap == null ) {
336400 return null ;
@@ -440,6 +504,18 @@ public Pair<Boolean, Backup> takeBackup(final VirtualMachine vm, Boolean quiesce
440504 logger .info ("NAS incremental for VM {} recreated parent bitmap {} (likely VM was restarted since last backup)" ,
441505 vm .getInstanceName (), answer .getBitmapRecreated ());
442506 }
507+ // Pin the VM's active_checkpoint_id to whichever bitmap the agent actually
508+ // created. This is the only valid parent for the next incremental (see
509+ // decideChain). For the stopped-VM offline path BITMAP_CREATED is null —
510+ // no bitmap exists on the host, so we also clear any stale detail from a
511+ // prior online backup. Either way, after this step the detail accurately
512+ // reflects what's on the running QEMU (or absence thereof).
513+ String confirmedBitmap = answer .getBitmapCreated ();
514+ if (confirmedBitmap != null ) {
515+ upsertVmActiveCheckpoint (vm .getId (), confirmedBitmap );
516+ } else {
517+ clearVmActiveCheckpoint (vm .getId ());
518+ }
443519 return new Pair <>(true , backupVO );
444520 } else {
445521 throw new CloudRuntimeException ("Failed to update backup" );
@@ -531,6 +607,11 @@ private Pair<Boolean, String> restoreVMBackup(VirtualMachine vm, Backup backup)
531607 } catch (OperationTimedoutException e ) {
532608 throw new CloudRuntimeException ("Operation to restore backup timed out, please try again" );
533609 }
610+ // After a restore the QEMU dirty-bitmap chain is gone — clear active_checkpoint_id so
611+ // the next backup is taken as a fresh full and starts a new chain. See decideChain.
612+ if (answer != null && answer .getResult ()) {
613+ clearVmActiveCheckpoint (vm .getId ());
614+ }
534615 return new Pair <>(answer .getResult (), answer .getDetails ());
535616 }
536617
0 commit comments