1212import org .zstack .core .thread .ChainTask ;
1313import org .zstack .core .thread .SyncTaskChain ;
1414import org .zstack .core .thread .ThreadFacade ;
15+ import org .zstack .core .thread .PeriodicTask ;
1516import org .zstack .header .Component ;
1617import org .zstack .header .apimediator .ApiMediatorConstant ;
1718import org .zstack .header .core .Completion ;
3940
4041import java .util .*;
4142import java .util .concurrent .ConcurrentHashMap ;
43+ import java .util .concurrent .TimeUnit ;
4244
4345import static org .zstack .core .Platform .getReflections ;
4446import static org .zstack .core .Platform .operr ;
@@ -65,6 +67,9 @@ public class KvmVmSyncPingTask extends VmTracer implements KVMPingAgentNoFailure
6567 // A map from apiId to VM instance uuid
6668 private ConcurrentHashMap <String , ConcurrentHashMap <String , String >> vmApis = new ConcurrentHashMap <>();
6769 private ConcurrentHashMap <String , Set <String >> vmsToSkip = new ConcurrentHashMap <>();
70+ // Orphaned skip list: VMs whose management node left but might still be starting
71+ // Map: vmUuid -> timestamp when orphaned
72+ private ConcurrentHashMap <String , Long > orphanedVmsToSkip = new ConcurrentHashMap <>();
6873 private List <Class <? extends Message >> skipVmTracerMessages = new ArrayList <>();
6974 private List <Class > skipVmTracerReplies = new ArrayList <>();
7075 private Map <String , Integer > vmInShutdownMap = new ConcurrentHashMap <>();
@@ -142,12 +147,16 @@ protected void run(Map tokens, Object data) {
142147 String vmUuid = vmApis .get (data1 .getManagementNodeId ()).remove (data1 .getApiId ());
143148 logger .info ("Continuing tracing VM: " + vmUuid );
144149 vmsToSkip .get (data1 .getManagementNodeId ()).remove (vmUuid );
150+ // Also remove from orphaned list if present
151+ orphanedVmsToSkip .remove (vmUuid );
145152 return ;
146153 }
147154
148155 if (data1 .getVmUuid () != null ) {
149156 logger .info ("Continuing tracing VM: " + data1 .getVmUuid ());
150157 vmsToSkip .get (data1 .getManagementNodeId ()).remove (data1 .getVmUuid ());
158+ // Also remove from orphaned list if present
159+ orphanedVmsToSkip .remove (data1 .getVmUuid ());
151160 }
152161 }
153162 });
@@ -307,8 +316,45 @@ public HypervisorType getHypervisorTypeForReestablishExtensionPoint() {
307316 return HypervisorType .valueOf (KVMConstant .KVM_HYPERVISOR_TYPE );
308317 }
309318
319+ private void startOrphanedVmCleanupTask () {
320+ thdf .submitPeriodicTask (new PeriodicTask () {
321+ @ Override
322+ public TimeUnit getTimeUnit () {
323+ return TimeUnit .SECONDS ;
324+ }
325+
326+ @ Override
327+ public long getInterval () {
328+ return 60 ; // Check every 60 seconds
329+ }
330+
331+ @ Override
332+ public String getName () {
333+ return "orphaned-vm-skip-list-cleanup" ;
334+ }
335+
336+ @ Override
337+ public void run () {
338+ long now = System .currentTimeMillis ();
339+ long timeoutMs = KVMGlobalConfig .ORPHANED_VM_SKIP_TIMEOUT .value (Long .class ) * 1000 ;
340+
341+ orphanedVmsToSkip .entrySet ().removeIf (entry -> {
342+ long age = now - entry .getValue ();
343+ if (age > timeoutMs ) {
344+ logger .info (String .format ("Removing VM[uuid:%s] from orphaned skip list after %d seconds" ,
345+ entry .getKey (), age / 1000 ));
346+ return true ;
347+ }
348+ return false ;
349+ });
350+ }
351+ });
352+ }
353+
310354 @ Override
311355 public boolean start () {
356+ startOrphanedVmCleanupTask ();
357+
312358 restf .registerSyncHttpCallHandler (KVMConstant .KVM_REPORT_VM_STATE , ReportVmStateCmd .class , new SyncHttpCallHandler <ReportVmStateCmd >() {
313359 private void reportState (final ReportVmStateCmd cmd ) {
314360 thdf .chainSubmit (new ChainTask (null ) {
@@ -446,7 +492,18 @@ public void nodeJoin(ManagementNodeInventory inv) {
446492 @ Override
447493 public void nodeLeft (ManagementNodeInventory inv ) {
448494 vmApis .remove (inv .getUuid ());
449- vmsToSkip .remove (inv .getUuid ());
495+
496+ // Don't immediately remove skip list - move to orphaned set with timestamp
497+ // This prevents split-brain when MN restarts and VMs are still starting on kvmagent
498+ Set <String > vmsToOrphan = vmsToSkip .remove (inv .getUuid ());
499+ if (vmsToOrphan != null && !vmsToOrphan .isEmpty ()) {
500+ long now = System .currentTimeMillis ();
501+ for (String vmUuid : vmsToOrphan ) {
502+ orphanedVmsToSkip .put (vmUuid , now );
503+ logger .info (String .format ("VM[uuid:%s] moved to orphaned skip list due to management node[uuid:%s] left" ,
504+ vmUuid , inv .getUuid ()));
505+ }
506+ }
450507 }
451508
452509 @ Override
@@ -460,6 +517,30 @@ public void iJoin(ManagementNodeInventory inv) {
460517 }
461518
462519 public boolean isVmDoNotNeedToTrace (String vmUuid ) {
463- return vmsToSkip .values ().stream ().anyMatch (vmsToSkipSet -> vmsToSkipSet .contains (vmUuid ));
520+ return vmsToSkip .values ().stream ().anyMatch (vmsToSkipSet -> vmsToSkipSet .contains (vmUuid ))
521+ || orphanedVmsToSkip .containsKey (vmUuid );
522+ }
523+
524+ protected boolean isVmOrphaned (String vmUuid ) {
525+ return orphanedVmsToSkip .containsKey (vmUuid );
526+ }
527+
528+ @ Override
529+ protected boolean shouldSkipMissingVmHandling (String vmUuid , VmInstanceState expectedState ) {
530+ // Double-check: if VM is orphaned and DB state indicates it's in a transient state,
531+ // skip this trace cycle to avoid split-brain
532+ if (!orphanedVmsToSkip .containsKey (vmUuid )) {
533+ return false ;
534+ }
535+
536+ // If the VM is orphaned and in Starting/Migrating state, it might still be
537+ // starting on kvmagent - don't judge it as Stopped yet
538+ if (expectedState == VmInstanceState .Starting || expectedState == VmInstanceState .Migrating ) {
539+ logger .info (String .format ("VM[uuid:%s] is orphaned and in state[%s], skipping missing VM check to avoid split-brain" ,
540+ vmUuid , expectedState ));
541+ return true ;
542+ }
543+
544+ return false ;
464545 }
465546}
0 commit comments