diff --git a/api/src/main/java/com/cloud/ha/Investigator.java b/api/src/main/java/com/cloud/ha/Investigator.java index 88d802a1ce44..00371d395f5a 100644 --- a/api/src/main/java/com/cloud/ha/Investigator.java +++ b/api/src/main/java/com/cloud/ha/Investigator.java @@ -26,17 +26,19 @@ public interface Investigator extends Adapter { * Returns if the vm is still alive. * * @param vm to work on. + * @return true if vm is alive, otherwise false */ - public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM; + boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM; - public Status isAgentAlive(Host agent); + /** + * Returns the agent status of the host. + * + * @param host + * @return status of the host agent + */ + Status getHostAgentStatus(Host host); class UnknownVM extends Exception { - - /** - * - */ private static final long serialVersionUID = 1L; - }; } diff --git a/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java b/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java index 5a26b22ec6a5..41e266784db8 100644 --- a/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java +++ b/core/src/main/java/com/cloud/agent/api/CheckOnHostAnswer.java @@ -38,6 +38,8 @@ public CheckOnHostAnswer(CheckOnHostCommand cmd, Boolean alive, String details) public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) { super(cmd, false, details); + determined = false; + alive = false; } public boolean isDetermined() { @@ -47,5 +49,4 @@ public boolean isDetermined() { public boolean isAlive() { return alive; } - } diff --git a/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java b/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java index 94239f2900ef..72b5217604dc 100644 --- a/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java +++ b/core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java @@ -24,7 +24,7 @@ public class CheckOnHostCommand extends Command { HostTO host; - boolean reportCheckFailureIfOneStorageIsDown; + boolean reportIfHeartBeatFailedForOneStoragePool; protected CheckOnHostCommand() { } @@ -34,17 +34,17 @@ public CheckOnHostCommand(Host host) { setWait(20); } - public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) { + public CheckOnHostCommand(Host host, boolean reportIfHeartBeatFailedForOneStoragePool) { this(host); - this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown; + this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool; } public HostTO getHost() { return host; } - public boolean isCheckFailedOnOneStorage() { - return reportCheckFailureIfOneStorageIsDown; + public boolean shouldReportIfHeartBeatFailedForOneStoragePool() { + return reportIfHeartBeatFailedForOneStoragePool; } @Override diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java index 3ae94479cea5..53bfcce27038 100644 --- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java +++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java @@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager { + " which are registered for the HA event that were successful and are now ready to be purged.", true, Cluster); - public static final ConfigKey KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false", + ConfigKey KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false", "Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone); - public enum WorkType { + enum WorkType { Migration, // Migrating VMs off of a host. Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now. CheckStop, // Checks if a VM has been stopped. diff --git a/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java b/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java index 3d79b9efdd13..4e44d8cb7359 100644 --- a/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java +++ b/plugins/hypervisors/hyperv/src/main/java/com/cloud/ha/HypervInvestigator.java @@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase implements Investigator { @Override public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM { - Status status = isAgentAlive(host); + Status status = getHostAgentStatus(host); if (status == null) { throw new UnknownVM(); } - return status == Status.Up ? true : null; + return status == Status.Up; } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) { return null; } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java index ce9fbe6c232c..64f0d5637a85 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/ha/KVMInvestigator.java @@ -19,10 +19,7 @@ package com.cloud.ha; import com.cloud.agent.AgentManager; -import com.cloud.agent.api.Answer; -import com.cloud.agent.api.CheckOnHostCommand; import com.cloud.host.Host; -import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; @@ -34,11 +31,12 @@ import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager; import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver; import org.apache.cloudstack.ha.HAManager; +import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker; import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao; import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; import javax.inject.Inject; -import java.util.Arrays; +import java.util.Collections; import java.util.List; public class KVMInvestigator extends AdapterBase implements Investigator { @@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements Investigator { private HAManager haManager; @Inject private DataStoreProviderManager dataStoreProviderMgr; + @Inject + private KVMHostActivityChecker hostActivityChecker; @Override public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM { if (haManager.isHAEligible(host)) { return haManager.isVMAliveOnHost(host); } - Status status = isAgentAlive(host); + Status status = getHostAgentStatus(host); logger.debug("HA: HOST is ineligible legacy state {} for host {}", status, host); if (status == null) { throw new UnknownVM(); @@ -73,86 +73,41 @@ public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws Unkno } @Override - public Status isAgentAlive(Host agent) { - if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { + public Status getHostAgentStatus(Host host) { + if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) { return null; } - if (haManager.isHAEligible(agent)) { - return haManager.getHostStatus(agent); + if (haManager.isHAEligible(host)) { + return haManager.getHostStatus(host); } - List clusterPools = _storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null); - boolean storageSupportHA = storageSupportHa(clusterPools); - if (!storageSupportHA) { - List zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType()); - storageSupportHA = storageSupportHa(zonePools); + List clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null); + boolean storageSupportsHA = storageSupportsHA(clusterPools); + if (!storageSupportsHA) { + List zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(), host.getHypervisorType()); + storageSupportsHA = storageSupportsHA(zonePools); } - if (!storageSupportHA) { - logger.warn("Agent investigation was requested on host {}, but host does not support investigation because it has no NFS storage. Skipping investigation.", agent); + if (!storageSupportsHA) { + logger.warn("Agent investigation was requested on host {}, but host does not support investigation" + + " because it has no HA supported storage. Skipping investigation.", host); return null; } - Status hostStatus = null; - Status neighbourStatus = null; - boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value(); - CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown); - - try { - Answer answer = _agentMgr.easySend(agent.getId(), cmd); - if (answer != null) { - hostStatus = answer.getResult() ? Status.Down : Status.Up; - } - } catch (Exception e) { - logger.debug("Failed to send command to host: {}", agent); - } - if (hostStatus == null) { - hostStatus = Status.Disconnected; - } - - List neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up); - for (HostVO neighbor : neighbors) { - if (neighbor.getId() == agent.getId() - || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { - continue; - } - logger.debug("Investigating host:{} via neighbouring host:{}", agent, neighbor); - try { - Answer answer = _agentMgr.easySend(neighbor.getId(), cmd); - if (answer != null) { - neighbourStatus = answer.getResult() ? Status.Down : Status.Up; - logger.debug("Neighbouring host:{} returned status:{} for the investigated host:{}", neighbor, neighbourStatus, agent); - if (neighbourStatus == Status.Up) { - break; - } - } - } catch (Exception e) { - logger.debug("Failed to send command to host: {}", neighbor); - } - } - if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { - hostStatus = Status.Disconnected; - } - if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { - hostStatus = Status.Down; - } - logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, agent); - return hostStatus; + return hostActivityChecker.getHostAgentStatus(host); } - private boolean storageSupportHa(List pools) { - boolean storageSupportHA = false; + private boolean storageSupportsHA(List pools) { for (StoragePoolVO pool : pools) { DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName()); DataStoreDriver storeDriver = storeProvider.getDataStoreDriver(); if (storeDriver instanceof PrimaryDataStoreDriver) { PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver; if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) { - storageSupportHA = true; - break; + return true; } } } - return storageSupportHA; + return false; } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java index e9a7ac8951ce..5de5cadb9c7f 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHABase.java @@ -35,10 +35,9 @@ public class KVMHABase { protected Logger logger = LogManager.getLogger(getClass()); private long _timeout = 60000; /* 1 minutes */ - protected long _heartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT); - protected long _heartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY); + protected long _heartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY); protected long _heartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES); - protected long _heartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP); + protected long _heartBeatUpdateRetrySleepInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP); public static enum PoolType { PrimaryStorage, SecondaryStorage @@ -234,7 +233,7 @@ protected String runScriptRetry(String cmdString, OutputInterpreter interpreter) return result; } - public Boolean checkingHeartBeat() { + public Boolean hasHeartBeat() { // TODO Auto-generated method stub return null; } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java index db6190fa8f28..05197d8eaab5 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAChecker.java @@ -26,44 +26,42 @@ public class KVMHAChecker extends KVMHABase implements Callable { private List storagePools; private HostTO host; - private boolean reportFailureIfOneStorageIsDown; + private boolean reportIfHeartBeatFailedForOneStoragePool; - public KVMHAChecker(List pools, HostTO host, boolean reportFailureIfOneStorageIsDown) { + public KVMHAChecker(List pools, HostTO host, boolean reportIfHeartBeatFailedForOneStoragePool) { this.storagePools = pools; this.host = host; - this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown; + this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool; } /* - * True means heartbeaing is on going, or we can't get it's status. False - * means heartbeating is stopped definitely + * True means heart beating is on going, or we can't get it's status. + * False means heart beating is stopped definitely. */ @Override - public Boolean checkingHeartBeat() { - boolean validResult = false; - + public Boolean hasHeartBeat() { String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", "))); + logger.debug("Checking heart beat with KVMHAChecker for {}", hostAndPools); - logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools)); - + boolean heartBeatCheckResult = false; for (HAStoragePool pool : storagePools) { - validResult = pool.getPool().checkingHeartBeat(pool, host); - if (reportFailureIfOneStorageIsDown && !validResult) { + heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host); + if (reportIfHeartBeatFailedForOneStoragePool && !heartBeatCheckResult) { break; } } - if (!validResult) { - logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools)); + if (!heartBeatCheckResult) { + logger.warn("All checks with KVMHAChecker for {} considered it as dead. It may cause a shutdown of the host.", hostAndPools); } - return validResult; + return heartBeatCheckResult; } @Override public Boolean call() throws Exception { // logger.addAppender(new org.apache.log4j.ConsoleAppender(new // org.apache.log4j.PatternLayout(), "System.out")); - return checkingHeartBeat(); + return hasHeartBeat(); } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java index aa868ff1d3f2..bbf93e651c52 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java @@ -91,7 +91,7 @@ protected void runHeartBeat() { result = executePoolHeartBeatCommand(uuid, primaryStoragePool, result); if (result != null && rebootHostAndAlertManagementOnHeartbeatTimeout) { - logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; stopping cloudstack-agent.", uuid, result)); + logger.warn("Write heartbeat for pool [{}] failed: {}; stopping cloudstack-agent.", uuid, result); primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null, false);; } } @@ -108,9 +108,9 @@ private String executePoolHeartBeatCommand(String uuid, HAStoragePool primarySto result = primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, hostPrivateIp, true); if (result != null) { - logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries)); + logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of {}.", uuid, result, i, _heartBeatUpdateMaxTries); try { - Thread.sleep(_heartBeatUpdateRetrySleep); + Thread.sleep(_heartBeatUpdateRetrySleepInMs); } catch (InterruptedException e) { logger.debug("[IGNORED] Interrupted between heartbeat retries.", e); } @@ -128,21 +128,21 @@ private void checkForNotExistingLibvirtStoragePools(Set removedPools, St StoragePool storage = conn.storagePoolLookupByUUIDString(uuid); if (storage == null || storage.getInfo().state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) { if (storage == null) { - logger.debug(String.format("Libvirt storage pool [%s] not found, removing from HA list.", uuid)); + logger.debug("Libvirt storage pool [{}] not found, removing from HA list.", uuid); } else { - logger.debug(String.format("Libvirt storage pool [%s] found, but not running, removing from HA list.", uuid)); + logger.debug("Libvirt storage pool [{}] found, but not running, removing from HA list.", uuid); } removedPools.add(uuid); } - logger.debug(String.format("Found NFS storage pool [%s] in libvirt, continuing.", uuid)); + logger.debug("Found NFS storage pool [{}] in libvirt, continuing.", uuid); } catch (LibvirtException e) { - logger.debug(String.format("Failed to lookup libvirt storage pool [%s].", uuid), e); + logger.debug("Failed to lookup libvirt storage pool [{}].", uuid, e); if (e.toString().contains("pool not found")) { - logger.debug(String.format("Removing pool [%s] from HA monitor since it was deleted.", uuid)); + logger.debug("Removing pool [{}] from HA monitor since it was deleted.", uuid); removedPools.add(uuid); } } @@ -155,11 +155,10 @@ public void run() { runHeartBeat(); try { - Thread.sleep(_heartBeatUpdateFreq); + Thread.sleep(_heartBeatUpdateFreqInMs); } catch (InterruptedException e) { logger.debug("[IGNORED] Interrupted between heartbeats.", e); } } } - } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java index e6937b515e95..c13be64a3a3e 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/KVMHAVMActivityChecker.java @@ -39,12 +39,12 @@ public KVMHAVMActivityChecker(final HAStoragePool pool, final HostTO host, final } @Override - public Boolean checkingHeartBeat() { - return this.storagePool.getPool().vmActivityCheck(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds); + public Boolean hasHeartBeat() { + return this.storagePool.getPool().hasVmActivity(storagePool, host, activityScriptTimeout, volumeUuidList, vmActivityCheckPath, suspectTimeInSeconds); } @Override public Boolean call() throws Exception { - return checkingHeartBeat(); + return hasHeartBeat(); } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java index 48996a7ba97c..f901fd97ca76 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtCheckOnHostCommandWrapper.java @@ -26,6 +26,7 @@ import java.util.concurrent.Future; import com.cloud.agent.api.Answer; +import com.cloud.agent.api.CheckOnHostAnswer; import com.cloud.agent.api.CheckOnHostCommand; import com.cloud.agent.api.to.HostTO; import com.cloud.hypervisor.kvm.resource.KVMHABase.HAStoragePool; @@ -45,20 +46,21 @@ public Answer execute(final CheckOnHostCommand command, final LibvirtComputingRe final List pools = monitor.getStoragePools(); final HostTO host = command.getHost(); - final KVMHAChecker ha = new KVMHAChecker(pools, host, command.isCheckFailedOnOneStorage()); + + final KVMHAChecker ha = new KVMHAChecker(pools, host, command.shouldReportIfHeartBeatFailedForOneStoragePool()); final Future future = executors.submit(ha); try { - final Boolean result = future.get(); - if (result) { - return new Answer(command, false, "Heart is beating..."); + final Boolean hasHeartBeat = future.get(); + if (hasHeartBeat) { + return new CheckOnHostAnswer(command, true, "Heart is beating"); } else { - return new Answer(command); + return new CheckOnHostAnswer(command, "Heart is not beating"); } } catch (final InterruptedException e) { - return new Answer(command, false, "CheckOnHostCommand: can't get status of host: InterruptedException"); + return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: InterruptedException"); } catch (final ExecutionException e) { - return new Answer(command, false, "CheckOnHostCommand: can't get status of host: ExecutionException"); + return new CheckOnHostAnswer(command, "CheckOnHostCommand: can't get status of host: ExecutionException"); } } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java index f5bfd898a4f4..8cf6de68f955 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/IscsiAdmStoragePool.java @@ -208,12 +208,12 @@ public String getStorageNodeId() { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { return null; } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { return null; } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java index 8dd2116e1235..3e35ed9476b1 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/KVMStoragePool.java @@ -33,35 +33,33 @@ public interface KVMStoragePool { - public static final long HeartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT); - public static final long HeartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY); - public static final long HeartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES); - public static final long HeartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP); - public static final long HeartBeatCheckerTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT); + long HeartBeatUpdateTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT); + long HeartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY); + long HeartBeatCheckerTimeoutInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_CHECKER_TIMEOUT); - public default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) { + default KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, Long usableSize, byte[] passphrase) { return createPhysicalDisk(volumeUuid, format, provisioningType, size, passphrase); } - public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase); + KVMPhysicalDisk createPhysicalDisk(String volumeUuid, PhysicalDiskFormat format, Storage.ProvisioningType provisioningType, long size, byte[] passphrase); - public KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase); + KVMPhysicalDisk createPhysicalDisk(String volumeUuid, Storage.ProvisioningType provisioningType, long size, byte[] passphrase); - public boolean connectPhysicalDisk(String volumeUuid, Map details); + boolean connectPhysicalDisk(String volumeUuid, Map details); - public KVMPhysicalDisk getPhysicalDisk(String volumeUuid); + KVMPhysicalDisk getPhysicalDisk(String volumeUuid); - public boolean disconnectPhysicalDisk(String volumeUuid); + boolean disconnectPhysicalDisk(String volumeUuid); - public boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format); + boolean deletePhysicalDisk(String volumeUuid, Storage.ImageFormat format); - public List listPhysicalDisks(); + List listPhysicalDisks(); - public String getUuid(); + String getUuid(); - public long getCapacity(); + long getCapacity(); - public long getUsed(); + long getUsed(); default Long getCapacityIops() { return null; @@ -71,51 +69,51 @@ default Long getUsedIops() { return null; } - public long getAvailable(); + long getAvailable(); - public boolean refresh(); + boolean refresh(); - public boolean isExternalSnapshot(); + boolean isExternalSnapshot(); - public String getLocalPath(); + String getLocalPath(); - public String getSourceHost(); + String getSourceHost(); - public String getSourceDir(); + String getSourceDir(); - public int getSourcePort(); + int getSourcePort(); - public String getAuthUserName(); + String getAuthUserName(); - public String getAuthSecret(); + String getAuthSecret(); - public StoragePoolType getType(); + StoragePoolType getType(); - public boolean delete(); + boolean delete(); PhysicalDiskFormat getDefaultFormat(); - public boolean createFolder(String path); + boolean createFolder(String path); - public boolean supportsConfigDriveIso(); + boolean supportsConfigDriveIso(); - public Map getDetails(); + Map getDetails(); default String getLocalPathFor(String relativePath) { return String.format("%s%s%s", getLocalPath(), File.separator, relativePath); } - public boolean isPoolSupportHA(); + boolean isPoolSupportHA(); - public String getHearthBeatPath(); + String getHearthBeatPath(); - public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation); + String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation); - public String getStorageNodeId(); + String getStorageNodeId(); - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host); + Boolean hasHeartBeat(HAStoragePool pool, HostTO host); - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration); + Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration); default LibvirtVMDef.DiskDef.BlockIOSize getSupportedLogicalBlockSize() { return null; diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java index 45c22d3ac754..8dff53df37f3 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/LibvirtStoragePool.java @@ -345,7 +345,7 @@ public String getHearthBeatPath() { public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) { - Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger); + Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger); cmd.add("-i", primaryStoragePool.getPoolIp()); cmd.add("-p", primaryStoragePool.getPoolMountSourcePath()); cmd.add("-m", primaryStoragePool.getMountDestPath()); @@ -372,53 +372,53 @@ public String getStorageNodeId() { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { - boolean validResult = false; + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { String hostIp = host.getPrivateNetwork().getIp(); - Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeout, logger); + Script cmd = new Script(getHearthBeatPath(), HeartBeatCheckerTimeoutInMs, logger); cmd.add("-i", pool.getPoolIp()); cmd.add("-p", pool.getPoolMountSourcePath()); cmd.add("-m", pool.getMountDestPath()); cmd.add("-h", hostIp); cmd.add("-r"); - cmd.add("-t", String.valueOf(HeartBeatUpdateFreq / 1000)); + cmd.add("-t", String.valueOf(HeartBeatUpdateFreqInMs / 1000)); OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser(); String result = cmd.execute(parser); String parsedLine = parser.getLine(); - logger.debug(String.format("Checking heart beat with KVMHAChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine, - pool.getPoolIp())); + logger.debug("Checking heart beat for host IP {} with KVMHAChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp()); if (result == null && parsedLine.contains("DEAD")) { - logger.warn(String.format("Checking heart beat with KVMHAChecker command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(), - result, parsedLine, hostIp)); + logger.warn("Checking heart beat for host IP {} with KVMHAChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine); + return false; } else { - validResult = true; + logger.debug("Checking heart beat for host IP {} with KVMHAChecker command [{}] succeeded.", hostIp, cmd.toString()); + return true; } - return validResult; } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { + String hostIp = host.getPrivateNetwork().getIp(); Script cmd = new Script(vmActivityCheckPath, activityScriptTimeout.getStandardSeconds(), logger); cmd.add("-i", pool.getPoolIp()); cmd.add("-p", pool.getPoolMountSourcePath()); cmd.add("-m", pool.getMountDestPath()); - cmd.add("-h", host.getPrivateNetwork().getIp()); + cmd.add("-h", hostIp); cmd.add("-u", volumeUUIDListString); - cmd.add("-t", String.valueOf(String.valueOf(System.currentTimeMillis() / 1000))); + cmd.add("-t", String.valueOf(System.currentTimeMillis() / 1000)); cmd.add("-d", String.valueOf(duration)); OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser(); String result = cmd.execute(parser); String parsedLine = parser.getLine(); - logger.debug(String.format("Checking heart beat with KVMHAVMActivityChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine, pool.getPoolIp())); + logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker [{command=\"{}\", result: \"{}\", log: \"{}\", pool: \"{}\"}].", hostIp, cmd.toString(), result, parsedLine, pool.getPoolIp()); if (result == null && parsedLine.contains("DEAD")) { - logger.warn(String.format("Checking heart beat with KVMHAVMActivityChecker command [%s] returned [%s]. It is [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(), result, parsedLine, host.getPrivateNetwork().getIp())); + logger.warn("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] returned [{}]. It may cause a shutdown of the host.", hostIp, cmd.toString(), parsedLine); return false; } else { + logger.debug("Checking VM activity for host IP {} with KVMHAVMActivityChecker command [{}] succeeded.", hostIp, cmd.toString()); return true; } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java index 229481b1f795..df9986c99194 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/MultipathSCSIPool.java @@ -225,13 +225,13 @@ public String getStorageNodeId() { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { return null; } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, - String volumeUUIDListString, String vmActivityCheckPath, long duration) { + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, + String volumeUUIDListString, String vmActivityCheckPath, long duration) { return null; } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java index e8243c3f7cfe..fc512ff94a94 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/storage/ScaleIOStoragePool.java @@ -236,12 +236,12 @@ public String getStorageNodeId() { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { return null; } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { return null; } } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java index 59ea720328f5..3fbb5340fcce 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHAConfig.java @@ -19,38 +19,37 @@ import org.apache.cloudstack.framework.config.ConfigKey; -public class KVMHAConfig { +public interface KVMHAConfig { - public static final ConfigKey KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10", + ConfigKey KvmHAHealthCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.health.check.timeout", "10", "The maximum length of time, in seconds, expected for an health check to complete.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60", + ConfigKey KvmHAActivityCheckTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.timeout", "60", "The maximum length of time, in seconds, expected for an activity check to complete.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60", + ConfigKey KvmHAActivityCheckInterval = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.interval", "60", "The interval, in seconds, between activity checks.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10", + ConfigKey KvmHAActivityCheckMaxAttempts = new ConfigKey<>("Advanced", Long.class, "kvm.ha.activity.check.max.attempts", "10", "The maximum number of activity check attempts to perform before deciding to recover or degrade a resource.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7", + ConfigKey KvmHAActivityCheckFailureThreshold = new ConfigKey<>("Advanced", Double.class, "kvm.ha.activity.check.failure.ratio", "0.7", "The activity check failure threshold ratio. This is used with the activity check maximum attempts for deciding to recover or degrade a resource. For most environments, please keep this value above 0.5.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300", + ConfigKey KvmHADegradedMaxPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.degraded.max.period", "300", "The maximum length of time, in seconds, a resource can be in degraded state where only health checks are performed.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60", + ConfigKey KvmHARecoverTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.timeout", "60", "The maximum length of time, in seconds, expected for a recovery operation to complete.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600", + ConfigKey KvmHARecoverWaitPeriod = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.wait.period", "600", "The maximum length of time, in seconds, to wait for a resource to recover.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1", + ConfigKey KvmHARecoverAttemptThreshold = new ConfigKey<>("Advanced", Long.class, "kvm.ha.recover.failure.threshold", "1", "The maximum recovery attempts to be made for a resource, after which the resource is fenced. The recovery counter resets when a health check passes for a resource.", true, ConfigKey.Scope.Cluster); - public static final ConfigKey KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60", + ConfigKey KvmHAFenceTimeout = new ConfigKey<>("Advanced", Long.class, "kvm.ha.fence.timeout", "60", "The maximum length of time, in seconds, expected for a fence operation to complete.", true, ConfigKey.Scope.Cluster); - } diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index 31f87d7e0442..85e2cb2b512f 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -19,6 +19,7 @@ import com.cloud.agent.AgentManager; import com.cloud.agent.api.Answer; +import com.cloud.agent.api.CheckOnHostAnswer; import com.cloud.agent.api.CheckOnHostCommand; import com.cloud.agent.api.CheckVMActivityOnStoragePoolCommand; import com.cloud.dc.dao.ClusterDao; @@ -70,7 +71,7 @@ public class KVMHostActivityChecker extends AdapterBase implements ActivityCheck @Override public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException { try { - return isVMActivityOnHost(r, suspectTime); + return hasVMActivityOnHost(r, suspectTime); } catch (HACheckerException e) { //Re-throwing the exception to avoid poluting the 'HACheckerException' already thrown throw e; @@ -83,82 +84,115 @@ public boolean isActive(Host r, DateTime suspectTime) throws HACheckerException @Override public boolean isHealthy(Host r) { - return isAgentActive(r); + return isHostAgentUp(r); } - private boolean isAgentActive(Host agent) { - if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { - throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); + private boolean isHostAgentUp(Host host) { + if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) { + throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType())); } - Status hostStatus = Status.Unknown; - Status neighbourStatus = Status.Unknown; - final CheckOnHostCommand cmd = new CheckOnHostCommand(agent, HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value()); + + Status hostStatus = getHostAgentStatus(host); + + logger.debug("{} has the status [{}].", host.toString(), hostStatus); + return hostStatus == Status.Up; + } + + public Status getHostAgentStatus(Host host) { + if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) { + return null; + } + + Status hostStatusFromItself = checkHostStatusWithSameHost(host); + if (hostStatusFromItself == Status.Up) { + return Status.Up; + } + + Status hostStatusFromNeighbour = checkHostStatusWithNeighbourHosts(host); + Status hostStatus = hostStatusFromItself; + if (hostStatusFromNeighbour == Status.Up && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) { + hostStatus = Status.Disconnected; + } + if (hostStatusFromNeighbour == Status.Down && (hostStatusFromItself == Status.Disconnected || hostStatusFromItself == Status.Down)) { + hostStatus = Status.Down; + } + + logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, host); + return hostStatus; + } + + private Status checkHostStatusWithSameHost(Host host) { + Status hostStatus; + boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value(); + final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown); try { - logger.debug(String.format("Checking %s status...", agent.toString())); - Answer answer = agentMgr.easySend(agent.getId(), cmd); + logger.debug("Checking {} status...", host.toString()); + Answer answer = agentMgr.easySend(host.getId(), cmd); if (answer != null) { - hostStatus = answer.getResult() ? Status.Down : Status.Up; - logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus)); - - if ( hostStatus == Status.Up ){ - return true; + if (answer.getResult()) { + hostStatus = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down; + } else { + logger.debug("{} is not active according to itself, details: {}.", host.toString(), answer.getDetails()); + hostStatus = Status.Down; } - } - else { - logger.debug(String.format("Setting %s to \"Disconnected\" status.", agent.toString())); + logger.debug("{} has the status [{}].", host.toString(), hostStatus); + } else { + logger.debug("Setting {} to \"Disconnected\" status.", host.toString()); hostStatus = Status.Disconnected; } } catch (Exception e) { - logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", agent.toString()), e); + logger.warn("Failed to send command CheckOnHostCommand to {}.", host.toString(), e); + hostStatus = Status.Disconnected; } - List neighbors = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up); + return hostStatus; + } + + private Status checkHostStatusWithNeighbourHosts(Host host) { + Status hostStatusFromNeighbour = Status.Unknown; + boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value(); + final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown); + List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { - if (neighbor.getId() == agent.getId() || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { + if (neighbor.getId() == host.getId() + || (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { continue; } try { - logger.debug(String.format("Investigating %s via neighbouring %s.", agent.toString(), neighbor.toString())); - + logger.debug("Investigating {} via neighboring {}.", host.toString(), neighbor.toString()); Answer answer = agentMgr.easySend(neighbor.getId(), cmd); if (answer != null) { - neighbourStatus = answer.getResult() ? Status.Down : Status.Up; - - logger.debug(String.format("Neighbouring %s returned status [%s] for the investigated %s.", neighbor.toString(), neighbourStatus, agent.toString())); - - if (neighbourStatus == Status.Up) { - break; + if (answer.getResult()) { + hostStatusFromNeighbour = ((CheckOnHostAnswer)answer).isAlive() ? Status.Up : Status.Down; + logger.debug("Neighboring {} returned status [{}] for the investigated {}.", neighbor.toString(), hostStatusFromNeighbour, host.toString()); + if (hostStatusFromNeighbour == Status.Up) { + return hostStatusFromNeighbour; + } + } else { + logger.debug("{} is not active according to neighbor {}, details: {}.", host.toString(), neighbor.toString(), answer.getDetails()); } } else { - logger.debug(String.format("Neighbouring %s is Disconnected.", neighbor.toString())); + logger.debug("Neighboring {} is Disconnected.", neighbor.toString()); } } catch (Exception e) { - logger.warn(String.format("Failed to send command CheckOnHostCommand to %s.", neighbor.toString()), e); + logger.warn("Failed to send command CheckOnHostCommand to neighbor {}.", neighbor.toString(), e); } } - if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { - hostStatus = Status.Disconnected; - } - if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) { - hostStatus = Status.Down; - } - logger.debug(String.format("%s has the status [%s].", agent.toString(), hostStatus)); - - return hostStatus == Status.Up; + return hostStatusFromNeighbour; } - private boolean isVMActivityOnHost(Host agent, DateTime suspectTime) throws HACheckerException { - if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) { - throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", agent.getHypervisorType())); + private boolean hasVMActivityOnHost(Host host, DateTime suspectTime) throws HACheckerException { + if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) { + throw new IllegalStateException(String.format("Calling KVM investigator for non KVM Host of type [%s].", host.getHypervisorType())); } boolean activityStatus = true; - HashMap> poolVolMap = getVolumeUuidOnHost(agent); + HashMap> poolVolMap = getVolumeUuidOnHost(host); for (StoragePool pool : poolVolMap.keySet()) { - activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, agent, suspectTime, activityStatus); + activityStatus = verifyActivityOfStorageOnHost(poolVolMap, pool, host, suspectTime, activityStatus); if (!activityStatus) { - logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, agent); + logger.warn("It seems that the storage pool [{}] does not have activity on {}.", pool, host); break; } } @@ -166,32 +200,32 @@ private boolean isVMActivityOnHost(Host agent, DateTime suspectTime) throws HACh return activityStatus; } - protected boolean verifyActivityOfStorageOnHost(HashMap> poolVolMap, StoragePool pool, Host agent, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { + protected boolean verifyActivityOfStorageOnHost(HashMap> poolVolMap, StoragePool pool, Host host, DateTime suspectTime, boolean activityStatus) throws HACheckerException, IllegalStateException { List volume_list = poolVolMap.get(pool); - final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(agent, pool, volume_list, suspectTime); + final CheckVMActivityOnStoragePoolCommand cmd = new CheckVMActivityOnStoragePoolCommand(host, pool, volume_list, suspectTime); - logger.debug("Checking VM activity for {} on storage pool [{}].", agent.toString(), pool); + logger.debug("Checking VM activity for {} on storage pool [{}].", host.toString(), pool); try { - Answer answer = storageManager.sendToPool(pool, getNeighbors(agent), cmd); + Answer answer = storageManager.sendToPool(pool, getNeighbors(host), cmd); if (answer != null) { activityStatus = !answer.getResult(); - logger.debug("{} {} activity on storage pool [{}]", agent.toString(), activityStatus ? "has" : "does not have", pool); + logger.debug("{} {} activity on storage pool [{}]", host.toString(), activityStatus ? "has" : "does not have", pool); } else { - String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", agent.toString(), pool); + String message = String.format("Did not get a valid response for VM activity check for %s on storage pool [%s].", host.toString(), pool); logger.debug(message); throw new IllegalStateException(message); } } catch (StorageUnavailableException e){ - String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, agent); + String message = String.format("Storage [%s] is unavailable to do the check, probably the %s is not reachable.", pool, host); logger.warn(message, e); throw new HACheckerException(message, e); } return activityStatus; } - private HashMap> getVolumeUuidOnHost(Host agent) { - List vm_list = vmInstanceDao.listByHostId(agent.getId()); + private HashMap> getVolumeUuidOnHost(Host host) { + List vm_list = vmInstanceDao.listByHostId(host.getId()); List volume_list = new ArrayList(); for (VirtualMachine vm : vm_list) { logger.debug("Retrieving volumes of VM [{}]...", vm); @@ -215,15 +249,15 @@ private HashMap> getVolumeUuidOnHost(Host agent) { return poolVolMap; } - public long[] getNeighbors(Host agent) { + public long[] getNeighbors(Host host) { List neighbors = new ArrayList(); - List cluster_hosts = resourceManager.listHostsInClusterByStatus(agent.getClusterId(), Status.Up); - logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(agent.getClusterId())); - for (HostVO host : cluster_hosts) { - if (host.getId() == agent.getId() || (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { + List clusterHosts = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); + logger.debug("Retrieving all \"Up\" hosts from cluster [{}]...", clusterDao.findById(host.getClusterId())); + for (HostVO clusterHost : clusterHosts) { + if (clusterHost.getId() == host.getId() || (clusterHost.getHypervisorType() != Hypervisor.HypervisorType.KVM && clusterHost.getHypervisorType() != Hypervisor.HypervisorType.LXC)) { continue; } - neighbors.add(host.getId()); + neighbors.add(clusterHost.getId()); } return ArrayUtils.toPrimitive(neighbors.toArray(new Long[neighbors.size()])); } diff --git a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java index 0fae98072431..662b09a30448 100644 --- a/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java +++ b/plugins/hypervisors/kvm/src/test/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResourceTest.java @@ -3130,7 +3130,7 @@ public void testCheckOnHostCommand() { assertNotNull(wrapper); final Answer answer = wrapper.execute(command, libvirtComputingResourceMock); - assertTrue(answer.getResult()); + assertFalse(answer.getResult()); verify(libvirtComputingResourceMock, times(1)).getMonitor(); } diff --git a/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java b/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java index 7114a8411578..891520bb5c40 100644 --- a/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java +++ b/plugins/hypervisors/simulator/src/main/java/com/cloud/ha/SimulatorInvestigator.java @@ -54,7 +54,7 @@ protected SimulatorInvestigator() { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { if (agent.getHypervisorType() != HypervisorType.Simulator) { return null; } diff --git a/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java b/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java index 5bfc18968430..3d4fab0d229e 100644 --- a/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java +++ b/plugins/hypervisors/vmware/src/main/java/com/cloud/ha/VmwareInvestigator.java @@ -28,7 +28,7 @@ protected VmwareInvestigator() { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { if (agent.getHypervisorType() == HypervisorType.VMware) return Status.Disconnected; diff --git a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java index bb354bec9b4b..1bcfaa4ebf7f 100644 --- a/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java +++ b/plugins/storage/volume/linstor/src/main/java/com/cloud/hypervisor/kvm/storage/LinstorStoragePool.java @@ -228,11 +228,11 @@ public String getHearthBeatPath() { public String createHeartBeatCommand(HAStoragePool pool, String hostPrivateIp, boolean hostValidation) { LOGGER.trace(String.format("Linstor.createHeartBeatCommand: %s, %s, %b", pool.getPoolIp(), hostPrivateIp, hostValidation)); - boolean isStorageNodeUp = checkingHeartBeat(pool, null); + boolean isStorageNodeUp = hasHeartBeat(pool, null); if (!isStorageNodeUp && !hostValidation) { //restart the host LOGGER.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, pool.getPool().getType())); - Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeout), LOGGER); + Script cmd = new Script(pool.getPool().getHearthBeatPath(), Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER); cmd.add("-c"); cmd.execute(); return "Down"; @@ -258,7 +258,7 @@ static String getHostname() { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { String hostName; if (host == null) { hostName = localNodeName; @@ -274,7 +274,7 @@ public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { } private String executeDrbdSetupStatus(OutputInterpreter.AllLinesParser parser) { - Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER); + Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER); sc.add("status"); sc.add("--json"); return sc.execute(parser); @@ -329,7 +329,7 @@ private boolean checkDrbdSetupStatusOutput(String output, String otherNodeName) } private String executeDrbdEventsNow(OutputInterpreter.AllLinesParser parser) { - Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeout), LOGGER); + Script sc = new Script("drbdsetup", Duration.millis(HeartBeatUpdateTimeoutInMs), LOGGER); sc.add("events2"); sc.add("--now"); return sc.execute(parser); @@ -369,8 +369,8 @@ private boolean checkHostUpToDateAndConnected(String hostName) { } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUUIDListString, String vmActivityCheckPath, long duration) { LOGGER.trace(String.format("Linstor.vmActivityCheck: %s, %s", pool.getPoolIp(), host.getPrivateNetwork().getIp())); - return checkingHeartBeat(pool, host); + return hasHeartBeat(pool, host); } } diff --git a/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java b/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java index ab5dc03d3431..04100a3c6d38 100644 --- a/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java +++ b/plugins/storage/volume/storpool/src/main/java/com/cloud/hypervisor/kvm/storage/StorPoolStoragePool.java @@ -198,11 +198,11 @@ public String getHearthBeatPath() { @Override public String createHeartBeatCommand(HAStoragePool primaryStoragePool, String hostPrivateIp, boolean hostValidation) { - boolean isStorageNodeUp = checkingHeartBeat(primaryStoragePool, null); + boolean isStorageNodeUp = hasHeartBeat(primaryStoragePool, null); if (!isStorageNodeUp && !hostValidation) { //restart the host logger.debug(String.format("The host [%s] will be restarted because the health check failed for the storage pool [%s]", hostPrivateIp, primaryStoragePool.getPool().getType())); - Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeout, logger); + Script cmd = new Script(primaryStoragePool.getPool().getHearthBeatPath(), HeartBeatUpdateTimeoutInMs, logger); cmd.add("-c"); cmd.execute(); return "Down"; @@ -240,7 +240,7 @@ public static final String getStorPoolConfigParam(String param) { } @Override - public Boolean checkingHeartBeat(HAStoragePool pool, HostTO host) { + public Boolean hasHeartBeat(HAStoragePool pool, HostTO host) { boolean isNodeWorking = false; OutputInterpreter.AllLinesParser parser = new OutputInterpreter.AllLinesParser(); @@ -300,8 +300,8 @@ private String executeStorPoolServiceListCmd(OutputInterpreter.AllLinesParser pa } @Override - public Boolean vmActivityCheck(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) { - return checkingHeartBeat(pool, host); + public Boolean hasVmActivity(HAStoragePool pool, HostTO host, Duration activityScriptTimeout, String volumeUuidListString, String vmActivityCheckPath, long duration) { + return hasHeartBeat(pool, host); } @Override diff --git a/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java b/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java index d7945ef20776..e0dbc8fcbe11 100644 --- a/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java +++ b/server/src/main/java/com/cloud/ha/CheckOnAgentInvestigator.java @@ -38,7 +38,7 @@ protected CheckOnAgentInvestigator() { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { return null; } diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index c8635397b661..d05082c8d1de 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -270,7 +270,7 @@ public Status investigate(final long hostId) { Status hostState = null; for (Investigator investigator : investigators) { - hostState = investigator.isAgentAlive(host); + hostState = investigator.getHostAgentStatus(host); if (hostState != null) { if (logger.isDebugEnabled()) { logger.debug("{} was able to determine host {} is in {}", investigator.getName(), host, hostState.toString()); @@ -278,7 +278,7 @@ public Status investigate(final long hostId) { return hostState; } if (logger.isDebugEnabled()) { - logger.debug(investigator.getName() + " unable to determine the state of the host. Moving on."); + logger.debug("{} unable to determine the state of the host. Moving on.", investigator.getName()); } } @@ -570,9 +570,9 @@ private void startVm(VirtualMachine vm, Map } protected Long restart(final HaWorkVO work) { - logger.debug("RESTART with HAWORK"); + logger.debug("RESTART with HA WORK"); List items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId()); - if (items.size() > 0) { + if (!items.isEmpty()) { StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = ["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); @@ -583,7 +583,7 @@ protected Long restart(final HaWorkVO work) { } items = _haDao.listRunningHaWorkForVm(work.getInstanceId()); - if (items.size() > 0) { + if (!items.isEmpty()) { StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =["); for (HaWorkVO item : items) { str.append(item.getId()).append(", "); @@ -597,21 +597,21 @@ protected Long restart(final HaWorkVO work) { VirtualMachine vm = _itMgr.findById(work.getInstanceId()); if (vm == null) { - logger.info("Unable to find vm: " + vmId); + logger.info("Unable to find vm: {}", vmId); return null; } if (checkAndCancelWorkIfNeeded(work)) { return null; } - logger.info("HA on " + vm); + logger.info("HA on {}", vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { - logger.info("VM " + vm + " has been changed. Current State = " + vm.getState() + " Previous State = " + work.getPreviousState() + " last updated = " + - vm.getUpdated() + " previous updated = " + work.getUpdateTime()); + logger.info("VM {} has been changed. Current State = {} Previous State = {} last updated = {} previous updated = {}", + vm, vm.getState(), work.getPreviousState(), vm.getUpdated(), work.getUpdateTime()); return null; } if (vm.getHostId() != null && !vm.getHostId().equals(work.getHostId())) { - logger.info("VM " + vm + " has been changed. Current host id = " + vm.getHostId() + " Previous host id = " + work.getHostId()); + logger.info("VM {} has been changed. Current host id = {} Previous host id = {}", vm, vm.getHostId(), work.getHostId()); return null; } @@ -652,40 +652,39 @@ protected Long restart(final HaWorkVO work) { try { alive = investigator.isVmAlive(vm, host); - logger.info(investigator.getName() + " found " + vm + " to be alive? " + alive); + logger.info("{} found {} to be alive? {}", investigator.getName(), vm, alive); break; } catch (UnknownVM e) { - logger.info(investigator.getName() + " could not find " + vm); + logger.info("{} could not find {}", investigator.getName(), vm); } } boolean fenced = false; if (alive == null) { - logger.debug("Fencing off VM that we don't know the state of"); + logger.debug("Fencing off VM {} that we don't know the state of", vm); for (FenceBuilder fb : fenceBuilders) { Boolean result = fb.fenceOff(vm, host); - logger.info("Fencer " + fb.getName() + " returned " + result); + logger.info("Fencer {} returned {}", fb.getName(), result); if (result != null && result) { fenced = true; break; } } - } else if (!alive) { fenced = true; } else { - logger.debug("VM {} is found to be alive by {}", vm, investigator.getName()); + logger.debug("VM {} is found to be alive by {} on host {}", vm, investigator.getName(), host); if (host.getStatus() == Status.Up) { - logger.info(vm + " is alive and host is up. No need to restart it."); + logger.info("{} is alive and host {} is up. No need to restart it.", vm, host); return null; } else { - logger.debug("Rescheduling because the host is not up but the vm is alive"); + logger.debug("Rescheduling because the host {} is not up but the vm {} is alive", host, vm); return (System.currentTimeMillis() >> 10) + _investigateRetryInterval; } } if (!fenced) { - logger.debug("We were unable to fence off the VM " + vm); + logger.debug("We were unable to fence off the VM {}", vm); _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "Unable to restart " + vm.getHostName() + " which was running on host " + hostDesc, "Insufficient capacity to restart VM, name: " + vm.getHostName() + ", id: " + vmId + " which was running on host " + hostDesc); @@ -728,7 +727,7 @@ protected Long restart(final HaWorkVO work) { if (!ForceHA.value() && !vm.isHaEnabled()) { if (logger.isDebugEnabled()) { - logger.debug("VM is not HA enabled so we're done."); + logger.debug("VM {} is not HA enabled so we're done.", vm); } return null; // VM doesn't require HA } @@ -736,7 +735,7 @@ protected Long restart(final HaWorkVO work) { if ((host == null || host.getRemoved() != null || host.getState() != Status.Up) && !volumeMgr.canVmRestartOnAnotherServer(vm.getId())) { if (logger.isDebugEnabled()) { - logger.debug("VM can not restart on another server."); + logger.debug("VM {} can not restart on another server.", vm); } return null; } @@ -777,13 +776,13 @@ protected Long restart(final HaWorkVO work) { if (started != null && started.getState() == VirtualMachine.State.Running) { String message = String.format("HA starting VM: %s (%s)", started.getHostName(), started.getInstanceName()); HostVO hostVmHasStarted = _hostDao.findById(started.getHostId()); - logger.info(String.format("HA is now restarting %s on %s", started, hostVmHasStarted)); + logger.info("HA is now restarting {} on {}", started, hostVmHasStarted); _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), message, message); return null; } if (logger.isDebugEnabled()) { - logger.debug("Rescheduling VM " + vm.toString() + " to try again in " + _restartRetryInterval); + logger.debug("Rescheduling VM {} to try again in {}", vm.toString(), _restartRetryInterval); } } catch (final InsufficientCapacityException e) { logger.warn("Unable to restart " + vm.toString() + " due to " + e.getMessage()); @@ -825,13 +824,14 @@ protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) { } public Long migrate(final HaWorkVO work) { + logger.debug("MIGRATE with HA WORK"); long vmId = work.getInstanceId(); long srcHostId = work.getHostId(); HostVO srcHost = _hostDao.findById(srcHostId); VMInstanceVO vm = _instanceDao.findById(vmId); if (vm == null) { - logger.info("Unable to find vm: " + vmId + ", skipping migrate."); + logger.info("Unable to find vm: {}, skipping migrate.", vmId); return null; } if (checkAndCancelWorkIfNeeded(work)) { @@ -840,11 +840,11 @@ public Long migrate(final HaWorkVO work) { logger.info("Migration attempt: for {} from {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries); if (VirtualMachine.State.Stopped.equals(vm.getState())) { - logger.info(String.format("vm %s is Stopped, skipping migrate.", vm)); + logger.info("vm {} is Stopped, skipping migrate.", vm); return null; } if (VirtualMachine.State.Running.equals(vm.getState()) && srcHostId != vm.getHostId()) { - logger.info(String.format("VM %s is running on a different host %s, skipping migration", vm, vm.getHostId())); + logger.info("VM {} is running on a different host {}, skipping migration", vm, vm.getHostId()); return null; } @@ -879,7 +879,7 @@ public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonTy final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType); _haDao.persist(work); if (logger.isDebugEnabled()) { - logger.debug("Scheduled " + work.toString()); + logger.debug("{}}", work.toString()); } wakeupWorkers(); return true; @@ -897,7 +897,7 @@ private void stopVMWithCleanup(VirtualMachine vm, VirtualMachine.State state) th } private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimedoutException, AgentUnavailableException { - logger.info("Destroying " + vm.toString()); + logger.info("Destroying {}", vm.toString()); if (VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { consoleProxyManager.destroyProxy(vm.getId()); } else if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType())) { @@ -908,9 +908,10 @@ private void destroyVM(VirtualMachine vm, boolean expunge) throws OperationTimed } protected Long destroyVM(final HaWorkVO work) { + logger.debug("DESTROY with HA WORK"); final VirtualMachine vm = _itMgr.findById(work.getInstanceId()); if (vm == null) { - logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work); + logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work); return null; } if (checkAndCancelWorkIfNeeded(work)) { @@ -944,20 +945,21 @@ protected Long destroyVM(final HaWorkVO work) { } protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException { + logger.debug("STOP with HA WORK"); VirtualMachine vm = _itMgr.findById(work.getInstanceId()); if (vm == null) { - logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work); + logger.info("No longer can find VM {}. Throwing away {}", work.getInstanceId(), work); work.setStep(Step.Done); return null; } if (checkAndCancelWorkIfNeeded(work)) { return null; } - logger.info("Stopping " + vm); + logger.info("Stopping {}", vm); try { if (work.getWorkType() == WorkType.Stop) { _itMgr.advanceStop(vm.getUuid(), false); - logger.info("Successfully stopped " + vm); + logger.info("Successfully stopped {}", vm); return null; } else if (work.getWorkType() == WorkType.CheckStop) { if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null || @@ -969,7 +971,7 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException { } _itMgr.advanceStop(vm.getUuid(), false); - logger.info("Stop for " + vm + " was successful"); + logger.info("Stop for {} was successful", vm); return null; } else if (work.getWorkType() == WorkType.ForceStop) { if ((vm.getState() != work.getPreviousState()) || vm.getUpdated() != work.getUpdateTime() || vm.getHostId() == null || @@ -981,13 +983,13 @@ protected Long stopVM(final HaWorkVO work) throws ConcurrentOperationException { } _itMgr.advanceStop(vm.getUuid(), true); - logger.info("Stop for " + vm + " was successful"); + logger.info("Stop for {} was successful", vm); return null; } else { assert false : "Who decided there's other steps but didn't modify the guy who does the work?"; } } catch (final ResourceUnavailableException e) { - logger.debug("Agnet is not available" + e.getMessage()); + logger.debug("Agent is not available" + e.getMessage()); } catch (OperationTimedoutException e) { logger.debug("operation timed out: " + e.getMessage()); } @@ -1043,7 +1045,8 @@ private void processWork(final HaWorkVO work) { try { if (vm != null && !VmHaEnabled.valueIn(vm.getDataCenterId())) { if (logger.isDebugEnabled()) { - logger.debug(String.format("VM high availability manager is disabled, rescheduling the HA work %s, for the VM %s (id) to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId())); + logger.debug("VM high availability manager is disabled, rescheduling the HA work {} for the VM {} ({}) " + + "to retry later in case VM high availability manager is enabled on retry attempt", work, vm.getName(), vm.getId()); } long nextTime = getRescheduleTime(wt); rescheduleWork(work, nextTime); @@ -1065,13 +1068,13 @@ private void processWork(final HaWorkVO work) { } if (nextTime == null) { - logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts."); + logger.info("Completed work {}. Took {}/{} attempts.", work, work.getTimesTried() + 1, _maxRetries); work.setStep(Step.Done); } else { rescheduleWork(work, nextTime.longValue()); } } catch (Exception e) { - logger.warn("Encountered unhandled exception during HA process, reschedule work", e); + logger.warn("Encountered unhandled exception during HA process, reschedule work {}", work, e); long nextTime = getRescheduleTime(wt); rescheduleWork(work, nextTime); @@ -1085,11 +1088,11 @@ private void processWork(final HaWorkVO work) { } finally { if (!Step.Done.equals(work.getStep())) { if (work.getTimesTried() >= _maxRetries) { - logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work); + logger.warn("Giving up, retried max {}/{} times for work: {}", work.getTimesTried(), _maxRetries, work); work.setStep(Step.Done); } else { - logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) + - ". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times."); + logger.warn("Rescheduling work {} to try again at {}. Finished attempt {}/{} times.", + work, new Date(work.getTimeToTry() << 10), work.getTimesTried(), _maxRetries); } } _haDao.update(work.getId(), work); diff --git a/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java b/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java index 0972f2451afb..d14c4baafb34 100644 --- a/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java +++ b/server/src/main/java/com/cloud/ha/ManagementIPSystemVMInvestigator.java @@ -104,7 +104,7 @@ public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { return null; } diff --git a/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java b/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java index 7d063b3088e4..82074460f2af 100644 --- a/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java +++ b/server/src/main/java/com/cloud/ha/UserVmDomRInvestigator.java @@ -103,7 +103,7 @@ public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { if (logger.isDebugEnabled()) { logger.debug("checking if agent ({}) is alive", agent); } diff --git a/server/src/main/java/com/cloud/ha/XenServerInvestigator.java b/server/src/main/java/com/cloud/ha/XenServerInvestigator.java index 5482a7f148e6..fea44c97ab58 100644 --- a/server/src/main/java/com/cloud/ha/XenServerInvestigator.java +++ b/server/src/main/java/com/cloud/ha/XenServerInvestigator.java @@ -46,7 +46,7 @@ protected XenServerInvestigator() { } @Override - public Status isAgentAlive(Host agent) { + public Status getHostAgentStatus(Host agent) { if (agent.getHypervisorType() != HypervisorType.XenServer) { return null; } @@ -74,7 +74,7 @@ public Status isAgentAlive(Host agent) { @Override public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM { - Status status = isAgentAlive(host); + Status status = getHostAgentStatus(host); if (status == null) { throw new UnknownVM(); } diff --git a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java index 9274bd1ff085..42b9a701ad8e 100644 --- a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java +++ b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java @@ -362,7 +362,7 @@ public void investigateHostStatusSuccess() { investigators.add(investigator); highAvailabilityManager.setInvestigators(investigators); // Mock isAgentAlive to return host status as Down - Mockito.when(investigator.isAgentAlive(hostVO)).thenReturn(Status.Down); + Mockito.when(investigator.getHostAgentStatus(hostVO)).thenReturn(Status.Down); ConfigKey haEnabled = Mockito.mock(ConfigKey.class); highAvailabilityManager.VmHaEnabled = haEnabled;