Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions api/src/main/java/com/cloud/ha/Investigator.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,19 @@ public interface Investigator extends Adapter {
* Returns if the vm is still alive.
*
* @param vm to work on.
* @return true if vm is alive, otherwise false
*/
public boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;
boolean isVmAlive(VirtualMachine vm, Host host) throws UnknownVM;

public Status isAgentAlive(Host agent);
/**
* Returns the agent status of the host.
*
* @param host
* @return status of the host agent
*/
Status getHostAgentStatus(Host host);

class UnknownVM extends Exception {

/**
*
*/
private static final long serialVersionUID = 1L;

};
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public CheckOnHostAnswer(CheckOnHostCommand cmd, Boolean alive, String details)

public CheckOnHostAnswer(CheckOnHostCommand cmd, String details) {
super(cmd, false, details);
determined = false;
alive = false;
}

public boolean isDetermined() {
Expand All @@ -47,5 +49,4 @@ public boolean isDetermined() {
public boolean isAlive() {
return alive;
}

}
10 changes: 5 additions & 5 deletions core/src/main/java/com/cloud/agent/api/CheckOnHostCommand.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

public class CheckOnHostCommand extends Command {
HostTO host;
boolean reportCheckFailureIfOneStorageIsDown;
boolean reportIfHeartBeatFailedForOneStoragePool;

protected CheckOnHostCommand() {
}
Expand All @@ -34,17 +34,17 @@ public CheckOnHostCommand(Host host) {
setWait(20);
}

public CheckOnHostCommand(Host host, boolean reportCheckFailureIfOneStorageIsDown) {
public CheckOnHostCommand(Host host, boolean reportIfHeartBeatFailedForOneStoragePool) {
this(host);
this.reportCheckFailureIfOneStorageIsDown = reportCheckFailureIfOneStorageIsDown;
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
}

public HostTO getHost() {
return host;
}

public boolean isCheckFailedOnOneStorage() {
return reportCheckFailureIfOneStorageIsDown;
public boolean shouldReportIfHeartBeatFailedForOneStoragePool() {
return reportIfHeartBeatFailedForOneStoragePool;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ public interface HighAvailabilityManager extends Manager {
+ " which are registered for the HA event that were successful and are now ready to be purged.",
true, Cluster);

public static final ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
ConfigKey<Boolean> KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false",
"Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone);

public enum WorkType {
enum WorkType {
Migration, // Migrating VMs off of a host.
Stop, // Stops a VM for storage pool migration purposes. This should be obsolete now.
CheckStop, // Checks if a VM has been stopped.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ public class HypervInvestigator extends AdapterBase implements Investigator {

@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
Status status = isAgentAlive(host);
Status status = getHostAgentStatus(host);
if (status == null) {
throw new UnknownVM();
}
return status == Status.Up ? true : null;
return status == Status.Up;
}

@Override
public Status isAgentAlive(Host agent) {
public Status getHostAgentStatus(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.Hyperv) {
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@
package com.cloud.ha;

import com.cloud.agent.AgentManager;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.CheckOnHostCommand;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
import com.cloud.host.Status;
import com.cloud.host.dao.HostDao;
import com.cloud.hypervisor.Hypervisor;
Expand All @@ -34,11 +31,12 @@
import org.apache.cloudstack.engine.subsystem.api.storage.DataStoreProviderManager;
import org.apache.cloudstack.engine.subsystem.api.storage.PrimaryDataStoreDriver;
import org.apache.cloudstack.ha.HAManager;
import org.apache.cloudstack.kvm.ha.KVMHostActivityChecker;
import org.apache.cloudstack.storage.datastore.db.PrimaryDataStoreDao;
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;

import javax.inject.Inject;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

public class KVMInvestigator extends AdapterBase implements Investigator {
Expand All @@ -54,13 +52,15 @@ public class KVMInvestigator extends AdapterBase implements Investigator {
private HAManager haManager;
@Inject
private DataStoreProviderManager dataStoreProviderMgr;
@Inject
private KVMHostActivityChecker hostActivityChecker;

@Override
public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws UnknownVM {
if (haManager.isHAEligible(host)) {
return haManager.isVMAliveOnHost(host);
}
Status status = isAgentAlive(host);
Status status = getHostAgentStatus(host);
logger.debug("HA: HOST is ineligible legacy state {} for host {}", status, host);
if (status == null) {
throw new UnknownVM();
Expand All @@ -73,86 +73,41 @@ public boolean isVmAlive(com.cloud.vm.VirtualMachine vm, Host host) throws Unkno
}

@Override
public Status isAgentAlive(Host agent) {
if (agent.getHypervisorType() != Hypervisor.HypervisorType.KVM && agent.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
public Status getHostAgentStatus(Host host) {
if (host.getHypervisorType() != Hypervisor.HypervisorType.KVM && host.getHypervisorType() != Hypervisor.HypervisorType.LXC) {
return null;
}

if (haManager.isHAEligible(agent)) {
return haManager.getHostStatus(agent);
if (haManager.isHAEligible(host)) {
return haManager.getHostStatus(host);
}

List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Arrays.asList(agent.getClusterId()), null);
boolean storageSupportHA = storageSupportHa(clusterPools);
if (!storageSupportHA) {
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(agent.getDataCenterId(), agent.getHypervisorType());
storageSupportHA = storageSupportHa(zonePools);
List<StoragePoolVO> clusterPools = _storagePoolDao.findPoolsInClusters(Collections.singletonList(host.getClusterId()), null);
boolean storageSupportsHA = storageSupportsHA(clusterPools);
if (!storageSupportsHA) {
List<StoragePoolVO> zonePools = _storagePoolDao.findZoneWideStoragePoolsByHypervisor(host.getDataCenterId(), host.getHypervisorType());
storageSupportsHA = storageSupportsHA(zonePools);
}
if (!storageSupportHA) {
logger.warn("Agent investigation was requested on host {}, but host does not support investigation because it has no NFS storage. Skipping investigation.", agent);
if (!storageSupportsHA) {
logger.warn("Agent investigation was requested on host {}, but host does not support investigation" +
" because it has no HA supported storage. Skipping investigation.", host);
return null;
}

Status hostStatus = null;
Status neighbourStatus = null;
boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value();
CheckOnHostCommand cmd = new CheckOnHostCommand(agent, reportFailureIfOneStorageIsDown);

try {
Answer answer = _agentMgr.easySend(agent.getId(), cmd);
if (answer != null) {
hostStatus = answer.getResult() ? Status.Down : Status.Up;
}
} catch (Exception e) {
logger.debug("Failed to send command to host: {}", agent);
}
if (hostStatus == null) {
hostStatus = Status.Disconnected;
}

List<HostVO> neighbors = _resourceMgr.listHostsInClusterByStatus(agent.getClusterId(), Status.Up);
for (HostVO neighbor : neighbors) {
if (neighbor.getId() == agent.getId()
|| (neighbor.getHypervisorType() != Hypervisor.HypervisorType.KVM && neighbor.getHypervisorType() != Hypervisor.HypervisorType.LXC)) {
continue;
}
logger.debug("Investigating host:{} via neighbouring host:{}", agent, neighbor);
try {
Answer answer = _agentMgr.easySend(neighbor.getId(), cmd);
if (answer != null) {
neighbourStatus = answer.getResult() ? Status.Down : Status.Up;
logger.debug("Neighbouring host:{} returned status:{} for the investigated host:{}", neighbor, neighbourStatus, agent);
if (neighbourStatus == Status.Up) {
break;
}
}
} catch (Exception e) {
logger.debug("Failed to send command to host: {}", neighbor);
}
}
if (neighbourStatus == Status.Up && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Disconnected;
}
if (neighbourStatus == Status.Down && (hostStatus == Status.Disconnected || hostStatus == Status.Down)) {
hostStatus = Status.Down;
}
logger.debug("HA: HOST is ineligible legacy state {} for host {}", hostStatus, agent);
return hostStatus;
return hostActivityChecker.getHostAgentStatus(host);
}

private boolean storageSupportHa(List<StoragePoolVO> pools) {
boolean storageSupportHA = false;
private boolean storageSupportsHA(List<StoragePoolVO> pools) {
for (StoragePoolVO pool : pools) {
DataStoreProvider storeProvider = dataStoreProviderMgr.getDataStoreProvider(pool.getStorageProviderName());
DataStoreDriver storeDriver = storeProvider.getDataStoreDriver();
if (storeDriver instanceof PrimaryDataStoreDriver) {
PrimaryDataStoreDriver primaryStoreDriver = (PrimaryDataStoreDriver)storeDriver;
if (primaryStoreDriver.isStorageSupportHA(pool.getPoolType())) {
storageSupportHA = true;
break;
return true;
}
}
}
return storageSupportHA;
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,9 @@
public class KVMHABase {
protected Logger logger = LogManager.getLogger(getClass());
private long _timeout = 60000; /* 1 minutes */
protected long _heartBeatUpdateTimeout = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.HEARTBEAT_UPDATE_TIMEOUT);
protected long _heartBeatUpdateFreq = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
protected long _heartBeatUpdateFreqInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_FREQUENCY);
protected long _heartBeatUpdateMaxTries = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_MAX_TRIES);
protected long _heartBeatUpdateRetrySleep = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);
protected long _heartBeatUpdateRetrySleepInMs = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.KVM_HEARTBEAT_UPDATE_RETRY_SLEEP);

public static enum PoolType {
PrimaryStorage, SecondaryStorage
Expand Down Expand Up @@ -234,7 +233,7 @@ protected String runScriptRetry(String cmdString, OutputInterpreter interpreter)
return result;
}

public Boolean checkingHeartBeat() {
public Boolean hasHeartBeat() {
// TODO Auto-generated method stub
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,44 +26,42 @@
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
private List<HAStoragePool> storagePools;
private HostTO host;
private boolean reportFailureIfOneStorageIsDown;
private boolean reportIfHeartBeatFailedForOneStoragePool;

public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportFailureIfOneStorageIsDown) {
public KVMHAChecker(List<HAStoragePool> pools, HostTO host, boolean reportIfHeartBeatFailedForOneStoragePool) {
this.storagePools = pools;
this.host = host;
this.reportFailureIfOneStorageIsDown = reportFailureIfOneStorageIsDown;
this.reportIfHeartBeatFailedForOneStoragePool = reportIfHeartBeatFailedForOneStoragePool;
}

/*
* True means heartbeaing is on going, or we can't get it's status. False
* means heartbeating is stopped definitely
* True means heart beating is on going, or we can't get it's status.
* False means heart beating is stopped definitely.
*/
@Override
public Boolean checkingHeartBeat() {
boolean validResult = false;

public Boolean hasHeartBeat() {
String hostAndPools = String.format("host IP [%s] in pools [%s]", host.getPrivateNetwork().getIp(), storagePools.stream().map(pool -> pool.getPoolUUID()).collect(Collectors.joining(", ")));
logger.debug("Checking heart beat with KVMHAChecker for {}", hostAndPools);

logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));

boolean heartBeatCheckResult = false;
for (HAStoragePool pool : storagePools) {
validResult = pool.getPool().checkingHeartBeat(pool, host);
if (reportFailureIfOneStorageIsDown && !validResult) {
heartBeatCheckResult = pool.getPool().hasHeartBeat(pool, host);
if (reportIfHeartBeatFailedForOneStoragePool && !heartBeatCheckResult) {
break;
}
}

if (!validResult) {
logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools));
if (!heartBeatCheckResult) {
logger.warn("All checks with KVMHAChecker for {} considered it as dead. It may cause a shutdown of the host.", hostAndPools);
}

return validResult;
return heartBeatCheckResult;
}

@Override
public Boolean call() throws Exception {
// logger.addAppender(new org.apache.log4j.ConsoleAppender(new
// org.apache.log4j.PatternLayout(), "System.out"));
return checkingHeartBeat();
return hasHeartBeat();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ protected void runHeartBeat() {
result = executePoolHeartBeatCommand(uuid, primaryStoragePool, result);

if (result != null && rebootHostAndAlertManagementOnHeartbeatTimeout) {
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; stopping cloudstack-agent.", uuid, result));
logger.warn("Write heartbeat for pool [{}] failed: {}; stopping cloudstack-agent.", uuid, result);
primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, null, false);;
}
}
Expand All @@ -108,9 +108,9 @@ private String executePoolHeartBeatCommand(String uuid, HAStoragePool primarySto
result = primaryStoragePool.getPool().createHeartBeatCommand(primaryStoragePool, hostPrivateIp, true);

if (result != null) {
logger.warn(String.format("Write heartbeat for pool [%s] failed: %s; try: %s of %s.", uuid, result, i, _heartBeatUpdateMaxTries));
logger.warn("Write heartbeat for pool [{}] failed: {}; try: {} of {}.", uuid, result, i, _heartBeatUpdateMaxTries);
try {
Thread.sleep(_heartBeatUpdateRetrySleep);
Thread.sleep(_heartBeatUpdateRetrySleepInMs);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeat retries.", e);
}
Expand All @@ -128,21 +128,21 @@ private void checkForNotExistingLibvirtStoragePools(Set<String> removedPools, St
StoragePool storage = conn.storagePoolLookupByUUIDString(uuid);
if (storage == null || storage.getInfo().state != StoragePoolState.VIR_STORAGE_POOL_RUNNING) {
if (storage == null) {
logger.debug(String.format("Libvirt storage pool [%s] not found, removing from HA list.", uuid));
logger.debug("Libvirt storage pool [{}] not found, removing from HA list.", uuid);
} else {
logger.debug(String.format("Libvirt storage pool [%s] found, but not running, removing from HA list.", uuid));
logger.debug("Libvirt storage pool [{}] found, but not running, removing from HA list.", uuid);
}

removedPools.add(uuid);
}

logger.debug(String.format("Found NFS storage pool [%s] in libvirt, continuing.", uuid));
logger.debug("Found NFS storage pool [{}] in libvirt, continuing.", uuid);

} catch (LibvirtException e) {
logger.debug(String.format("Failed to lookup libvirt storage pool [%s].", uuid), e);
logger.debug("Failed to lookup libvirt storage pool [{}].", uuid, e);

if (e.toString().contains("pool not found")) {
logger.debug(String.format("Removing pool [%s] from HA monitor since it was deleted.", uuid));
logger.debug("Removing pool [{}] from HA monitor since it was deleted.", uuid);
removedPools.add(uuid);
}
}
Expand All @@ -155,11 +155,10 @@ public void run() {
runHeartBeat();

try {
Thread.sleep(_heartBeatUpdateFreq);
Thread.sleep(_heartBeatUpdateFreqInMs);
} catch (InterruptedException e) {
logger.debug("[IGNORED] Interrupted between heartbeats.", e);
}
}
}

}
Loading
Loading