diff --git a/build.gradle b/build.gradle index 17e76704..15fb443c 100644 --- a/build.gradle +++ b/build.gradle @@ -42,6 +42,9 @@ dependencies { compile 'com.google.guava:guava:11.+' compile 'org.apache.httpcomponents:httpclient:4.2.1' compile 'org.apache.httpcomponents:httpcore:4.2.1' + compile 'org.jclouds:jclouds-all:1.6.0' + compile 'org.jclouds.driver:jclouds-jsch:1.6.0' + compile 'org.jclouds.driver:jclouds-slf4j:1.6.0' testCompile 'org.testng:testng:6.3.1' testCompile 'org.mockito:mockito-core:1.8.5' diff --git a/src/main/java/com/netflix/simianarmy/CloudClient.java b/src/main/java/com/netflix/simianarmy/CloudClient.java index 7243f42b..d4b5a295 100644 --- a/src/main/java/com/netflix/simianarmy/CloudClient.java +++ b/src/main/java/com/netflix/simianarmy/CloudClient.java @@ -20,6 +20,7 @@ import java.util.List; import java.util.Map; +import org.jclouds.compute.ComputeService; /** * The CloudClient interface. This abstractions provides the interface that the monkeys need to interact with @@ -89,4 +90,43 @@ public interface CloudClient { */ void createTagsForResources(Map keyValueMap, String... resourceIds); + /** + * Lists all EBS volumes attached to the specified instance. + * + * @param instanceId + * the instance id + * @param includeRoot + * if the root volume is on EBS, should we include it? + * + * @throws NotFoundException + * if the instance no longer exists or was already terminated after the crawler discovered it then you + * should get a NotFoundException + */ + List listAttachedVolumes(String instanceId, boolean includeRoot); + + /** + * Detaches an EBS volumes from the specified instance. + * + * @param instanceId + * the instance id + * @param volumeId + * the volume id + * @param force + * if we should force-detach the volume. Probably best not to use on high-value volumes. + * + * @throws NotFoundException + * if the instance no longer exists or was already terminated after the crawler discovered it then you + * should get a NotFoundException + */ + void detachVolume(String instanceId, String volumeId, boolean force); + + /** + * Returns the jClouds compute service. + */ + ComputeService getJcloudsComputeService(); + + /** + * Returns the jClouds node id for an instance id on this CloudClient. + */ + String getJcloudsId(String instanceId); } diff --git a/src/main/java/com/netflix/simianarmy/basic/chaos/BasicChaosMonkey.java b/src/main/java/com/netflix/simianarmy/basic/chaos/BasicChaosMonkey.java index 41985474..563c354b 100644 --- a/src/main/java/com/netflix/simianarmy/basic/chaos/BasicChaosMonkey.java +++ b/src/main/java/com/netflix/simianarmy/basic/chaos/BasicChaosMonkey.java @@ -40,9 +40,11 @@ import com.netflix.simianarmy.NotFoundException; import com.netflix.simianarmy.chaos.BlockAllNetworkTrafficChaosType; import com.netflix.simianarmy.chaos.ChaosCrawler.InstanceGroup; +import com.netflix.simianarmy.chaos.BurnCpuChaosType; import com.netflix.simianarmy.chaos.ChaosEmailNotifier; import com.netflix.simianarmy.chaos.ChaosMonkey; import com.netflix.simianarmy.chaos.ChaosType; +import com.netflix.simianarmy.chaos.DetachVolumesChaosType; import com.netflix.simianarmy.chaos.ShutdownInstanceChaosType; /** @@ -92,6 +94,8 @@ public BasicChaosMonkey(ChaosMonkey.Context ctx) { allChaosTypes = Lists.newArrayList(); allChaosTypes.add(new ShutdownInstanceChaosType(cfg)); allChaosTypes.add(new BlockAllNetworkTrafficChaosType(cfg)); + allChaosTypes.add(new DetachVolumesChaosType(cfg)); + allChaosTypes.add(new BurnCpuChaosType(cfg)); TimeUnit freqUnit = ctx.scheduler().frequencyUnit(); long units = freqUnit.convert(close.getTimeInMillis() - open.getTimeInMillis(), TimeUnit.MILLISECONDS); diff --git a/src/main/java/com/netflix/simianarmy/chaos/BurnCpuChaosType.java b/src/main/java/com/netflix/simianarmy/chaos/BurnCpuChaosType.java new file mode 100644 index 00000000..244089e9 --- /dev/null +++ b/src/main/java/com/netflix/simianarmy/chaos/BurnCpuChaosType.java @@ -0,0 +1,22 @@ +package com.netflix.simianarmy.chaos; + +import java.io.IOException; +import com.netflix.simianarmy.MonkeyConfiguration; + +/** + * Executes a CPU intensive program on the node, using up all available CPU. + * + * This simulates either a noisy CPU neighbor on the box or just a general issue with the CPU. + */ +public class BurnCpuChaosType extends ScriptChaosType { + /** + * Constructor. + * + * @param config + * Configuration to use + * @throws IOException + */ + public BurnCpuChaosType(MonkeyConfiguration config) { + super(config, "BurnCpu"); + } +} diff --git a/src/main/java/com/netflix/simianarmy/chaos/ChaosType.java b/src/main/java/com/netflix/simianarmy/chaos/ChaosType.java index 61bb66cc..56acde2c 100644 --- a/src/main/java/com/netflix/simianarmy/chaos/ChaosType.java +++ b/src/main/java/com/netflix/simianarmy/chaos/ChaosType.java @@ -16,7 +16,7 @@ public abstract class ChaosType { private static final Logger LOGGER = LoggerFactory.getLogger(ChaosType.class); /** - * Configuration for this chaos type + * Configuration for this chaos type. */ private final MonkeyConfiguration config; @@ -77,6 +77,13 @@ public String getKey() { * detach. */ public boolean canApply(CloudClient cloudClient, String instanceId) { + return isEnabled(); + } + + /** + * Returns whether we are enabled. + */ + public boolean isEnabled() { return enabled; } @@ -94,6 +101,7 @@ public static ChaosType parse(List all, String chaosTypeName) { return chaosType; } } - throw new IllegalArgumentException("Unknown chaos type: " + chaosTypeName); + throw new IllegalArgumentException("Unknown chaos type value: " + + chaosTypeName); } } diff --git a/src/main/java/com/netflix/simianarmy/chaos/DetachVolumesChaosType.java b/src/main/java/com/netflix/simianarmy/chaos/DetachVolumesChaosType.java new file mode 100644 index 00000000..0e340b30 --- /dev/null +++ b/src/main/java/com/netflix/simianarmy/chaos/DetachVolumesChaosType.java @@ -0,0 +1,64 @@ +package com.netflix.simianarmy.chaos; + +import java.util.List; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.netflix.simianarmy.CloudClient; +import com.netflix.simianarmy.MonkeyConfiguration; +import com.netflix.simianarmy.basic.chaos.BasicChaosMonkey; + +/** + * We force-detach all the EBS volumes. + * + * This is supposed to simulate a catastrophic failure of EBS, however the instance will (possibly) still keep running; + * e.g. it should continue to respond to pings. + */ +public class DetachVolumesChaosType extends ChaosType { + /** The Constant LOGGER. */ + private static final Logger LOGGER = LoggerFactory.getLogger(BasicChaosMonkey.class); + + /** + * Constructor. + * + * @param config + * Configuration to use + */ + public DetachVolumesChaosType(MonkeyConfiguration config) { + super(config, "DetachVolumes"); + } + + /** + * Strategy can be applied iff there are any EBS volumes attached. + */ + @Override + public boolean canApply(CloudClient cloudClient, String instanceId) { + // The test mocks don't implement listAttachedVolumes + // so check for enabled first + if (!isEnabled()) { + return false; + } + + List volumes = cloudClient.listAttachedVolumes(instanceId, false); + if (volumes.isEmpty()) { + LOGGER.debug("Can't apply strategy: no non-root EBS volumes"); + return false; + } + + return super.canApply(cloudClient, instanceId); + } + + /** + * Force-detaches all attached EBS volumes from the instance. + */ + @Override + public void apply(CloudClient cloudClient, String instanceId) { + // IDEA: We could have a strategy where we detach some of the volumes... + boolean force = true; + for (String volumeId : cloudClient.listAttachedVolumes(instanceId, false)) { + cloudClient.detachVolume(instanceId, volumeId, force); + } + } + +} diff --git a/src/main/java/com/netflix/simianarmy/chaos/ScriptChaosType.java b/src/main/java/com/netflix/simianarmy/chaos/ScriptChaosType.java new file mode 100644 index 00000000..7921ab57 --- /dev/null +++ b/src/main/java/com/netflix/simianarmy/chaos/ScriptChaosType.java @@ -0,0 +1,146 @@ +package com.netflix.simianarmy.chaos; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Set; + +import org.jclouds.compute.ComputeService; +import org.jclouds.compute.Utils; +import org.jclouds.compute.domain.ComputeMetadata; +import org.jclouds.compute.domain.ExecResponse; +import org.jclouds.compute.domain.NodeMetadata; +import org.jclouds.compute.domain.NodeMetadataBuilder; +import org.jclouds.domain.LoginCredentials; +import org.jclouds.ssh.SshClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Charsets; +import com.google.common.base.Strings; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.google.common.io.Files; +import com.google.common.io.Resources; +import com.netflix.simianarmy.CloudClient; +import com.netflix.simianarmy.MonkeyConfiguration; + +/** + * Base class for chaos types that run a script over JClouds/SSH on the node. + */ +public abstract class ScriptChaosType extends ChaosType { + /** The Constant LOGGER. */ + private static final Logger LOGGER = LoggerFactory.getLogger(ScriptChaosType.class); + + /** + * The SSH credentials to log on to an instance. + */ + private final LoginCredentials sshCredentials; + + /** + * Constructor. + * + * @param config + * Configuration to use + * @throws IOException + */ + public ScriptChaosType(MonkeyConfiguration config, String key) { + super(config, key); + + String sshUser = config.getStrOrElse("simianarmy.chaos.ssh.user", "root"); + String privateKey = null; + + String sshKeyPath = config.getStrOrElse("simianarmy.chaos.ssh.key", null); + if (sshKeyPath != null) { + sshKeyPath = sshKeyPath.trim(); + if (sshKeyPath.startsWith("~/")) { + String home = System.getProperty("user.home"); + if (!Strings.isNullOrEmpty(home)) { + if (!home.endsWith("/")) { + home += "/"; + } + sshKeyPath = home + sshKeyPath.substring(2); + } + } + try { + privateKey = Files.toString(new File(sshKeyPath), Charsets.UTF_8); + } catch (IOException e) { + throw new IllegalStateException("Unable to read the specified SSH key: " + sshKeyPath, e); + } + } + + if (privateKey == null) { + this.sshCredentials = null; + } else { + this.sshCredentials = LoginCredentials.builder().user(sshUser).privateKey(privateKey).build(); + } + } + + /** + * We can apply the strategy iff we can SSH to the instance. + */ + @Override + public boolean canApply(CloudClient cloudClient, String instanceId) { + // TODO: Check that SSH connection works here? + + if (this.sshCredentials == null) { + LOGGER.info("Strategy disabled because SSH credentials not set"); + return false; + } + + return super.canApply(cloudClient, instanceId); + } + + /** + * Runs the script. + */ + @Override + public void apply(CloudClient cloudClient, String instanceId) { + ComputeService computeService = cloudClient.getJcloudsComputeService(); + + String jcloudsId = cloudClient.getJcloudsId(instanceId); + + // Work around a jclouds bug / documentation issue... + // Set nodes = computeService.listNodesByIds(Collections.singletonList(jcloudsId)); + Set nodes = Sets.newHashSet(); + for (ComputeMetadata n : computeService.listNodes()) { + if (jcloudsId.equals(n.getId())) { + nodes.add((NodeMetadata) n); + } + } + + if (nodes.isEmpty()) { + LOGGER.warn("Unable to jclouds node: {}", jcloudsId); + for (ComputeMetadata n : computeService.listNodes()) { + LOGGER.info("Did find node: {}", n); + } + throw new IllegalStateException("Unable to find node using jclouds: " + jcloudsId); + } + NodeMetadata node = Iterables.getOnlyElement(nodes); + + node = NodeMetadataBuilder.fromNodeMetadata(node).credentials(sshCredentials).build(); + + LOGGER.info("Running script for {} on instance {}", getKey(), instanceId); + + Utils utils = computeService.getContext().getUtils(); + SshClient ssh = utils.sshForNode().apply(node); + + ssh.connect(); + + String filename = getKey().toLowerCase() + ".sh"; + URL url = Resources.getResource("/scripts/" + filename); + String script; + try { + script = Resources.toString(url, Charsets.UTF_8); + } catch (IOException e) { + throw new IllegalStateException("Error reading script resource", e); + } + + ssh.put("/tmp/" + filename, script); + ExecResponse response = ssh.exec("/bin/bash /tmp/" + filename); + if (response.getExitStatus() != 0) { + LOGGER.warn("Got non-zero output from running script: {}", response); + } + ssh.disconnect(); + } +} diff --git a/src/main/java/com/netflix/simianarmy/client/aws/AWSClient.java b/src/main/java/com/netflix/simianarmy/client/aws/AWSClient.java index 966e7e3e..0021cd7d 100644 --- a/src/main/java/com/netflix/simianarmy/client/aws/AWSClient.java +++ b/src/main/java/com/netflix/simianarmy/client/aws/AWSClient.java @@ -49,8 +49,11 @@ import com.amazonaws.services.ec2.model.DescribeSnapshotsResult; import com.amazonaws.services.ec2.model.DescribeVolumesRequest; import com.amazonaws.services.ec2.model.DescribeVolumesResult; +import com.amazonaws.services.ec2.model.DetachVolumeRequest; +import com.amazonaws.services.ec2.model.EbsInstanceBlockDevice; import com.amazonaws.services.ec2.model.Image; import com.amazonaws.services.ec2.model.Instance; +import com.amazonaws.services.ec2.model.InstanceBlockDeviceMapping; import com.amazonaws.services.ec2.model.ModifyInstanceAttributeRequest; import com.amazonaws.services.ec2.model.Reservation; import com.amazonaws.services.ec2.model.SecurityGroup; @@ -64,10 +67,18 @@ import com.amazonaws.services.elasticloadbalancing.model.LoadBalancerDescription; import com.amazonaws.services.simpledb.AmazonSimpleDB; import com.amazonaws.services.simpledb.AmazonSimpleDBClient; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableSet; +import com.google.inject.Module; import com.netflix.simianarmy.CloudClient; import com.netflix.simianarmy.NotFoundException; import org.apache.commons.lang.Validate; +import org.jclouds.ContextBuilder; +import org.jclouds.compute.ComputeService; +import org.jclouds.compute.ComputeServiceContext; +import org.jclouds.logging.slf4j.config.SLF4JLoggingModule; +import org.jclouds.ssh.jsch.config.JschSshClientModule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,6 +103,8 @@ public class AWSClient implements CloudClient { private final AWSCredentialsProvider awsCredentialsProvider; + private ComputeService jcloudsComputeService; + /** * This constructor will let the AWS SDK obtain the credentials, which will * choose such in the following order: @@ -567,12 +580,64 @@ public List describeImages(String... imageIds) { LOGGER.info(String.format("Got %d AMIs in region %s.", images.size(), region)); return images; } - - + + @Override + public void detachVolume(String instanceId, String volumeId, boolean force) { + Validate.notEmpty(instanceId); + LOGGER.info(String.format("Detach volumes from instance %s in region %s.", instanceId, region)); + try { + DetachVolumeRequest detachVolumeRequest = new DetachVolumeRequest(); + detachVolumeRequest.setForce(force); + detachVolumeRequest.setInstanceId(instanceId); + detachVolumeRequest.setVolumeId(volumeId); + ec2Client().detachVolume(detachVolumeRequest); + } catch (AmazonServiceException e) { + if (e.getErrorCode().equals("InvalidInstanceID.NotFound")) { + throw new NotFoundException("AWS instance " + instanceId + " not found", e); + } + throw e; + } + } + + @Override + public List listAttachedVolumes(String instanceId, boolean includeRoot) { + Validate.notEmpty(instanceId); + LOGGER.info(String.format("Listing volumes attached to instance %s in region %s.", instanceId, region)); + try { + List volumeIds = new ArrayList(); + for (Instance instance : describeInstances(instanceId)) { + String rootDeviceName = instance.getRootDeviceName(); + + for (InstanceBlockDeviceMapping ibdm : instance.getBlockDeviceMappings()) { + EbsInstanceBlockDevice ebs = ibdm.getEbs(); + if (ebs == null) { + continue; + } + + String volumeId = ebs.getVolumeId(); + if (Strings.isNullOrEmpty(volumeId)) { + continue; + } + + if (!includeRoot && rootDeviceName != null && rootDeviceName.equals(ibdm.getDeviceName())) { + continue; + } + + volumeIds.add(volumeId); + } + } + return volumeIds; + } catch (AmazonServiceException e) { + if (e.getErrorCode().equals("InvalidInstanceID.NotFound")) { + throw new NotFoundException("AWS instance " + instanceId + " not found", e); + } + throw e; + } + } /** - * Describe a set of security groups - * + * Describe a set of security groups. + * * @param groupNames the names of the groups to find * @return a list of matching groups */ @@ -606,7 +671,7 @@ public List describeSecurityGroups(String... groupNames) { /** * Create an (empty) EC2 security group. - * + * * @param name * Name of group to create * @param description @@ -642,4 +707,26 @@ public Instance describeInstance(String instanceId) { } return instance; } + + /** {@inheritDoc} */ + @Override + public synchronized ComputeService getJcloudsComputeService() { + if (jcloudsComputeService == null) { + String username = awsCredentialsProvider.getCredentials().getAWSAccessKeyId(); + String password = awsCredentialsProvider.getCredentials().getAWSSecretKey(); + ComputeServiceContext jcloudsContext = ContextBuilder.newBuilder("ec2").credentials(username, password) + .modules(ImmutableSet. of(new SLF4JLoggingModule(), new JschSshClientModule())) + .buildView(ComputeServiceContext.class); + + this.jcloudsComputeService = jcloudsContext.getComputeService(); + } + + return jcloudsComputeService; + } + + /** {@inheritDoc} */ + @Override + public String getJcloudsId(String instanceId) { + return this.region + "/" + instanceId; + } } diff --git a/src/main/resources/chaos.properties b/src/main/resources/chaos.properties index bf3ffa27..6dff2dae 100644 --- a/src/main/resources/chaos.properties +++ b/src/main/resources/chaos.properties @@ -20,6 +20,7 @@ simianarmy.chaos.ASG.maxTerminationsPerDay = 1.0 # Strategies simianarmy.chaos.shutdowninstance.enabled = true simianarmy.chaos.blockallnetworktraffic.enabled = true +simianarmy.chaos.detachvolumes.enabled = true # enable a specific ASG # simianarmy.chaos.ASG..enabled = true diff --git a/src/main/resources/scripts/burncpu.sh b/src/main/resources/scripts/burncpu.sh new file mode 100644 index 00000000..7253d0d1 --- /dev/null +++ b/src/main/resources/scripts/burncpu.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +cat << EOF > /tmp/infiniteburn.sh +#!/bin/bash +while true; + do openssl speed; +done +EOF + +# 32 parallel 100% CPU tasks should hit even the biggest EC2 instances +for i in {1..32} +do + nohup /bin/bash /tmp/infiniteburn.sh & +done \ No newline at end of file diff --git a/src/test/java/com/netflix/simianarmy/TestMonkeyContext.java b/src/test/java/com/netflix/simianarmy/TestMonkeyContext.java index b4bbd59d..d86861bd 100644 --- a/src/test/java/com/netflix/simianarmy/TestMonkeyContext.java +++ b/src/test/java/com/netflix/simianarmy/TestMonkeyContext.java @@ -26,6 +26,7 @@ import java.util.Properties; import java.util.concurrent.TimeUnit; +import org.jclouds.compute.ComputeService; import org.testng.Assert; import com.netflix.simianarmy.MonkeyRecorder.Event; @@ -132,6 +133,27 @@ public void deleteImage(String imageId) { @Override public void deleteLaunchConfiguration(String launchConfigName) { } + + @Override + public List listAttachedVolumes(String instanceId, boolean includeRoot) { + throw new UnsupportedOperationException(); + } + + @Override + public void detachVolume(String instanceId, String volumeId, + boolean force) { + throw new UnsupportedOperationException(); + } + + @Override + public ComputeService getJcloudsComputeService() { + throw new UnsupportedOperationException(); + } + + @Override + public String getJcloudsId(String instanceId) { + throw new UnsupportedOperationException(); + } }; } diff --git a/src/test/java/com/netflix/simianarmy/chaos/TestChaosMonkeyContext.java b/src/test/java/com/netflix/simianarmy/chaos/TestChaosMonkeyContext.java index e38ccf8a..4a4e13e0 100644 --- a/src/test/java/com/netflix/simianarmy/chaos/TestChaosMonkeyContext.java +++ b/src/test/java/com/netflix/simianarmy/chaos/TestChaosMonkeyContext.java @@ -30,6 +30,7 @@ import java.util.Map; import java.util.Properties; +import org.jclouds.compute.ComputeService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -223,6 +224,27 @@ public void deleteImage(String imageId) { @Override public void deleteLaunchConfiguration(String launchConfigName) { } + + @Override + public List listAttachedVolumes(String instanceId, boolean includeRoot) { + throw new UnsupportedOperationException(); + } + + @Override + public void detachVolume(String instanceId, String volumeId, + boolean force) { + throw new UnsupportedOperationException(); + } + + @Override + public ComputeService getJcloudsComputeService() { + throw new UnsupportedOperationException(); + } + + @Override + public String getJcloudsId(String instanceId) { + throw new UnsupportedOperationException(); + } }; }