- gcloud (with GCP account)
- kubectl
Log into GCP (my demo was using GKE, you might prefer to use another cloud).
gcloud auth loginCreate the GKE cluster
# Set cluster configurations.
export CLUSTER_REGION="europe-west2"
export CLUSTER_ZONES="europe-west2-a,europe-west2-b,europe-west2-c"
export CLUSTER_NAME="crdb-resilience-testing"
# Get second from latest version (to allow for cluster upgrade testing later).
GKE_VERSION=$(gcloud container get-server-config --zone=europe-west2-a --format="value(validMasterVersions)" | tr ';' '\n' | sed -n '2p')
# Install GKE across three AZs, with 3 nodes per AZ.
gcloud container clusters create ${CLUSTER_NAME} \
--cluster-version ${GKE_VERSION} \
--region ${CLUSTER_REGION} \
--node-locations ${CLUSTER_ZONES} \
--num-nodes 1 \
--machine-type n2-standard-8
# Use the newly created GKE cluster with kubectl.
gcloud container clusters get-credentials ${CLUSTER_NAME} \
--region ${CLUSTER_REGION}Install CockroachDB (older version to allow for updates)
kubectl apply -f cockroachdb/manifests/v25.2.0.yaml --wait
kubectl wait --for=jsonpath='{.status.phase}'=Running pods --all -n crdb --timeout=300s
kubectl exec -it -n crdb cockroachdb-0 -- /cockroach/cockroach init --insecureConfirm that the nodes are spread across the available Kubernetes nodes
kubectl get pods -n crdb -o custom-columns=NAME:.metadata.name,HOST:.spec.nodeNameFetch public IP of external load balancer (wait until the external IP is acailable)
while true; do
export CRDB_IP=$(kubectl get service cockroachdb-public -n crdb -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null)
if [ -n "$CRDB_IP" ]; then
echo "Got CRDB_IP: $CRDB_IP"
break
fi
echo "Waiting for LoadBalancer IP..."
sleep 5
doneTest connections
cockroach sql --url "postgres://root@${CRDB_IP}:26257?sslmode=disable"
open "http://${CRDB_IP}:8080"Apply enterprise license
enterprise --url "postgres://root@${CRDB_IP}:26257?sslmode=disable"Create objects and apply resilience settings.
cockroach sql --url "postgres://root@${CRDB_IP}:26257?sslmode=disable" -f cockroachdb/data/create.sql
cockroach sql --url "postgres://root@${CRDB_IP}:26257?sslmode=disable" -f cockroachdb/data/resilience.sqlInstall Chaos Mesh
curl -sSL https://mirrors.chaos-mesh.org/v2.7.0/install.sh | bash -s -- -r containerdSecurity
# * * * UPDATE _rbac.yaml WITH YOUR ACCOUNT VALUES * * *
kubectl apply -f chaos_mesh/manifests/_rbac.yaml
# Run once per Google account.
# * * * UPDATE create_service_account.sh WITH YOUR ACCOUNT VALUES * * *
sh chaos_mesh/create_service_account.sh
SECRET=$(cat k8s-admin-sa-key.json | base64 | tr -d '\n') \
yq '.stringData.service_account = env(SECRET)' chaos_mesh/manifests/_secret.yaml \
> chaos_mesh/manifests/modified/_secret.yaml
kubectl apply -f chaos_mesh/manifests/modified/_secret.yamlStart workload
go run workload/main.go --url "postgres://root@${CRDB_IP}:26257?sslmode=disable"Start polling containers (if you want to see pods going up and down)
see kubectl get pods -n crdb -o custom-columns="NAME:.metadata.name,READY:.status.containerStatuses[0].ready,STATUS:.status.phase,RESTARTS:.status.containerStatuses[0].restartCount,IMAGE:.spec.containers[0].image"Run chaos experiments
sh ./chaos_mesh/run.shPerform rolling downgrade
kubectl apply -f cockroachdb/manifests/v25.2.0.yamlChaos Mesh
curl -sSL https://mirrors.chaos-mesh.org/v2.7.2/install.sh | bash -s -- --template | kubectl delete -f -CockroachDB
kubectl delete -n crdb -f cockroachdb/manifests/v25.2.0.yaml --wait
# Or the latest version.
kubectl delete -n crdb -f cockroachdb/manifests/v25.2.1.yaml --waitGKE cluster
gcloud container clusters delete ${CLUSTER_NAME} \
--region ${CLUSTER_REGION} \
--quietForce delete PVC
kubectl patch pvc datadir-cockroachdb-0 -p '{"metadata":{"finalizers":null}}'
kubectl delete pvc datadir-cockroachdb-0 --grace-period=0 --force
kubectl patch pv pvc-30e5f0f8-0ab2-4c4d-ae44-66b0f48d0598 -p '{"metadata":{"finalizers":null}}'
kubectl delete pv pvc-30e5f0f8-0ab2-4c4d-ae44-66b0f48d0598 --grace-period=0 --force
kubectl patch pvc failover-cockroachdb-0 -p '{"metadata":{"finalizers":null}}'
kubectl delete pvc failover-cockroachdb-0 --grace-period=0 --force
kubectl patch pv pvc-455f0abd-f884-4b4f-8f71-56074a795237 -p '{"metadata":{"finalizers":null}}'
kubectl delete pv pvc-455f0abd-f884-4b4f-8f71-56074a795237 --grace-period=0 --forceOne one-off experiment
pods=($(kubectl get pods -n crdb --field-selector=status.phase=Running -o custom-columns=NAME:.metadata.name --no-headers | sort))
for pod in "${pods[@]}"; do
yq ".metadata.name = \"crdb\" | .spec.selector.namespaces = [\"crdb\"] | .spec.selector.labelSelectors[\"statefulset.kubernetes.io/pod-name\"] = \"${pod}\"" chaos_mesh/manifests/pod_failure.yaml > chaos_mesh/manifests/modified/crdb.yaml
kubectl apply -f chaos_mesh/manifests/modified/crdb.yaml
echo "Running pod failure against ${pod} for 30 second(s)"
sleep 20
kubectl delete -f chaos_mesh/manifests/modified/crdb.yaml
echo "Recovering for 30 second(s)"
sleep 30
done| Run Number | Total errors | Total downtime |
|---|---|---|
| 1 | 59 | 43.67s |
| 2 | 17 | 23.85s |
| 3 | 17 | 42.92s |
| 4 | 14 | 25.09s |
| 5 | 13 | 20.97s |
| 6 | 10 | 25.85s |
| 7 | 5 | 20.04s |
| 8 | 11 | 15.37s |
| 9 | 7 | 25.12s |
| 10 | 3 | 15.01s |
| 11 | 6 | 15.17s |
| avg | 14.72 | 24.82s |
Total outages = 18 Average outage = 1.3788s 86 outages = 118.58s