From 05ab05a84a774048274f2163c8298f925601e6d6 Mon Sep 17 00:00:00 2001 From: John Bowdre Date: Wed, 19 Apr 2023 13:39:27 -0500 Subject: [PATCH] improve timing/error handling in k8s bootstrap --- terraform/scripts/initialize-controlplane.sh | 25 +++++++++++++------- terraform/scripts/join-workers.sh | 24 ++++++++++++------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/terraform/scripts/initialize-controlplane.sh b/terraform/scripts/initialize-controlplane.sh index 8dbeee8..1f6a248 100644 --- a/terraform/scripts/initialize-controlplane.sh +++ b/terraform/scripts/initialize-controlplane.sh @@ -53,7 +53,7 @@ EOF sudo chown "$(id -u):$(id -g)" "${HOME}"/.kube/config echo ">> Applying Calico networking..." - kubectl apply -f https://docs.projectcalico.org/manifests/calico.yaml + kubectl apply -f https://raw.githubusercontent.com/projectcalico/calico/master/manifests/calico.yaml echo ">> Creating discovery config..." kubectl -n kube-public get configmap cluster-info -o jsonpath='{.data.kubeconfig}' > discovery.yaml @@ -64,7 +64,7 @@ EOF fi fi echo ">> Waiting up to 10 minutes for all control-plane nodes to be Ready..." - python3 -m http.server & + python3 -m http.server &>/dev/null & PROC_ID=$! attempts_max=60 attempt=0 @@ -383,17 +383,25 @@ else sleep 10 done echo ">> Continuing after $((attempt*10)) seconds." - echo ">> Joining cluster..." + echo ">> Retrieving cluster discovery config..." attempts_max=6 attempt=0 - until [ -f /etc/kubernetes/discovery.yaml ]; do - wget "http://${K8S_CONTROLPLANE_VIP}:8000/discovery.yaml" 2>/dev/null - sudo install -o root -g root -m 600 discovery.yaml /etc/kubernetes/discovery.yaml 2>/dev/null - if [ ! -f /etc/kubernetes/discovery.yaml ]; then + until [ -f ~/discovery.yaml ] || [ ${attempt} -eq ${attempts_max} ]; do + wget "http://${K8S_CONTROLPLANE_VIP}:8000/discovery.yaml" + sleep 2 + if ! [ -f ~/discovery.yaml ]; then + echo ">> Unable to retrieve config..." attempt=$((attempt+1)) - sleep 10 + sleep 8 fi done + if ! [ -f ~/discovery.yaml ]; then + echo ">> Timeout reached while retrieving config!" + echo "Exiting." + exit 1 + fi + sudo install -o root -g root -m 600 discovery.yaml /etc/kubernetes/discovery.yaml + echo ">> Successfully discovered cluster!" cat << EOF > kubeadmjoin.yaml apiVersion: kubeadm.k8s.io/v1beta3 caCertPath: /etc/kubernetes/pki/ca.crt @@ -409,6 +417,7 @@ nodeRegistration: controlPlane: certificateKey: ${KUBEADM_CERTKEY} EOF + echo ">> Joining cluster..." if sudo kubeadm join "${K8S_CONTROLPLANE_VIP}":6443 --config kubeadmjoin.yaml; then echo ">> Node ${HOSTNAME} successfully initialized!" touch .k8s-node-success diff --git a/terraform/scripts/join-workers.sh b/terraform/scripts/join-workers.sh index 8e10b7e..8dd32ec 100644 --- a/terraform/scripts/join-workers.sh +++ b/terraform/scripts/join-workers.sh @@ -17,7 +17,7 @@ echo ">> Continuing after $((attempt*10)) seconds." echo ">> Waiting up to 10 minutes for all control-plane nodes..." attempts_max=60 attempt=0 -until "$(wget http://${K8S_CONTROLPLANE_VIP}:8000/.k8s-controlplane-success)" 2>/dev/null; do +until curl --fail "http://${K8S_CONTROLPLANE_VIP}:8000/.k8s-controlplane-success" 2>/dev/null; do if [ ${attempt} -eq ${attempts_max} ]; then echo ">> [ERROR] Timeout waiting for control-plane nodes! <<" exit 1 @@ -26,18 +26,26 @@ until "$(wget http://${K8S_CONTROLPLANE_VIP}:8000/.k8s-controlplane-success)" 2> sleep 10 done echo ">> Continuing after $((attempt*10)) seconds." -echo ">> Joining cluster..." +echo ">> Retrieving cluster discovery config..." attempts_max=6 attempt=0 -until [ -f /etc/kubernetes/discovery.yaml ]; do - wget "http://${K8S_CONTROLPLANE_VIP}:8000/discovery.yaml" 2>/dev/null - sudo install -o root -g root -m 600 discovery.yaml /etc/kubernetes/discovery.yaml 2>/dev/null - if [ ! -f /etc/kubernetes/discovery.yaml ]; then +until [ -f ~/discovery.yaml ] || [ ${attempt} -eq ${attempts_max} ]; do + wget "http://${K8S_CONTROLPLANE_VIP}:8000/discovery.yaml" + sleep 2 + if ! [ -f ~/discovery.yaml ]; then + echo ">> Unable to retrieve config..." attempt=$((attempt+1)) - sleep 10 + sleep 8 fi done - +if ! [ -f ~/discovery.yaml ]; then + echo ">> Timeout reached while retrieving config!" + echo "Exiting." + exit 1 +fi +sudo install -o root -g root -m 600 discovery.yaml /etc/kubernetes/discovery.yaml +echo ">> Successfully discovered cluster!" +echo ">> Discovered cluster!" cat << EOF > kubeadmjoin.yaml apiVersion: kubeadm.k8s.io/v1beta3 caCertPath: /etc/kubernetes/pki/ca.crt