From cdc7e3377b28f4ba6d367c3a8188cca2ab5f18a6 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 1 Sep 2023 15:42:24 -0400 Subject: [PATCH] Tweak oneshot script Cleanly stop daemons; check if OSDs are back before continuing; wait less --- oneshot/update-pvc-cluster.yml | 64 ++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/oneshot/update-pvc-cluster.yml b/oneshot/update-pvc-cluster.yml index a2f9fde..79f6a56 100644 --- a/oneshot/update-pvc-cluster.yml +++ b/oneshot/update-pvc-cluster.yml @@ -55,7 +55,6 @@ until: virshcount.stdout == "0" retries: 60 delay: 10 - become: yes - name: wait 15 seconds for system to stabilize pause: @@ -66,30 +65,81 @@ - name: set OSD noout command: pvc storage osd set noout + - name: get running OSD services + shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'" + ignore_errors: true + register: osd_services + + - name: stop Ceph OSD daemons cleanly + service: + name: "{{ item }}" + state: stopped + ignore_errors: true + with_items: "{{ osd_services.stdout_lines }}" + + - name: stop Ceph Monitor daemon cleanly + service: + name: "ceph-mon@{{ ansible_hostname }}" + state: stopped + ignore_errors: true + + - name: stop Ceph Manager daemon cleanly + service: + name: "ceph-mgr@{{ ansible_hostname }}" + state: stopped + ignore_errors: true + + - name: wait 15 seconds for system to stabilize + pause: + seconds: "15" + become: no + connection: local + - name: stop PVC flush daemon cleanly service: - name: pvc-flush + name: "pvc-flush" state: stopped - name: stop PVC daemon cleanly service: - name: pvcnoded + name: "pvcnoded" + state: stopped + + - name: stop Zookeeper daemon cleanly + service: + name: "zookeeper" state: stopped - name: restart system reboot: post_reboot_delay: 15 reboot_timeout: 1800 - + + - name: make sure all OSDs are active + shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'" + register: osdstat + failed_when: osdstat.stdout == "NOK" + until: osdstat.stdout == "OK" + retries: 60 + delay: 10 + + - name: make sure all PGs have recovered + shell: "ceph health | grep -wo 'Degraded data redundancy'" + register: cephhealth + failed_when: cephhealth.stdout == "Degraded data redundancy'" + until: cephhealth.stdout == "" + retries: 60 + delay: 10 + - name: unset OSD noout command: pvc storage osd unset noout - name: unflush node - command: 'pvc node ready {{ ansible_hostname }} --wait' + command: "pvc node ready {{ ansible_hostname }} --wait" - - name: wait 5 minutes for system to stabilize + - name: wait 30 seconds for system to stabilize pause: - seconds: "300" + seconds: "30" become: no connection: local