Tweak oneshot script

Cleanly stop daemons; check if OSDs are back before continuing; wait
less
This commit is contained in:
Joshua Boniface 2023-09-01 15:42:24 -04:00
parent 458e7b4872
commit cdc7e3377b
1 changed files with 57 additions and 7 deletions

View File

@ -55,7 +55,6 @@
until: virshcount.stdout == "0" until: virshcount.stdout == "0"
retries: 60 retries: 60
delay: 10 delay: 10
become: yes
- name: wait 15 seconds for system to stabilize - name: wait 15 seconds for system to stabilize
pause: pause:
@ -66,14 +65,49 @@
- name: set OSD noout - name: set OSD noout
command: pvc storage osd set noout command: pvc storage osd set noout
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
ignore_errors: true
register: osd_services
- name: stop Ceph OSD daemons cleanly
service:
name: "{{ item }}"
state: stopped
ignore_errors: true
with_items: "{{ osd_services.stdout_lines }}"
- name: stop Ceph Monitor daemon cleanly
service:
name: "ceph-mon@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: stop Ceph Manager daemon cleanly
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: wait 15 seconds for system to stabilize
pause:
seconds: "15"
become: no
connection: local
- name: stop PVC flush daemon cleanly - name: stop PVC flush daemon cleanly
service: service:
name: pvc-flush name: "pvc-flush"
state: stopped state: stopped
- name: stop PVC daemon cleanly - name: stop PVC daemon cleanly
service: service:
name: pvcnoded name: "pvcnoded"
state: stopped
- name: stop Zookeeper daemon cleanly
service:
name: "zookeeper"
state: stopped state: stopped
- name: restart system - name: restart system
@ -81,15 +115,31 @@
post_reboot_delay: 15 post_reboot_delay: 15
reboot_timeout: 1800 reboot_timeout: 1800
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
- name: unset OSD noout - name: unset OSD noout
command: pvc storage osd unset noout command: pvc storage osd unset noout
- name: unflush node - name: unflush node
command: 'pvc node ready {{ ansible_hostname }} --wait' command: "pvc node ready {{ ansible_hostname }} --wait"
- name: wait 5 minutes for system to stabilize - name: wait 30 seconds for system to stabilize
pause: pause:
seconds: "300" seconds: "30"
become: no become: no
connection: local connection: local