pvc-ansible/oneshot/update-pvc-cluster.yml

256 lines
7.0 KiB
YAML
Raw Permalink Normal View History

---
2023-09-18 09:42:01 -04:00
- import_playbook: update-pvc-daemons.yml
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
tasks:
- name: set PVC maintenance mode
command: pvc cluster maintenance on
run_once: yes
ignore_errors: yes
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
2023-09-01 15:42:25 -04:00
- name: temporarily set norestart policy
copy:
dest: "/usr/sbin/policy-rc.d"
mode: "0755"
content: |
#!/bin/sh
exit 101
2023-12-26 01:08:53 -05:00
- name: upgrade vhostmd (forced due to bugs)
apt:
update_cache: yes
name: vhostmd
state: latest
dpkg_options: "force-confnew"
2023-09-01 15:42:25 -04:00
2024-06-29 01:33:08 -04:00
- name: apt full upgrade and cleanup
apt:
update_cache: yes
autoremove: yes
autoclean: yes
upgrade: full
2023-09-01 15:42:25 -04:00
- name: clean apt archives
file:
dest: /var/cache/apt/archives
state: absent
2023-09-01 15:42:25 -04:00
- name: remove temporarily set norestart policy
file:
dest: "/usr/sbin/policy-rc.d"
state: "absent"
force: "yes"
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: check freshness (kernel)
command: /usr/sbin/needrestart -p -k
register: freshness_kernel
changed_when: freshness_kernel.rc >= 1 and freshness_kernel.rc < 3
failed_when: false
- name: check freshness (libraries)
command: /usr/sbin/needrestart -p -l
register: freshness_libraries
changed_when: freshness_libraries.rc >= 1 and freshness_libraries.rc < 3
failed_when: false
- name: check freshness (microcode)
command: /usr/sbin/needrestart -p -w
register: freshness_microcode
changed_when: freshness_microcode.rc >= 1 and freshness_microcode.rc < 3
failed_when: false
- name: restart system cleanly
block:
- name: secondary node
command: 'pvc node secondary --wait {{ ansible_hostname }}'
ignore_errors: true
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: flush node
command: "pvc node flush {{ ansible_hostname }} --wait"
ignore_errors: yes
2023-09-01 15:42:26 -04:00
- name: make sure node is in flushed state
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcflush
failed_when: pvcflush.stdout != 'flushed'
until: pvcflush.stdout == 'flushed'
retries: 180
2023-09-01 15:42:26 -04:00
delay: 10
- name: ensure VMs are migrated away
shell: "virsh list | grep running | wc -l"
register: virshcount
failed_when: virshcount.stdout != "0"
until: virshcount.stdout == "0"
retries: 180
delay: 10
2023-09-01 15:42:25 -04:00
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
2023-09-01 15:42:25 -04:00
become: no
connection: local
- name: stop PVC daemon cleanly
service:
name: pvcnoded
2023-09-01 15:42:25 -04:00
state: stopped
- name: stop Zookeeper daemon cleanly
service:
name: zookeeper
2023-09-01 15:42:25 -04:00
state: stopped
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: set OSD noout
command: pvc storage osd set noout
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
ignore_errors: true
register: osd_services
- name: stop Ceph OSD daemons cleanly
service:
name: "{{ item }}"
state: stopped
ignore_errors: true
with_items: "{{ osd_services.stdout_lines }}"
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: stop Ceph Monitor daemon cleanly
service:
name: "ceph-mon@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: stop Ceph Manager daemon cleanly
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: stopped
ignore_errors: true
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: restart system
reboot:
post_reboot_delay: 15
reboot_timeout: 1800
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
- name: unset OSD noout
command: pvc storage osd unset noout
2023-09-01 15:42:27 -04:00
ignore_errors: yes
- name: make sure node daemon is in run state
shell: "pvc node info {{ ansible_hostname }} | grep '^Daemon State' | awk '{ print $NF }'"
register: pvcdaemon
failed_when: pvcdaemon.stdout != 'run'
until: pvcdaemon.stdout == 'run'
retries: 180
delay: 10
- name: unflush node
command: "pvc node ready {{ ansible_hostname }} --wait"
ignore_errors: yes
- name: make sure all VMs have returned
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcunflush
failed_when: pvcunflush.stdout != 'ready'
until: pvcunflush.stdout == 'ready'
retries: 180
delay: 10
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: reset any systemd failures
command: systemctl reset-failed
when: freshness_kernel.changed or freshness_libraries.changed or freshness_microcode.changed
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: clean up obsolete kernels
command: /usr/local/sbin/kernel-cleanup.sh
- name: clean up obsolete packages
command: /usr/local/sbin/dpkg-cleanup.sh
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
tasks:
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: unset PVC maintenance mode
command: pvc cluster maintenance off
run_once: yes