diff --git a/oneshot/README.md b/oneshot/README.md new file mode 100644 index 0000000..61a5762 --- /dev/null +++ b/oneshot/README.md @@ -0,0 +1,91 @@ +# PVC Oneshot Playbooks + +This directory contains playbooks to assist in automating day-to-day maintenance of a PVC cluster. These playbooks can be used independent of the main `pvc.yml` and roles setup to automate tasks. + +## `update-pvc-cluster.yml` + +This playbook performs a sequential full upgrade on all nodes in a PVC cluster. + +### Running the Playbook + +``` +$ ansible-playbook -i hosts -l [cluster] update-pvc-cluster.yml +``` + +### Caveats, Warnings and Notes + +* Ensure the cluster is in Optimal health before executing this playbook; all nodes should be up and reachable and operating normally + +* Be prepared to intervene if step 9 times out; OOB access may be required + +* This playbook is safe to run against a given host multiple times (e.g. to rerun after a failure); if a reboot is not required, it will not be performed + +### Process and Steps + +For each host in the cluster sequentially, do: + +1. Enable cluster maintenance mode + +1. Perform a full apt update, upgrade, autoremove, and clean + +1. Clean up obsolete kernels (`kernel-cleanup.sh`), packages/updated configuration files (`dpkg-cleanup.sh`), and the apt archive + +1. Verify library freshness and kernel version; if these produce no warnings, go to step 14 (skip reboot) + +1. Secondary the node, then wait 30 seconds + +1. Flush the node, wait for all VMs to migrate, then wait 15 seconds + +1. Stop and disable the `pvc-flush` daemon, stop the `pvcnoded` and `zookeeper` daemons, then wait 15 seconds + +1. Set Ceph OSD `noout` and stop all Ceph OSD, monitor, and manager processes, then wait 30 seconds + +1. Reboot the system and wait for it to come back up (maximum wait time of 1800 seconds) + +1. Ensure all OSDs become active and all PGs recover, then unset Ceph OSD `noout` + +1. Unflush the node, wait for all VMs to migrate, then wait 30 seconds + +1. Start and enable the `pvc-flush` daemon + +1. Reset any systemd failures + +1. Disable cluster maintenance mode, then wait 30 seconds + +## `upgrade-pvc-daemons.yml` + +This playbook performs a sequential upgrade of the PVC software daemons via apt on all nodes in a PVC cluster. This is a less invasive update process than the `update-pvc-cluster.yml` playbook as it does not flush or reboot the nodes, but does restart all PVC daemons (`pvcnoded`, `pvcapid`, and `pvcapid-worker`). + +### Running the Playbook + +``` +$ ansible-playbook -i hosts -l [cluster] upgrade-pvc-daemons.yml +``` + +### Caveats, Warnings, and Notes + +* Ensure the cluster is in Optimal health before executing this playbook; all nodes should be up and reachable and operating normally + +* This playbook is safe to run against a given host multiple times; if service restarts are not required, they will not be performed + +* This playbook should only be used in exceptional circumstances when performing a full `update-pvc-cluster.yml` would be too disruptive; it is always preferable to update all packages and reboot the nodes instead + +### Process and Steps + +For each node in the cluster sequentially, do: + +1. Enable cluster maintenance mode + +1. Perform an apt update, and install the 4 PVC packages (`pvc-client-cli`, `pvc-daemon-common`, `pvc-daemon-api`, `pvc-daemon-node`) + +1. Clean up the apt archive + +1. Verify if packages changed; if not, go to step 8 (skip restarts) + +1. Secondary the node, then wait 30 seconds + +1. Restart both active PVC daemons (`pvcapid-worker`, `pvcnoded`), then wait 60 seconds; since the node is not the primary coordinator, `pvcapid` will not be running + +1. Verify daemons are running + +1. Disble cluster maintenance mode, then wait 30 seconds diff --git a/oneshot/update-pvc-cluster.yml b/oneshot/update-pvc-cluster.yml index a6682a6..a2e34b8 100644 --- a/oneshot/update-pvc-cluster.yml +++ b/oneshot/update-pvc-cluster.yml @@ -11,21 +11,21 @@ - name: aptitude full upgrade and cleanup apt: - update_cache: "yes" - autoremove: "yes" - autoclean: "yes" - upgrade: "full" + update_cache: yes + autoremove: yes + autoclean: yes + upgrade: full - name: clean up obsolete kernels - command: "/usr/local/sbin/kernel-cleanup.sh" + command: /usr/local/sbin/kernel-cleanup.sh - name: clean up obsolete packages - command: "/usr/local/sbin/dpkg-cleanup.sh" + command: /usr/local/sbin/dpkg-cleanup.sh - name: clean apt archives file: - dest: "/var/cache/apt/archives" - state: "absent" + dest: /var/cache/apt/archives + state: absent - name: check library freshness command: /usr/lib/check_mk_agent/plugins/freshness @@ -47,12 +47,12 @@ - name: wait 30 seconds for system to stabilize pause: - seconds: "30" + seconds: 30 become: no connection: local - name: flush node - command: 'pvc node flush {{ ansible_hostname }} --wait' + command: "pvc node flush {{ ansible_hostname }} --wait" - name: ensure VMs are migrated away shell: "virsh list | grep running | wc -l" @@ -72,29 +72,29 @@ - name: wait 15 seconds for system to stabilize pause: - seconds: "15" + seconds: 15 become: no connection: local - name: stop and disable PVC flush daemon cleanly service: - name: "pvc-flush" + name: pvc-flush state: stopped enabled: no - name: stop PVC daemon cleanly service: - name: "pvcnoded" + name: pvcnoded state: stopped - name: stop Zookeeper daemon cleanly service: - name: "zookeeper" + name: zookeeper state: stopped - name: wait 15 seconds for system to stabilize pause: - seconds: "15" + seconds: 15 become: no connection: local @@ -127,7 +127,7 @@ - name: wait 30 seconds for system to stabilize pause: - seconds: "30" + seconds: 30 become: no connection: local @@ -168,13 +168,13 @@ - name: wait 30 seconds for system to stabilize pause: - seconds: "30" + seconds: 30 become: no connection: local - name: start and enable PVC flush daemon cleanly service: - name: "pvc-flush" + name: pvc-flush state: started enabled: yes @@ -182,11 +182,11 @@ command: systemctl reset-failed when: freshness.changed or kernelversion.changed - - name: set PVC maintenance mode + - name: unset PVC maintenance mode command: pvc maintenance off - name: wait 30 seconds for system to stabilize pause: - seconds: "30" + seconds: 30 become: no connection: local diff --git a/oneshot/upgrade-pvc-daemons.yml b/oneshot/upgrade-pvc-daemons.yml new file mode 100644 index 0000000..a74ae39 --- /dev/null +++ b/oneshot/upgrade-pvc-daemons.yml @@ -0,0 +1,79 @@ +--- +- hosts: all + remote_user: deploy + become: yes + become_user: root + gather_facts: yes + serial: 1 + tasks: + - name: set PVC maintenance mode + command: pvc maintenance on + + - name: install latest PVC packages + apt: + update_cache: yes + autoremove: yes + autoclean: yes + package: + - pvc-client-cli + - pvc-daemon-common + - pvc-daemon-api + - pvc-daemon-node + state: latest + register: packages + + - name: clean apt archives + file: + dest: /var/cache/apt/archives + state: absent + + - name: restart system cleanly + block: + - name: secondary node + command: 'pvc node secondary {{ ansible_hostname }}' + ignore_errors: true + + - name: wait 30 seconds for system to stabilize + pause: + seconds: 30 + become: no + connection: local + + - name: restart PVC daemons + service: + name: "{{ item }}" + state: restarted + enabled: yes + with_items: + - pvcapid-worker + - pvcnoded + + - name: wait 60 seconds for system to stabilize + pause: + seconds: 60 + become: no + connection: local + + - name: get service facts + service_facts: + + - name: fail if PVC daemons are not running + fail: + msg: "PVC daemons are not running" + when: ansible_facts.services[item] is not defined or ansible_facts.services[item]["state"] != "running" + with_items: + - pvcnoded.service + - pvcapid-worker.service + + - name: reset any systemd failures + command: systemctl reset-failed + when: packages.changed + + - name: unset PVC maintenance mode + command: pvc maintenance off + + - name: wait 30 seconds for system to stabilize + pause: + seconds: 30 + become: no + connection: local