--- # Play 1: Sanity check, initial configuration upgrade, information gathering, patroni freeze - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes vars: minimum_pvc_version: 0.9.69 tasks: - name: check that cluster is on the minimum PVC version or newer shell: "dpkg --compare-versions $(pvc --quiet cluster status -f json | jq '.pvc_version' | tr -d '\"') ge {{ minimum_pvc_version }}" run_once: yes - name: set PVC maintenance mode command: pvc cluster maintenance on ignore_errors: yes - name: include base role import_role: name: base - name: include pvc role import_role: name: pvc - name: get current postgresql directory shell: "find /usr/lib/postgresql -mindepth 1 -maxdepth 1 -type d -name '[0-9][0-9]' | sort -n | tail -1" register: old_postgres_dir_output - name: set old_postgres_bin_dir fact set_fact: old_postgres_bin_dir: "{{ old_postgres_dir_output.stdout.strip() }}/bin" - name: get current patroni leader node shell: "patronictl -c /etc/patroni/config.yml list --format json | jq '.[] | select(.Role == \"Leader\") | .Member' | tr -d '\"'" register: patroni_leader_output - name: set patroni_leader fact set_fact: patroni_leader: "{% for node in ansible_play_hosts if node.split('.')[0].strip() == patroni_leader_output.stdout.strip() %}{{ node }}{% endfor %}" - name: set patroni_followers fact set_fact: patroni_followers: "{{ ansible_play_hosts | difference([ patroni_leader ]) }}" - debug: var: patroni_leader - debug: var: patroni_followers - name: fail out if patroni_leader is empty command: "echo {{ patroni_leader }}" register: check_output failed_when: check_output.stdout == "" - name: stop and mask patroni service on followers to perform database upgrade (later) systemd_service: name: patroni.service state: stopped masked: yes run_once: yes delegate_to: "{{ item }}" loop: "{{ patroni_followers }}" - name: perform a backup of the primary shell: cmd: "sudo -u postgres /usr/bin/pg_dumpall > upgrade_dump.sql" chdir: "/var/lib/postgresql" run_once: yes delegate_to: "{{ patroni_leader }}" - name: stop and mask patroni service on leader to perform database upgrade (later) systemd_service: name: patroni.service state: stopped masked: yes run_once: yes delegate_to: "{{ patroni_leader }}" # Play 2: Per-node upgrade to Debian 12 - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes serial: 1 tasks: - name: set Debian details (with ansible_distribution_*) set_fact: debian_version: "{{ ansible_distribution_major_version }}" debian_codename: "{{ ansible_distribution_release }}" when: ansible_distribution_release is defined - name: set Debian details (with ansible_lsb) set_fact: debian_version: "{{ ansible_lsb.major_release }}" debian_codename: "{{ ansible_lsb.codename }}" when: ansible_lsb.codename is defined - name: secondary node command: "pvc node secondary {{ ansible_hostname }}" ignore_errors: yes - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: flush node command: "pvc node flush {{ ansible_hostname }} --wait" ignore_errors: yes - name: ensure VMs are migrated away shell: "virsh list | grep running | wc -l" register: virshcount failed_when: virshcount.stdout != "0" until: virshcount.stdout == "0" retries: 60 delay: 10 - name: make sure all VMs have migrated shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'" register: pvcflush failed_when: pvcflush.stdout != 'flushed' until: pvcflush.stdout == 'flushed' retries: 60 delay: 10 - name: wait 15 seconds for system to stabilize pause: seconds: 15 become: no connection: local - name: stop PVC daemon cleanly service: name: pvcnoded state: stopped - name: stop Zookeeper daemon cleanly service: name: zookeeper state: stopped - name: wait 15 seconds for system to stabilize pause: seconds: 15 become: no connection: local - name: set OSD noout command: pvc storage osd set noout ignore_errors: yes - name: get running OSD services shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'" ignore_errors: yes register: osd_services - name: stop Ceph OSD daemons cleanly service: name: "{{ item }}" state: stopped ignore_errors: yes with_items: "{{ osd_services.stdout_lines }}" - name: stop Ceph Monitor daemon cleanly service: name: "ceph-mon@{{ ansible_hostname }}" state: stopped ignore_errors: yes - name: stop Ceph Manager daemon cleanly service: name: "ceph-mgr@{{ ansible_hostname }}" state: stopped ignore_errors: yes - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: remove possible obsolete cset configuration file: dest: "/etc/systemd/system/{{ item }}" state: absent with_items: - ceph-osd@.service.d - ceph-osd-cpuset.service - name: replace sources.list entries with bookworm replace: dest: "{{ item }}" regexp: "{{ debian_codename }}" replace: "bookworm" with_items: - /etc/apt/sources.list - name: replace sources.list non-free with non-free-firmware replace: dest: "{{ item }}" regexp: "non-free$" replace: "non-free-firmware" with_items: - /etc/apt/sources.list - name: remove security entry if on Debian 10 lineinfile: dest: /etc/apt/sources.list regexp: "security.debian.org" state: absent when: debian_version|int < 11 - name: link python to python3 on Debian 10 file: state: link src: python3 dest: /usr/bin/python force: yes when: debian_version|int < 11 - name: update apt cache apt: update_cache: yes register: apt_res retries: 5 until: apt_res is success # This seems insane, but works around a fatal error upgrading from d10 to d12 - name: perform initial upgrade on Debian 10 block: - name: install script to perform safe d10->d12 upgrade copy: dest: /tmp/upgrade-d10.sh mode: 0755 content: | #!/usr/bin/env bash exec 1>/tmp/upgrade-d10.log exec 2>&1 set -o xtrace export DEBIAN_FRONTEND=noninteractive echo "Forcibly update libcrypt1 and libc6 to avoid conflicts" apt-get --download-only install --yes libcrypt1 dpkg --force-all --install /var/cache/apt/archives/{libcrypt1,libpam0g,libc6}*.deb echo "Fix broken packages" apt --fix-broken --option Dpkg::Options::="--force-confold" install --yes echo "Upgrade core apt packages (forcing confold)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes apt dpkg dpkg-dev base-files zstd echo "Upgrade ca-certificates (forcing confold)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ca-certificates echo "Upgrade sudo (forcing confold)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes sudo echo "Upgrade core PVC packages (forcing confold)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ceph-osd ceph-mds ceph-mon ceph-mgr radosgw zookeeper zookeeper-bin patroni libvirt-daemon-system frr echo "Perform main upgrade (forcing confnew)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" upgrade --yes echo "Perform main dist-upgrade (forcing confnew)" apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" dist-upgrade --yes echo "Restart SSH so Ansible can continue" systemctl restart ssh - name: run script to perform safe d10->d12 upgrade (will take a long time) shell: /tmp/upgrade-d10.sh - name: install python-is-python3 apt: name: python-is-python3 state: latest when: debian_version|int < 11 - name: apt upgrade apt: upgrade: safe register: apt_res retries: 5 until: apt_res is success - name: apt dist upgrade and cleanup apt: autoremove: yes autoclean: yes upgrade: dist register: apt_res retries: 5 until: apt_res is success - name: clean up obsolete kernels command: /usr/local/sbin/kernel-cleanup.sh - name: clean up obsolete packages command: /usr/local/sbin/dpkg-cleanup.sh - name: clean apt archives file: dest: /var/cache/apt/archives state: absent - name: regather facts setup: - name: include base role import_role: name: base - name: include pvc role import_role: name: pvc - name: apt full upgrade and cleanup apt: update_cache: yes autoremove: yes autoclean: yes upgrade: full register: apt_res retries: 5 until: apt_res is success - name: remove obsolete database directories file: dest: "{{ item }}" state: absent with_items: - "/etc/postgresql/13" - "/var/lib/postgresql/13" - name: restart system reboot: post_reboot_delay: 15 reboot_timeout: 1800 - name: make sure all OSDs are active shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'" register: osdstat failed_when: osdstat.stdout == "NOK" until: osdstat.stdout == "OK" retries: 60 delay: 10 - name: make sure all PGs have recovered shell: "ceph health | grep -wo 'Degraded data redundancy'" register: cephhealth failed_when: cephhealth.stdout == "Degraded data redundancy'" until: cephhealth.stdout == "" retries: 60 delay: 10 - name: unset OSD noout command: pvc storage osd unset noout - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: unflush node command: "pvc node ready {{ ansible_hostname }} --wait" - name: make sure all VMs have returned shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'" register: pvcunflush failed_when: pvcunflush.stdout != 'ready' until: pvcunflush.stdout == 'ready' retries: 60 delay: 10 - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: reset any systemd failures command: systemctl reset-failed - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local # Play 3: Ceph upgrades for Debian 12 - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes tasks: - name: backup CRUSH map command: ceph osd getcrushmap -o /srv/backups/backup-crushmap-deb12-upgrade - block: - name: disable insecure global id reclaim in Ceph command: ceph config set mon auth_allow_insecure_global_id_reclaim false - name: disable default pg autoscaler command: ceph config set global osd_pool_default_pg_autoscale_mode off - name: get pools shell: "ceph osd lspools | awk '{ $1=\"\"; print $0 }' | tr -d ' '" register: pool_output - name: set pools fact set_fact: ceph_pools: "{{ pool_output.stdout.split('\n') | list }}" - name: disable pg autoscaler on each pool command: "ceph osd pool set {{ item }} pg_autoscale_mode off" loop: "{{ ceph_pools }}" - name: set OSDs to require pacific command: ceph osd require-osd-release pacific - name: update CRUSH map command: ceph osd crush set-all-straw-buckets-to-straw2 - name: set OSDs to quick fsck command: ceph config set osd bluestore_fsck_quick_fix_on_mount true run_once: yes - name: restart ceph-mgr service: name: "ceph-mgr@{{ ansible_hostname }}" state: restarted # Play 4: Ceph OSD restart (serial per-node) - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes serial: 1 tasks: - name: restart all OSDs on node shell: "systemctl restart --all ceph-osd@*" - name: make sure all OSDs are active shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'" register: osdstat failed_when: osdstat.stdout == "NOK" until: osdstat.stdout == "OK" retries: 60 delay: 10 - name: make sure all PGs have recovered shell: "ceph health | grep -wo 'Degraded data redundancy'" register: cephhealth failed_when: cephhealth.stdout == "Degraded data redundancy'" until: cephhealth.stdout == "" retries: 60 delay: 10 # Play 4: Patroni upgrades for Debian 12 - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes vars: new_postgres_version: 15 tasks: - name: stop patroni service on followers to perform database upgrade service: name: patroni state: stopped run_once: yes delegate_to: "{{ item }}" loop: "{{ patroni_followers }}" - name: stop patroni service on leader to perform database upgrade service: name: patroni state: stopped run_once: yes delegate_to: "{{ patroni_leader }}" - block: - name: move old postgres database out of the way shell: cmd: "sudo -u postgres mv /var/lib/postgresql/patroni/pvc /var/lib/postgresql/patroni/pvc.old" chdir: "/var/lib/postgresql" - name: initialize new postgres database shell: cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/initdb -D /var/lib/postgresql/patroni/pvc" chdir: "/var/lib/postgresql" - name: ensure recovery.conf is absent file: dest: /var/lib/postgresql/patroni/pvc/recovery.conf state: absent - name: delete patroni cluster znodes shell: "/usr/share/zookeeper/bin/zkCli.sh -server {{ ansible_hostname }}:2181 deleteall /patroni/pvc" - name: start patroni service on leader systemd_service: name: patroni.service state: started masked: no - name: wait 15 seconds for system to stabilize pause: seconds: 15 become: no connection: local - name: import backup of the primary shell: cmd: "sudo -u postgres /usr/bin/psql < upgrade_dump.sql" chdir: "/var/lib/postgresql" - name: apply schema updates shell: /usr/share/pvc/pvc-api-db-upgrade ignore_errors: yes - name: remove temporary backup file: dest: /var/lib/postgresql/upgrade_dump.sql state: absent run_once: yes delegate_to: "{{ patroni_leader }}" - name: remove old data directory on patroni followers file: dest: /var/lib/postgresql/patroni/pvc state: absent run_once: yes delegate_to: "{{ item }}" loop: "{{ patroni_followers }}" - name: start patroni service on followers systemd_service: name: patroni.service state: started masked: no run_once: yes delegate_to: "{{ item }}" loop: "{{ patroni_followers }}" - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: restart PVC daemons on all nodes shell: "systemctl restart pvchealthd && systemctl restart pvcworkerd && systemctl restart pvcnoded && sleep 30" run_once: yes delegate_to: "{{ item }}" loop: "{{ ansible_play_hosts }}" - name: wait 30 seconds for system to stabilize pause: seconds: 30 become: no connection: local - name: set first node as primary coordinator command: "pvc node primary --wait {{ ansible_play_hosts[0].split('.')[0] }}" run_once: yes delegate_to: "{{ ansible_play_hosts[0] }}" - name: wait 15 seconds for system to stabilize pause: seconds: 15 become: no connection: local # Play 5: Final role updates to normalize cluster - hosts: all remote_user: deploy become: yes become_user: root gather_facts: yes tasks: - name: include base role import_role: name: base - name: include pvc role import_role: name: pvc - name: unset PVC maintenance mode command: pvc cluster maintenance off run_once: yes