pvc-ansible/oneshot/upgrade-pvc-cluster_deb12.yml

537 lines
16 KiB
YAML

---
# Play 1: Sanity check, information gathering, patroni freeze
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
minimum_pvc_version: 0.9.68
tasks:
- name: check that cluster is on the minimum PVC version or newer
shell: "dpkg --compare-versions $(pvc --quiet cluster status -f json | jq '.pvc_version' | tr -d '\"') ge {{ minimum_pvc_version }}"
run_once: yes
- name: set PVC maintenance mode
command: pvc cluster maintenance on
ignore_errors: yes
- name: get current postgresql directory
shell: "find /usr/lib/postgresql -mindepth 1 -maxdepth 1 -type d -name '[0-9][0-9]' | sort -n | tail -1"
register: old_postgres_dir_output
- name: set old_postgres_bin_dir fact
set_fact:
old_postgres_bin_dir: "{{ old_postgres_dir_output.stdout.strip() }}/bin"
- name: get current patroni leader node
shell: "patronictl -c /etc/patroni/config.yml list --format json | jq '.[] | select(.Role == \"Leader\") | .Member' | tr -d '\"'"
register: patroni_leader_output
- name: set patroni_leader fact
set_fact:
patroni_leader: "{% for node in ansible_play_hosts if node.split('.')[0].strip() == patroni_leader_output.stdout.strip() %}{{ node }}{% endfor %}"
- name: set patroni_followers fact
set_fact:
patroni_followers: "{{ ansible_play_hosts | difference([ patroni_leader ]) }}"
- debug:
var: patroni_leader
- debug:
var: patroni_followers
- name: fail out if patroni_leader is empty
command: "echo {{ patroni_leader }}"
register: check_output
failed_when: check_output.stdout == ""
- name: stop and mask patroni service on followers to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop and mask patroni service on leader to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ patroni_leader }}"
# Play 2: Per-node upgrade to Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: set Debian details (with ansible_distribution_*)
set_fact:
debian_version: "{{ ansible_distribution_major_version }}"
debian_codename: "{{ ansible_distribution_release }}"
when: ansible_distribution_release is defined
- name: set Debian details (with ansible_lsb)
set_fact:
debian_version: "{{ ansible_lsb.major_release }}"
debian_codename: "{{ ansible_lsb.codename }}"
when: ansible_lsb.codename is defined
- name: secondary node
command: "pvc node secondary {{ ansible_hostname }}"
ignore_errors: yes
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: flush node
command: "pvc node flush {{ ansible_hostname }} --wait"
ignore_errors: yes
- name: ensure VMs are migrated away
shell: "virsh list | grep running | wc -l"
register: virshcount
failed_when: virshcount.stdout != "0"
until: virshcount.stdout == "0"
retries: 60
delay: 10
- name: make sure all VMs have migrated
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcflush
failed_when: pvcflush.stdout != 'flushed'
until: pvcflush.stdout == 'flushed'
retries: 60
delay: 10
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: stop PVC daemon cleanly
service:
name: pvcnoded
state: stopped
- name: stop Zookeeper daemon cleanly
service:
name: zookeeper
state: stopped
- name: wait 15 seconds for system to stabilize
pause:
seconds: 15
become: no
connection: local
- name: set OSD noout
command: pvc storage osd set noout
ignore_errors: yes
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
ignore_errors: yes
register: osd_services
- name: stop Ceph OSD daemons cleanly
service:
name: "{{ item }}"
state: stopped
ignore_errors: yes
with_items: "{{ osd_services.stdout_lines }}"
- name: stop Ceph Monitor daemon cleanly
service:
name: "ceph-mon@{{ ansible_hostname }}"
state: stopped
ignore_errors: yes
- name: stop Ceph Manager daemon cleanly
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: stopped
ignore_errors: yes
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: remove possible obsolete cset configuration
file:
dest: "/etc/systemd/system/{{ item }}"
state: absent
with_items:
- ceph-osd@.service.d
- ceph-osd-cpuset.service
- name: replace sources.list entries will bookworm
replace:
dest: "{{ item }}"
regexp: "{{ debian_codename }}"
replace: "bookworm"
with_items:
- /etc/apt/sources.list
- name: replace sources.list non-free with non-free-firmware
replace:
dest: "{{ item }}"
regexp: "non-free$"
replace: "non-free-firmware"
with_items:
- /etc/apt/sources.list
- name: remove security entry if on Debian 10
lineinfile:
dest: /etc/apt/sources.list
regexp: "security.debian.org"
state: absent
when: debian_version|int < 11
- name: link python to python3 on Debian 10
file:
state: link
src: python3
dest: /usr/bin/python
force: yes
when: debian_version|int < 11
- name: update apt cache
apt:
update_cache: yes
register: apt_res
retries: 5
until: apt_res is success
# This seems insane, but works around a fatal error upgrading from d10 to d12
- name: perform initial upgrade on Debian 10
block:
- name: install script to perform safe d10->d12 upgrade
copy:
dest: /tmp/upgrade-d10.sh
mode: 0755
content: |
#!/usr/bin/env bash
exec 1>/tmp/upgrade-d10.log
exec 2>&1
set -o xtrace
export DEBIAN_FRONTEND=noninteractive
echo "Forcibly update libcrypt1 and libc6 to avoid conflicts"
apt-get --download-only install --yes libcrypt1
dpkg --force-all --install /var/cache/apt/archives/{libcrypt1,libpam0g,libc6}*.deb
echo "Fix broken packages"
apt --fix-broken --option Dpkg::Options::="--force-confold" install --yes
echo "Upgrade core apt packages (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes apt dpkg dpkg-dev base-files zstd
echo "Upgrade ca-certificates (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ca-certificates
echo "Upgrade sudo (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes sudo
echo "Upgrade core PVC packages (forcing confold)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install --yes ceph-osd ceph-mds ceph-mon ceph-mgr radosgw zookeeper zookeeper-bin patroni libvirt-daemon-system frr
echo "Perform main upgrade (forcing confnew)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" upgrade --yes
echo "Perform main dist-upgrade (forcing confnew)"
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" dist-upgrade --yes
echo "Restart SSH so Ansible can continue"
systemctl restart ssh
- name: run script to perform safe d10->d12 upgrade (will take a long time)
shell: /tmp/upgrade-d10.sh
- name: install python-is-python3
apt:
name: python-is-python3
state: latest
when: debian_version|int < 11
- name: aptitude upgrade
apt:
upgrade: safe
register: apt_res
retries: 5
until: apt_res is success
- name: aptitude dist upgrade and cleanup
apt:
autoremove: yes
autoclean: yes
upgrade: dist
register: apt_res
retries: 5
until: apt_res is success
- name: clean up obsolete kernels
command: /usr/local/sbin/kernel-cleanup.sh
- name: clean up obsolete packages
command: /usr/local/sbin/dpkg-cleanup.sh
- name: clean apt archives
file:
dest: /var/cache/apt/archives
state: absent
- name: regather facts
setup:
- name: include base role
import_role:
name: base
- name: include pvc role
import_role:
name: pvc
- name: aptitude full upgrade and cleanup
apt:
update_cache: yes
autoremove: yes
autoclean: yes
upgrade: full
register: apt_res
retries: 5
until: apt_res is success
- name: remove obsolete database directories
file:
dest: "{{ item }}"
state: absent
with_items:
- "/etc/postgresql/13"
- "/var/lib/postgresql/13"
- name: restart system
reboot:
post_reboot_delay: 15
reboot_timeout: 1800
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
- name: unset OSD noout
command: pvc storage osd unset noout
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: unflush node
command: "pvc node ready {{ ansible_hostname }} --wait"
- name: make sure all VMs have returned
shell: "pvc node info {{ ansible_hostname }} | grep '^Domain State' | awk '{ print $NF }'"
register: pvcunflush
failed_when: pvcunflush.stdout != 'ready'
until: pvcunflush.stdout == 'ready'
retries: 60
delay: 10
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: reset any systemd failures
command: systemctl reset-failed
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
# Play 3: Ceph upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
tasks:
- name: backup CRUSH map
command: ceph osd getcrushmap -o /srv/backups/backup-crushmap-deb12-upgrade
- block:
- name: disable insecure global id reclaim in Ceph
command: ceph config set mon auth_allow_insecure_global_id_reclaim false
- name: set OSDs to require pacific
command: ceph osd require-osd-release pacific
- name: update CRUSH map
command: ceph osd crush set-all-straw-buckets-to-straw2
- name: set OSDs to quick fsck
command: ceph config set osd bluestore_fsck_quick_fix_on_mount true
run_once: yes
- name: restart ceph-mgr
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: restarted
# Play 4: Ceph OSD restart (serial per-node)
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: restart all OSDs on node
shell: "systemctl restart --all ceph-osd@*"
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
# Play 4: Patroni upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
new_postgres_version: 15
tasks:
- name: stop patroni service on followers to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop patroni service on leader to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ patroni_leader }}"
- block:
- name: initialize new postgres database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/initdb -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: enable data checksums in new database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_checksums --enable /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: run postgres upgrade
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_upgrade -b {{ old_postgres_bin_dir }} -d /var/lib/postgresql/patroni/pvc -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: move old postgres database out of the way
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/patroni/pvc /var/lib/postgresql/patroni/pvc.old"
chdir: "/var/lib/postgresql"
- name: move new postgres database into place
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/{{ new_postgres_version }}/pvc /var/lib/postgresql/patroni/pvc"
chdir: "/var/lib/postgresql"
- name: ensure recovery.conf is absent
file:
dest: /var/lib/postgresql/patroni/pvc/recovery.conf
state: absent
- name: delete patroni cluster znodes
shell: "/usr/share/zookeeper/bin/zkCli.sh -server {{ ansible_hostname }}:2181 deleteall /patroni/pvc"
- name: start patroni service on leader
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ patroni_leader }}"
- name: remove old data directory on patroni followers
file:
dest: /var/lib/postgresql/patroni/pvc
state: absent
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: start patroni service on followers
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: restart pvcnoded on all nodes
service:
name: pvcnoded
state: restarted
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: set first node as primary coordinator
command: "pvc node primary --wait"
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ ansible_play_hosts[0] }}"
- name: unset PVC maintenance mode
command: pvc cluster maintenance off