Compare commits

..

17 Commits

Author SHA1 Message Date
ecf4b764de Add additional primary node switch 2023-08-29 11:25:59 -04:00
141af2b64a Ensure core pg_hba entries are present 2023-08-29 11:00:44 -04:00
51c064efaa Add one more fact regathering 2023-08-29 09:49:16 -04:00
bc1bfd4cc2 Add zstd dependency for D10+ 2023-08-29 09:30:35 -04:00
12061a75da Add PVC upgrade to Debian 12 playbook 2023-08-29 09:28:16 -04:00
e0c32d5799 Fix Patroni upgrade and D12 support 2023-08-29 02:00:29 -04:00
9cf0d18d38 Fix reboot 2023-08-28 23:42:46 -04:00
e9ac08be3f Ensure facts are always regathered 2023-08-28 13:52:23 -04:00
b684eb8c8c Add Debian 12 Patroni config 2023-08-28 11:03:07 -04:00
4923e17cb1 Fix warning in user module 2023-08-28 11:00:16 -04:00
6be00cfb50 Add retries to all apt commands 2023-08-28 10:57:57 -04:00
67f21b8958 Ignore errors enabling vhostmd
Seems to cause issues in bookworm.
2023-08-27 23:14:32 -04:00
28f33613a9 Use non-free-firmware repository 2023-08-27 23:14:20 -04:00
fdedfb35bc Add final pvcnoded restart 2023-08-27 13:20:06 -04:00
a3916d932c Allow specifying interface mode 2023-08-27 01:21:03 -04:00
79859269b7 Fix support for bookworm 2023-08-27 01:09:43 -04:00
e1716a534b Revert "Disallow Deb10 -> Deb12 upgrades"
This reverts commit 6e56dd3e4a.
2023-08-26 14:59:55 -04:00
13 changed files with 426 additions and 49 deletions

View File

@ -1,4 +1,69 @@
---
# Play 1: Sanity check, information gathering, patroni freeze
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
minimum_pvc_version: 0.9.68
tasks:
- name: check that cluster is on the minimum PVC version or newer
shell: "dpkg --compare-versions $(pvc --quiet cluster status -f json | jq '.pvc_version' | tr -d '\"') ge {{ minimum_pvc_version }}"
run_once: yes
- name: set PVC maintenance mode
command: pvc cluster maintenance on
ignore_errors: yes
- name: get current postgresql directory
shell: "find /usr/lib/postgresql -mindepth 1 -maxdepth 1 -type d -name '[0-9][0-9]' | sort -n | tail -1"
register: old_postgres_dir_output
- name: set old_postgres_bin_dir fact
set_fact:
old_postgres_bin_dir: "{{ old_postgres_dir_output.stdout.strip() }}/bin"
- name: get current patroni leader node
shell: "patronictl -c /etc/patroni/config.yml list --format json | jq '.[] | select(.Role == \"Leader\") | .Member' | tr -d '\"'"
register: patroni_leader_output
- name: set patroni_leader fact
set_fact:
patroni_leader: "{% for node in ansible_play_hosts if node.split('.')[0].strip() == patroni_leader_output.stdout.strip() %}{{ node }}{% endfor %}"
- name: set patroni_followers fact
set_fact:
patroni_followers: "{{ ansible_play_hosts | difference([ patroni_leader ]) }}"
- debug:
var: patroni_leader
- debug:
var: patroni_followers
- name: fail out if patroni_leader is empty
command: "echo {{ patroni_leader }}"
register: check_output
failed_when: check_output.stdout == ""
- name: stop and mask patroni service on followers to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop and mask patroni service on leader to perform database upgrade (later)
service:
name: patroni
state: stopped
masked: yes
run_once: yes
delegate_to: "{{ patroni_leader }}"
# Play 2: Per-node upgrade to Debian 12
- hosts: all
remote_user: deploy
become: yes
@ -18,14 +83,6 @@
debian_codename: "{{ ansible_lsb.codename }}"
when: ansible_lsb.codename is defined
- name: bail out if incorrect version
shell: echo "Can't upgrade from Debian < 11; run upgrade-pvc-cluster_deb11.yml against cluster first!" && /bin/false
delegate_to: localhost
when: debian_version|int < 11
- name: set PVC maintenance mode
command: pvc cluster maintenance on
- name: secondary node
command: "pvc node secondary {{ ansible_hostname }}"
ignore_errors: yes
@ -38,6 +95,7 @@
- name: flush node
command: "pvc node flush {{ ansible_hostname }} --wait"
ignore_errors: yes
- name: ensure VMs are migrated away
shell: "virsh list | grep running | wc -l"
@ -79,6 +137,7 @@
- name: set OSD noout
command: pvc storage osd set noout
ignore_errors: yes
- name: get running OSD services
shell: "systemctl | awk '{ print $1 }' | grep 'ceph-osd@[0-9]*.service'"
@ -134,17 +193,73 @@
with_items:
- /etc/apt/sources.list
- name: aptitude dist upgrade and cleanup
- name: remove security entry if on Debian 10
lineinfile:
dest: /etc/apt/sources.list
regexp: "security.debian.org"
state: absent
when: debian_version|int < 11
- name: link python to python3 on Debian 10
file:
state: link
src: python3
dest: /usr/bin/python
force: yes
when: debian_version|int < 11
- name: update apt cache
apt:
update_cache: yes
register: apt_res
retries: 5
until: apt_res is success
# This seems insane, but works around a fatal error upgrading from d10 to d12
- name: perform initial upgrade on Debian 10
block:
- name: install script to perform safe d10->d12 upgrade
copy:
dest: /tmp/upgrade-d10.sh
mode: 0755
content: |
#!/usr/bin/env bash
export DEBIAN_FRONTEND=noninteractive
apt-get --download-only install libcrypt1
dpkg --force-all --install /var/cache/apt/archives/{libcrypt1,libpam0g,libc6}*.deb
apt --fix-broken install
apt-get --no-install-recommends install apt dpkg dpkg-dev base-files zstd
apt-get --no-install-recommends install ca-certificates
apt-get --no-install-recommends --option Dpkg::Options::="--force-confold" install sudo
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" upgrade
apt-get --no-install-recommends --option Dpkg::Options::="--force-confnew" dist-upgrade
systemctl restart ssh
- name: run script to perform safe d10->d12 upgrade (will take a long time)
shell: /tmp/upgrade-d10.sh
- name: install python-is-python3
apt:
name: python-is-python3
state: latest
when: debian_version|int < 11
- name: aptitude upgrade
apt:
upgrade: safe
register: apt_res
retries: 5
until: apt_res is success
- name: aptitude dist upgrade and cleanup
apt:
autoremove: yes
autoclean: yes
upgrade: dist
- name: install python-is-python3
apt:
name: python-is-python3
state: latest
register: apt_res
retries: 5
until: apt_res is success
- name: clean up obsolete kernels
command: /usr/local/sbin/kernel-cleanup.sh
@ -174,6 +289,9 @@
autoremove: yes
autoclean: yes
upgrade: full
register: apt_res
retries: 5
until: apt_res is success
- name: remove obsolete database directories
file:
@ -207,6 +325,12 @@
- name: unset OSD noout
command: pvc storage osd unset noout
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: unflush node
command: "pvc node ready {{ ansible_hostname }} --wait"
@ -233,28 +357,167 @@
become: no
connection: local
- name: unset PVC maintenance mode
command: pvc cluster maintenance off
# Play 3: Ceph upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
tasks:
- name: disable insecure global id reclaim in Ceph
command: ceph config set mon auth_allow_insecure_global_id_reclaim false
run_once: yes
- name: set OSDs to require pacific
command: ceph osd require-osd-release pacific
run_once: yes
- name: backup CRUSH map
command: ceph osd getcrushmap -o /srv/backups/backup-crushmap-deb12-upgrade
- block:
- name: disable insecure global id reclaim in Ceph
command: ceph config set mon auth_allow_insecure_global_id_reclaim false
- name: set OSDs to require pacific
command: ceph osd require-osd-release pacific
- name: update CRUSH map
command: ceph osd crush set-all-straw-buckets-to-straw2
- name: set OSDs to quick fsck
command: ceph config set osd bluestore_fsck_quick_fix_on_mount true
run_once: yes
- name: update CRUSH map
command: ceph osd crush set-all-straw-buckets-to-straw2
run_once: yes
- name: restart ceph-mgr
service:
name: "ceph-mgr@{{ ansible_hostname }}"
state: restarted
# Play 4: Ceph OSD restart (serial per-node)
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
serial: 1
tasks:
- name: restart all OSDs on node
shell: "systemctl restart --all ceph-osd@*"
- name: make sure all OSDs are active
shell: "ceph osd stat | grep 'osds:' | awk '{ if ( $1 == $3 ) { print \"OK\" } else { print \"NOK\" } }'"
register: osdstat
failed_when: osdstat.stdout == "NOK"
until: osdstat.stdout == "OK"
retries: 60
delay: 10
- name: make sure all PGs have recovered
shell: "ceph health | grep -wo 'Degraded data redundancy'"
register: cephhealth
failed_when: cephhealth.stdout == "Degraded data redundancy'"
until: cephhealth.stdout == ""
retries: 60
delay: 10
# Play 4: Patroni upgrades for Debian 12
- hosts: all
remote_user: deploy
become: yes
become_user: root
gather_facts: yes
vars:
new_postgres_version: 15
tasks:
- name: stop patroni service on followers to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: stop patroni service on leader to perform database upgrade
service:
name: patroni
state: stopped
run_once: yes
delegate_to: "{{ patroni_leader }}"
- block:
- name: initialize new postgres database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/initdb -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: enable data checksums in new database
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_checksums --enable /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: run postgres upgrade
shell:
cmd: "sudo -u postgres /usr/lib/postgresql/{{ new_postgres_version }}/bin/pg_upgrade -b {{ old_postgres_bin_dir }} -d /var/lib/postgresql/patroni/pvc -D /var/lib/postgresql/{{ new_postgres_version }}/pvc"
chdir: "/var/lib/postgresql"
- name: move old postgres database out of the way
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/patroni/pvc /var/lib/postgresql/patroni/pvc.old"
chdir: "/var/lib/postgresql"
- name: move new postgres database into place
shell:
cmd: "sudo -u postgres mv /var/lib/postgresql/{{ new_postgres_version }}/pvc /var/lib/postgresql/patroni/pvc"
chdir: "/var/lib/postgresql"
- name: ensure recovery.conf is absent
file:
dest: /var/lib/postgresql/patroni/pvc/recovery.conf
state: absent
- name: delete patroni cluster znodes
shell: "/usr/share/zookeeper/bin/zkCli.sh -server {{ ansible_hostname }}:2181 deleteall /patroni/pvc"
- name: start patroni service on leader
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ patroni_leader }}"
- name: remove old data directory on patroni followers
file:
dest: /var/lib/postgresql/patroni/pvc
state: absent
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: start patroni service on followers
service:
name: patroni
state: started
masked: no
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ patroni_followers }}"
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: restart pvcnoded on all nodes
service:
name: pvcnoded
state: restarted
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: set first node as primary coordinator
command: "pvc node primary --wait"
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ ansible_play_hosts[0] }}"
- name: unset PVC maintenance mode
command: pvc cluster maintenance off

View File

@ -2,6 +2,10 @@
#
# First run check
#
- name: regather facts
setup:
tags: always
- name: check if this is a new instance
shell: "echo 'bootstrapped' > /etc/pvc-install.base"
args:
@ -76,8 +80,7 @@
- name: regather facts
setup:
when: installed_facts.changed
tags: base-ansible
tags: always
- debug:
var: ansible_local.host_group
@ -125,6 +128,9 @@
apt:
update-cache: yes
when: (newhost is defined and newhost) or apt_config.changed or apt_key.changed
register: apt_res
retries: 5
until: apt_res is success
tags: base-apt
- name: aptitude safe upgrade with autoremove
@ -133,6 +139,9 @@
autoremove: yes
upgrade: safe
when: newhost is defined and newhost
register: apt_res
retries: 5
until: apt_res is success
tags: base-apt
- name: install dbus
@ -141,6 +150,9 @@
- dbus
state: latest
when: newhost is defined and newhost
register: apt_res
retries: 5
until: apt_res is success
tags: base-apt
- name: clean out apt cache
@ -165,6 +177,9 @@
state: absent
purge: yes
autoremove: yes
register: apt_res
retries: 5
until: apt_res is success
tags: base-packages
- name: set override debconf selections
@ -177,14 +192,20 @@
apt:
name:
- python
when: debian_codename == 'buster'
when: debian_version|int <= 10
register: apt_res
retries: 5
until: apt_res is success
tags: base-packages
- name: install common packages (bullseye)
- name: install common packages (bullseye+)
apt:
name:
- python-is-python3
when: debian_codename == 'bullseye'
when: debian_version|int >= 11
register: apt_res
retries: 5
until: apt_res is success
tags: base-packages
- name: install common packages (all versions)
@ -259,6 +280,7 @@
- bzip2
- lzop
- xz-utils
- zstd
- haveged
- cpufrequtils
- lm-sensors
@ -269,6 +291,9 @@
- plymouth-themes
- linux-image-amd64
- linux-headers-amd64
register: apt_res
retries: 5
until: apt_res is success
tags: base-packages
- name: install cleanup scripts
@ -833,6 +858,7 @@
name: "{{ deploy_username }}"
uid: 200
group: operator
groups: operator
shell: /bin/bash
home: "/var/home/{{ deploy_username }}"
createhome: yes

View File

@ -0,0 +1,14 @@
# apt configuration: main sources list
# {{ ansible_managed }}
deb {{ debian_main_repository }} bookworm main contrib non-free-firmware
deb-src {{ debian_main_repository }} bookworm main contrib
deb {{ debian_security_repository }} bookworm-security main contrib
deb-src {{ debian_security_repository }} bookworm-security main contrib
deb {{ debian_main_repository }} bookworm-updates main contrib
deb-src {{ debian_main_repository }} bookworm-updates main contrib
deb {{ debian_pvc_repository }} bookworm pvc
deb-src {{ debian_pvc_repository }} bookworm pvc

View File

@ -1,7 +1,7 @@
# apt configuration: main sources list
# {{ ansible_managed }}
deb {{ debian_main_repository }} bullseye main contrib non-free
deb {{ debian_main_repository }} bullseye main contrib non-free-firmware
deb-src {{ debian_main_repository }} bullseye main contrib
deb {{ debian_security_repository }} bullseye-security main contrib

View File

@ -2,7 +2,7 @@
# {{ ansible_managed }}
auto {{ network.value['device'] }}
iface {{ network.value['device'] }} inet manual
iface {{ network.value['device'] }} inet {{ network.value['mode']|default('manual') }}
post-up ip link set $IFACE mtu {{ network.value['mtu'] }}
{% if network.value['type'] == 'bond' %}
bond-mode {{ network.value['bond_mode'] }}

View File

@ -8,6 +8,9 @@
- ceph-mgr
- radosgw
state: latest
register: apt_res
retries: 5
until: apt_res is success
- name: add admin users to ceph groups
user:

View File

@ -4,6 +4,9 @@
name:
- frr
state: latest
register: apt_res
retries: 5
until: apt_res is success
- name: install frr configuration
template:

View File

@ -10,6 +10,9 @@
- ceph-common
- libguestfs-tools
state: latest
register: apt_res
retries: 5
until: apt_res is success
- name: add libvirt user to ceph group
user:
@ -89,6 +92,7 @@
name: "{{ item }}"
state: started
enabled: yes
ignore_errors: yes
with_items:
- vhostmd

View File

@ -61,11 +61,36 @@
tags: pvc-cputuning
when: debian_version|int >= 11 and cpu_tuning is defined
- name: restart server on first install
shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"'
async: 1
poll: 0
ignore_errors: yes
become: yes
- block:
- name: restart server on first install
reboot:
post_reboot_delay: 30
reboot_timeout: 1800
- name: wait 90 seconds for system to stabilize
pause:
seconds: 90
become: no
connection: local
- name: restart pvcnoded on first install
service:
name: pvcnoded
state: restarted
throttle: 1
ignore_errors: yes
- name: wait 30 seconds for system to stabilize
pause:
seconds: 30
become: no
connection: local
- name: set first node as primary coordinator on first install
command: "pvc node primary --wait"
run_once: yes
delegate_to: "{{ item }}"
loop: "{{ ansible_play_hosts[0] }}"
when: newhost is defined and newhost
tags: always

View File

@ -8,7 +8,7 @@
- patroni
- postgresql-11
postgresql_version: 11
when: debian_version|int <= 10
when: debian_version|int == 10
- set_fact:
package_list:
@ -17,13 +17,25 @@
- patroni
- postgresql
postgresql_version: 13
when: debian_version|int >= 11
when: debian_version|int == 11
- set_fact:
package_list:
- python3-psycopg2
- python3-kazoo
- patroni
- postgresql
postgresql_version: 15
when: debian_version|int == 12
- name: install patroni packages via apt
apt:
name: "{{ package_list }}"
state: latest
update-cache: yes
register: apt_res
retries: 5
until: apt_res is success
- name: stop and disable postgresql
service:

View File

@ -13,6 +13,9 @@
- rinse
state: latest
when: newhost is defined and newhost
register: apt_res
retries: 5
until: apt_res is success
- name: install pvc node daemon configuration
template:

View File

@ -5,6 +5,9 @@
- zookeeper
- zookeeper-bin
state: latest
register: apt_res
retries: 5
until: apt_res is success
- name: install zookeeper configuration
template:

View File

@ -19,21 +19,25 @@ bootstrap:
postgresql:
use_pg_rewind: true
parameters:
{% if debian_version|int >= 12 %}
wal_keep_size: 64
{% else %}
wal_keep_segments: 64
{% endif %}
max_wal_senders: 8
max_replication_slots: 8
initdb:
- encoding: UTF8
- data-checksums
- encoding: UTF8
- data-checksums
pg_hba:
- local all all peer
- host replication replicator 127.0.0.1/32 trust
- local all all peer
- host replication replicator 127.0.0.1/32 trust
{% for node in pvc_nodes if node.is_coordinator %}
- host replication replicator {{ node.cluster_ip }}/32 trust
- host replication replicator {{ node.cluster_ip }}/32 trust
{% endfor %}
- host all all 0.0.0.0/0 md5
- host all all 0.0.0.0/0 md5
users:
admin:
@ -61,9 +65,26 @@ postgresql:
password: '{{ pvc_superuser_database_password }}'
parameters:
unix_socket_directories: '/run/postgresql'
{% if debian_version|int >= 12 %}
wal_keep_size: 64
{% else %}
wal_keep_segments: 64
{% endif %}
max_wal_senders: 8
max_replication_slots: 8
pg_hba:
- local all all trust
- host all all 127.0.0.1/32 trust
- host all all ::1/128 trust
- local replication all trust
- host replication all 127.0.0.1/32 trust
- host replication all ::1/128 trust
- local all all peer
- host replication replicator 127.0.0.1/32 trust
{% for node in pvc_nodes if node.is_coordinator %}
- host replication replicator {{ node.cluster_ip }}/32 trust
{% endfor %}
- host all all 0.0.0.0/0 md5
tags:
nofailover: false