diff --git a/group_vars/default/pvc.yml b/group_vars/default/pvc.yml index 531dd15..8daf9a5 100644 --- a/group_vars/default/pvc.yml +++ b/group_vars/default/pvc.yml @@ -13,8 +13,6 @@ #pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands. #pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands. -# Timing and fencing configuration (uncomment to override defaults) - # Timing and fencing configuration (uncomment to override defaults) # These default options are generally best for most clusters; override these if you want more granular # control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone. @@ -36,8 +34,8 @@ pvc_api_secret_key: "" # Use pwgen to generate # The first token will always be used for the "local" connection, and thus at least one token MUST be defined. pvc_api_enable_authentication: True pvc_api_tokens: - - description: "myuser" - token: "a3945326-d36c-4024-83b3-2a8931d7785a" +# - description: "myuser" +# token: "a3945326-d36c-4024-83b3-2a8931d7785a" # PVC API SSL configuration # Use these options to enable SSL for the API listener, providing security over WAN connections. @@ -76,22 +74,7 @@ pvc_asn: "65500" pvc_routers: - "192.168.100.1" -# Memory tuning -# > Uncomment these options in low-memory situations (nodes with <32GB RAM). -# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB. -# > This option is *only* applied at cluster bootstrap and cannot be changed later -# here, only by editing the `files/ceph//ceph.conf` file directly. -#pvc_osd_memory_limit: 939524096 -# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M. -# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. -# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks. -#pvc_zookeeper_heap_limit: 128M # 1/4 of default -# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M. -# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. -# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. -#pvc_zookeeper_stack_limit: 256M # 1/4 of default - -# Node list +# PVC Node list # > Every node configured with this playbook must be specified in this list. pvc_nodes: - hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname @@ -143,8 +126,76 @@ pvc_sriov_enable: False # mtu: 9000 # vfcount: 6 +# Memory tuning +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default memory +# allocations. Uncomment these options only low-memory situations (nodes with <32GB RAM). +# +# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB. +# > This option is *only* applied at cluster bootstrap and cannot be changed later +# here, only by editing the `files/ceph//ceph.conf` file directly. +#pvc_osd_memory_limit: 939524096 +# +# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M. +# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. +# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks. +#pvc_zookeeper_heap_limit: 128M # 1/4 of default +# +# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M. +# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. +# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. +#pvc_zookeeper_stack_limit: 256M # 1/4 of default + +# CPU pinning configuration via cset +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default scheduling. +# > These options can be set to maximize the CPU performance of the Ceph subsystem. Because Ceph OSD +# performance is heavily limited more by CPU than anything else, for users with a lot of relatively slow CPU +# cores, or for those looking to get maximum storage performance, tuning the pinning options here might +# provide noticeable benefits. +# > This configuration makes use of the cset command and will dedicate a specific number of CPU cores to the +# Ceph OSD processes on each node. This is accomplished by using cset's shield mechanism to create a cgroup +# which will contain only Ceph OSD processes, while putting everything else onto the remaining CPUs. +# > Avoid using this tuning if you have less than 8 total CPU cores (excluding SMT threads). Otherwise, you +# might not have enough CPU cores to properly run VMs, unless you are very careful with vCPU allocation. +# > Like the 'pvc_nodes' dictionary, these options are set per-host, even if all hosts are identical. This +# is required to handle sitations where hosts might have different CPU topologies. Each host can have a +# specific set of CPUs that are included in the shield. +# > Ensure that you know which CPU cores are "real" and which are SMT "threads". This can be obtained using +# the 'virsh capabilities' command and noting the 'siblings' entries for each CPU. +# > Ensure you consider NUMA nodes when setting up this tuning. Generally speaking it is better to keep the +# OSD processes onto one NUMA node for simplicity; more advanced tuning is outside of the scope of this +# playbook. +# > You should set a number of cores in the shield (along with their respective SMT threads) equal to the +# number of OSDs in the system. This can be adjusted later as needed. For instance, if you have 2 OSDs per +# node, and each node has a 10-core SMT-capable CPU, you would want to assign cores 0 and 1 (the first two +# real cores) and 10 and 11 (the SMT siblings of those cores in 'virsh capabilities') in the cset. +# +# The shield mode is disabled by default and a commented out example configuration is shown. +pvc_shield_osds_enable: False +#pvc_shield_osds_cset: +# # This example host has 2x 6-core SMT-enabled CPUs; we want to use cores 0 (+SMT 12) and 2 (+SMT 14), which are +# # both on physical CPU 0, for 2x OSDs. +# - hostname: pvchv1 +# osd_cset: +# - 0 +# - 2 +# - 12 +# - 14 +# # These example hosts have 1x 8-core SMT-enabled CPUs; we want to use cores 0 (+SMT 8) and 1 (+SMT 9) for 2x OSDs. +# - hostname: pvchv2 +# osd_cset: +# - 0 +# - 1 +# - 8 +# - 9 +# - hostname: pvchv3 +# osd_cset: +# - 0 +# - 1 +# - 8 +# - 9 + # Configuration file networks -# > Taken from base.yml's configuration; do not modify this section. +# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION. pvc_upstream_device: "{{ networks['upstream']['device'] }}" pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}" pvc_upstream_domain: "{{ networks['upstream']['domain'] }}" diff --git a/roles/pvc/tasks/ceph/main.yml b/roles/pvc/tasks/ceph/main.yml index ea4813e..6d17f62 100644 --- a/roles/pvc/tasks/ceph/main.yml +++ b/roles/pvc/tasks/ceph/main.yml @@ -117,4 +117,55 @@ - ceph-mon@{{ ansible_hostname }} - ceph-mgr@{{ ansible_hostname }} +# System OSD CPU shielding activation +- block: + - name: install packages + apt: + name: + - cpuset + - numactl + state: latest + + - name: install ceph-osd-cpuset script + template: + src: ceph/ceph-osd-cpuset.j2 + dest: /usr/local/sbin/ceph-osd-cpuset + mode: 0755 + + - name: install ceph-osd-cpuset service unit + template: + src: ceph/ceph-osd-cpuset.service.j2 + dest: /etc/systemd/system/ceph-osd-cpuset.service + register: systemd_file_cpuset + + - name: create ceph-osd override dropin directory + file: + dest: /etc/systemd/system/ceph-osd@.service.d + state: directory + + - name: install ceph-osd override dropin + template: + src: ceph/ceph-osd-cpuset.conf.j2 + dest: /etc/systemd/system/ceph-osd@.service.d/cpuset.conf + register: systemd_file_osd + + - name: reload systemd to apply previous changes + command: "systemctl daemon-reload" + when: systemd_file_cpuset.changed or systemd_file_osd.changed + + - name: enable ceph-osd-cpuset service + service: + name: ceph-osd-cpuset + enabled: yes + + - debug: + msg: "NOTICE: cpuset configs have NOT been applied to the running system. This node must be rebooted to apply these changes." + when: systemd_file_cpuset.changed or systemd_file_osd.changed + tags: pvc-ceph-cpuset + when: + - pvc_shield_osds_enable is defined + - pvc_shield_osds_enable + - pvc_shield_osds_cset is defined + - pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) | list | count > 0 + - meta: flush_handlers diff --git a/roles/pvc/tasks/main.yml b/roles/pvc/tasks/main.yml index 641d0d0..1de19dd 100644 --- a/roles/pvc/tasks/main.yml +++ b/roles/pvc/tasks/main.yml @@ -23,17 +23,9 @@ when: newhost is defined and newhost tags: always -# General blacklisting of modules -- name: add module blacklist - template: - src: system/blacklist.j2 - dest: /etc/modprobe.d/blacklist.conf - -# Logrotate configuration -- name: add logrotate configuration - template: - src: system/pvc.j2 - dest: /etc/logrotate.d/pvc +# Install system tweaks +- include: system/main.yml + tags: pvc-system # Install base databases (coordinators only) - include: ceph/main.yml diff --git a/roles/pvc/tasks/system/main.yml b/roles/pvc/tasks/system/main.yml new file mode 100644 index 0000000..770c2c8 --- /dev/null +++ b/roles/pvc/tasks/system/main.yml @@ -0,0 +1,14 @@ +--- +# General blacklisting of modules +- name: add module blacklist + template: + src: system/blacklist.j2 + dest: /etc/modprobe.d/blacklist.conf + +# Logrotate configuration +- name: add logrotate configuration + template: + src: system/pvc.j2 + dest: /etc/logrotate.d/pvc + +- meta: flush_handlers diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 new file mode 100644 index 0000000..13deae2 --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 @@ -0,0 +1,5 @@ +# ceph-osd@.service overrides for cpuset +# {{ ansible_managed }} +[Service] +ExecStart = +ExecStart = /usr/bin/cset proc --set=osd --exec /usr/bin/ceph-osd -- -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 new file mode 100755 index 0000000..9c3b4f7 --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 @@ -0,0 +1,63 @@ +#!/bin/bash +# PVC Ceph OSD cpuset preparation script +# {{ ansible_managed }} + +# This script is designed to prepare the cpusets for use by Ceph OSDs, VMs, and other system resources. +# Libvirt does not make this easy with any way to globally set its CPUs, so we must do this trickery. +{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %} + +A_OSD_CPUS=( {{ cset_host[0]['osd_cset'] | join(' ') }} ) +A_SYS_CPUS=() + +CPU_INFO="$( lscpu )" + +# First, we must determine how many NUMA nodes we have +NUMA_COUNT="$( grep '^NUMA node(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )" + +# If we have 1 NUMA node, our SYS_MEMS is 0; otherwise it's 0-X +# This is needed to explicitly set our memspec during the set +if [[ ${NUMA_COUNT} -eq 1 ]]; then + SYS_MEMS="0" +else + SYS_MEMS="0-$(( ${NUMA_COUNT} - 1 ))" +fi + +# We must determine which NUMA nodes our OSD CPUS are in for the memspec during the set +A_OSD_MEMS=() +for CPU in ${A_OSD_CPUS[@]}; do + NODE="$( grep -E '^NUMA node[0-9]+ CPU' <<<"${CPU_INFO}" | grep -w "${CPU}" | awk '{ print $2 }' | sed 's/node//' )" + if [[ ! " ${A_OSD_MEMS} " =~ " ${NODE} " ]]; then + A_OSD_MEMS+=( $NODE ) + fi +done + +# Determine our CPU count +CPU_COUNT="$( grep '^CPU(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )" +echo "CPU count: ${CPU_COUNT}" + +# Loop through all the CPUs in the count; if they are not in OSD_CPUS, add them to the SYS_CPUS array +for i in $( seq 0 $(( ${CPU_COUNT} - 1)) ); do + if [[ ! " ${A_OSD_CPUS[*]} " =~ " ${i} " ]]; then + A_SYS_CPUS+=( $i ) + fi +done + +# Convert arrays into CSV +OSD_MEMS="$( IFS=, ; echo "${A_OSD_MEMS[*]}" )" +OSD_CPUS="$( IFS=, ; echo "${A_OSD_CPUS[*]}" )" +SYS_CPUS="$( IFS=, ; echo "${A_SYS_CPUS[*]}" )" + +echo "OSD CPUs: ${OSD_CPUS}" +echo "OSD Mems: ${OSD_MEMS}" +echo "System/VM CPUs: ${SYS_CPUS}" +echo "System/VM Mems: ${SYS_MEMS}" + +# Create the system cpuset and move everything currently running into it +/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} system +/usr/bin/cset proc --move --force --threads root --toset=system + +# Create our Libvirt cpuset (identical to system cpuset) +/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} machine + +# Create our OSD cpuset +/usr/bin/cset set --cpu=${OSD_CPUS} --mem=${OSD_MEMS} osd diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 new file mode 100644 index 0000000..0451a3d --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 @@ -0,0 +1,13 @@ +# PVC Ceph OSD cpuset service unit +# {{ ansible_managed }} +{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %} +[Unit] +Description = Ceph OSD cpuset shield creation +Before = ceph-osd@.service libvirtd.service + +[Service] +Type = oneshot +ExecStart = /usr/local/sbin/ceph-osd-cpuset + +[Install] +WantedBy = ceph.target