From 6e48d6fe84d03beff5a2c082227d793e43cc3651 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 1 Sep 2023 15:42:27 -0400 Subject: [PATCH] Add Ceph OSD cpuset tuning options Allows an administrator to set CPU pinning with the cpuset tool for Ceph OSDs, in situations where CPU contention with VMs or other system tasks may be negatively affecting OSD performance. This is optional, advanced tuning and is disabled by default. --- group_vars/default/pvc.yml | 93 ++++++++++++++----- roles/pvc/tasks/ceph/main.yml | 51 ++++++++++ roles/pvc/tasks/main.yml | 14 +-- roles/pvc/tasks/system/main.yml | 14 +++ .../templates/ceph/ceph-osd-cpuset.conf.j2 | 5 + roles/pvc/templates/ceph/ceph-osd-cpuset.j2 | 63 +++++++++++++ .../templates/ceph/ceph-osd-cpuset.service.j2 | 13 +++ 7 files changed, 221 insertions(+), 32 deletions(-) create mode 100644 roles/pvc/tasks/system/main.yml create mode 100644 roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 create mode 100755 roles/pvc/templates/ceph/ceph-osd-cpuset.j2 create mode 100644 roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 diff --git a/group_vars/default/pvc.yml b/group_vars/default/pvc.yml index 531dd15..8daf9a5 100644 --- a/group_vars/default/pvc.yml +++ b/group_vars/default/pvc.yml @@ -13,8 +13,6 @@ #pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands. #pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands. -# Timing and fencing configuration (uncomment to override defaults) - # Timing and fencing configuration (uncomment to override defaults) # These default options are generally best for most clusters; override these if you want more granular # control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone. @@ -36,8 +34,8 @@ pvc_api_secret_key: "" # Use pwgen to generate # The first token will always be used for the "local" connection, and thus at least one token MUST be defined. pvc_api_enable_authentication: True pvc_api_tokens: - - description: "myuser" - token: "a3945326-d36c-4024-83b3-2a8931d7785a" +# - description: "myuser" +# token: "a3945326-d36c-4024-83b3-2a8931d7785a" # PVC API SSL configuration # Use these options to enable SSL for the API listener, providing security over WAN connections. @@ -76,22 +74,7 @@ pvc_asn: "65500" pvc_routers: - "192.168.100.1" -# Memory tuning -# > Uncomment these options in low-memory situations (nodes with <32GB RAM). -# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB. -# > This option is *only* applied at cluster bootstrap and cannot be changed later -# here, only by editing the `files/ceph//ceph.conf` file directly. -#pvc_osd_memory_limit: 939524096 -# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M. -# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. -# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks. -#pvc_zookeeper_heap_limit: 128M # 1/4 of default -# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M. -# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. -# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. -#pvc_zookeeper_stack_limit: 256M # 1/4 of default - -# Node list +# PVC Node list # > Every node configured with this playbook must be specified in this list. pvc_nodes: - hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname @@ -143,8 +126,76 @@ pvc_sriov_enable: False # mtu: 9000 # vfcount: 6 +# Memory tuning +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default memory +# allocations. Uncomment these options only low-memory situations (nodes with <32GB RAM). +# +# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB. +# > This option is *only* applied at cluster bootstrap and cannot be changed later +# here, only by editing the `files/ceph//ceph.conf` file directly. +#pvc_osd_memory_limit: 939524096 +# +# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M. +# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. +# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks. +#pvc_zookeeper_heap_limit: 128M # 1/4 of default +# +# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M. +# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED. +# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. +#pvc_zookeeper_stack_limit: 256M # 1/4 of default + +# CPU pinning configuration via cset +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default scheduling. +# > These options can be set to maximize the CPU performance of the Ceph subsystem. Because Ceph OSD +# performance is heavily limited more by CPU than anything else, for users with a lot of relatively slow CPU +# cores, or for those looking to get maximum storage performance, tuning the pinning options here might +# provide noticeable benefits. +# > This configuration makes use of the cset command and will dedicate a specific number of CPU cores to the +# Ceph OSD processes on each node. This is accomplished by using cset's shield mechanism to create a cgroup +# which will contain only Ceph OSD processes, while putting everything else onto the remaining CPUs. +# > Avoid using this tuning if you have less than 8 total CPU cores (excluding SMT threads). Otherwise, you +# might not have enough CPU cores to properly run VMs, unless you are very careful with vCPU allocation. +# > Like the 'pvc_nodes' dictionary, these options are set per-host, even if all hosts are identical. This +# is required to handle sitations where hosts might have different CPU topologies. Each host can have a +# specific set of CPUs that are included in the shield. +# > Ensure that you know which CPU cores are "real" and which are SMT "threads". This can be obtained using +# the 'virsh capabilities' command and noting the 'siblings' entries for each CPU. +# > Ensure you consider NUMA nodes when setting up this tuning. Generally speaking it is better to keep the +# OSD processes onto one NUMA node for simplicity; more advanced tuning is outside of the scope of this +# playbook. +# > You should set a number of cores in the shield (along with their respective SMT threads) equal to the +# number of OSDs in the system. This can be adjusted later as needed. For instance, if you have 2 OSDs per +# node, and each node has a 10-core SMT-capable CPU, you would want to assign cores 0 and 1 (the first two +# real cores) and 10 and 11 (the SMT siblings of those cores in 'virsh capabilities') in the cset. +# +# The shield mode is disabled by default and a commented out example configuration is shown. +pvc_shield_osds_enable: False +#pvc_shield_osds_cset: +# # This example host has 2x 6-core SMT-enabled CPUs; we want to use cores 0 (+SMT 12) and 2 (+SMT 14), which are +# # both on physical CPU 0, for 2x OSDs. +# - hostname: pvchv1 +# osd_cset: +# - 0 +# - 2 +# - 12 +# - 14 +# # These example hosts have 1x 8-core SMT-enabled CPUs; we want to use cores 0 (+SMT 8) and 1 (+SMT 9) for 2x OSDs. +# - hostname: pvchv2 +# osd_cset: +# - 0 +# - 1 +# - 8 +# - 9 +# - hostname: pvchv3 +# osd_cset: +# - 0 +# - 1 +# - 8 +# - 9 + # Configuration file networks -# > Taken from base.yml's configuration; do not modify this section. +# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION. pvc_upstream_device: "{{ networks['upstream']['device'] }}" pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}" pvc_upstream_domain: "{{ networks['upstream']['domain'] }}" diff --git a/roles/pvc/tasks/ceph/main.yml b/roles/pvc/tasks/ceph/main.yml index ea4813e..6d17f62 100644 --- a/roles/pvc/tasks/ceph/main.yml +++ b/roles/pvc/tasks/ceph/main.yml @@ -117,4 +117,55 @@ - ceph-mon@{{ ansible_hostname }} - ceph-mgr@{{ ansible_hostname }} +# System OSD CPU shielding activation +- block: + - name: install packages + apt: + name: + - cpuset + - numactl + state: latest + + - name: install ceph-osd-cpuset script + template: + src: ceph/ceph-osd-cpuset.j2 + dest: /usr/local/sbin/ceph-osd-cpuset + mode: 0755 + + - name: install ceph-osd-cpuset service unit + template: + src: ceph/ceph-osd-cpuset.service.j2 + dest: /etc/systemd/system/ceph-osd-cpuset.service + register: systemd_file_cpuset + + - name: create ceph-osd override dropin directory + file: + dest: /etc/systemd/system/ceph-osd@.service.d + state: directory + + - name: install ceph-osd override dropin + template: + src: ceph/ceph-osd-cpuset.conf.j2 + dest: /etc/systemd/system/ceph-osd@.service.d/cpuset.conf + register: systemd_file_osd + + - name: reload systemd to apply previous changes + command: "systemctl daemon-reload" + when: systemd_file_cpuset.changed or systemd_file_osd.changed + + - name: enable ceph-osd-cpuset service + service: + name: ceph-osd-cpuset + enabled: yes + + - debug: + msg: "NOTICE: cpuset configs have NOT been applied to the running system. This node must be rebooted to apply these changes." + when: systemd_file_cpuset.changed or systemd_file_osd.changed + tags: pvc-ceph-cpuset + when: + - pvc_shield_osds_enable is defined + - pvc_shield_osds_enable + - pvc_shield_osds_cset is defined + - pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) | list | count > 0 + - meta: flush_handlers diff --git a/roles/pvc/tasks/main.yml b/roles/pvc/tasks/main.yml index 641d0d0..1de19dd 100644 --- a/roles/pvc/tasks/main.yml +++ b/roles/pvc/tasks/main.yml @@ -23,17 +23,9 @@ when: newhost is defined and newhost tags: always -# General blacklisting of modules -- name: add module blacklist - template: - src: system/blacklist.j2 - dest: /etc/modprobe.d/blacklist.conf - -# Logrotate configuration -- name: add logrotate configuration - template: - src: system/pvc.j2 - dest: /etc/logrotate.d/pvc +# Install system tweaks +- include: system/main.yml + tags: pvc-system # Install base databases (coordinators only) - include: ceph/main.yml diff --git a/roles/pvc/tasks/system/main.yml b/roles/pvc/tasks/system/main.yml new file mode 100644 index 0000000..770c2c8 --- /dev/null +++ b/roles/pvc/tasks/system/main.yml @@ -0,0 +1,14 @@ +--- +# General blacklisting of modules +- name: add module blacklist + template: + src: system/blacklist.j2 + dest: /etc/modprobe.d/blacklist.conf + +# Logrotate configuration +- name: add logrotate configuration + template: + src: system/pvc.j2 + dest: /etc/logrotate.d/pvc + +- meta: flush_handlers diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 new file mode 100644 index 0000000..13deae2 --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.conf.j2 @@ -0,0 +1,5 @@ +# ceph-osd@.service overrides for cpuset +# {{ ansible_managed }} +[Service] +ExecStart = +ExecStart = /usr/bin/cset proc --set=osd --exec /usr/bin/ceph-osd -- -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 new file mode 100755 index 0000000..9c3b4f7 --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.j2 @@ -0,0 +1,63 @@ +#!/bin/bash +# PVC Ceph OSD cpuset preparation script +# {{ ansible_managed }} + +# This script is designed to prepare the cpusets for use by Ceph OSDs, VMs, and other system resources. +# Libvirt does not make this easy with any way to globally set its CPUs, so we must do this trickery. +{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %} + +A_OSD_CPUS=( {{ cset_host[0]['osd_cset'] | join(' ') }} ) +A_SYS_CPUS=() + +CPU_INFO="$( lscpu )" + +# First, we must determine how many NUMA nodes we have +NUMA_COUNT="$( grep '^NUMA node(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )" + +# If we have 1 NUMA node, our SYS_MEMS is 0; otherwise it's 0-X +# This is needed to explicitly set our memspec during the set +if [[ ${NUMA_COUNT} -eq 1 ]]; then + SYS_MEMS="0" +else + SYS_MEMS="0-$(( ${NUMA_COUNT} - 1 ))" +fi + +# We must determine which NUMA nodes our OSD CPUS are in for the memspec during the set +A_OSD_MEMS=() +for CPU in ${A_OSD_CPUS[@]}; do + NODE="$( grep -E '^NUMA node[0-9]+ CPU' <<<"${CPU_INFO}" | grep -w "${CPU}" | awk '{ print $2 }' | sed 's/node//' )" + if [[ ! " ${A_OSD_MEMS} " =~ " ${NODE} " ]]; then + A_OSD_MEMS+=( $NODE ) + fi +done + +# Determine our CPU count +CPU_COUNT="$( grep '^CPU(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )" +echo "CPU count: ${CPU_COUNT}" + +# Loop through all the CPUs in the count; if they are not in OSD_CPUS, add them to the SYS_CPUS array +for i in $( seq 0 $(( ${CPU_COUNT} - 1)) ); do + if [[ ! " ${A_OSD_CPUS[*]} " =~ " ${i} " ]]; then + A_SYS_CPUS+=( $i ) + fi +done + +# Convert arrays into CSV +OSD_MEMS="$( IFS=, ; echo "${A_OSD_MEMS[*]}" )" +OSD_CPUS="$( IFS=, ; echo "${A_OSD_CPUS[*]}" )" +SYS_CPUS="$( IFS=, ; echo "${A_SYS_CPUS[*]}" )" + +echo "OSD CPUs: ${OSD_CPUS}" +echo "OSD Mems: ${OSD_MEMS}" +echo "System/VM CPUs: ${SYS_CPUS}" +echo "System/VM Mems: ${SYS_MEMS}" + +# Create the system cpuset and move everything currently running into it +/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} system +/usr/bin/cset proc --move --force --threads root --toset=system + +# Create our Libvirt cpuset (identical to system cpuset) +/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} machine + +# Create our OSD cpuset +/usr/bin/cset set --cpu=${OSD_CPUS} --mem=${OSD_MEMS} osd diff --git a/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 b/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 new file mode 100644 index 0000000..0451a3d --- /dev/null +++ b/roles/pvc/templates/ceph/ceph-osd-cpuset.service.j2 @@ -0,0 +1,13 @@ +# PVC Ceph OSD cpuset service unit +# {{ ansible_managed }} +{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %} +[Unit] +Description = Ceph OSD cpuset shield creation +Before = ceph-osd@.service libvirtd.service + +[Service] +Type = oneshot +ExecStart = /usr/local/sbin/ceph-osd-cpuset + +[Install] +WantedBy = ceph.target