Add Ceph OSD cpuset tuning options

Allows an administrator to set CPU pinning with the cpuset tool for Ceph
OSDs, in situations where CPU contention with VMs or other system tasks
may be negatively affecting OSD performance. This is optional, advanced
tuning and is disabled by default.
This commit is contained in:
Joshua Boniface 2023-09-01 15:42:27 -04:00
parent 45424a28ce
commit 6e48d6fe84
7 changed files with 221 additions and 32 deletions

View File

@ -13,8 +13,6 @@
#pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands. #pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands.
#pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands. #pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands.
# Timing and fencing configuration (uncomment to override defaults)
# Timing and fencing configuration (uncomment to override defaults) # Timing and fencing configuration (uncomment to override defaults)
# These default options are generally best for most clusters; override these if you want more granular # These default options are generally best for most clusters; override these if you want more granular
# control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone. # control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone.
@ -36,8 +34,8 @@ pvc_api_secret_key: "" # Use pwgen to generate
# The first token will always be used for the "local" connection, and thus at least one token MUST be defined. # The first token will always be used for the "local" connection, and thus at least one token MUST be defined.
pvc_api_enable_authentication: True pvc_api_enable_authentication: True
pvc_api_tokens: pvc_api_tokens:
- description: "myuser" # - description: "myuser"
token: "a3945326-d36c-4024-83b3-2a8931d7785a" # token: "a3945326-d36c-4024-83b3-2a8931d7785a"
# PVC API SSL configuration # PVC API SSL configuration
# Use these options to enable SSL for the API listener, providing security over WAN connections. # Use these options to enable SSL for the API listener, providing security over WAN connections.
@ -76,22 +74,7 @@ pvc_asn: "65500"
pvc_routers: pvc_routers:
- "192.168.100.1" - "192.168.100.1"
# Memory tuning # PVC Node list
# > Uncomment these options in low-memory situations (nodes with <32GB RAM).
# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB.
# > This option is *only* applied at cluster bootstrap and cannot be changed later
# here, only by editing the `files/ceph/<cluster>/ceph.conf` file directly.
#pvc_osd_memory_limit: 939524096
# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M.
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks.
#pvc_zookeeper_heap_limit: 128M # 1/4 of default
# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M.
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
# Node list
# > Every node configured with this playbook must be specified in this list. # > Every node configured with this playbook must be specified in this list.
pvc_nodes: pvc_nodes:
- hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname - hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname
@ -143,8 +126,76 @@ pvc_sriov_enable: False
# mtu: 9000 # mtu: 9000
# vfcount: 6 # vfcount: 6
# Memory tuning
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default memory
# allocations. Uncomment these options only low-memory situations (nodes with <32GB RAM).
#
# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB.
# > This option is *only* applied at cluster bootstrap and cannot be changed later
# here, only by editing the `files/ceph/<cluster>/ceph.conf` file directly.
#pvc_osd_memory_limit: 939524096
#
# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M.
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks.
#pvc_zookeeper_heap_limit: 128M # 1/4 of default
#
# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M.
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
# CPU pinning configuration via cset
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default scheduling.
# > These options can be set to maximize the CPU performance of the Ceph subsystem. Because Ceph OSD
# performance is heavily limited more by CPU than anything else, for users with a lot of relatively slow CPU
# cores, or for those looking to get maximum storage performance, tuning the pinning options here might
# provide noticeable benefits.
# > This configuration makes use of the cset command and will dedicate a specific number of CPU cores to the
# Ceph OSD processes on each node. This is accomplished by using cset's shield mechanism to create a cgroup
# which will contain only Ceph OSD processes, while putting everything else onto the remaining CPUs.
# > Avoid using this tuning if you have less than 8 total CPU cores (excluding SMT threads). Otherwise, you
# might not have enough CPU cores to properly run VMs, unless you are very careful with vCPU allocation.
# > Like the 'pvc_nodes' dictionary, these options are set per-host, even if all hosts are identical. This
# is required to handle sitations where hosts might have different CPU topologies. Each host can have a
# specific set of CPUs that are included in the shield.
# > Ensure that you know which CPU cores are "real" and which are SMT "threads". This can be obtained using
# the 'virsh capabilities' command and noting the 'siblings' entries for each CPU.
# > Ensure you consider NUMA nodes when setting up this tuning. Generally speaking it is better to keep the
# OSD processes onto one NUMA node for simplicity; more advanced tuning is outside of the scope of this
# playbook.
# > You should set a number of cores in the shield (along with their respective SMT threads) equal to the
# number of OSDs in the system. This can be adjusted later as needed. For instance, if you have 2 OSDs per
# node, and each node has a 10-core SMT-capable CPU, you would want to assign cores 0 and 1 (the first two
# real cores) and 10 and 11 (the SMT siblings of those cores in 'virsh capabilities') in the cset.
#
# The shield mode is disabled by default and a commented out example configuration is shown.
pvc_shield_osds_enable: False
#pvc_shield_osds_cset:
# # This example host has 2x 6-core SMT-enabled CPUs; we want to use cores 0 (+SMT 12) and 2 (+SMT 14), which are
# # both on physical CPU 0, for 2x OSDs.
# - hostname: pvchv1
# osd_cset:
# - 0
# - 2
# - 12
# - 14
# # These example hosts have 1x 8-core SMT-enabled CPUs; we want to use cores 0 (+SMT 8) and 1 (+SMT 9) for 2x OSDs.
# - hostname: pvchv2
# osd_cset:
# - 0
# - 1
# - 8
# - 9
# - hostname: pvchv3
# osd_cset:
# - 0
# - 1
# - 8
# - 9
# Configuration file networks # Configuration file networks
# > Taken from base.yml's configuration; do not modify this section. # > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION.
pvc_upstream_device: "{{ networks['upstream']['device'] }}" pvc_upstream_device: "{{ networks['upstream']['device'] }}"
pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}" pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}"
pvc_upstream_domain: "{{ networks['upstream']['domain'] }}" pvc_upstream_domain: "{{ networks['upstream']['domain'] }}"

View File

@ -117,4 +117,55 @@
- ceph-mon@{{ ansible_hostname }} - ceph-mon@{{ ansible_hostname }}
- ceph-mgr@{{ ansible_hostname }} - ceph-mgr@{{ ansible_hostname }}
# System OSD CPU shielding activation
- block:
- name: install packages
apt:
name:
- cpuset
- numactl
state: latest
- name: install ceph-osd-cpuset script
template:
src: ceph/ceph-osd-cpuset.j2
dest: /usr/local/sbin/ceph-osd-cpuset
mode: 0755
- name: install ceph-osd-cpuset service unit
template:
src: ceph/ceph-osd-cpuset.service.j2
dest: /etc/systemd/system/ceph-osd-cpuset.service
register: systemd_file_cpuset
- name: create ceph-osd override dropin directory
file:
dest: /etc/systemd/system/ceph-osd@.service.d
state: directory
- name: install ceph-osd override dropin
template:
src: ceph/ceph-osd-cpuset.conf.j2
dest: /etc/systemd/system/ceph-osd@.service.d/cpuset.conf
register: systemd_file_osd
- name: reload systemd to apply previous changes
command: "systemctl daemon-reload"
when: systemd_file_cpuset.changed or systemd_file_osd.changed
- name: enable ceph-osd-cpuset service
service:
name: ceph-osd-cpuset
enabled: yes
- debug:
msg: "NOTICE: cpuset configs have NOT been applied to the running system. This node must be rebooted to apply these changes."
when: systemd_file_cpuset.changed or systemd_file_osd.changed
tags: pvc-ceph-cpuset
when:
- pvc_shield_osds_enable is defined
- pvc_shield_osds_enable
- pvc_shield_osds_cset is defined
- pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) | list | count > 0
- meta: flush_handlers - meta: flush_handlers

View File

@ -23,17 +23,9 @@
when: newhost is defined and newhost when: newhost is defined and newhost
tags: always tags: always
# General blacklisting of modules # Install system tweaks
- name: add module blacklist - include: system/main.yml
template: tags: pvc-system
src: system/blacklist.j2
dest: /etc/modprobe.d/blacklist.conf
# Logrotate configuration
- name: add logrotate configuration
template:
src: system/pvc.j2
dest: /etc/logrotate.d/pvc
# Install base databases (coordinators only) # Install base databases (coordinators only)
- include: ceph/main.yml - include: ceph/main.yml

View File

@ -0,0 +1,14 @@
---
# General blacklisting of modules
- name: add module blacklist
template:
src: system/blacklist.j2
dest: /etc/modprobe.d/blacklist.conf
# Logrotate configuration
- name: add logrotate configuration
template:
src: system/pvc.j2
dest: /etc/logrotate.d/pvc
- meta: flush_handlers

View File

@ -0,0 +1,5 @@
# ceph-osd@.service overrides for cpuset
# {{ ansible_managed }}
[Service]
ExecStart =
ExecStart = /usr/bin/cset proc --set=osd --exec /usr/bin/ceph-osd -- -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph

View File

@ -0,0 +1,63 @@
#!/bin/bash
# PVC Ceph OSD cpuset preparation script
# {{ ansible_managed }}
# This script is designed to prepare the cpusets for use by Ceph OSDs, VMs, and other system resources.
# Libvirt does not make this easy with any way to globally set its CPUs, so we must do this trickery.
{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %}
A_OSD_CPUS=( {{ cset_host[0]['osd_cset'] | join(' ') }} )
A_SYS_CPUS=()
CPU_INFO="$( lscpu )"
# First, we must determine how many NUMA nodes we have
NUMA_COUNT="$( grep '^NUMA node(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )"
# If we have 1 NUMA node, our SYS_MEMS is 0; otherwise it's 0-X
# This is needed to explicitly set our memspec during the set
if [[ ${NUMA_COUNT} -eq 1 ]]; then
SYS_MEMS="0"
else
SYS_MEMS="0-$(( ${NUMA_COUNT} - 1 ))"
fi
# We must determine which NUMA nodes our OSD CPUS are in for the memspec during the set
A_OSD_MEMS=()
for CPU in ${A_OSD_CPUS[@]}; do
NODE="$( grep -E '^NUMA node[0-9]+ CPU' <<<"${CPU_INFO}" | grep -w "${CPU}" | awk '{ print $2 }' | sed 's/node//' )"
if [[ ! " ${A_OSD_MEMS} " =~ " ${NODE} " ]]; then
A_OSD_MEMS+=( $NODE )
fi
done
# Determine our CPU count
CPU_COUNT="$( grep '^CPU(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )"
echo "CPU count: ${CPU_COUNT}"
# Loop through all the CPUs in the count; if they are not in OSD_CPUS, add them to the SYS_CPUS array
for i in $( seq 0 $(( ${CPU_COUNT} - 1)) ); do
if [[ ! " ${A_OSD_CPUS[*]} " =~ " ${i} " ]]; then
A_SYS_CPUS+=( $i )
fi
done
# Convert arrays into CSV
OSD_MEMS="$( IFS=, ; echo "${A_OSD_MEMS[*]}" )"
OSD_CPUS="$( IFS=, ; echo "${A_OSD_CPUS[*]}" )"
SYS_CPUS="$( IFS=, ; echo "${A_SYS_CPUS[*]}" )"
echo "OSD CPUs: ${OSD_CPUS}"
echo "OSD Mems: ${OSD_MEMS}"
echo "System/VM CPUs: ${SYS_CPUS}"
echo "System/VM Mems: ${SYS_MEMS}"
# Create the system cpuset and move everything currently running into it
/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} system
/usr/bin/cset proc --move --force --threads root --toset=system
# Create our Libvirt cpuset (identical to system cpuset)
/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} machine
# Create our OSD cpuset
/usr/bin/cset set --cpu=${OSD_CPUS} --mem=${OSD_MEMS} osd

View File

@ -0,0 +1,13 @@
# PVC Ceph OSD cpuset service unit
# {{ ansible_managed }}
{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %}
[Unit]
Description = Ceph OSD cpuset shield creation
Before = ceph-osd@.service libvirtd.service
[Service]
Type = oneshot
ExecStart = /usr/local/sbin/ceph-osd-cpuset
[Install]
WantedBy = ceph.target