Add Ceph OSD cpuset tuning options
Allows an administrator to set CPU pinning with the cpuset tool for Ceph OSDs, in situations where CPU contention with VMs or other system tasks may be negatively affecting OSD performance. This is optional, advanced tuning and is disabled by default.
This commit is contained in:
parent
d7b07925bb
commit
732bfe732c
|
@ -13,8 +13,6 @@
|
||||||
#pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands.
|
#pvc_log_console_lines: 1000 # The number of VM console log lines to store in Zookeeper for 'vm log' commands.
|
||||||
#pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands.
|
#pvc_log_node_lines: 2000 # The number of node log lines to store in Zookeeper for 'node log' commands.
|
||||||
|
|
||||||
# Timing and fencing configuration (uncomment to override defaults)
|
|
||||||
|
|
||||||
# Timing and fencing configuration (uncomment to override defaults)
|
# Timing and fencing configuration (uncomment to override defaults)
|
||||||
# These default options are generally best for most clusters; override these if you want more granular
|
# These default options are generally best for most clusters; override these if you want more granular
|
||||||
# control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone.
|
# control over the timings of various areas of the cluster, for instance if your hardware is slow or error-prone.
|
||||||
|
@ -36,8 +34,8 @@ pvc_api_secret_key: "" # Use pwgen to generate
|
||||||
# The first token will always be used for the "local" connection, and thus at least one token MUST be defined.
|
# The first token will always be used for the "local" connection, and thus at least one token MUST be defined.
|
||||||
pvc_api_enable_authentication: True
|
pvc_api_enable_authentication: True
|
||||||
pvc_api_tokens:
|
pvc_api_tokens:
|
||||||
- description: "myuser"
|
# - description: "myuser"
|
||||||
token: "a3945326-d36c-4024-83b3-2a8931d7785a"
|
# token: "a3945326-d36c-4024-83b3-2a8931d7785a"
|
||||||
|
|
||||||
# PVC API SSL configuration
|
# PVC API SSL configuration
|
||||||
# Use these options to enable SSL for the API listener, providing security over WAN connections.
|
# Use these options to enable SSL for the API listener, providing security over WAN connections.
|
||||||
|
@ -76,22 +74,7 @@ pvc_asn: "65500"
|
||||||
pvc_routers:
|
pvc_routers:
|
||||||
- "192.168.100.1"
|
- "192.168.100.1"
|
||||||
|
|
||||||
# Memory tuning
|
# PVC Node list
|
||||||
# > Uncomment these options in low-memory situations (nodes with <32GB RAM).
|
|
||||||
# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB.
|
|
||||||
# > This option is *only* applied at cluster bootstrap and cannot be changed later
|
|
||||||
# here, only by editing the `files/ceph/<cluster>/ceph.conf` file directly.
|
|
||||||
#pvc_osd_memory_limit: 939524096
|
|
||||||
# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M.
|
|
||||||
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
|
|
||||||
# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks.
|
|
||||||
#pvc_zookeeper_heap_limit: 128M # 1/4 of default
|
|
||||||
# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M.
|
|
||||||
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
|
|
||||||
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
|
|
||||||
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
|
|
||||||
|
|
||||||
# Node list
|
|
||||||
# > Every node configured with this playbook must be specified in this list.
|
# > Every node configured with this playbook must be specified in this list.
|
||||||
pvc_nodes:
|
pvc_nodes:
|
||||||
- hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname
|
- hostname: "pvchv1" # This name MUST match the Ansible inventory_hostname
|
||||||
|
@ -143,8 +126,76 @@ pvc_sriov_enable: False
|
||||||
# mtu: 9000
|
# mtu: 9000
|
||||||
# vfcount: 6
|
# vfcount: 6
|
||||||
|
|
||||||
|
# Memory tuning
|
||||||
|
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default memory
|
||||||
|
# allocations. Uncomment these options only low-memory situations (nodes with <32GB RAM).
|
||||||
|
#
|
||||||
|
# OSD memory limit - 939524096 (~900MB) is the lowest possible value; default is 4GB.
|
||||||
|
# > This option is *only* applied at cluster bootstrap and cannot be changed later
|
||||||
|
# here, only by editing the `files/ceph/<cluster>/ceph.conf` file directly.
|
||||||
|
#pvc_osd_memory_limit: 939524096
|
||||||
|
#
|
||||||
|
# Zookeeper heap memory limit, sets Xms and Xmx values to the Java process; default is 512M.
|
||||||
|
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
|
||||||
|
# Lowering the heap limit may cause poor performance or crashes in Zookeeper during some tasks.
|
||||||
|
#pvc_zookeeper_heap_limit: 128M # 1/4 of default
|
||||||
|
#
|
||||||
|
# Zookeeper stack memory limit, sets Xss value to the Java process; default is 1024M.
|
||||||
|
# > WARNING: Unless you have an extremely limited amount of RAM, changing this setting is NOT RECOMMENDED.
|
||||||
|
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
|
||||||
|
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
|
||||||
|
|
||||||
|
# CPU pinning configuration via cset
|
||||||
|
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default scheduling.
|
||||||
|
# > These options can be set to maximize the CPU performance of the Ceph subsystem. Because Ceph OSD
|
||||||
|
# performance is heavily limited more by CPU than anything else, for users with a lot of relatively slow CPU
|
||||||
|
# cores, or for those looking to get maximum storage performance, tuning the pinning options here might
|
||||||
|
# provide noticeable benefits.
|
||||||
|
# > This configuration makes use of the cset command and will dedicate a specific number of CPU cores to the
|
||||||
|
# Ceph OSD processes on each node. This is accomplished by using cset's shield mechanism to create a cgroup
|
||||||
|
# which will contain only Ceph OSD processes, while putting everything else onto the remaining CPUs.
|
||||||
|
# > Avoid using this tuning if you have less than 8 total CPU cores (excluding SMT threads). Otherwise, you
|
||||||
|
# might not have enough CPU cores to properly run VMs, unless you are very careful with vCPU allocation.
|
||||||
|
# > Like the 'pvc_nodes' dictionary, these options are set per-host, even if all hosts are identical. This
|
||||||
|
# is required to handle sitations where hosts might have different CPU topologies. Each host can have a
|
||||||
|
# specific set of CPUs that are included in the shield.
|
||||||
|
# > Ensure that you know which CPU cores are "real" and which are SMT "threads". This can be obtained using
|
||||||
|
# the 'virsh capabilities' command and noting the 'siblings' entries for each CPU.
|
||||||
|
# > Ensure you consider NUMA nodes when setting up this tuning. Generally speaking it is better to keep the
|
||||||
|
# OSD processes onto one NUMA node for simplicity; more advanced tuning is outside of the scope of this
|
||||||
|
# playbook.
|
||||||
|
# > You should set a number of cores in the shield (along with their respective SMT threads) equal to the
|
||||||
|
# number of OSDs in the system. This can be adjusted later as needed. For instance, if you have 2 OSDs per
|
||||||
|
# node, and each node has a 10-core SMT-capable CPU, you would want to assign cores 0 and 1 (the first two
|
||||||
|
# real cores) and 10 and 11 (the SMT siblings of those cores in 'virsh capabilities') in the cset.
|
||||||
|
#
|
||||||
|
# The shield mode is disabled by default and a commented out example configuration is shown.
|
||||||
|
pvc_shield_osds_enable: False
|
||||||
|
#pvc_shield_osds_cset:
|
||||||
|
# # This example host has 2x 6-core SMT-enabled CPUs; we want to use cores 0 (+SMT 12) and 2 (+SMT 14), which are
|
||||||
|
# # both on physical CPU 0, for 2x OSDs.
|
||||||
|
# - hostname: pvchv1
|
||||||
|
# osd_cset:
|
||||||
|
# - 0
|
||||||
|
# - 2
|
||||||
|
# - 12
|
||||||
|
# - 14
|
||||||
|
# # These example hosts have 1x 8-core SMT-enabled CPUs; we want to use cores 0 (+SMT 8) and 1 (+SMT 9) for 2x OSDs.
|
||||||
|
# - hostname: pvchv2
|
||||||
|
# osd_cset:
|
||||||
|
# - 0
|
||||||
|
# - 1
|
||||||
|
# - 8
|
||||||
|
# - 9
|
||||||
|
# - hostname: pvchv3
|
||||||
|
# osd_cset:
|
||||||
|
# - 0
|
||||||
|
# - 1
|
||||||
|
# - 8
|
||||||
|
# - 9
|
||||||
|
|
||||||
# Configuration file networks
|
# Configuration file networks
|
||||||
# > Taken from base.yml's configuration; do not modify this section.
|
# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION.
|
||||||
pvc_upstream_device: "{{ networks['upstream']['device'] }}"
|
pvc_upstream_device: "{{ networks['upstream']['device'] }}"
|
||||||
pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}"
|
pvc_upstream_mtu: "{{ networks['upstream']['mtu'] }}"
|
||||||
pvc_upstream_domain: "{{ networks['upstream']['domain'] }}"
|
pvc_upstream_domain: "{{ networks['upstream']['domain'] }}"
|
||||||
|
|
|
@ -117,4 +117,55 @@
|
||||||
- ceph-mon@{{ ansible_hostname }}
|
- ceph-mon@{{ ansible_hostname }}
|
||||||
- ceph-mgr@{{ ansible_hostname }}
|
- ceph-mgr@{{ ansible_hostname }}
|
||||||
|
|
||||||
|
# System OSD CPU shielding activation
|
||||||
|
- block:
|
||||||
|
- name: install packages
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- cpuset
|
||||||
|
- numactl
|
||||||
|
state: latest
|
||||||
|
|
||||||
|
- name: install ceph-osd-cpuset script
|
||||||
|
template:
|
||||||
|
src: ceph/ceph-osd-cpuset.j2
|
||||||
|
dest: /usr/local/sbin/ceph-osd-cpuset
|
||||||
|
mode: 0755
|
||||||
|
|
||||||
|
- name: install ceph-osd-cpuset service unit
|
||||||
|
template:
|
||||||
|
src: ceph/ceph-osd-cpuset.service.j2
|
||||||
|
dest: /etc/systemd/system/ceph-osd-cpuset.service
|
||||||
|
register: systemd_file_cpuset
|
||||||
|
|
||||||
|
- name: create ceph-osd override dropin directory
|
||||||
|
file:
|
||||||
|
dest: /etc/systemd/system/ceph-osd@.service.d
|
||||||
|
state: directory
|
||||||
|
|
||||||
|
- name: install ceph-osd override dropin
|
||||||
|
template:
|
||||||
|
src: ceph/ceph-osd-cpuset.conf.j2
|
||||||
|
dest: /etc/systemd/system/ceph-osd@.service.d/cpuset.conf
|
||||||
|
register: systemd_file_osd
|
||||||
|
|
||||||
|
- name: reload systemd to apply previous changes
|
||||||
|
command: "systemctl daemon-reload"
|
||||||
|
when: systemd_file_cpuset.changed or systemd_file_osd.changed
|
||||||
|
|
||||||
|
- name: enable ceph-osd-cpuset service
|
||||||
|
service:
|
||||||
|
name: ceph-osd-cpuset
|
||||||
|
enabled: yes
|
||||||
|
|
||||||
|
- debug:
|
||||||
|
msg: "NOTICE: cpuset configs have NOT been applied to the running system. This node must be rebooted to apply these changes."
|
||||||
|
when: systemd_file_cpuset.changed or systemd_file_osd.changed
|
||||||
|
tags: pvc-ceph-cpuset
|
||||||
|
when:
|
||||||
|
- pvc_shield_osds_enable is defined
|
||||||
|
- pvc_shield_osds_enable
|
||||||
|
- pvc_shield_osds_cset is defined
|
||||||
|
- pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) | list | count > 0
|
||||||
|
|
||||||
- meta: flush_handlers
|
- meta: flush_handlers
|
||||||
|
|
|
@ -23,17 +23,9 @@
|
||||||
when: newhost is defined and newhost
|
when: newhost is defined and newhost
|
||||||
tags: always
|
tags: always
|
||||||
|
|
||||||
# General blacklisting of modules
|
# Install system tweaks
|
||||||
- name: add module blacklist
|
- include: system/main.yml
|
||||||
template:
|
tags: pvc-system
|
||||||
src: system/blacklist.j2
|
|
||||||
dest: /etc/modprobe.d/blacklist.conf
|
|
||||||
|
|
||||||
# Logrotate configuration
|
|
||||||
- name: add logrotate configuration
|
|
||||||
template:
|
|
||||||
src: system/pvc.j2
|
|
||||||
dest: /etc/logrotate.d/pvc
|
|
||||||
|
|
||||||
# Install base databases (coordinators only)
|
# Install base databases (coordinators only)
|
||||||
- include: ceph/main.yml
|
- include: ceph/main.yml
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
# General blacklisting of modules
|
||||||
|
- name: add module blacklist
|
||||||
|
template:
|
||||||
|
src: system/blacklist.j2
|
||||||
|
dest: /etc/modprobe.d/blacklist.conf
|
||||||
|
|
||||||
|
# Logrotate configuration
|
||||||
|
- name: add logrotate configuration
|
||||||
|
template:
|
||||||
|
src: system/pvc.j2
|
||||||
|
dest: /etc/logrotate.d/pvc
|
||||||
|
|
||||||
|
- meta: flush_handlers
|
|
@ -0,0 +1,5 @@
|
||||||
|
# ceph-osd@.service overrides for cpuset
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
[Service]
|
||||||
|
ExecStart =
|
||||||
|
ExecStart = /usr/bin/cset proc --set=osd --exec /usr/bin/ceph-osd -- -f --cluster ${CLUSTER} --id %i --setuser ceph --setgroup ceph
|
|
@ -0,0 +1,63 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# PVC Ceph OSD cpuset preparation script
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
# This script is designed to prepare the cpusets for use by Ceph OSDs, VMs, and other system resources.
|
||||||
|
# Libvirt does not make this easy with any way to globally set its CPUs, so we must do this trickery.
|
||||||
|
{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %}
|
||||||
|
|
||||||
|
A_OSD_CPUS=( {{ cset_host[0]['osd_cset'] | join(' ') }} )
|
||||||
|
A_SYS_CPUS=()
|
||||||
|
|
||||||
|
CPU_INFO="$( lscpu )"
|
||||||
|
|
||||||
|
# First, we must determine how many NUMA nodes we have
|
||||||
|
NUMA_COUNT="$( grep '^NUMA node(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )"
|
||||||
|
|
||||||
|
# If we have 1 NUMA node, our SYS_MEMS is 0; otherwise it's 0-X
|
||||||
|
# This is needed to explicitly set our memspec during the set
|
||||||
|
if [[ ${NUMA_COUNT} -eq 1 ]]; then
|
||||||
|
SYS_MEMS="0"
|
||||||
|
else
|
||||||
|
SYS_MEMS="0-$(( ${NUMA_COUNT} - 1 ))"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# We must determine which NUMA nodes our OSD CPUS are in for the memspec during the set
|
||||||
|
A_OSD_MEMS=()
|
||||||
|
for CPU in ${A_OSD_CPUS[@]}; do
|
||||||
|
NODE="$( grep -E '^NUMA node[0-9]+ CPU' <<<"${CPU_INFO}" | grep -w "${CPU}" | awk '{ print $2 }' | sed 's/node//' )"
|
||||||
|
if [[ ! " ${A_OSD_MEMS} " =~ " ${NODE} " ]]; then
|
||||||
|
A_OSD_MEMS+=( $NODE )
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Determine our CPU count
|
||||||
|
CPU_COUNT="$( grep '^CPU(s)' <<<"${CPU_INFO}" | awk '{ print $NF }' )"
|
||||||
|
echo "CPU count: ${CPU_COUNT}"
|
||||||
|
|
||||||
|
# Loop through all the CPUs in the count; if they are not in OSD_CPUS, add them to the SYS_CPUS array
|
||||||
|
for i in $( seq 0 $(( ${CPU_COUNT} - 1)) ); do
|
||||||
|
if [[ ! " ${A_OSD_CPUS[*]} " =~ " ${i} " ]]; then
|
||||||
|
A_SYS_CPUS+=( $i )
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Convert arrays into CSV
|
||||||
|
OSD_MEMS="$( IFS=, ; echo "${A_OSD_MEMS[*]}" )"
|
||||||
|
OSD_CPUS="$( IFS=, ; echo "${A_OSD_CPUS[*]}" )"
|
||||||
|
SYS_CPUS="$( IFS=, ; echo "${A_SYS_CPUS[*]}" )"
|
||||||
|
|
||||||
|
echo "OSD CPUs: ${OSD_CPUS}"
|
||||||
|
echo "OSD Mems: ${OSD_MEMS}"
|
||||||
|
echo "System/VM CPUs: ${SYS_CPUS}"
|
||||||
|
echo "System/VM Mems: ${SYS_MEMS}"
|
||||||
|
|
||||||
|
# Create the system cpuset and move everything currently running into it
|
||||||
|
/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} system
|
||||||
|
/usr/bin/cset proc --move --force --threads root --toset=system
|
||||||
|
|
||||||
|
# Create our Libvirt cpuset (identical to system cpuset)
|
||||||
|
/usr/bin/cset set --cpu=${SYS_CPUS} --mem=${SYS_MEMS} machine
|
||||||
|
|
||||||
|
# Create our OSD cpuset
|
||||||
|
/usr/bin/cset set --cpu=${OSD_CPUS} --mem=${OSD_MEMS} osd
|
|
@ -0,0 +1,13 @@
|
||||||
|
# PVC Ceph OSD cpuset service unit
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
{% set cset_host = pvc_shield_osds_cset | selectattr('hostname', 'equalto', inventory_hostname) %}
|
||||||
|
[Unit]
|
||||||
|
Description = Ceph OSD cpuset shield creation
|
||||||
|
Before = ceph-osd@.service libvirtd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type = oneshot
|
||||||
|
ExecStart = /usr/local/sbin/ceph-osd-cpuset
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy = ceph.target
|
Loading…
Reference in New Issue