Add updated tuning configuration

Uses a much nicer CPU tuning configuration, leveraging systemd's
AllowedCPUs and CPUAffinity options within a set of slices (some
default, some custom).

Configuration is also greatly simplified versus the previous
implementation, simply asking for a number of CPUS for both the system
and OSDs, and calculating everything else that is required.

Also switches (back) to the v2 unified cgroup hierarchy by default as
required by the systemd AllowedCPUs directive.
This commit is contained in:
Joshua Boniface 2023-09-01 15:42:29 -04:00
parent 131caba0bd
commit 07d75573d6
12 changed files with 258 additions and 17 deletions

View File

@ -87,6 +87,9 @@ pvc_nodes:
ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here
ipmi_user: "{{ ipmi['users']['pvc']['username'] }}" ipmi_user: "{{ ipmi['users']['pvc']['username'] }}"
ipmi_password: "{{ ipmi['users']['pvc']['password'] }}" ipmi_password: "{{ ipmi['users']['pvc']['password'] }}"
cpu_tuning: # Example of cpu_tuning overrides per-node, only relevant if enabled; see below
system_cpus: 2
osd_cpus: 2
- hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]" - hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]"
is_coordinator: yes is_coordinator: yes
node_id: 2 node_id: 2
@ -146,6 +149,37 @@ pvc_sriov_enable: False
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. # Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
#pvc_zookeeper_stack_limit: 256M # 1/4 of default #pvc_zookeeper_stack_limit: 256M # 1/4 of default
# CPU tuning
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default CPU
# allocations. Adjust these options only for clusters where CPU optimization is needed.
# > Defines CPU tuning/affinity options for various subsystems within PVC. This is useful to
# help limit the impact that noisy elements may have on other elements, e.g. busy VMs on
# OSDs, or system processes on latency-sensitive VMs.
# > To enable tuning, set enabled to yes.
# > Within "nodes", two counts are specified:
# * system_cpus: The number of CPUs to assign to the "system" slice, i.e. all non-VM,
# non-OSD processes on the system. Should usually be at least 2, and be
# higher on the coordinators of larger clusters (i.e. >5 nodes).
# * osd_cpus: The number of CPUs to assign to the "osd" slice, i.e. all OSD processes.
# Should be at least 1 per OSD, and ideally 2 per OSD for best performance.
# A third count, for the VM CPUs, is autogenerated based on the total node CPU count and
# the above two values (using all remaining CPUs).
# > Tuning is done based on cores; for systems with SMT (>1 thread-per-core), all SMTs within
# a given core are also assigned to the same CPU set. So for example, if the system assigns
# 2 system_cpus, there are 16 cores, and there are 2 threads per core, the list will be:
# 0,1,16,17
# leveraging the assumption that Linux puts all cores before all threads.
# > This tuning section under "nodes" is global to the cluster; to override these values on
# a per-node basis, use the corresponding "cpu_tuning" section of a given "pvc_nodes" entry
# as shown below.
# > If disabled after being enabled, the tuning configurations on each node will be removed
# on the next run. A reboot of all nodes is required to fully disable the tuning.
cpu_tuning:
enabled: no
nodes:
system_cpus: 2 # Set based on your actual system configuration (min 2, increase on coordinators if many nodes)
osd_cpus: 2 # Set based on your actual number of OSDs
# Configuration file networks # Configuration file networks
# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION. # > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION.
pvc_upstream_device: "{{ networks['upstream']['device'] }}" pvc_upstream_device: "{{ networks['upstream']['device'] }}"

View File

@ -12,7 +12,7 @@ recursive_dns_servers:
recursive_dns_search_domains: recursive_dns_search_domains:
- "{{ local_domain }}" - "{{ local_domain }}"
grub_cmdline: "systemd.unified_cgroup_hierarchy=0 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash" grub_cmdline: "systemd.unified_cgroup_hierarchy=1 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash"
grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200" grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200"
deploy_username: "deploy" deploy_username: "deploy"

View File

@ -37,18 +37,39 @@ pvc_api_database_name: "pvcapi"
pvc_api_database_user: "pvcapi" pvc_api_database_user: "pvcapi"
pvc_api_database_password: "PVCprovPassw0rd" pvc_api_database_password: "PVCprovPassw0rd"
# CPU tuning
# This is left commented so the section of the tasks never runs; enable or disable it in your per-cluster configs
# CPU tune options defined per-node are placed in the pvc_nodes section below under cpu_tuning; global versions
# may be placed here instead.
# Whether a node has hyperthreading is determined automatically; if so, for each (real) CPU core assigned to a
# subscection, its corresponding hyperthread is also assigned to that section.
# machine_cpus is later used in the templates; the value of this field is autogenerated as:
# total_system_cpus - osd_cpus - system_cpus
#cpu_tuning:
# enabled: yes # Enable or disable CPU tuning for processes
# nodes: # Nodes configuration; default options, can be overridden by per-node tuning below
# system_cpus: 2 # The number of CPUs to assign to the "system" slice
# # This slice includes all non-VM, non-OSD processes including databases, node daemons, system processes, non-OSD Ceph processes, etc.
# # At least 2 cores should be assigned to this slice.
# osd_cpus: 2 # The number of CPUs to assign to the "osd" slice
# # This slice includes all OSD processes
# # At least 1 core per OSD should be assigned to this slice.
# Coordinators # Coordinators
pvc_nodes: pvc_nodes:
- hostname: "pvc1" - hostname: "pvc1" # The full ansible inventory hostname of the node
is_coordinator: yes is_coordinator: yes # If the node is a coordinator or not
node_id: 1 node_id: 1 # The sequential node ID, usually matches the numerical part of the hostname
router_id: "10.0.0.1" router_id: "10.0.0.1" # The router ID of the node; can be the IP Address of the Cluster network, or the node_id, or some other unique number
cluster_ip: "by-id" cluster_ip: "by-id" # The Cluster network IP of the host; by-id uses the network then adds the node as node_id within that network (e.g. pvc1 becomes x.y.z.1)
storage_ip: "by-id" storage_ip: "by-id" # The Storage network IP of the host; by-id as above
upstream_ip: "" upstream_ip: "" # The Upstream network IP of the host; by-id as above
ipmi_host: "pvc1-lom" ipmi_host: "pvc1-lom" # The IPMI hostname of the node
ipmi_user: "" ipmi_user: "" # The IPMI username to use
ipmi_password: "" ipmi_password: "" # The IPMI password to use
cpu_tuning: # Per-node CPU tuning options; if set, overrides the global options above; useful if a node has different CPU characteristics
system_cpus: 1
osd_cpus: 2
- hostname: "pvc2" - hostname: "pvc2"
is_coordinator: yes is_coordinator: yes
node_id: 2 node_id: 2
@ -56,9 +77,10 @@ pvc_nodes:
cluster_ip: "by-id" cluster_ip: "by-id"
storage_ip: "by-id" storage_ip: "by-id"
upstream_ip: "" upstream_ip: ""
ipmi_host: "pvc2-lom" ipmi:
ipmi_user: "" host: "pvc2-lom"
ipmi_password: "" user: ""
password: ""
- hostname: "pvc3" - hostname: "pvc3"
is_coordinator: yes is_coordinator: yes
node_id: 3 node_id: 3
@ -66,9 +88,10 @@ pvc_nodes:
cluster_ip: "by-id" cluster_ip: "by-id"
storage_ip: "by-id" storage_ip: "by-id"
upstream_ip: "" upstream_ip: ""
ipmi_host: "pvc3-lom" ipmi:
ipmi_user: "" host: "pvc3-lom"
ipmi_password: "" user: ""
password: ""
# Networks # Networks
pvc_asn: "65001" pvc_asn: "65001"

View File

@ -0,0 +1,18 @@
---
- name: remove cpu tuning configurations
file:
dest: "{{ item }}"
state: absent
loop:
- /etc/systemd/system/system.slice
- /etc/systemd/system/user.slice
- /etc/systemd/system/osd.slice
- /etc/systemd/system/machine.slice
- /etc/systemd/system/ceph-osd@.service.d/cputuning.conf
register: systemd
ignore_errors: yes
- name: reload systemd to apply changes
command: systemctl daemon-reload
when: systemd.changed

View File

@ -0,0 +1,100 @@
---
# Calculate the correct per-node cpu sets
- name: set global values
set_fact:
system_cpus: "{{ cpu_tuning.nodes.system_cpus }}"
osd_cpus: "{{ cpu_tuning.nodes.osd_cpus }}"
- name: get per-node cpu tuning values
set_fact:
node_cpu_tuning: "{% for node in pvc_nodes if node.hostname == this_node %}{% if node.cpu_tuning is defined %}{{ node.cpu_tuning }}{% endif %}{% endfor %}"
- name: override global system_cpus value if set
set_fact:
system_cpus: "{{ node_cpu_tuning.system_cpus }}"
osd_cpus: "{{ node_cpu_tuning.osd_cpus }}"
when: node_cpu_tuning is defined and node_cpu_tuning
- name: get node CPU details
command: lscpu --json
register: lscpu
- name: set sockets variable
set_fact:
sockets: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
vars:
query: "lscpu[?field == 'Socket(s):'].data"
- name: set cores_per_socket variable
set_fact:
cores_per_socket: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
vars:
query: "lscpu[?field == 'Core(s) per socket:'].data"
- name: set threads_per_core variable
set_fact:
threads_per_core: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
vars:
query: "lscpu[?field == 'Thread(s) per core:'].data"
- name: set total_cores variable
set_fact:
total_cores: "{{ sockets|int * cores_per_socket|int }}"
- name: craft the system cpuset (first <system_cpus> cores + any threads as applicable)
set_fact:
cpuset_system: "{%- set cores = [] -%}
{%- for rng in range(0, system_cpus|int) -%}
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
{{ cores.append(core) }}
{%- endfor -%}
{%- endfor -%}
{{ cores|sort|join(',') }}"
- name: craft the osd cpuset (next <osd_cpus> cores + any threads as applicable)
set_fact:
cpuset_osd: "{%- set cores = [] -%}
{%- for rng in range(system_cpus|int, system_cpus|int + osd_cpus|int) -%}
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
{{ cores.append(core) }}
{%- endfor -%}
{%- endfor -%}
{{ cores|sort|join(',') }}"
- name: craft the VM cpuset (remaining cores + any threads as applicable)
set_fact:
cpuset_vm: "{%- set cores = [] -%}
{%- for rng in range(system_cpus|int + osd_cpus|int, total_cores|int) -%}
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
{{ cores.append(core) }}
{%- endfor -%}
{%- endfor -%}
{{ cores|sort|join(',') }}"
# Actually install the required components
- name: install slice tuning units
template:
src: "cputuning/{{ item }}.j2"
dest: "/etc/systemd/system/{{ item }}"
loop:
- system.slice
- user.slice
- osd.slice
- machine.slice
register: systemd_slices
- name: create osd unit override configuration directory
file:
dest: /etc/systemd/system/ceph-osd@.service.d
state: directory
- name: install osd cputuning configuration
template:
src: cputuning/ceph-osd@.service.d-cputuning.conf
dest: /etc/systemd/system/ceph-osd@.service.d/cputuning.conf
register: systemd_osdtuning
- name: reload systemd to apply changes
command: systemctl daemon-reload
when: systemd_slices.changed or systemd_osdtuning.changed

View File

@ -0,0 +1,7 @@
---
- include: enable.yml
when: cpu_tuning.enabled
- include: disable.yml
when: not cpu_tuning.enabled

View File

@ -56,6 +56,11 @@
- include: pvc/main.yml - include: pvc/main.yml
tags: pvc-daemon tags: pvc-daemon
# Install CPU tuning
- include: cputuning/main.yml
tags: pvc-cputuning
when: cpu_tuning is defined
- name: restart server on first install - name: restart server on first install
shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"' shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"'
async: 1 async: 1

View File

@ -0,0 +1,2 @@
[Service]
Slice = osd.slice

View File

@ -0,0 +1,13 @@
# PVC VM slice unit
# {{ ansible_managed }}
[Unit]
Description=Virtual Machine and Container Slice
Documentation=man:systemd.special(7)
Before=slices.target
[Slice]
CPUAccounting = true
Delegate = true
CPUAffinity = {{ cpuset_vm }}
AllowedCPUs = {{ cpuset_vm }}

View File

@ -0,0 +1,13 @@
# PVC ceph-osd slice unit
# {{ ansible_managed }}
[Unit]
Description=Ceph OSD Slice
Documentation=man:systemd.special(7)
Before=slices.target
[Slice]
CPUAccounting=true
Delegate = true
CPUAffinity = {{ cpuset_osd }}
AllowedCPUs = {{ cpuset_osd }}

View File

@ -0,0 +1,13 @@
# PVC VM slice unit
# {{ ansible_managed }}
[Unit]
Description = Core System Slice
Documentation = man:systemd.special(7)
Before = slices.target
[Slice]
CPUAccounting = true
Delegate = true
CPUAffinity = {{ cpuset_system }}
AllowedCPUs = {{ cpuset_system }}

View File

@ -0,0 +1,13 @@
# PVC VM slice unit
# {{ ansible_managed }}
[Unit]
Description = User and Session Slice
Documentation = man:systemd.special(7)
Before = slices.target
[Slice]
CPUAccounting = true
Delegate = true
CPUAffinity = {{ cpuset_system }}
AllowedCPUs = {{ cpuset_system }}