Add updated tuning configuration
Uses a much nicer CPU tuning configuration, leveraging systemd's AllowedCPUs and CPUAffinity options within a set of slices (some default, some custom). Configuration is also greatly simplified versus the previous implementation, simply asking for a number of CPUS for both the system and OSDs, and calculating everything else that is required. Also switches (back) to the v2 unified cgroup hierarchy by default as required by the systemd AllowedCPUs directive.
This commit is contained in:
parent
e52c46f68d
commit
c5ec0f4f62
|
@ -87,6 +87,9 @@ pvc_nodes:
|
||||||
ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here
|
ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here
|
||||||
ipmi_user: "{{ ipmi['users']['pvc']['username'] }}"
|
ipmi_user: "{{ ipmi['users']['pvc']['username'] }}"
|
||||||
ipmi_password: "{{ ipmi['users']['pvc']['password'] }}"
|
ipmi_password: "{{ ipmi['users']['pvc']['password'] }}"
|
||||||
|
cpu_tuning: # Example of cpu_tuning overrides per-node, only relevant if enabled; see below
|
||||||
|
system_cpus: 2
|
||||||
|
osd_cpus: 2
|
||||||
- hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]"
|
- hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]"
|
||||||
is_coordinator: yes
|
is_coordinator: yes
|
||||||
node_id: 2
|
node_id: 2
|
||||||
|
@ -146,6 +149,37 @@ pvc_sriov_enable: False
|
||||||
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
|
# Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks.
|
||||||
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
|
#pvc_zookeeper_stack_limit: 256M # 1/4 of default
|
||||||
|
|
||||||
|
# CPU tuning
|
||||||
|
# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default CPU
|
||||||
|
# allocations. Adjust these options only for clusters where CPU optimization is needed.
|
||||||
|
# > Defines CPU tuning/affinity options for various subsystems within PVC. This is useful to
|
||||||
|
# help limit the impact that noisy elements may have on other elements, e.g. busy VMs on
|
||||||
|
# OSDs, or system processes on latency-sensitive VMs.
|
||||||
|
# > To enable tuning, set enabled to yes.
|
||||||
|
# > Within "nodes", two counts are specified:
|
||||||
|
# * system_cpus: The number of CPUs to assign to the "system" slice, i.e. all non-VM,
|
||||||
|
# non-OSD processes on the system. Should usually be at least 2, and be
|
||||||
|
# higher on the coordinators of larger clusters (i.e. >5 nodes).
|
||||||
|
# * osd_cpus: The number of CPUs to assign to the "osd" slice, i.e. all OSD processes.
|
||||||
|
# Should be at least 1 per OSD, and ideally 2 per OSD for best performance.
|
||||||
|
# A third count, for the VM CPUs, is autogenerated based on the total node CPU count and
|
||||||
|
# the above two values (using all remaining CPUs).
|
||||||
|
# > Tuning is done based on cores; for systems with SMT (>1 thread-per-core), all SMTs within
|
||||||
|
# a given core are also assigned to the same CPU set. So for example, if the system assigns
|
||||||
|
# 2 system_cpus, there are 16 cores, and there are 2 threads per core, the list will be:
|
||||||
|
# 0,1,16,17
|
||||||
|
# leveraging the assumption that Linux puts all cores before all threads.
|
||||||
|
# > This tuning section under "nodes" is global to the cluster; to override these values on
|
||||||
|
# a per-node basis, use the corresponding "cpu_tuning" section of a given "pvc_nodes" entry
|
||||||
|
# as shown below.
|
||||||
|
# > If disabled after being enabled, the tuning configurations on each node will be removed
|
||||||
|
# on the next run. A reboot of all nodes is required to fully disable the tuning.
|
||||||
|
cpu_tuning:
|
||||||
|
enabled: no
|
||||||
|
nodes:
|
||||||
|
system_cpus: 2 # Set based on your actual system configuration (min 2, increase on coordinators if many nodes)
|
||||||
|
osd_cpus: 2 # Set based on your actual number of OSDs
|
||||||
|
|
||||||
# Configuration file networks
|
# Configuration file networks
|
||||||
# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION.
|
# > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION.
|
||||||
pvc_upstream_device: "{{ networks['upstream']['device'] }}"
|
pvc_upstream_device: "{{ networks['upstream']['device'] }}"
|
||||||
|
|
|
@ -12,7 +12,7 @@ recursive_dns_servers:
|
||||||
recursive_dns_search_domains:
|
recursive_dns_search_domains:
|
||||||
- "{{ local_domain }}"
|
- "{{ local_domain }}"
|
||||||
|
|
||||||
grub_cmdline: "systemd.unified_cgroup_hierarchy=0 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash"
|
grub_cmdline: "systemd.unified_cgroup_hierarchy=1 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash"
|
||||||
grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200"
|
grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200"
|
||||||
|
|
||||||
deploy_username: "deploy"
|
deploy_username: "deploy"
|
||||||
|
|
|
@ -37,18 +37,39 @@ pvc_api_database_name: "pvcapi"
|
||||||
pvc_api_database_user: "pvcapi"
|
pvc_api_database_user: "pvcapi"
|
||||||
pvc_api_database_password: "PVCprovPassw0rd"
|
pvc_api_database_password: "PVCprovPassw0rd"
|
||||||
|
|
||||||
|
# CPU tuning
|
||||||
|
# This is left commented so the section of the tasks never runs; enable or disable it in your per-cluster configs
|
||||||
|
# CPU tune options defined per-node are placed in the pvc_nodes section below under cpu_tuning; global versions
|
||||||
|
# may be placed here instead.
|
||||||
|
# Whether a node has hyperthreading is determined automatically; if so, for each (real) CPU core assigned to a
|
||||||
|
# subscection, its corresponding hyperthread is also assigned to that section.
|
||||||
|
# machine_cpus is later used in the templates; the value of this field is autogenerated as:
|
||||||
|
# total_system_cpus - osd_cpus - system_cpus
|
||||||
|
#cpu_tuning:
|
||||||
|
# enabled: yes # Enable or disable CPU tuning for processes
|
||||||
|
# nodes: # Nodes configuration; default options, can be overridden by per-node tuning below
|
||||||
|
# system_cpus: 2 # The number of CPUs to assign to the "system" slice
|
||||||
|
# # This slice includes all non-VM, non-OSD processes including databases, node daemons, system processes, non-OSD Ceph processes, etc.
|
||||||
|
# # At least 2 cores should be assigned to this slice.
|
||||||
|
# osd_cpus: 2 # The number of CPUs to assign to the "osd" slice
|
||||||
|
# # This slice includes all OSD processes
|
||||||
|
# # At least 1 core per OSD should be assigned to this slice.
|
||||||
|
|
||||||
# Coordinators
|
# Coordinators
|
||||||
pvc_nodes:
|
pvc_nodes:
|
||||||
- hostname: "pvc1"
|
- hostname: "pvc1" # The full ansible inventory hostname of the node
|
||||||
is_coordinator: yes
|
is_coordinator: yes # If the node is a coordinator or not
|
||||||
node_id: 1
|
node_id: 1 # The sequential node ID, usually matches the numerical part of the hostname
|
||||||
router_id: "10.0.0.1"
|
router_id: "10.0.0.1" # The router ID of the node; can be the IP Address of the Cluster network, or the node_id, or some other unique number
|
||||||
cluster_ip: "by-id"
|
cluster_ip: "by-id" # The Cluster network IP of the host; by-id uses the network then adds the node as node_id within that network (e.g. pvc1 becomes x.y.z.1)
|
||||||
storage_ip: "by-id"
|
storage_ip: "by-id" # The Storage network IP of the host; by-id as above
|
||||||
upstream_ip: ""
|
upstream_ip: "" # The Upstream network IP of the host; by-id as above
|
||||||
ipmi_host: "pvc1-lom"
|
ipmi_host: "pvc1-lom" # The IPMI hostname of the node
|
||||||
ipmi_user: ""
|
ipmi_user: "" # The IPMI username to use
|
||||||
ipmi_password: ""
|
ipmi_password: "" # The IPMI password to use
|
||||||
|
cpu_tuning: # Per-node CPU tuning options; if set, overrides the global options above; useful if a node has different CPU characteristics
|
||||||
|
system_cpus: 1
|
||||||
|
osd_cpus: 2
|
||||||
- hostname: "pvc2"
|
- hostname: "pvc2"
|
||||||
is_coordinator: yes
|
is_coordinator: yes
|
||||||
node_id: 2
|
node_id: 2
|
||||||
|
@ -56,9 +77,10 @@ pvc_nodes:
|
||||||
cluster_ip: "by-id"
|
cluster_ip: "by-id"
|
||||||
storage_ip: "by-id"
|
storage_ip: "by-id"
|
||||||
upstream_ip: ""
|
upstream_ip: ""
|
||||||
ipmi_host: "pvc2-lom"
|
ipmi:
|
||||||
ipmi_user: ""
|
host: "pvc2-lom"
|
||||||
ipmi_password: ""
|
user: ""
|
||||||
|
password: ""
|
||||||
- hostname: "pvc3"
|
- hostname: "pvc3"
|
||||||
is_coordinator: yes
|
is_coordinator: yes
|
||||||
node_id: 3
|
node_id: 3
|
||||||
|
@ -66,9 +88,10 @@ pvc_nodes:
|
||||||
cluster_ip: "by-id"
|
cluster_ip: "by-id"
|
||||||
storage_ip: "by-id"
|
storage_ip: "by-id"
|
||||||
upstream_ip: ""
|
upstream_ip: ""
|
||||||
ipmi_host: "pvc3-lom"
|
ipmi:
|
||||||
ipmi_user: ""
|
host: "pvc3-lom"
|
||||||
ipmi_password: ""
|
user: ""
|
||||||
|
password: ""
|
||||||
|
|
||||||
# Networks
|
# Networks
|
||||||
pvc_asn: "65001"
|
pvc_asn: "65001"
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
---
|
||||||
|
|
||||||
|
- name: remove cpu tuning configurations
|
||||||
|
file:
|
||||||
|
dest: "{{ item }}"
|
||||||
|
state: absent
|
||||||
|
loop:
|
||||||
|
- /etc/systemd/system/system.slice
|
||||||
|
- /etc/systemd/system/user.slice
|
||||||
|
- /etc/systemd/system/osd.slice
|
||||||
|
- /etc/systemd/system/machine.slice
|
||||||
|
- /etc/systemd/system/ceph-osd@.service.d/cputuning.conf
|
||||||
|
register: systemd
|
||||||
|
ignore_errors: yes
|
||||||
|
|
||||||
|
- name: reload systemd to apply changes
|
||||||
|
command: systemctl daemon-reload
|
||||||
|
when: systemd.changed
|
|
@ -0,0 +1,100 @@
|
||||||
|
---
|
||||||
|
|
||||||
|
# Calculate the correct per-node cpu sets
|
||||||
|
- name: set global values
|
||||||
|
set_fact:
|
||||||
|
system_cpus: "{{ cpu_tuning.nodes.system_cpus }}"
|
||||||
|
osd_cpus: "{{ cpu_tuning.nodes.osd_cpus }}"
|
||||||
|
|
||||||
|
- name: get per-node cpu tuning values
|
||||||
|
set_fact:
|
||||||
|
node_cpu_tuning: "{% for node in pvc_nodes if node.hostname == this_node %}{% if node.cpu_tuning is defined %}{{ node.cpu_tuning }}{% endif %}{% endfor %}"
|
||||||
|
|
||||||
|
- name: override global system_cpus value if set
|
||||||
|
set_fact:
|
||||||
|
system_cpus: "{{ node_cpu_tuning.system_cpus }}"
|
||||||
|
osd_cpus: "{{ node_cpu_tuning.osd_cpus }}"
|
||||||
|
when: node_cpu_tuning is defined and node_cpu_tuning
|
||||||
|
|
||||||
|
- name: get node CPU details
|
||||||
|
command: lscpu --json
|
||||||
|
register: lscpu
|
||||||
|
|
||||||
|
- name: set sockets variable
|
||||||
|
set_fact:
|
||||||
|
sockets: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
|
||||||
|
vars:
|
||||||
|
query: "lscpu[?field == 'Socket(s):'].data"
|
||||||
|
|
||||||
|
- name: set cores_per_socket variable
|
||||||
|
set_fact:
|
||||||
|
cores_per_socket: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
|
||||||
|
vars:
|
||||||
|
query: "lscpu[?field == 'Core(s) per socket:'].data"
|
||||||
|
|
||||||
|
- name: set threads_per_core variable
|
||||||
|
set_fact:
|
||||||
|
threads_per_core: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}"
|
||||||
|
vars:
|
||||||
|
query: "lscpu[?field == 'Thread(s) per core:'].data"
|
||||||
|
|
||||||
|
- name: set total_cores variable
|
||||||
|
set_fact:
|
||||||
|
total_cores: "{{ sockets|int * cores_per_socket|int }}"
|
||||||
|
|
||||||
|
- name: craft the system cpuset (first <system_cpus> cores + any threads as applicable)
|
||||||
|
set_fact:
|
||||||
|
cpuset_system: "{%- set cores = [] -%}
|
||||||
|
{%- for rng in range(0, system_cpus|int) -%}
|
||||||
|
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
|
||||||
|
{{ cores.append(core) }}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ cores|sort|join(',') }}"
|
||||||
|
|
||||||
|
- name: craft the osd cpuset (next <osd_cpus> cores + any threads as applicable)
|
||||||
|
set_fact:
|
||||||
|
cpuset_osd: "{%- set cores = [] -%}
|
||||||
|
{%- for rng in range(system_cpus|int, system_cpus|int + osd_cpus|int) -%}
|
||||||
|
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
|
||||||
|
{{ cores.append(core) }}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ cores|sort|join(',') }}"
|
||||||
|
|
||||||
|
- name: craft the VM cpuset (remaining cores + any threads as applicable)
|
||||||
|
set_fact:
|
||||||
|
cpuset_vm: "{%- set cores = [] -%}
|
||||||
|
{%- for rng in range(system_cpus|int + osd_cpus|int, total_cores|int) -%}
|
||||||
|
{%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%}
|
||||||
|
{{ cores.append(core) }}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ cores|sort|join(',') }}"
|
||||||
|
|
||||||
|
# Actually install the required components
|
||||||
|
- name: install slice tuning units
|
||||||
|
template:
|
||||||
|
src: "cputuning/{{ item }}.j2"
|
||||||
|
dest: "/etc/systemd/system/{{ item }}"
|
||||||
|
loop:
|
||||||
|
- system.slice
|
||||||
|
- user.slice
|
||||||
|
- osd.slice
|
||||||
|
- machine.slice
|
||||||
|
register: systemd_slices
|
||||||
|
|
||||||
|
- name: create osd unit override configuration directory
|
||||||
|
file:
|
||||||
|
dest: /etc/systemd/system/ceph-osd@.service.d
|
||||||
|
state: directory
|
||||||
|
|
||||||
|
- name: install osd cputuning configuration
|
||||||
|
template:
|
||||||
|
src: cputuning/ceph-osd@.service.d-cputuning.conf
|
||||||
|
dest: /etc/systemd/system/ceph-osd@.service.d/cputuning.conf
|
||||||
|
register: systemd_osdtuning
|
||||||
|
|
||||||
|
- name: reload systemd to apply changes
|
||||||
|
command: systemctl daemon-reload
|
||||||
|
when: systemd_slices.changed or systemd_osdtuning.changed
|
|
@ -0,0 +1,7 @@
|
||||||
|
---
|
||||||
|
|
||||||
|
- include: enable.yml
|
||||||
|
when: cpu_tuning.enabled
|
||||||
|
|
||||||
|
- include: disable.yml
|
||||||
|
when: not cpu_tuning.enabled
|
|
@ -56,6 +56,11 @@
|
||||||
- include: pvc/main.yml
|
- include: pvc/main.yml
|
||||||
tags: pvc-daemon
|
tags: pvc-daemon
|
||||||
|
|
||||||
|
# Install CPU tuning
|
||||||
|
- include: cputuning/main.yml
|
||||||
|
tags: pvc-cputuning
|
||||||
|
when: cpu_tuning is defined
|
||||||
|
|
||||||
- name: restart server on first install
|
- name: restart server on first install
|
||||||
shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"'
|
shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"'
|
||||||
async: 1
|
async: 1
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
[Service]
|
||||||
|
Slice = osd.slice
|
|
@ -0,0 +1,13 @@
|
||||||
|
# PVC VM slice unit
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
[Unit]
|
||||||
|
Description=Virtual Machine and Container Slice
|
||||||
|
Documentation=man:systemd.special(7)
|
||||||
|
Before=slices.target
|
||||||
|
|
||||||
|
[Slice]
|
||||||
|
CPUAccounting = true
|
||||||
|
Delegate = true
|
||||||
|
CPUAffinity = {{ cpuset_vm }}
|
||||||
|
AllowedCPUs = {{ cpuset_vm }}
|
|
@ -0,0 +1,13 @@
|
||||||
|
# PVC ceph-osd slice unit
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
[Unit]
|
||||||
|
Description=Ceph OSD Slice
|
||||||
|
Documentation=man:systemd.special(7)
|
||||||
|
Before=slices.target
|
||||||
|
|
||||||
|
[Slice]
|
||||||
|
CPUAccounting=true
|
||||||
|
Delegate = true
|
||||||
|
CPUAffinity = {{ cpuset_osd }}
|
||||||
|
AllowedCPUs = {{ cpuset_osd }}
|
|
@ -0,0 +1,13 @@
|
||||||
|
# PVC VM slice unit
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
[Unit]
|
||||||
|
Description = Core System Slice
|
||||||
|
Documentation = man:systemd.special(7)
|
||||||
|
Before = slices.target
|
||||||
|
|
||||||
|
[Slice]
|
||||||
|
CPUAccounting = true
|
||||||
|
Delegate = true
|
||||||
|
CPUAffinity = {{ cpuset_system }}
|
||||||
|
AllowedCPUs = {{ cpuset_system }}
|
|
@ -0,0 +1,13 @@
|
||||||
|
# PVC VM slice unit
|
||||||
|
# {{ ansible_managed }}
|
||||||
|
|
||||||
|
[Unit]
|
||||||
|
Description = User and Session Slice
|
||||||
|
Documentation = man:systemd.special(7)
|
||||||
|
Before = slices.target
|
||||||
|
|
||||||
|
[Slice]
|
||||||
|
CPUAccounting = true
|
||||||
|
Delegate = true
|
||||||
|
CPUAffinity = {{ cpuset_system }}
|
||||||
|
AllowedCPUs = {{ cpuset_system }}
|
Loading…
Reference in New Issue