From 07d75573d675048d5dab534f9acdf37a4d27b6bc Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Fri, 1 Sep 2023 15:42:29 -0400 Subject: [PATCH] Add updated tuning configuration Uses a much nicer CPU tuning configuration, leveraging systemd's AllowedCPUs and CPUAffinity options within a set of slices (some default, some custom). Configuration is also greatly simplified versus the previous implementation, simply asking for a number of CPUS for both the system and OSDs, and calculating everything else that is required. Also switches (back) to the v2 unified cgroup hierarchy by default as required by the systemd AllowedCPUs directive. --- group_vars/default/pvc.yml | 34 ++++++ roles/base/defaults/main.yml | 2 +- roles/pvc/defaults/main.yml | 55 +++++++--- roles/pvc/tasks/cputuning/disable.yml | 18 ++++ roles/pvc/tasks/cputuning/enable.yml | 100 ++++++++++++++++++ roles/pvc/tasks/cputuning/main.yml | 7 ++ roles/pvc/tasks/main.yml | 5 + .../ceph-osd@.service.d-cputuning.conf | 2 + .../pvc/templates/cputuning/machine.slice.j2 | 13 +++ roles/pvc/templates/cputuning/osd.slice.j2 | 13 +++ roles/pvc/templates/cputuning/system.slice.j2 | 13 +++ roles/pvc/templates/cputuning/user.slice.j2 | 13 +++ 12 files changed, 258 insertions(+), 17 deletions(-) create mode 100644 roles/pvc/tasks/cputuning/disable.yml create mode 100644 roles/pvc/tasks/cputuning/enable.yml create mode 100644 roles/pvc/tasks/cputuning/main.yml create mode 100644 roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf create mode 100644 roles/pvc/templates/cputuning/machine.slice.j2 create mode 100644 roles/pvc/templates/cputuning/osd.slice.j2 create mode 100644 roles/pvc/templates/cputuning/system.slice.j2 create mode 100644 roles/pvc/templates/cputuning/user.slice.j2 diff --git a/group_vars/default/pvc.yml b/group_vars/default/pvc.yml index a7049d8..5da3fc0 100644 --- a/group_vars/default/pvc.yml +++ b/group_vars/default/pvc.yml @@ -87,6 +87,9 @@ pvc_nodes: ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here ipmi_user: "{{ ipmi['users']['pvc']['username'] }}" ipmi_password: "{{ ipmi['users']['pvc']['password'] }}" + cpu_tuning: # Example of cpu_tuning overrides per-node, only relevant if enabled; see below + system_cpus: 2 + osd_cpus: 2 - hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]" is_coordinator: yes node_id: 2 @@ -146,6 +149,37 @@ pvc_sriov_enable: False # Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. #pvc_zookeeper_stack_limit: 256M # 1/4 of default +# CPU tuning +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default CPU +# allocations. Adjust these options only for clusters where CPU optimization is needed. +# > Defines CPU tuning/affinity options for various subsystems within PVC. This is useful to +# help limit the impact that noisy elements may have on other elements, e.g. busy VMs on +# OSDs, or system processes on latency-sensitive VMs. +# > To enable tuning, set enabled to yes. +# > Within "nodes", two counts are specified: +# * system_cpus: The number of CPUs to assign to the "system" slice, i.e. all non-VM, +# non-OSD processes on the system. Should usually be at least 2, and be +# higher on the coordinators of larger clusters (i.e. >5 nodes). +# * osd_cpus: The number of CPUs to assign to the "osd" slice, i.e. all OSD processes. +# Should be at least 1 per OSD, and ideally 2 per OSD for best performance. +# A third count, for the VM CPUs, is autogenerated based on the total node CPU count and +# the above two values (using all remaining CPUs). +# > Tuning is done based on cores; for systems with SMT (>1 thread-per-core), all SMTs within +# a given core are also assigned to the same CPU set. So for example, if the system assigns +# 2 system_cpus, there are 16 cores, and there are 2 threads per core, the list will be: +# 0,1,16,17 +# leveraging the assumption that Linux puts all cores before all threads. +# > This tuning section under "nodes" is global to the cluster; to override these values on +# a per-node basis, use the corresponding "cpu_tuning" section of a given "pvc_nodes" entry +# as shown below. +# > If disabled after being enabled, the tuning configurations on each node will be removed +# on the next run. A reboot of all nodes is required to fully disable the tuning. +cpu_tuning: + enabled: no + nodes: + system_cpus: 2 # Set based on your actual system configuration (min 2, increase on coordinators if many nodes) + osd_cpus: 2 # Set based on your actual number of OSDs + # Configuration file networks # > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION. pvc_upstream_device: "{{ networks['upstream']['device'] }}" diff --git a/roles/base/defaults/main.yml b/roles/base/defaults/main.yml index 556f952..e56a24f 100644 --- a/roles/base/defaults/main.yml +++ b/roles/base/defaults/main.yml @@ -12,7 +12,7 @@ recursive_dns_servers: recursive_dns_search_domains: - "{{ local_domain }}" -grub_cmdline: "systemd.unified_cgroup_hierarchy=0 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash" +grub_cmdline: "systemd.unified_cgroup_hierarchy=1 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash" grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200" deploy_username: "deploy" diff --git a/roles/pvc/defaults/main.yml b/roles/pvc/defaults/main.yml index 1307ba4..565705a 100644 --- a/roles/pvc/defaults/main.yml +++ b/roles/pvc/defaults/main.yml @@ -37,18 +37,39 @@ pvc_api_database_name: "pvcapi" pvc_api_database_user: "pvcapi" pvc_api_database_password: "PVCprovPassw0rd" +# CPU tuning +# This is left commented so the section of the tasks never runs; enable or disable it in your per-cluster configs +# CPU tune options defined per-node are placed in the pvc_nodes section below under cpu_tuning; global versions +# may be placed here instead. +# Whether a node has hyperthreading is determined automatically; if so, for each (real) CPU core assigned to a +# subscection, its corresponding hyperthread is also assigned to that section. +# machine_cpus is later used in the templates; the value of this field is autogenerated as: +# total_system_cpus - osd_cpus - system_cpus +#cpu_tuning: +# enabled: yes # Enable or disable CPU tuning for processes +# nodes: # Nodes configuration; default options, can be overridden by per-node tuning below +# system_cpus: 2 # The number of CPUs to assign to the "system" slice +# # This slice includes all non-VM, non-OSD processes including databases, node daemons, system processes, non-OSD Ceph processes, etc. +# # At least 2 cores should be assigned to this slice. +# osd_cpus: 2 # The number of CPUs to assign to the "osd" slice +# # This slice includes all OSD processes +# # At least 1 core per OSD should be assigned to this slice. + # Coordinators pvc_nodes: - - hostname: "pvc1" - is_coordinator: yes - node_id: 1 - router_id: "10.0.0.1" - cluster_ip: "by-id" - storage_ip: "by-id" - upstream_ip: "" - ipmi_host: "pvc1-lom" - ipmi_user: "" - ipmi_password: "" + - hostname: "pvc1" # The full ansible inventory hostname of the node + is_coordinator: yes # If the node is a coordinator or not + node_id: 1 # The sequential node ID, usually matches the numerical part of the hostname + router_id: "10.0.0.1" # The router ID of the node; can be the IP Address of the Cluster network, or the node_id, or some other unique number + cluster_ip: "by-id" # The Cluster network IP of the host; by-id uses the network then adds the node as node_id within that network (e.g. pvc1 becomes x.y.z.1) + storage_ip: "by-id" # The Storage network IP of the host; by-id as above + upstream_ip: "" # The Upstream network IP of the host; by-id as above + ipmi_host: "pvc1-lom" # The IPMI hostname of the node + ipmi_user: "" # The IPMI username to use + ipmi_password: "" # The IPMI password to use + cpu_tuning: # Per-node CPU tuning options; if set, overrides the global options above; useful if a node has different CPU characteristics + system_cpus: 1 + osd_cpus: 2 - hostname: "pvc2" is_coordinator: yes node_id: 2 @@ -56,9 +77,10 @@ pvc_nodes: cluster_ip: "by-id" storage_ip: "by-id" upstream_ip: "" - ipmi_host: "pvc2-lom" - ipmi_user: "" - ipmi_password: "" + ipmi: + host: "pvc2-lom" + user: "" + password: "" - hostname: "pvc3" is_coordinator: yes node_id: 3 @@ -66,9 +88,10 @@ pvc_nodes: cluster_ip: "by-id" storage_ip: "by-id" upstream_ip: "" - ipmi_host: "pvc3-lom" - ipmi_user: "" - ipmi_password: "" + ipmi: + host: "pvc3-lom" + user: "" + password: "" # Networks pvc_asn: "65001" diff --git a/roles/pvc/tasks/cputuning/disable.yml b/roles/pvc/tasks/cputuning/disable.yml new file mode 100644 index 0000000..b0c9a42 --- /dev/null +++ b/roles/pvc/tasks/cputuning/disable.yml @@ -0,0 +1,18 @@ +--- + +- name: remove cpu tuning configurations + file: + dest: "{{ item }}" + state: absent + loop: + - /etc/systemd/system/system.slice + - /etc/systemd/system/user.slice + - /etc/systemd/system/osd.slice + - /etc/systemd/system/machine.slice + - /etc/systemd/system/ceph-osd@.service.d/cputuning.conf + register: systemd + ignore_errors: yes + +- name: reload systemd to apply changes + command: systemctl daemon-reload + when: systemd.changed diff --git a/roles/pvc/tasks/cputuning/enable.yml b/roles/pvc/tasks/cputuning/enable.yml new file mode 100644 index 0000000..040eb3b --- /dev/null +++ b/roles/pvc/tasks/cputuning/enable.yml @@ -0,0 +1,100 @@ +--- + +# Calculate the correct per-node cpu sets +- name: set global values + set_fact: + system_cpus: "{{ cpu_tuning.nodes.system_cpus }}" + osd_cpus: "{{ cpu_tuning.nodes.osd_cpus }}" + +- name: get per-node cpu tuning values + set_fact: + node_cpu_tuning: "{% for node in pvc_nodes if node.hostname == this_node %}{% if node.cpu_tuning is defined %}{{ node.cpu_tuning }}{% endif %}{% endfor %}" + +- name: override global system_cpus value if set + set_fact: + system_cpus: "{{ node_cpu_tuning.system_cpus }}" + osd_cpus: "{{ node_cpu_tuning.osd_cpus }}" + when: node_cpu_tuning is defined and node_cpu_tuning + +- name: get node CPU details + command: lscpu --json + register: lscpu + +- name: set sockets variable + set_fact: + sockets: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Socket(s):'].data" + +- name: set cores_per_socket variable + set_fact: + cores_per_socket: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Core(s) per socket:'].data" + +- name: set threads_per_core variable + set_fact: + threads_per_core: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Thread(s) per core:'].data" + +- name: set total_cores variable + set_fact: + total_cores: "{{ sockets|int * cores_per_socket|int }}" + +- name: craft the system cpuset (first cores + any threads as applicable) + set_fact: + cpuset_system: "{%- set cores = [] -%} + {%- for rng in range(0, system_cpus|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +- name: craft the osd cpuset (next cores + any threads as applicable) + set_fact: + cpuset_osd: "{%- set cores = [] -%} + {%- for rng in range(system_cpus|int, system_cpus|int + osd_cpus|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +- name: craft the VM cpuset (remaining cores + any threads as applicable) + set_fact: + cpuset_vm: "{%- set cores = [] -%} + {%- for rng in range(system_cpus|int + osd_cpus|int, total_cores|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +# Actually install the required components +- name: install slice tuning units + template: + src: "cputuning/{{ item }}.j2" + dest: "/etc/systemd/system/{{ item }}" + loop: + - system.slice + - user.slice + - osd.slice + - machine.slice + register: systemd_slices + +- name: create osd unit override configuration directory + file: + dest: /etc/systemd/system/ceph-osd@.service.d + state: directory + +- name: install osd cputuning configuration + template: + src: cputuning/ceph-osd@.service.d-cputuning.conf + dest: /etc/systemd/system/ceph-osd@.service.d/cputuning.conf + register: systemd_osdtuning + +- name: reload systemd to apply changes + command: systemctl daemon-reload + when: systemd_slices.changed or systemd_osdtuning.changed diff --git a/roles/pvc/tasks/cputuning/main.yml b/roles/pvc/tasks/cputuning/main.yml new file mode 100644 index 0000000..38106a1 --- /dev/null +++ b/roles/pvc/tasks/cputuning/main.yml @@ -0,0 +1,7 @@ +--- + +- include: enable.yml + when: cpu_tuning.enabled + +- include: disable.yml + when: not cpu_tuning.enabled diff --git a/roles/pvc/tasks/main.yml b/roles/pvc/tasks/main.yml index b6aeba0..74d002a 100644 --- a/roles/pvc/tasks/main.yml +++ b/roles/pvc/tasks/main.yml @@ -56,6 +56,11 @@ - include: pvc/main.yml tags: pvc-daemon +# Install CPU tuning +- include: cputuning/main.yml + tags: pvc-cputuning + when: cpu_tuning is defined + - name: restart server on first install shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"' async: 1 diff --git a/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf b/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf new file mode 100644 index 0000000..cdc253a --- /dev/null +++ b/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf @@ -0,0 +1,2 @@ +[Service] +Slice = osd.slice diff --git a/roles/pvc/templates/cputuning/machine.slice.j2 b/roles/pvc/templates/cputuning/machine.slice.j2 new file mode 100644 index 0000000..ffd2a23 --- /dev/null +++ b/roles/pvc/templates/cputuning/machine.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description=Virtual Machine and Container Slice +Documentation=man:systemd.special(7) +Before=slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_vm }} +AllowedCPUs = {{ cpuset_vm }} diff --git a/roles/pvc/templates/cputuning/osd.slice.j2 b/roles/pvc/templates/cputuning/osd.slice.j2 new file mode 100644 index 0000000..3df541b --- /dev/null +++ b/roles/pvc/templates/cputuning/osd.slice.j2 @@ -0,0 +1,13 @@ +# PVC ceph-osd slice unit +# {{ ansible_managed }} + +[Unit] +Description=Ceph OSD Slice +Documentation=man:systemd.special(7) +Before=slices.target + +[Slice] +CPUAccounting=true +Delegate = true +CPUAffinity = {{ cpuset_osd }} +AllowedCPUs = {{ cpuset_osd }} diff --git a/roles/pvc/templates/cputuning/system.slice.j2 b/roles/pvc/templates/cputuning/system.slice.j2 new file mode 100644 index 0000000..c03b917 --- /dev/null +++ b/roles/pvc/templates/cputuning/system.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description = Core System Slice +Documentation = man:systemd.special(7) +Before = slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_system }} +AllowedCPUs = {{ cpuset_system }} diff --git a/roles/pvc/templates/cputuning/user.slice.j2 b/roles/pvc/templates/cputuning/user.slice.j2 new file mode 100644 index 0000000..9c08efe --- /dev/null +++ b/roles/pvc/templates/cputuning/user.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description = User and Session Slice +Documentation = man:systemd.special(7) +Before = slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_system }} +AllowedCPUs = {{ cpuset_system }}