diff --git a/group_vars/default/pvc.yml b/group_vars/default/pvc.yml index a7049d8..5da3fc0 100644 --- a/group_vars/default/pvc.yml +++ b/group_vars/default/pvc.yml @@ -87,6 +87,9 @@ pvc_nodes: ipmi_host: "{{ ipmi['hosts']['pvchv1']['hostname'] }}" # Note the node hostname key in here ipmi_user: "{{ ipmi['users']['pvc']['username'] }}" ipmi_password: "{{ ipmi['users']['pvc']['password'] }}" + cpu_tuning: # Example of cpu_tuning overrides per-node, only relevant if enabled; see below + system_cpus: 2 + osd_cpus: 2 - hostname: "pvchv2" # This name MUST match the Ansible inventory_hostname's first portion, i.e. "inventory_hostname.split('.')[0]" is_coordinator: yes node_id: 2 @@ -146,6 +149,37 @@ pvc_sriov_enable: False # Lowering the stack limit may cause poor performance or crashes in Zookeeper during some tasks. #pvc_zookeeper_stack_limit: 256M # 1/4 of default +# CPU tuning +# > ADVANCED TUNING: For most users, this is unnecessary and PVC will run fine with the default CPU +# allocations. Adjust these options only for clusters where CPU optimization is needed. +# > Defines CPU tuning/affinity options for various subsystems within PVC. This is useful to +# help limit the impact that noisy elements may have on other elements, e.g. busy VMs on +# OSDs, or system processes on latency-sensitive VMs. +# > To enable tuning, set enabled to yes. +# > Within "nodes", two counts are specified: +# * system_cpus: The number of CPUs to assign to the "system" slice, i.e. all non-VM, +# non-OSD processes on the system. Should usually be at least 2, and be +# higher on the coordinators of larger clusters (i.e. >5 nodes). +# * osd_cpus: The number of CPUs to assign to the "osd" slice, i.e. all OSD processes. +# Should be at least 1 per OSD, and ideally 2 per OSD for best performance. +# A third count, for the VM CPUs, is autogenerated based on the total node CPU count and +# the above two values (using all remaining CPUs). +# > Tuning is done based on cores; for systems with SMT (>1 thread-per-core), all SMTs within +# a given core are also assigned to the same CPU set. So for example, if the system assigns +# 2 system_cpus, there are 16 cores, and there are 2 threads per core, the list will be: +# 0,1,16,17 +# leveraging the assumption that Linux puts all cores before all threads. +# > This tuning section under "nodes" is global to the cluster; to override these values on +# a per-node basis, use the corresponding "cpu_tuning" section of a given "pvc_nodes" entry +# as shown below. +# > If disabled after being enabled, the tuning configurations on each node will be removed +# on the next run. A reboot of all nodes is required to fully disable the tuning. +cpu_tuning: + enabled: no + nodes: + system_cpus: 2 # Set based on your actual system configuration (min 2, increase on coordinators if many nodes) + osd_cpus: 2 # Set based on your actual number of OSDs + # Configuration file networks # > Taken from base.yml's configuration; DO NOT MODIFY THIS SECTION. pvc_upstream_device: "{{ networks['upstream']['device'] }}" diff --git a/roles/base/defaults/main.yml b/roles/base/defaults/main.yml index 556f952..e56a24f 100644 --- a/roles/base/defaults/main.yml +++ b/roles/base/defaults/main.yml @@ -12,7 +12,7 @@ recursive_dns_servers: recursive_dns_search_domains: - "{{ local_domain }}" -grub_cmdline: "systemd.unified_cgroup_hierarchy=0 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash" +grub_cmdline: "systemd.unified_cgroup_hierarchy=1 console=tty0 console=ttyS{{ grub.serial_console[cluster_hardware].console }},115200 plymouth.ignore-serial-consoles splash" grub_serial: "serial --unit={{ grub.serial_console[cluster_hardware].console }} --speed=115200" deploy_username: "deploy" diff --git a/roles/pvc/defaults/main.yml b/roles/pvc/defaults/main.yml index 1307ba4..565705a 100644 --- a/roles/pvc/defaults/main.yml +++ b/roles/pvc/defaults/main.yml @@ -37,18 +37,39 @@ pvc_api_database_name: "pvcapi" pvc_api_database_user: "pvcapi" pvc_api_database_password: "PVCprovPassw0rd" +# CPU tuning +# This is left commented so the section of the tasks never runs; enable or disable it in your per-cluster configs +# CPU tune options defined per-node are placed in the pvc_nodes section below under cpu_tuning; global versions +# may be placed here instead. +# Whether a node has hyperthreading is determined automatically; if so, for each (real) CPU core assigned to a +# subscection, its corresponding hyperthread is also assigned to that section. +# machine_cpus is later used in the templates; the value of this field is autogenerated as: +# total_system_cpus - osd_cpus - system_cpus +#cpu_tuning: +# enabled: yes # Enable or disable CPU tuning for processes +# nodes: # Nodes configuration; default options, can be overridden by per-node tuning below +# system_cpus: 2 # The number of CPUs to assign to the "system" slice +# # This slice includes all non-VM, non-OSD processes including databases, node daemons, system processes, non-OSD Ceph processes, etc. +# # At least 2 cores should be assigned to this slice. +# osd_cpus: 2 # The number of CPUs to assign to the "osd" slice +# # This slice includes all OSD processes +# # At least 1 core per OSD should be assigned to this slice. + # Coordinators pvc_nodes: - - hostname: "pvc1" - is_coordinator: yes - node_id: 1 - router_id: "10.0.0.1" - cluster_ip: "by-id" - storage_ip: "by-id" - upstream_ip: "" - ipmi_host: "pvc1-lom" - ipmi_user: "" - ipmi_password: "" + - hostname: "pvc1" # The full ansible inventory hostname of the node + is_coordinator: yes # If the node is a coordinator or not + node_id: 1 # The sequential node ID, usually matches the numerical part of the hostname + router_id: "10.0.0.1" # The router ID of the node; can be the IP Address of the Cluster network, or the node_id, or some other unique number + cluster_ip: "by-id" # The Cluster network IP of the host; by-id uses the network then adds the node as node_id within that network (e.g. pvc1 becomes x.y.z.1) + storage_ip: "by-id" # The Storage network IP of the host; by-id as above + upstream_ip: "" # The Upstream network IP of the host; by-id as above + ipmi_host: "pvc1-lom" # The IPMI hostname of the node + ipmi_user: "" # The IPMI username to use + ipmi_password: "" # The IPMI password to use + cpu_tuning: # Per-node CPU tuning options; if set, overrides the global options above; useful if a node has different CPU characteristics + system_cpus: 1 + osd_cpus: 2 - hostname: "pvc2" is_coordinator: yes node_id: 2 @@ -56,9 +77,10 @@ pvc_nodes: cluster_ip: "by-id" storage_ip: "by-id" upstream_ip: "" - ipmi_host: "pvc2-lom" - ipmi_user: "" - ipmi_password: "" + ipmi: + host: "pvc2-lom" + user: "" + password: "" - hostname: "pvc3" is_coordinator: yes node_id: 3 @@ -66,9 +88,10 @@ pvc_nodes: cluster_ip: "by-id" storage_ip: "by-id" upstream_ip: "" - ipmi_host: "pvc3-lom" - ipmi_user: "" - ipmi_password: "" + ipmi: + host: "pvc3-lom" + user: "" + password: "" # Networks pvc_asn: "65001" diff --git a/roles/pvc/tasks/cputuning/disable.yml b/roles/pvc/tasks/cputuning/disable.yml new file mode 100644 index 0000000..b0c9a42 --- /dev/null +++ b/roles/pvc/tasks/cputuning/disable.yml @@ -0,0 +1,18 @@ +--- + +- name: remove cpu tuning configurations + file: + dest: "{{ item }}" + state: absent + loop: + - /etc/systemd/system/system.slice + - /etc/systemd/system/user.slice + - /etc/systemd/system/osd.slice + - /etc/systemd/system/machine.slice + - /etc/systemd/system/ceph-osd@.service.d/cputuning.conf + register: systemd + ignore_errors: yes + +- name: reload systemd to apply changes + command: systemctl daemon-reload + when: systemd.changed diff --git a/roles/pvc/tasks/cputuning/enable.yml b/roles/pvc/tasks/cputuning/enable.yml new file mode 100644 index 0000000..040eb3b --- /dev/null +++ b/roles/pvc/tasks/cputuning/enable.yml @@ -0,0 +1,100 @@ +--- + +# Calculate the correct per-node cpu sets +- name: set global values + set_fact: + system_cpus: "{{ cpu_tuning.nodes.system_cpus }}" + osd_cpus: "{{ cpu_tuning.nodes.osd_cpus }}" + +- name: get per-node cpu tuning values + set_fact: + node_cpu_tuning: "{% for node in pvc_nodes if node.hostname == this_node %}{% if node.cpu_tuning is defined %}{{ node.cpu_tuning }}{% endif %}{% endfor %}" + +- name: override global system_cpus value if set + set_fact: + system_cpus: "{{ node_cpu_tuning.system_cpus }}" + osd_cpus: "{{ node_cpu_tuning.osd_cpus }}" + when: node_cpu_tuning is defined and node_cpu_tuning + +- name: get node CPU details + command: lscpu --json + register: lscpu + +- name: set sockets variable + set_fact: + sockets: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Socket(s):'].data" + +- name: set cores_per_socket variable + set_fact: + cores_per_socket: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Core(s) per socket:'].data" + +- name: set threads_per_core variable + set_fact: + threads_per_core: "{{ (lscpu.stdout|from_json|json_query(query)|list)[0] }}" + vars: + query: "lscpu[?field == 'Thread(s) per core:'].data" + +- name: set total_cores variable + set_fact: + total_cores: "{{ sockets|int * cores_per_socket|int }}" + +- name: craft the system cpuset (first cores + any threads as applicable) + set_fact: + cpuset_system: "{%- set cores = [] -%} + {%- for rng in range(0, system_cpus|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +- name: craft the osd cpuset (next cores + any threads as applicable) + set_fact: + cpuset_osd: "{%- set cores = [] -%} + {%- for rng in range(system_cpus|int, system_cpus|int + osd_cpus|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +- name: craft the VM cpuset (remaining cores + any threads as applicable) + set_fact: + cpuset_vm: "{%- set cores = [] -%} + {%- for rng in range(system_cpus|int + osd_cpus|int, total_cores|int) -%} + {%- for core in range(rng, total_cores|int * threads_per_core|int, total_cores|int) -%} + {{ cores.append(core) }} + {%- endfor -%} + {%- endfor -%} + {{ cores|sort|join(',') }}" + +# Actually install the required components +- name: install slice tuning units + template: + src: "cputuning/{{ item }}.j2" + dest: "/etc/systemd/system/{{ item }}" + loop: + - system.slice + - user.slice + - osd.slice + - machine.slice + register: systemd_slices + +- name: create osd unit override configuration directory + file: + dest: /etc/systemd/system/ceph-osd@.service.d + state: directory + +- name: install osd cputuning configuration + template: + src: cputuning/ceph-osd@.service.d-cputuning.conf + dest: /etc/systemd/system/ceph-osd@.service.d/cputuning.conf + register: systemd_osdtuning + +- name: reload systemd to apply changes + command: systemctl daemon-reload + when: systemd_slices.changed or systemd_osdtuning.changed diff --git a/roles/pvc/tasks/cputuning/main.yml b/roles/pvc/tasks/cputuning/main.yml new file mode 100644 index 0000000..38106a1 --- /dev/null +++ b/roles/pvc/tasks/cputuning/main.yml @@ -0,0 +1,7 @@ +--- + +- include: enable.yml + when: cpu_tuning.enabled + +- include: disable.yml + when: not cpu_tuning.enabled diff --git a/roles/pvc/tasks/main.yml b/roles/pvc/tasks/main.yml index b6aeba0..74d002a 100644 --- a/roles/pvc/tasks/main.yml +++ b/roles/pvc/tasks/main.yml @@ -56,6 +56,11 @@ - include: pvc/main.yml tags: pvc-daemon +# Install CPU tuning +- include: cputuning/main.yml + tags: pvc-cputuning + when: cpu_tuning is defined + - name: restart server on first install shell: 'sleep 3 && shutdown -r now "Ansible updates triggered"' async: 1 diff --git a/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf b/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf new file mode 100644 index 0000000..cdc253a --- /dev/null +++ b/roles/pvc/templates/cputuning/ceph-osd@.service.d-cputuning.conf @@ -0,0 +1,2 @@ +[Service] +Slice = osd.slice diff --git a/roles/pvc/templates/cputuning/machine.slice.j2 b/roles/pvc/templates/cputuning/machine.slice.j2 new file mode 100644 index 0000000..ffd2a23 --- /dev/null +++ b/roles/pvc/templates/cputuning/machine.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description=Virtual Machine and Container Slice +Documentation=man:systemd.special(7) +Before=slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_vm }} +AllowedCPUs = {{ cpuset_vm }} diff --git a/roles/pvc/templates/cputuning/osd.slice.j2 b/roles/pvc/templates/cputuning/osd.slice.j2 new file mode 100644 index 0000000..3df541b --- /dev/null +++ b/roles/pvc/templates/cputuning/osd.slice.j2 @@ -0,0 +1,13 @@ +# PVC ceph-osd slice unit +# {{ ansible_managed }} + +[Unit] +Description=Ceph OSD Slice +Documentation=man:systemd.special(7) +Before=slices.target + +[Slice] +CPUAccounting=true +Delegate = true +CPUAffinity = {{ cpuset_osd }} +AllowedCPUs = {{ cpuset_osd }} diff --git a/roles/pvc/templates/cputuning/system.slice.j2 b/roles/pvc/templates/cputuning/system.slice.j2 new file mode 100644 index 0000000..c03b917 --- /dev/null +++ b/roles/pvc/templates/cputuning/system.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description = Core System Slice +Documentation = man:systemd.special(7) +Before = slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_system }} +AllowedCPUs = {{ cpuset_system }} diff --git a/roles/pvc/templates/cputuning/user.slice.j2 b/roles/pvc/templates/cputuning/user.slice.j2 new file mode 100644 index 0000000..9c08efe --- /dev/null +++ b/roles/pvc/templates/cputuning/user.slice.j2 @@ -0,0 +1,13 @@ +# PVC VM slice unit +# {{ ansible_managed }} + +[Unit] +Description = User and Session Slice +Documentation = man:systemd.special(7) +Before = slices.target + +[Slice] +CPUAccounting = true +Delegate = true +CPUAffinity = {{ cpuset_system }} +AllowedCPUs = {{ cpuset_system }}