From e7b6a3eac1a1442b6066e831c3b58490a5eb4413 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Thu, 17 Jun 2021 01:01:23 -0400 Subject: [PATCH] Implement SR-IOV PF and VF instances Adds support for the node daemon managing SR-IOV PF and VF instances. PFs are added to Zookeeper automatically based on the config at startup during network configuration, and are otherwise completely static. PFs are automatically removed from Zookeeper, along with all coresponding VFs, should the PF phy device be removed from the configuration. VFs are configured based on the (autocreated) VFs of each PF device, added to Zookeeper, and then a new class instance, SRIOVVFInstance, is used to watch them for configuration changes. This will enable the runtime management of VF settings by the API. The set of keys ensures that both configuration and details of the NIC can be tracked. Most keys are self-explanatory, especially for PFs and the basic keys for VFs. The configuration tree is also self-explanatory, being based entirely on the options available in the `ip link set {dev} vf` command. Two additional keys are also present: `used` and `used_by`, which will be able to track the (boolean) state of usage, as well as the VM that uses a given VIF. Since the VM side implementation will support both macvtap and direct "hostdev" assignments, this will ensure that this state can be tracked on both the VF and the VM side. --- daemon-common/migrations/versions/1.json | 1 + daemon-common/zkhandler.py | 39 ++++- node-daemon/pvcnoded/Daemon.py | 90 ++++++++++ node-daemon/pvcnoded/SRIOVVFInstance.py | 199 +++++++++++++++++++++++ 4 files changed, 327 insertions(+), 2 deletions(-) create mode 100644 daemon-common/migrations/versions/1.json create mode 100644 node-daemon/pvcnoded/SRIOVVFInstance.py diff --git a/daemon-common/migrations/versions/1.json b/daemon-common/migrations/versions/1.json new file mode 100644 index 00000000..70f5e23f --- /dev/null +++ b/daemon-common/migrations/versions/1.json @@ -0,0 +1 @@ +{"version": "1", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "migrate.sync_lock": "/migrate_sync_lock"}, "network": {"vni": "", "type": "/nettype", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index f201429d..5bcffa6f 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -426,7 +426,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 0 + _version = 1 # Root for doing nested keys _schema_root = '' @@ -483,7 +483,34 @@ class ZKSchema(object): 'memory.provisioned': '/memprov', 'ipmi.hostname': '/ipmihostname', 'ipmi.username': '/ipmiusername', - 'ipmi.password': '/ipmipassword' + 'ipmi.password': '/ipmipassword', + 'sriov': '/sriov', + 'sriov.pf': '/sriov/pf', + 'sriov.vf': '/sriov/vf', + }, + # The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf}) + 'sriov_pf': { + 'phy': '', # The root key + 'mtu': '/mtu', + 'vfcount': '/vfcount' + }, + # The schema of an individual SR-IOV VF entry (/nodes/{node_name}/sriov/vf/{vf}) + 'sriov_vf': { + 'phy': '', # The root key + 'pf': '/pf', + 'mtu': '/mtu', + 'mac': '/mac', + 'config': '/config', + 'config.vlan_id': '/config/vlan_id', + 'config.vlan_qos': '/config/vlan_qos', + 'config.tx_rate_min': '/config/tx_rate_min', + 'config.tx_rate_max': '/config/tx_rate_max', + 'config.spoof_check': '/config/spoof_check', + 'config.link_state': '/config/link_state', + 'config.trust': '/config/trust', + 'config.query_rss': '/config/query_rss', + 'used': '/used', + 'used_by': '/used_by' }, # The schema of an individual domain entry (/domains/{domain_uuid}) 'domain': { @@ -709,6 +736,10 @@ class ZKSchema(object): if not zkhandler.zk_conn.exists(nkipath): result = False + # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be + # managed here as well, but those are created automatically every time pvcnoded starts + # and thus never need to be validated or applied. + # These two have several children layers that must be parsed through for elem in ['volume']: # First read all the subelements of the key class (pool layer) @@ -782,6 +813,10 @@ class ZKSchema(object): if not zkhandler.zk_conn.exists(nkipath): zkhandler.zk_conn.create(nkipath, ''.encode(zkhandler.encoding)) + # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be + # managed here as well, but those are created automatically every time pvcnoded starts + # and thus never need to be validated or applied. + # These two have several children layers that must be parsed through for elem in ['volume']: # First read all the subelements of the key class (pool layer) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 96b061f0..fa8562b8 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -49,6 +49,7 @@ import daemon_lib.common as common import pvcnoded.VMInstance as VMInstance import pvcnoded.NodeInstance as NodeInstance import pvcnoded.VXNetworkInstance as VXNetworkInstance +import pvcnoded.SRIOVVFInstance as SRIOVVFInstance import pvcnoded.DNSAggregatorInstance as DNSAggregatorInstance import pvcnoded.CephInstance as CephInstance import pvcnoded.MetadataAPIInstance as MetadataAPIInstance @@ -390,6 +391,7 @@ else: # PHASE 2a - Activate SR-IOV support ############################################################################### +# This happens before other networking steps to enable using VFs for cluster functions. if enable_networking and enable_sriov: logger.out('Setting up SR-IOV device support', state='i') # Enable unsafe interruptts for the vfio_iommu_type1 kernel module @@ -916,12 +918,15 @@ logger.out('Setting up objects', state='i') d_node = dict() d_network = dict() +d_sriov_vf = dict() d_domain = dict() d_osd = dict() d_pool = dict() d_volume = dict() # Dict of Dicts node_list = [] network_list = [] +sriov_pf_list = [] +sriov_vf_list = [] domain_list = [] osd_list = [] pool_list = [] @@ -1076,6 +1081,91 @@ if enable_networking: for node in d_node: d_node[node].update_network_list(d_network) + # Add the SR-IOV PFs and VFs to Zookeeper + # These do not behave like the objects; they are not dynamic (the API cannot change them), and they + # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node + # basis, under the Node configuration tree. + # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys + if enable_sriov and zkhandler.schema.schema.get('sriov_pf', None) is not None: + vf_list = list() + for device in config['sriov_device']: + pf = device['phy'] + vfcount = device['vfcount'] + if device.get('mtu', None) is None: + mtu = 1500 + else: + mtu = device['mtu'] + + # Create the PF device in Zookeeper + zkhandler.write([ + (('node.sriov.pf', myhostname, 'sriov_pf', pf), ''), + (('node.sriov.pf', myhostname, 'sriov_pf.mtu', pf), mtu), + (('node.sriov.pf', myhostname, 'sriov_pf.vfcount', pf), vfcount), + ]) + # Append the device to the list of PFs + sriov_pf_list.append(pf) + + # Get the list of VFs from `ip link show` + vf_list = json.loads(common.run_os_command('ip --json link show {}'.format(pf))[1])[0].get('vfinfo_list', []) + for vf in vf_list: + # { + # 'vf': 3, + # 'link_type': 'ether', + # 'address': '00:00:00:00:00:00', + # 'broadcast': 'ff:ff:ff:ff:ff:ff', + # 'vlan_list': [{'vlan': 101, 'qos': 2}], + # 'rate': {'max_tx': 0, 'min_tx': 0}, + # 'spoofchk': True, + # 'link_state': 'auto', + # 'trust': False, + # 'query_rss_en': False + # } + vfphy = '{}v{}'.format(pf, vf['vf']) + zkhandler.write([ + (('node.sriov.vf', myhostname, 'sriov_vf', vfphy), ''), + (('node.sriov.vf', myhostname, 'sriov_vf.pf', vfphy), pf), + (('node.sriov.vf', myhostname, 'sriov_vf.mtu', vfphy), mtu), + (('node.sriov.vf', myhostname, 'sriov_vf.mac', vfphy), vf['address']), + (('node.sriov.vf', myhostname, 'sriov_vf.config', vfphy), ''), + (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_id', vfphy), vf['vlan_list'][0].get('vlan', '')), + (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_qos', vfphy), vf['vlan_list'][0].get('qos', '')), + (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_min', vfphy), vf['rate']['min_tx']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_max', vfphy), vf['rate']['max_tx']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.spoof_check', vfphy), vf['spoofchk']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.link_state', vfphy), vf['link_state']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.trust', vfphy), vf['trust']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.query_rss', vfphy), vf['query_rss_en']), + ]) + # Append the device to the list of VFs + sriov_vf_list.append(vfphy) + + # Remove any obsolete PFs from Zookeeper if they go away + for pf in zkhandler.children(('node.sriov.pf', myhostname)): + if pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.pf', myhostname, 'sriov_pf', pf) + ]) + # Remove any obsolete VFs from Zookeeper if their PF goes away + for vf in zkhandler.children(('node.sriov.vf', myhostname)): + vf_pf = zkhandler.read(('node.sriov.vf', myhostname, 'sriov_vf.pf', vf)) + if vf_pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.vf', myhostname, 'sriov_vf', vf) + ]) + + # SR-IOV VF objects + # This is a ChildrenWatch just for consistency; the list never changes at runtime + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('node.sriov.vf', myhostname)) + def update_sriov_pfs(new_sriov_vf_list): + global sriov_vf_list, d_sriov_vf + + # Add VFs to the list + for vf in new_sriov_vf_list: + d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(vf, zkhandler, config, logger, this_node) + + sriov_vf_list = new_sriov_vf_list + logger.out('{}SR-IOV VF list:{} {}'.format(fmt_blue, fmt_end, ' '.join(sriov_vf_list)), state='i') + if enable_hypervisor: # VM command pipeline key @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain')) diff --git a/node-daemon/pvcnoded/SRIOVVFInstance.py b/node-daemon/pvcnoded/SRIOVVFInstance.py new file mode 100644 index 00000000..ea20dff8 --- /dev/null +++ b/node-daemon/pvcnoded/SRIOVVFInstance.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +# SRIOVVFInstance.py - Class implementing a PVC SR-IOV VF and run by pvcnoded +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import daemon_lib.common as common + + +def boolToOnOff(state): + if state and str(state) == 'True': + return 'on' + else: + return 'off' + + +class SRIOVVFInstance(object): + # Initialization function + def __init__(self, vf, zkhandler, config, logger, this_node): + self.vf = vf + self.zkhandler = zkhandler + self.config = config + self.logger = logger + self.this_node = this_node + self.myhostname = self.this_node.name + + self.pf = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.pf', self.vf)) + self.mtu = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mtu', self.vf)) + + self.vfid = self.vf.replace('{}v'.format(self.pf), '') + self.mac = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mac', self.vf)) + + self.vlan_id = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_id', self.vf)) + self.vlan_qos = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_qos', self.vf)) + self.tx_rate_min = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_min', self.vf)) + self.tx_rate_max = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_max', self.vf)) + self.spoof_check = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.spoof_check', self.vf)) + self.link_state = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.link_state', self.vf)) + self.trust = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.trust', self.vf)) + self.query_rss = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.query_rss', self.vf)) + + # Zookeeper handlers for changed configs + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.mac', self.vf)) + def watch_vf_mac(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '00:00:00:00:00:00' + + if data != self.mac: + self.mac = data + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_id', self.vf)) + def watch_vf_vlan_id(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.vlan_id: + self.vlan_id = data + common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_qos', self.vf)) + def watch_vf_vlan_qos(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.vlan_qos: + self.vlan_qos = data + common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_min', self.vf)) + def watch_vf_tx_rate_min(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.tx_rate_min: + self.tx_rate_min = data + common.run_os_command('ip link set {} vf {} min_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_min)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_max', self.vf)) + def watch_vf_tx_rate_max(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; termaxate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.tx_rate_max: + self.tx_rate_max = data + common.run_os_command('ip link set {} vf {} max_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_max)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.spoof_check', self.vf)) + def watch_vf_spoof_check(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.spoof_check: + self.spoof_check = data + common.run_os_command('ip link set {} vf {} spoofchk {}'.format(self.pf, self.vfid, boolToOnOff(self.spoof_check))) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.link_state', self.vf)) + def watch_vf_link_state(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'on' + + if data != self.link_state: + self.link_state = data + common.run_os_command('ip link set {} vf {} state {}'.format(self.pf, self.vfid, self.link_state)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.trust', self.vf)) + def watch_vf_trust(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'off' + + if data != self.trust: + self.trust = data + common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.trust))) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.query_rss', self.vf)) + def watch_vf_query_rss(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'off' + + if data != self.query_rss: + self.query_rss = data + common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.query_rss)))