diff --git a/daemon-common/migrations/versions/1.json b/daemon-common/migrations/versions/1.json new file mode 100644 index 00000000..70f5e23f --- /dev/null +++ b/daemon-common/migrations/versions/1.json @@ -0,0 +1 @@ +{"version": "1", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "migrate.sync_lock": "/migrate_sync_lock"}, "network": {"vni": "", "type": "/nettype", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index f201429d..5bcffa6f 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -426,7 +426,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 0 + _version = 1 # Root for doing nested keys _schema_root = '' @@ -483,7 +483,34 @@ class ZKSchema(object): 'memory.provisioned': '/memprov', 'ipmi.hostname': '/ipmihostname', 'ipmi.username': '/ipmiusername', - 'ipmi.password': '/ipmipassword' + 'ipmi.password': '/ipmipassword', + 'sriov': '/sriov', + 'sriov.pf': '/sriov/pf', + 'sriov.vf': '/sriov/vf', + }, + # The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf}) + 'sriov_pf': { + 'phy': '', # The root key + 'mtu': '/mtu', + 'vfcount': '/vfcount' + }, + # The schema of an individual SR-IOV VF entry (/nodes/{node_name}/sriov/vf/{vf}) + 'sriov_vf': { + 'phy': '', # The root key + 'pf': '/pf', + 'mtu': '/mtu', + 'mac': '/mac', + 'config': '/config', + 'config.vlan_id': '/config/vlan_id', + 'config.vlan_qos': '/config/vlan_qos', + 'config.tx_rate_min': '/config/tx_rate_min', + 'config.tx_rate_max': '/config/tx_rate_max', + 'config.spoof_check': '/config/spoof_check', + 'config.link_state': '/config/link_state', + 'config.trust': '/config/trust', + 'config.query_rss': '/config/query_rss', + 'used': '/used', + 'used_by': '/used_by' }, # The schema of an individual domain entry (/domains/{domain_uuid}) 'domain': { @@ -709,6 +736,10 @@ class ZKSchema(object): if not zkhandler.zk_conn.exists(nkipath): result = False + # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be + # managed here as well, but those are created automatically every time pvcnoded starts + # and thus never need to be validated or applied. + # These two have several children layers that must be parsed through for elem in ['volume']: # First read all the subelements of the key class (pool layer) @@ -782,6 +813,10 @@ class ZKSchema(object): if not zkhandler.zk_conn.exists(nkipath): zkhandler.zk_conn.create(nkipath, ''.encode(zkhandler.encoding)) + # One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be + # managed here as well, but those are created automatically every time pvcnoded starts + # and thus never need to be validated or applied. + # These two have several children layers that must be parsed through for elem in ['volume']: # First read all the subelements of the key class (pool layer) diff --git a/node-daemon/pvcnoded/Daemon.py b/node-daemon/pvcnoded/Daemon.py index 96b061f0..fa8562b8 100644 --- a/node-daemon/pvcnoded/Daemon.py +++ b/node-daemon/pvcnoded/Daemon.py @@ -49,6 +49,7 @@ import daemon_lib.common as common import pvcnoded.VMInstance as VMInstance import pvcnoded.NodeInstance as NodeInstance import pvcnoded.VXNetworkInstance as VXNetworkInstance +import pvcnoded.SRIOVVFInstance as SRIOVVFInstance import pvcnoded.DNSAggregatorInstance as DNSAggregatorInstance import pvcnoded.CephInstance as CephInstance import pvcnoded.MetadataAPIInstance as MetadataAPIInstance @@ -390,6 +391,7 @@ else: # PHASE 2a - Activate SR-IOV support ############################################################################### +# This happens before other networking steps to enable using VFs for cluster functions. if enable_networking and enable_sriov: logger.out('Setting up SR-IOV device support', state='i') # Enable unsafe interruptts for the vfio_iommu_type1 kernel module @@ -916,12 +918,15 @@ logger.out('Setting up objects', state='i') d_node = dict() d_network = dict() +d_sriov_vf = dict() d_domain = dict() d_osd = dict() d_pool = dict() d_volume = dict() # Dict of Dicts node_list = [] network_list = [] +sriov_pf_list = [] +sriov_vf_list = [] domain_list = [] osd_list = [] pool_list = [] @@ -1076,6 +1081,91 @@ if enable_networking: for node in d_node: d_node[node].update_network_list(d_network) + # Add the SR-IOV PFs and VFs to Zookeeper + # These do not behave like the objects; they are not dynamic (the API cannot change them), and they + # exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node + # basis, under the Node configuration tree. + # MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys + if enable_sriov and zkhandler.schema.schema.get('sriov_pf', None) is not None: + vf_list = list() + for device in config['sriov_device']: + pf = device['phy'] + vfcount = device['vfcount'] + if device.get('mtu', None) is None: + mtu = 1500 + else: + mtu = device['mtu'] + + # Create the PF device in Zookeeper + zkhandler.write([ + (('node.sriov.pf', myhostname, 'sriov_pf', pf), ''), + (('node.sriov.pf', myhostname, 'sriov_pf.mtu', pf), mtu), + (('node.sriov.pf', myhostname, 'sriov_pf.vfcount', pf), vfcount), + ]) + # Append the device to the list of PFs + sriov_pf_list.append(pf) + + # Get the list of VFs from `ip link show` + vf_list = json.loads(common.run_os_command('ip --json link show {}'.format(pf))[1])[0].get('vfinfo_list', []) + for vf in vf_list: + # { + # 'vf': 3, + # 'link_type': 'ether', + # 'address': '00:00:00:00:00:00', + # 'broadcast': 'ff:ff:ff:ff:ff:ff', + # 'vlan_list': [{'vlan': 101, 'qos': 2}], + # 'rate': {'max_tx': 0, 'min_tx': 0}, + # 'spoofchk': True, + # 'link_state': 'auto', + # 'trust': False, + # 'query_rss_en': False + # } + vfphy = '{}v{}'.format(pf, vf['vf']) + zkhandler.write([ + (('node.sriov.vf', myhostname, 'sriov_vf', vfphy), ''), + (('node.sriov.vf', myhostname, 'sriov_vf.pf', vfphy), pf), + (('node.sriov.vf', myhostname, 'sriov_vf.mtu', vfphy), mtu), + (('node.sriov.vf', myhostname, 'sriov_vf.mac', vfphy), vf['address']), + (('node.sriov.vf', myhostname, 'sriov_vf.config', vfphy), ''), + (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_id', vfphy), vf['vlan_list'][0].get('vlan', '')), + (('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_qos', vfphy), vf['vlan_list'][0].get('qos', '')), + (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_min', vfphy), vf['rate']['min_tx']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_max', vfphy), vf['rate']['max_tx']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.spoof_check', vfphy), vf['spoofchk']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.link_state', vfphy), vf['link_state']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.trust', vfphy), vf['trust']), + (('node.sriov.vf', myhostname, 'sriov_vf.config.query_rss', vfphy), vf['query_rss_en']), + ]) + # Append the device to the list of VFs + sriov_vf_list.append(vfphy) + + # Remove any obsolete PFs from Zookeeper if they go away + for pf in zkhandler.children(('node.sriov.pf', myhostname)): + if pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.pf', myhostname, 'sriov_pf', pf) + ]) + # Remove any obsolete VFs from Zookeeper if their PF goes away + for vf in zkhandler.children(('node.sriov.vf', myhostname)): + vf_pf = zkhandler.read(('node.sriov.vf', myhostname, 'sriov_vf.pf', vf)) + if vf_pf not in sriov_pf_list: + zkhandler.delete([ + ('node.sriov.vf', myhostname, 'sriov_vf', vf) + ]) + + # SR-IOV VF objects + # This is a ChildrenWatch just for consistency; the list never changes at runtime + @zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('node.sriov.vf', myhostname)) + def update_sriov_pfs(new_sriov_vf_list): + global sriov_vf_list, d_sriov_vf + + # Add VFs to the list + for vf in new_sriov_vf_list: + d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(vf, zkhandler, config, logger, this_node) + + sriov_vf_list = new_sriov_vf_list + logger.out('{}SR-IOV VF list:{} {}'.format(fmt_blue, fmt_end, ' '.join(sriov_vf_list)), state='i') + if enable_hypervisor: # VM command pipeline key @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain')) diff --git a/node-daemon/pvcnoded/SRIOVVFInstance.py b/node-daemon/pvcnoded/SRIOVVFInstance.py new file mode 100644 index 00000000..ea20dff8 --- /dev/null +++ b/node-daemon/pvcnoded/SRIOVVFInstance.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 + +# SRIOVVFInstance.py - Class implementing a PVC SR-IOV VF and run by pvcnoded +# Part of the Parallel Virtual Cluster (PVC) system +# +# Copyright (C) 2018-2021 Joshua M. Boniface +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +############################################################################### + +import daemon_lib.common as common + + +def boolToOnOff(state): + if state and str(state) == 'True': + return 'on' + else: + return 'off' + + +class SRIOVVFInstance(object): + # Initialization function + def __init__(self, vf, zkhandler, config, logger, this_node): + self.vf = vf + self.zkhandler = zkhandler + self.config = config + self.logger = logger + self.this_node = this_node + self.myhostname = self.this_node.name + + self.pf = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.pf', self.vf)) + self.mtu = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mtu', self.vf)) + + self.vfid = self.vf.replace('{}v'.format(self.pf), '') + self.mac = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mac', self.vf)) + + self.vlan_id = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_id', self.vf)) + self.vlan_qos = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_qos', self.vf)) + self.tx_rate_min = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_min', self.vf)) + self.tx_rate_max = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_max', self.vf)) + self.spoof_check = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.spoof_check', self.vf)) + self.link_state = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.link_state', self.vf)) + self.trust = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.trust', self.vf)) + self.query_rss = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.query_rss', self.vf)) + + # Zookeeper handlers for changed configs + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.mac', self.vf)) + def watch_vf_mac(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '00:00:00:00:00:00' + + if data != self.mac: + self.mac = data + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_id', self.vf)) + def watch_vf_vlan_id(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.vlan_id: + self.vlan_id = data + common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_qos', self.vf)) + def watch_vf_vlan_qos(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.vlan_qos: + self.vlan_qos = data + common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_min', self.vf)) + def watch_vf_tx_rate_min(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.tx_rate_min: + self.tx_rate_min = data + common.run_os_command('ip link set {} vf {} min_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_min)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_max', self.vf)) + def watch_vf_tx_rate_max(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; termaxate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.tx_rate_max: + self.tx_rate_max = data + common.run_os_command('ip link set {} vf {} max_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_max)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.spoof_check', self.vf)) + def watch_vf_spoof_check(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = '0' + + if data != self.spoof_check: + self.spoof_check = data + common.run_os_command('ip link set {} vf {} spoofchk {}'.format(self.pf, self.vfid, boolToOnOff(self.spoof_check))) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.link_state', self.vf)) + def watch_vf_link_state(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'on' + + if data != self.link_state: + self.link_state = data + common.run_os_command('ip link set {} vf {} state {}'.format(self.pf, self.vfid, self.link_state)) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.trust', self.vf)) + def watch_vf_trust(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'off' + + if data != self.trust: + self.trust = data + common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.trust))) + + @self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.query_rss', self.vf)) + def watch_vf_query_rss(data, stat, event=''): + if event and event.type == 'DELETED': + # The key has been deleted after existing before; terminate this watcher + # because this class instance is about to be reaped in Daemon.py + return False + + try: + data = data.decode('ascii') + except AttributeError: + data = 'off' + + if data != self.query_rss: + self.query_rss = data + common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.query_rss)))