Implement SR-IOV PF and VF instances

Adds support for the node daemon managing SR-IOV PF and VF instances.

PFs are added to Zookeeper automatically based on the config at startup
during network configuration, and are otherwise completely static. PFs
are automatically removed from Zookeeper, along with all coresponding
VFs, should the PF phy device be removed from the configuration.

VFs are configured based on the (autocreated) VFs of each PF device,
added to Zookeeper, and then a new class instance, SRIOVVFInstance, is
used to watch them for configuration changes. This will enable the
runtime management of VF settings by the API. The set of keys ensures
that both configuration and details of the NIC can be tracked.

Most keys are self-explanatory, especially for PFs and the basic keys
for VFs. The configuration tree is also self-explanatory, being based
entirely on the options available in the `ip link set {dev} vf` command.

Two additional keys are also present: `used` and `used_by`, which will
be able to track the (boolean) state of usage, as well as the VM that
uses a given VIF. Since the VM side implementation will support both
macvtap and direct "hostdev" assignments, this will ensure that this
state can be tracked on both the VF and the VM side.
This commit is contained in:
Joshua Boniface 2021-06-17 01:01:23 -04:00
parent 0ad6d55dff
commit e7b6a3eac1
4 changed files with 327 additions and 2 deletions

View File

@ -0,0 +1 @@
{"version": "1", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "migrate.sync_lock": "/migrate_sync_lock"}, "network": {"vni": "", "type": "/nettype", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}}

View File

@ -426,7 +426,7 @@ class ZKHandler(object):
# #
class ZKSchema(object): class ZKSchema(object):
# Current version # Current version
_version = 0 _version = 1
# Root for doing nested keys # Root for doing nested keys
_schema_root = '' _schema_root = ''
@ -483,7 +483,34 @@ class ZKSchema(object):
'memory.provisioned': '/memprov', 'memory.provisioned': '/memprov',
'ipmi.hostname': '/ipmihostname', 'ipmi.hostname': '/ipmihostname',
'ipmi.username': '/ipmiusername', 'ipmi.username': '/ipmiusername',
'ipmi.password': '/ipmipassword' 'ipmi.password': '/ipmipassword',
'sriov': '/sriov',
'sriov.pf': '/sriov/pf',
'sriov.vf': '/sriov/vf',
},
# The schema of an individual SR-IOV PF entry (/nodes/{node_name}/sriov/pf/{pf})
'sriov_pf': {
'phy': '', # The root key
'mtu': '/mtu',
'vfcount': '/vfcount'
},
# The schema of an individual SR-IOV VF entry (/nodes/{node_name}/sriov/vf/{vf})
'sriov_vf': {
'phy': '', # The root key
'pf': '/pf',
'mtu': '/mtu',
'mac': '/mac',
'config': '/config',
'config.vlan_id': '/config/vlan_id',
'config.vlan_qos': '/config/vlan_qos',
'config.tx_rate_min': '/config/tx_rate_min',
'config.tx_rate_max': '/config/tx_rate_max',
'config.spoof_check': '/config/spoof_check',
'config.link_state': '/config/link_state',
'config.trust': '/config/trust',
'config.query_rss': '/config/query_rss',
'used': '/used',
'used_by': '/used_by'
}, },
# The schema of an individual domain entry (/domains/{domain_uuid}) # The schema of an individual domain entry (/domains/{domain_uuid})
'domain': { 'domain': {
@ -709,6 +736,10 @@ class ZKSchema(object):
if not zkhandler.zk_conn.exists(nkipath): if not zkhandler.zk_conn.exists(nkipath):
result = False result = False
# One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be
# managed here as well, but those are created automatically every time pvcnoded starts
# and thus never need to be validated or applied.
# These two have several children layers that must be parsed through # These two have several children layers that must be parsed through
for elem in ['volume']: for elem in ['volume']:
# First read all the subelements of the key class (pool layer) # First read all the subelements of the key class (pool layer)
@ -782,6 +813,10 @@ class ZKSchema(object):
if not zkhandler.zk_conn.exists(nkipath): if not zkhandler.zk_conn.exists(nkipath):
zkhandler.zk_conn.create(nkipath, ''.encode(zkhandler.encoding)) zkhandler.zk_conn.create(nkipath, ''.encode(zkhandler.encoding))
# One might expect child keys under node (specifically, sriov.pf and sriov.vf) to be
# managed here as well, but those are created automatically every time pvcnoded starts
# and thus never need to be validated or applied.
# These two have several children layers that must be parsed through # These two have several children layers that must be parsed through
for elem in ['volume']: for elem in ['volume']:
# First read all the subelements of the key class (pool layer) # First read all the subelements of the key class (pool layer)

View File

@ -49,6 +49,7 @@ import daemon_lib.common as common
import pvcnoded.VMInstance as VMInstance import pvcnoded.VMInstance as VMInstance
import pvcnoded.NodeInstance as NodeInstance import pvcnoded.NodeInstance as NodeInstance
import pvcnoded.VXNetworkInstance as VXNetworkInstance import pvcnoded.VXNetworkInstance as VXNetworkInstance
import pvcnoded.SRIOVVFInstance as SRIOVVFInstance
import pvcnoded.DNSAggregatorInstance as DNSAggregatorInstance import pvcnoded.DNSAggregatorInstance as DNSAggregatorInstance
import pvcnoded.CephInstance as CephInstance import pvcnoded.CephInstance as CephInstance
import pvcnoded.MetadataAPIInstance as MetadataAPIInstance import pvcnoded.MetadataAPIInstance as MetadataAPIInstance
@ -390,6 +391,7 @@ else:
# PHASE 2a - Activate SR-IOV support # PHASE 2a - Activate SR-IOV support
############################################################################### ###############################################################################
# This happens before other networking steps to enable using VFs for cluster functions.
if enable_networking and enable_sriov: if enable_networking and enable_sriov:
logger.out('Setting up SR-IOV device support', state='i') logger.out('Setting up SR-IOV device support', state='i')
# Enable unsafe interruptts for the vfio_iommu_type1 kernel module # Enable unsafe interruptts for the vfio_iommu_type1 kernel module
@ -916,12 +918,15 @@ logger.out('Setting up objects', state='i')
d_node = dict() d_node = dict()
d_network = dict() d_network = dict()
d_sriov_vf = dict()
d_domain = dict() d_domain = dict()
d_osd = dict() d_osd = dict()
d_pool = dict() d_pool = dict()
d_volume = dict() # Dict of Dicts d_volume = dict() # Dict of Dicts
node_list = [] node_list = []
network_list = [] network_list = []
sriov_pf_list = []
sriov_vf_list = []
domain_list = [] domain_list = []
osd_list = [] osd_list = []
pool_list = [] pool_list = []
@ -1076,6 +1081,91 @@ if enable_networking:
for node in d_node: for node in d_node:
d_node[node].update_network_list(d_network) d_node[node].update_network_list(d_network)
# Add the SR-IOV PFs and VFs to Zookeeper
# These do not behave like the objects; they are not dynamic (the API cannot change them), and they
# exist for the lifetime of this Node instance. The objects are set here in Zookeeper on a per-node
# basis, under the Node configuration tree.
# MIGRATION: The schema.schema.get ensures that the current active Schema contains the required keys
if enable_sriov and zkhandler.schema.schema.get('sriov_pf', None) is not None:
vf_list = list()
for device in config['sriov_device']:
pf = device['phy']
vfcount = device['vfcount']
if device.get('mtu', None) is None:
mtu = 1500
else:
mtu = device['mtu']
# Create the PF device in Zookeeper
zkhandler.write([
(('node.sriov.pf', myhostname, 'sriov_pf', pf), ''),
(('node.sriov.pf', myhostname, 'sriov_pf.mtu', pf), mtu),
(('node.sriov.pf', myhostname, 'sriov_pf.vfcount', pf), vfcount),
])
# Append the device to the list of PFs
sriov_pf_list.append(pf)
# Get the list of VFs from `ip link show`
vf_list = json.loads(common.run_os_command('ip --json link show {}'.format(pf))[1])[0].get('vfinfo_list', [])
for vf in vf_list:
# {
# 'vf': 3,
# 'link_type': 'ether',
# 'address': '00:00:00:00:00:00',
# 'broadcast': 'ff:ff:ff:ff:ff:ff',
# 'vlan_list': [{'vlan': 101, 'qos': 2}],
# 'rate': {'max_tx': 0, 'min_tx': 0},
# 'spoofchk': True,
# 'link_state': 'auto',
# 'trust': False,
# 'query_rss_en': False
# }
vfphy = '{}v{}'.format(pf, vf['vf'])
zkhandler.write([
(('node.sriov.vf', myhostname, 'sriov_vf', vfphy), ''),
(('node.sriov.vf', myhostname, 'sriov_vf.pf', vfphy), pf),
(('node.sriov.vf', myhostname, 'sriov_vf.mtu', vfphy), mtu),
(('node.sriov.vf', myhostname, 'sriov_vf.mac', vfphy), vf['address']),
(('node.sriov.vf', myhostname, 'sriov_vf.config', vfphy), ''),
(('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_id', vfphy), vf['vlan_list'][0].get('vlan', '')),
(('node.sriov.vf', myhostname, 'sriov_vf.config.vlan_qos', vfphy), vf['vlan_list'][0].get('qos', '')),
(('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_min', vfphy), vf['rate']['min_tx']),
(('node.sriov.vf', myhostname, 'sriov_vf.config.tx_rate_max', vfphy), vf['rate']['max_tx']),
(('node.sriov.vf', myhostname, 'sriov_vf.config.spoof_check', vfphy), vf['spoofchk']),
(('node.sriov.vf', myhostname, 'sriov_vf.config.link_state', vfphy), vf['link_state']),
(('node.sriov.vf', myhostname, 'sriov_vf.config.trust', vfphy), vf['trust']),
(('node.sriov.vf', myhostname, 'sriov_vf.config.query_rss', vfphy), vf['query_rss_en']),
])
# Append the device to the list of VFs
sriov_vf_list.append(vfphy)
# Remove any obsolete PFs from Zookeeper if they go away
for pf in zkhandler.children(('node.sriov.pf', myhostname)):
if pf not in sriov_pf_list:
zkhandler.delete([
('node.sriov.pf', myhostname, 'sriov_pf', pf)
])
# Remove any obsolete VFs from Zookeeper if their PF goes away
for vf in zkhandler.children(('node.sriov.vf', myhostname)):
vf_pf = zkhandler.read(('node.sriov.vf', myhostname, 'sriov_vf.pf', vf))
if vf_pf not in sriov_pf_list:
zkhandler.delete([
('node.sriov.vf', myhostname, 'sriov_vf', vf)
])
# SR-IOV VF objects
# This is a ChildrenWatch just for consistency; the list never changes at runtime
@zkhandler.zk_conn.ChildrenWatch(zkhandler.schema.path('node.sriov.vf', myhostname))
def update_sriov_pfs(new_sriov_vf_list):
global sriov_vf_list, d_sriov_vf
# Add VFs to the list
for vf in new_sriov_vf_list:
d_sriov_vf[vf] = SRIOVVFInstance.SRIOVVFInstance(vf, zkhandler, config, logger, this_node)
sriov_vf_list = new_sriov_vf_list
logger.out('{}SR-IOV VF list:{} {}'.format(fmt_blue, fmt_end, ' '.join(sriov_vf_list)), state='i')
if enable_hypervisor: if enable_hypervisor:
# VM command pipeline key # VM command pipeline key
@zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain')) @zkhandler.zk_conn.DataWatch(zkhandler.schema.path('base.cmd.domain'))

View File

@ -0,0 +1,199 @@
#!/usr/bin/env python3
# SRIOVVFInstance.py - Class implementing a PVC SR-IOV VF and run by pvcnoded
# Part of the Parallel Virtual Cluster (PVC) system
#
# Copyright (C) 2018-2021 Joshua M. Boniface <joshua@boniface.me>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
###############################################################################
import daemon_lib.common as common
def boolToOnOff(state):
if state and str(state) == 'True':
return 'on'
else:
return 'off'
class SRIOVVFInstance(object):
# Initialization function
def __init__(self, vf, zkhandler, config, logger, this_node):
self.vf = vf
self.zkhandler = zkhandler
self.config = config
self.logger = logger
self.this_node = this_node
self.myhostname = self.this_node.name
self.pf = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.pf', self.vf))
self.mtu = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mtu', self.vf))
self.vfid = self.vf.replace('{}v'.format(self.pf), '')
self.mac = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.mac', self.vf))
self.vlan_id = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_id', self.vf))
self.vlan_qos = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.vlan_qos', self.vf))
self.tx_rate_min = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_min', self.vf))
self.tx_rate_max = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.tx_rate_max', self.vf))
self.spoof_check = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.spoof_check', self.vf))
self.link_state = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.link_state', self.vf))
self.trust = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.trust', self.vf))
self.query_rss = self.zkhandler.read(('node.sriov.vf', self.myhostname, 'sriov_vf.config.query_rss', self.vf))
# Zookeeper handlers for changed configs
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.mac', self.vf))
def watch_vf_mac(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '00:00:00:00:00:00'
if data != self.mac:
self.mac = data
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_id', self.vf))
def watch_vf_vlan_id(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '0'
if data != self.vlan_id:
self.vlan_id = data
common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.vlan_qos', self.vf))
def watch_vf_vlan_qos(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '0'
if data != self.vlan_qos:
self.vlan_qos = data
common.run_os_command('ip link set {} vf {} vlan {} qos {}'.format(self.pf, self.vfid, self.vlan_id, self.vlan_qos))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_min', self.vf))
def watch_vf_tx_rate_min(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '0'
if data != self.tx_rate_min:
self.tx_rate_min = data
common.run_os_command('ip link set {} vf {} min_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_min))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.tx_rate_max', self.vf))
def watch_vf_tx_rate_max(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; termaxate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '0'
if data != self.tx_rate_max:
self.tx_rate_max = data
common.run_os_command('ip link set {} vf {} max_tx_rate {}'.format(self.pf, self.vfid, self.tx_rate_max))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.spoof_check', self.vf))
def watch_vf_spoof_check(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = '0'
if data != self.spoof_check:
self.spoof_check = data
common.run_os_command('ip link set {} vf {} spoofchk {}'.format(self.pf, self.vfid, boolToOnOff(self.spoof_check)))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.link_state', self.vf))
def watch_vf_link_state(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = 'on'
if data != self.link_state:
self.link_state = data
common.run_os_command('ip link set {} vf {} state {}'.format(self.pf, self.vfid, self.link_state))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.trust', self.vf))
def watch_vf_trust(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = 'off'
if data != self.trust:
self.trust = data
common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.trust)))
@self.zkhandler.zk_conn.DataWatch(self.zkhandler.schema.path('node.sriov.vf', self.myhostname) + self.zkhandler.schema.path('sriov_vf.config.query_rss', self.vf))
def watch_vf_query_rss(data, stat, event=''):
if event and event.type == 'DELETED':
# The key has been deleted after existing before; terminate this watcher
# because this class instance is about to be reaped in Daemon.py
return False
try:
data = data.decode('ascii')
except AttributeError:
data = 'off'
if data != self.query_rss:
self.query_rss = data
common.run_os_command('ip link set {} vf {} trust {}'.format(self.pf, self.vfid, boolToOnOff(self.query_rss)))