From 09269f182cdcbc611232914751856993b4e3bed9 Mon Sep 17 00:00:00 2001 From: "Joshua M. Boniface" Date: Wed, 10 Jan 2024 16:13:31 -0500 Subject: [PATCH] Add live migrate max downtime selector meta field Adds a new flag to VM metadata to allow setting the VM live migration max downtime. This will enable very busy VMs that hang live migration to have this value changed. --- .../977e7b4d3497_pvc_version_0_9_89.py | 28 ++++++++ api-daemon/pvc-api-db-upgrade | 3 +- api-daemon/pvcapid/flaskapi.py | 64 ++++++++++++++++++- api-daemon/pvcapid/helper.py | 22 ++++++- api-daemon/pvcapid/models.py | 3 + api-daemon/pvcapid/provisioner.py | 10 ++- client-cli/pvc/cli/cli.py | 48 ++++++++++++-- client-cli/pvc/lib/provisioner.py | 32 ++++++++-- client-cli/pvc/lib/vm.py | 33 +++++++--- daemon-common/common.py | 8 +++ daemon-common/migrations/versions/13.json | 1 + daemon-common/vm.py | 16 +++++ daemon-common/vmbuilder.py | 2 + daemon-common/zkhandler.py | 5 +- gen-api-migrations | 11 +++- node-daemon/pvcnoded/objects/VMInstance.py | 23 +++++++ pvc.sample.conf | 4 +- 17 files changed, 283 insertions(+), 30 deletions(-) create mode 100644 api-daemon/migrations/versions/977e7b4d3497_pvc_version_0_9_89.py create mode 100644 daemon-common/migrations/versions/13.json diff --git a/api-daemon/migrations/versions/977e7b4d3497_pvc_version_0_9_89.py b/api-daemon/migrations/versions/977e7b4d3497_pvc_version_0_9_89.py new file mode 100644 index 00000000..86571794 --- /dev/null +++ b/api-daemon/migrations/versions/977e7b4d3497_pvc_version_0_9_89.py @@ -0,0 +1,28 @@ +"""PVC version 0.9.89 + +Revision ID: 977e7b4d3497 +Revises: 88fa0d88a9f8 +Create Date: 2024-01-10 16:09:44.659027 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '977e7b4d3497' +down_revision = '88fa0d88a9f8' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('system_template', sa.Column('migration_max_downtime', sa.Integer(), default="300", server_default="300", nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('system_template', 'migration_max_downtime') + # ### end Alembic commands ### diff --git a/api-daemon/pvc-api-db-upgrade b/api-daemon/pvc-api-db-upgrade index 4a9d1d30..c75e4ae4 100755 --- a/api-daemon/pvc-api-db-upgrade +++ b/api-daemon/pvc-api-db-upgrade @@ -19,7 +19,8 @@ case "$( cat /etc/debian_version )" in ;; *) # Debian 12+ - flask --app ./pvcapid-manage_flask.py db upgrade + export FLASK_APP=./pvcapid-manage_flask.py + flask db upgrade ;; esac diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index 691df059..c312638f 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -46,7 +46,7 @@ from flask_sqlalchemy import SQLAlchemy app = flask.Flask(__name__) # Set up SQLAlchemy backend -app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False +app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = True app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql://{}:{}@{}:{}/{}".format( config["api_postgresql_user"], config["api_postgresql_password"], @@ -1591,6 +1591,9 @@ class API_VM_Root(Resource): migration_method: type: string description: The preferred migration method (live, shutdown, none) + migration_max_downtime: + type: integer + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime tags: type: array description: The tag(s) of the VM @@ -1843,6 +1846,10 @@ class API_VM_Root(Resource): "choices": ("live", "shutdown", "none"), "helptext": "A valid migration_method must be specified", }, + { + "name": "migration_max_downtime", + "helptext": "A valid migration_max_downtime must be specified", + }, {"name": "user_tags", "action": "append"}, {"name": "protected_tags", "action": "append"}, { @@ -1903,6 +1910,12 @@ class API_VM_Root(Resource): - live - shutdown - none + - in: query + name: migration_max_downtime + type: integer + required: false + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime + default: 300 - in: query name: user_tags type: array @@ -1943,6 +1956,7 @@ class API_VM_Root(Resource): reqargs.get("selector", "none"), bool(strtobool(reqargs.get("autostart", "false"))), reqargs.get("migration_method", "none"), + reqargs.get("migration_max_downtime", 300), user_tags, protected_tags, ) @@ -1990,6 +2004,10 @@ class API_VM_Element(Resource): "choices": ("live", "shutdown", "none"), "helptext": "A valid migration_method must be specified", }, + { + "name": "migration_max_downtime", + "helptext": "A valid migration_max_downtime must be specified", + }, {"name": "user_tags", "action": "append"}, {"name": "protected_tags", "action": "append"}, { @@ -2052,6 +2070,12 @@ class API_VM_Element(Resource): - live - shutdown - none + - in: query + name: migration_max_downtime + type: integer + required: false + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime + default: 300 - in: query name: user_tags type: array @@ -2092,6 +2116,7 @@ class API_VM_Element(Resource): reqargs.get("selector", "none"), bool(strtobool(reqargs.get("autostart", "false"))), reqargs.get("migration_method", "none"), + reqargs.get("migration_max_downtime", 300), user_tags, protected_tags, ) @@ -2218,6 +2243,9 @@ class API_VM_Metadata(Resource): migration_method: type: string description: The preferred migration method (live, shutdown, none) + migration_max_downtime: + type: integer + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime 404: description: VM not found schema: @@ -2241,6 +2269,10 @@ class API_VM_Metadata(Resource): "choices": ("live", "shutdown", "none"), "helptext": "A valid migration_method must be specified", }, + { + "name": "migration_max_downtime", + "helptext": "A valid migration_max_downtime must be specified", + }, ] ) @Authenticator @@ -2288,6 +2320,12 @@ class API_VM_Metadata(Resource): - live - shutdown - none + - in: query + name: migration_max_downtime + type: integer + required: false + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime + default: none responses: 200: description: OK @@ -2312,6 +2350,7 @@ class API_VM_Metadata(Resource): reqargs.get("autostart", None), reqargs.get("profile", None), reqargs.get("migration_method", None), + reqargs.get("migration_max_downtime", None), ) @@ -6387,6 +6426,9 @@ class API_Provisioner_Template_System_Root(Resource): migration_method: type: string description: The preferred migration method (live, shutdown, none) + migration_max_downtime: + type: integer + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime parameters: - in: query name: limit @@ -6431,6 +6473,7 @@ class API_Provisioner_Template_System_Root(Resource): {"name": "node_selector"}, {"name": "node_autostart"}, {"name": "migration_method"}, + {"name": "migration_max_downtime"}, ] ) @Authenticator @@ -6491,6 +6534,11 @@ class API_Provisioner_Template_System_Root(Resource): type: string required: false description: The preferred migration method (live, shutdown, none) + - in: query + name: migration_max_downtime + type: integer + required: false + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime responses: 200: description: OK @@ -6541,6 +6589,7 @@ class API_Provisioner_Template_System_Root(Resource): reqargs.get("node_selector", None), node_autostart, reqargs.get("migration_method", None), + reqargs.get("migration_max_downtime", None), ) @@ -6596,6 +6645,7 @@ class API_Provisioner_Template_System_Element(Resource): {"name": "node_selector"}, {"name": "node_autostart"}, {"name": "migration_method"}, + {"name": "migration_max_downtime"}, ] ) @Authenticator @@ -6651,6 +6701,11 @@ class API_Provisioner_Template_System_Element(Resource): type: string required: false description: The preferred migration method (live, shutdown, none) + - in: query + name: migration_max_downtime + type: integer + required: false + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime responses: 200: description: OK @@ -6701,6 +6756,7 @@ class API_Provisioner_Template_System_Element(Resource): reqargs.get("node_selector", None), node_autostart, reqargs.get("migration_method", None), + reqargs.get("migration_max_downtime", None), ) @RequestParser( @@ -6714,6 +6770,7 @@ class API_Provisioner_Template_System_Element(Resource): {"name": "node_selector"}, {"name": "node_autostart"}, {"name": "migration_method"}, + {"name": "migration_max_downtime"}, ] ) @Authenticator @@ -6760,6 +6817,10 @@ class API_Provisioner_Template_System_Element(Resource): name: migration_method type: string description: The preferred migration method (live, shutdown, none) + - in: query + name: migration_max_downtime + type: integer + description: The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger max_downtime responses: 200: description: OK @@ -6783,6 +6844,7 @@ class API_Provisioner_Template_System_Element(Resource): reqargs.get("node_selector", None), reqargs.get("node_autostart", None), reqargs.get("migration_method", None), + reqargs.get("migration_max_downtime", None), ) @Authenticator diff --git a/api-daemon/pvcapid/helper.py b/api-daemon/pvcapid/helper.py index 8f665b07..ca2ff7c1 100755 --- a/api-daemon/pvcapid/helper.py +++ b/api-daemon/pvcapid/helper.py @@ -641,6 +641,7 @@ def vm_define( selector, autostart, migration_method, + migration_max_downtime=300, user_tags=[], protected_tags=[], ): @@ -668,6 +669,7 @@ def vm_define( selector, autostart, migration_method, + migration_max_downtime, profile=None, tags=tags, ) @@ -826,6 +828,7 @@ def get_vm_meta(zkhandler, vm): domain_node_selector, domain_node_autostart, domain_migrate_method, + domain_migrate_max_downtime, ) = pvc_common.getDomainMetadata(zkhandler, dom_uuid) retcode = 200 @@ -835,6 +838,7 @@ def get_vm_meta(zkhandler, vm): "node_selector": domain_node_selector.lower(), "node_autostart": domain_node_autostart, "migration_method": domain_migrate_method.lower(), + "migration_max_downtime": int(domain_migrate_max_downtime), } return retdata, retcode @@ -842,7 +846,14 @@ def get_vm_meta(zkhandler, vm): @ZKConnection(config) def update_vm_meta( - zkhandler, vm, limit, selector, autostart, provisioner_profile, migration_method + zkhandler, + vm, + limit, + selector, + autostart, + provisioner_profile, + migration_method, + migration_max_downtime, ): """ Update metadata of a VM. @@ -858,7 +869,14 @@ def update_vm_meta( autostart = False retflag, retdata = pvc_vm.modify_vm_metadata( - zkhandler, vm, limit, selector, autostart, provisioner_profile, migration_method + zkhandler, + vm, + limit, + selector, + autostart, + provisioner_profile, + migration_method, + migration_max_downtime, ) if retflag: diff --git a/api-daemon/pvcapid/models.py b/api-daemon/pvcapid/models.py index e61a3059..f9c559cb 100755 --- a/api-daemon/pvcapid/models.py +++ b/api-daemon/pvcapid/models.py @@ -36,6 +36,7 @@ class DBSystemTemplate(db.Model): node_selector = db.Column(db.Text) node_autostart = db.Column(db.Boolean, nullable=False) migration_method = db.Column(db.Text) + migration_max_downtime = db.Column(db.Integer, default=300, server_default="300") ova = db.Column(db.Integer, db.ForeignKey("ova.id"), nullable=True) def __init__( @@ -50,6 +51,7 @@ class DBSystemTemplate(db.Model): node_selector, node_autostart, migration_method, + migration_max_downtime, ova=None, ): self.name = name @@ -62,6 +64,7 @@ class DBSystemTemplate(db.Model): self.node_selector = node_selector self.node_autostart = node_autostart self.migration_method = migration_method + self.migration_max_downtime = migration_max_downtime self.ova = ova def __repr__(self): diff --git a/api-daemon/pvcapid/provisioner.py b/api-daemon/pvcapid/provisioner.py index b15e53aa..0a1535b2 100755 --- a/api-daemon/pvcapid/provisioner.py +++ b/api-daemon/pvcapid/provisioner.py @@ -221,6 +221,7 @@ def create_template_system( node_selector=None, node_autostart=False, migration_method=None, + migration_max_downtime=None, ova=None, ): if list_template_system(name, is_fuzzy=False)[-1] != 404: @@ -231,7 +232,7 @@ def create_template_system( if node_selector == "none": node_selector = None - query = "INSERT INTO system_template (name, vcpu_count, vram_mb, serial, vnc, vnc_bind, node_limit, node_selector, node_autostart, migration_method, ova) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" + query = "INSERT INTO system_template (name, vcpu_count, vram_mb, serial, vnc, vnc_bind, node_limit, node_selector, node_autostart, migration_method, migration_max_downtime, ova) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" args = ( name, vcpu_count, @@ -243,6 +244,7 @@ def create_template_system( node_selector, node_autostart, migration_method, + migration_max_downtime, ova, ) @@ -438,6 +440,7 @@ def modify_template_system( node_selector=None, node_autostart=None, migration_method=None, + migration_max_downtime=None, ): if list_template_system(name, is_fuzzy=False)[-1] != 200: retmsg = {"message": 'The system template "{}" does not exist.'.format(name)} @@ -505,6 +508,11 @@ def modify_template_system( if migration_method is not None: fields.append({"field": "migration_method", "data": migration_method}) + if migration_max_downtime is not None: + fields.append( + {"field": "migration_max_downtime", "data": int(migration_max_downtime)} + ) + conn, cur = open_database(config) try: for field in fields: diff --git a/client-cli/pvc/cli/cli.py b/client-cli/pvc/cli/cli.py index e18ec188..62ac0f83 100644 --- a/client-cli/pvc/cli/cli.py +++ b/client-cli/pvc/cli/cli.py @@ -1098,6 +1098,14 @@ def cli_vm(): type=click.Choice(["none", "live", "shutdown"]), help="The preferred migration method of the VM between nodes; saved with VM.", ) +@click.option( + "-d", + "--max-downtime", + "migration_max_downtime", + default=300, + show_default=True, + help="The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger downtime.", +) @click.option( "-g", "--tag", @@ -1122,6 +1130,7 @@ def cli_vm_define( node_selector, node_autostart, migration_method, + migration_max_downtime, user_tags, protected_tags, ): @@ -1135,10 +1144,12 @@ def cli_vm_define( * "load": choose the node with the lowest current load average * "vms": choose the node with the least number of provisioned VMs - For most clusters, "mem" should be sufficient, but others may be used based on the cluster workload and available resources. The following caveats should be considered: + For most clusters, the migration method selector ("--method"/"-m") "mem" should be sufficient, but others may be used based on the cluster workload and available resources. The following caveats should be considered: * "mem" looks at the free memory of the node in general, ignoring the amount provisioned to VMs; if any VM's internal memory usage changes, this value would be affected. * "memprov" looks at the provisioned memory, not the allocated memory; thus, stopped or disabled VMs are counted towards a node's memory for this selector, even though their memory is not actively in use. * "load" looks at the system load of the node in general, ignoring load in any particular VMs; if any VM's CPU usage changes, this value would be affected. This might be preferable on clusters with some very CPU intensive VMs. + + For most VMs, the 300ms default maximum downtime ("--max-downtime"/"-d") should be sufficient. However very busy VMs with a lot of memory pressure or CPU load may require a larger downtime to properly migrate. Generally, keep this at the default unless you know the VM will be extremely busy, or you find you have problems migrating it later. Reasonable values range from 100ms to 2000ms (2 seconds). """ # Open the XML file @@ -1160,6 +1171,7 @@ def cli_vm_define( node_selector, node_autostart, migration_method, + migration_max_downtime, user_tags, protected_tags, ) @@ -1205,6 +1217,13 @@ def cli_vm_define( type=click.Choice(["none", "live", "shutdown"]), help="The preferred migration method of the VM between nodes.", ) +@click.option( + "-d", + "--max-downtime", + "migration_max_downtime", + default=None, + help="The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger downtime.", +) @click.option( "-p", "--profile", @@ -1220,12 +1239,13 @@ def cli_vm_meta( node_selector, node_autostart, migration_method, + migration_max_downtime, provisioner_profile, ): """ Modify the PVC metadata of existing virtual machine DOMAIN. At least one option to update must be specified. DOMAIN may be a UUID or name. - For details on the "--node-selector"/"-s" values, please see help for the command "pvc vm define". + For details on the available option values, please see help for the command "pvc vm define". """ if ( @@ -1233,6 +1253,7 @@ def cli_vm_meta( and node_selector is None and node_autostart is None and migration_method is None + and migration_max_downtime is None and provisioner_profile is None ): finish(False, "At least one metadata option must be specified to update.") @@ -1244,6 +1265,7 @@ def cli_vm_meta( node_selector, node_autostart, migration_method, + migration_max_downtime, provisioner_profile, ) finish(retcode, retmsg) @@ -4456,6 +4478,13 @@ def cli_provisioner_template_system(): default=None, # Use cluster default help="The preferred migration method of the VM between nodes", ) +@click.option( + "--max-downtime", + "migration_max_downtime", + default=300, + show_default=True, + help="The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger downtime.", +) def cli_provisioner_template_system_add( name, vcpus, @@ -4467,11 +4496,12 @@ def cli_provisioner_template_system_add( node_selector, node_autostart, migration_method, + migration_max_downtime, ): """ Add a new system template NAME to the PVC cluster provisioner. - For details on the possible "--node-selector" values, please see help for the command "pvc vm define". + For details on the possible option values, please see help for the command "pvc vm define". """ params = dict() params["name"] = name @@ -4489,6 +4519,8 @@ def cli_provisioner_template_system_add( params["node_autostart"] = node_autostart if migration_method: params["migration_method"] = migration_method + if migration_max_downtime: + params["migration_max_downtime"] = migration_max_downtime retcode, retdata = pvc.lib.provisioner.template_add( CLI_CONFIG, params, template_type="system" @@ -4551,6 +4583,12 @@ def cli_provisioner_template_system_add( default=None, # Use cluster default help="The preferred migration method of the VM between nodes", ) +@click.option( + "--max-downtime", + "migration_max_downtime", + default=None, + help="The maximum time in milliseconds that a VM can be down for during a live migration; busy VMs may require a larger downtime.", +) def cli_provisioner_template_system_modify( name, vcpus, @@ -4562,11 +4600,12 @@ def cli_provisioner_template_system_modify( node_selector, node_autostart, migration_method, + migration_max_downtime, ): """ Add a new system template NAME to the PVC cluster provisioner. - For details on the possible "--node-selector" values, please see help for the command "pvc vm define". + For details on the possible option values, please see help for the command "pvc vm define". """ params = dict() params["vcpus"] = vcpus @@ -4578,6 +4617,7 @@ def cli_provisioner_template_system_modify( params["node_selector"] = node_selector params["node_autostart"] = node_autostart params["migration_method"] = migration_method + params["migration_max_downtime"] = migration_max_downtime retcode, retdata = pvc.lib.provisioner.template_modify( CLI_CONFIG, params, name, template_type="system" diff --git a/client-cli/pvc/lib/provisioner.py b/client-cli/pvc/lib/provisioner.py index 7b25a2ce..41dbb2e9 100644 --- a/client-cli/pvc/lib/provisioner.py +++ b/client-cli/pvc/lib/provisioner.py @@ -779,7 +779,8 @@ def format_list_template_system(template_data): template_node_limit_length = 6 template_node_selector_length = 9 template_node_autostart_length = 10 - template_migration_method_length = 10 + template_migration_method_length = 12 + template_migration_max_downtime_length = 13 for template in template_data: # template_name column @@ -826,6 +827,17 @@ def format_list_template_system(template_data): _template_migration_method_length = len(str(template["migration_method"])) + 1 if _template_migration_method_length > template_migration_method_length: template_migration_method_length = _template_migration_method_length + # template_migration_max_downtime column + _template_migration_max_downtime_length = ( + len(str(template["migration_max_downtime"])) + 1 + ) + if ( + _template_migration_max_downtime_length + > template_migration_max_downtime_length + ): + template_migration_max_downtime_length = ( + _template_migration_max_downtime_length + ) # Format the string (header) template_list_output.append( @@ -842,7 +854,8 @@ def format_list_template_system(template_data): + template_node_selector_length + template_node_autostart_length + template_migration_method_length - + 3, + + template_migration_max_downtime_length + + 4, template_header="System Templates " + "".join( ["-" for _ in range(17, template_name_length + template_id_length)] @@ -874,7 +887,8 @@ def format_list_template_system(template_data): + template_node_selector_length + template_node_autostart_length + template_migration_method_length - + 2, + + template_migration_max_downtime_length + + 3, ) ] ), @@ -891,7 +905,8 @@ def format_list_template_system(template_data): {template_node_limit: <{template_node_limit_length}} \ {template_node_selector: <{template_node_selector_length}} \ {template_node_autostart: <{template_node_autostart_length}} \ -{template_migration_method: <{template_migration_method_length}}{end_bold}".format( +{template_migration_method: <{template_migration_method_length}} \ +{template_migration_max_downtime: <{template_migration_max_downtime_length}}{end_bold}".format( bold=ansiprint.bold(), end_bold=ansiprint.end(), template_name_length=template_name_length, @@ -905,6 +920,7 @@ def format_list_template_system(template_data): template_node_selector_length=template_node_selector_length, template_node_autostart_length=template_node_autostart_length, template_migration_method_length=template_migration_method_length, + template_migration_max_downtime_length=template_migration_max_downtime_length, template_name="Name", template_id="ID", template_vcpu="vCPUs", @@ -915,7 +931,8 @@ def format_list_template_system(template_data): template_node_limit="Limit", template_node_selector="Selector", template_node_autostart="Autostart", - template_migration_method="Migration", + template_migration_method="Mig. Method", + template_migration_max_downtime="Max Downtime", ) ) @@ -931,7 +948,8 @@ def format_list_template_system(template_data): {template_node_limit: <{template_node_limit_length}} \ {template_node_selector: <{template_node_selector_length}} \ {template_node_autostart: <{template_node_autostart_length}} \ -{template_migration_method: <{template_migration_method_length}}{end_bold}".format( +{template_migration_method: <{template_migration_method_length}} \ +{template_migration_max_downtime: <{template_migration_max_downtime_length}}{end_bold}".format( template_name_length=template_name_length, template_id_length=template_id_length, template_vcpu_length=template_vcpu_length, @@ -943,6 +961,7 @@ def format_list_template_system(template_data): template_node_selector_length=template_node_selector_length, template_node_autostart_length=template_node_autostart_length, template_migration_method_length=template_migration_method_length, + template_migration_max_downtime_length=template_migration_max_downtime_length, bold="", end_bold="", template_name=str(template["name"]), @@ -956,6 +975,7 @@ def format_list_template_system(template_data): template_node_selector=str(template["node_selector"]), template_node_autostart=str(template["node_autostart"]), template_migration_method=str(template["migration_method"]), + template_migration_max_downtime=f"{str(template['migration_max_downtime'])} ms", ) ) diff --git a/client-cli/pvc/lib/vm.py b/client-cli/pvc/lib/vm.py index f1d423fd..d9086e89 100644 --- a/client-cli/pvc/lib/vm.py +++ b/client-cli/pvc/lib/vm.py @@ -205,6 +205,7 @@ def vm_metadata( node_selector, node_autostart, migration_method, + migration_max_downtime, provisioner_profile, ): """ @@ -229,6 +230,9 @@ def vm_metadata( if migration_method is not None: params["migration_method"] = migration_method + if migration_max_downtime is not None: + params["migration_max_downtime"] = migration_max_downtime + if provisioner_profile is not None: params["profile"] = provisioner_profile @@ -1637,14 +1641,14 @@ def format_info(config, domain_information, long_output): ) ) ainformation.append( - "{}Current Node:{} {}".format( + "{}Current node:{} {}".format( ansiprint.purple(), ansiprint.end(), domain_information["node"] ) ) if not domain_information["last_node"]: domain_information["last_node"] = "N/A" ainformation.append( - "{}Previous Node:{} {}".format( + "{}Previous node:{} {}".format( ansiprint.purple(), ansiprint.end(), domain_information["last_node"] ) ) @@ -1676,15 +1680,12 @@ def format_info(config, domain_information, long_output): formatted_node_autostart = "True" if not domain_information.get("migration_method"): - formatted_migration_method = "Any" + formatted_migration_method = "Live, Shutdown" else: - formatted_migration_method = str(domain_information["migration_method"]).title() - - ainformation.append( - "{}Migration selector:{} {}".format( - ansiprint.purple(), ansiprint.end(), formatted_node_selector + formatted_migration_method = ( + f"{str(domain_information['migration_method']).title()} only" ) - ) + ainformation.append( "{}Node limit:{} {}".format( ansiprint.purple(), ansiprint.end(), formatted_node_limit @@ -1700,10 +1701,22 @@ def format_info(config, domain_information, long_output): ) ) ainformation.append( - "{}Migration Method:{} {}".format( + "{}Migration method:{} {}".format( ansiprint.purple(), ansiprint.end(), formatted_migration_method ) ) + ainformation.append( + "{}Migration selector:{} {}".format( + ansiprint.purple(), ansiprint.end(), formatted_node_selector + ) + ) + ainformation.append( + "{}Max live downtime:{} {}".format( + ansiprint.purple(), + ansiprint.end(), + f"{domain_information['migration_max_downtime']} ms", + ) + ) # Tag list tags_name_length = 5 diff --git a/daemon-common/common.py b/daemon-common/common.py index 6e582215..64eed545 100644 --- a/daemon-common/common.py +++ b/daemon-common/common.py @@ -441,12 +441,14 @@ def getDomainMetadata(zkhandler, dom_uuid): domain_node_selector, domain_node_autostart, domain_migration_method, + domain_migration_max_downtime, ) = zkhandler.read_many( [ ("domain.meta.node_limit", dom_uuid), ("domain.meta.node_selector", dom_uuid), ("domain.meta.autostart", dom_uuid), ("domain.meta.migrate_method", dom_uuid), + ("domain.meta.migrate_max_downtime", dom_uuid), ] ) @@ -464,11 +466,15 @@ def getDomainMetadata(zkhandler, dom_uuid): if not domain_migration_method or domain_migration_method == "none": domain_migration_method = None + if not domain_migration_max_downtime or domain_migration_max_downtime == "none": + domain_migration_max_downtime = 300 + return ( domain_node_limit, domain_node_selector, domain_node_autostart, domain_migration_method, + domain_migration_max_downtime, ) @@ -505,6 +511,7 @@ def getInformationFromXML(zkhandler, uuid): domain_node_selector, domain_node_autostart, domain_migration_method, + domain_migration_max_downtime, ) = getDomainMetadata(zkhandler, uuid) domain_tags = getDomainTags(zkhandler, uuid) @@ -565,6 +572,7 @@ def getInformationFromXML(zkhandler, uuid): "node_selector": domain_node_selector, "node_autostart": bool(strtobool(domain_node_autostart)), "migration_method": domain_migration_method, + "migration_max_downtime": int(domain_migration_max_downtime), "tags": domain_tags, "description": domain_description, "profile": domain_profile, diff --git a/daemon-common/migrations/versions/13.json b/daemon-common/migrations/versions/13.json new file mode 100644 index 00000000..151d997b --- /dev/null +++ b/daemon-common/migrations/versions/13.json @@ -0,0 +1 @@ +{"version": "13", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "logs": "/logs", "faults": "/faults", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "faults": {"id": "", "last_time": "/last_time", "first_time": "/first_time", "ack_time": "/ack_time", "status": "/status", "delta": "/delta", "message": "/message"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health", "network.stats": "/network_stats"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/vm.py b/daemon-common/vm.py index 143f7384..f84dd1e4 100644 --- a/daemon-common/vm.py +++ b/daemon-common/vm.py @@ -147,6 +147,7 @@ def define_vm( node_selector, node_autostart, migration_method=None, + migration_max_downtime=300, profile=None, tags=[], initial_state="stop", @@ -272,6 +273,10 @@ def define_vm( (("domain.console.vnc", dom_uuid), ""), (("domain.meta.autostart", dom_uuid), node_autostart), (("domain.meta.migrate_method", dom_uuid), str(migration_method).lower()), + ( + ("domain.meta.migrate_max_downtime", dom_uuid), + int(migration_max_downtime), + ), (("domain.meta.node_limit", dom_uuid), formatted_node_limit), (("domain.meta.node_selector", dom_uuid), str(node_selector).lower()), (("domain.meta.tags", dom_uuid), ""), @@ -305,6 +310,7 @@ def modify_vm_metadata( node_autostart, provisioner_profile, migration_method, + migration_max_downtime, ): dom_uuid = getDomainUUID(zkhandler, domain) if not dom_uuid: @@ -331,6 +337,14 @@ def modify_vm_metadata( (("domain.meta.migrate_method", dom_uuid), str(migration_method).lower()) ) + if migration_max_downtime is not None: + update_list.append( + ( + ("domain.meta.migrate_max_downtime", dom_uuid), + int(migration_max_downtime), + ) + ) + if len(update_list) < 1: return False, "ERROR: No updates to apply." @@ -563,6 +577,7 @@ def rename_vm(zkhandler, domain, new_domain): dom_info["node_selector"], dom_info["node_autostart"], migration_method=dom_info["migration_method"], + migration_max_downtime=dom_info["migration_max_downtime"], profile=dom_info["profile"], tags=dom_info["tags"], initial_state="stop", @@ -1624,6 +1639,7 @@ def restore_vm(zkhandler, domain, backup_path, datestring, retain_snapshot=False backup_source_details["vm_detail"]["node_selector"], backup_source_details["vm_detail"]["node_autostart"], backup_source_details["vm_detail"]["migration_method"], + backup_source_details["vm_detail"]["migration_max_downtime"], backup_source_details["vm_detail"]["profile"], backup_source_details["vm_detail"]["tags"], "restore", diff --git a/daemon-common/vmbuilder.py b/daemon-common/vmbuilder.py index 87e48122..db53a76a 100644 --- a/daemon-common/vmbuilder.py +++ b/daemon-common/vmbuilder.py @@ -744,6 +744,7 @@ def worker_create_vm( node_selector = vm_data["system_details"]["node_selector"] node_autostart = vm_data["system_details"]["node_autostart"] migration_method = vm_data["system_details"]["migration_method"] + migration_max_downtime = vm_data["system_details"]["migration_max_downtime"] with open_zk(config) as zkhandler: retcode, retmsg = pvc_vm.define_vm( zkhandler, @@ -753,6 +754,7 @@ def worker_create_vm( node_selector, node_autostart, migration_method, + migration_max_downtime, vm_profile, initial_state="provision", ) diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index c280bbde..29d14c57 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -572,7 +572,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 12 + _version = 13 # Root for doing nested keys _schema_root = "" @@ -707,6 +707,7 @@ class ZKSchema(object): "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", + "meta.migrate_max_downtime": "/migration_max_downtime", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", @@ -1026,6 +1027,8 @@ class ZKSchema(object): default_data = "False" elif elem == "pool" and ikey == "tier": default_data = "default" + elif elem == "domain" and ikey == "meta.migrate_max_downtime": + default_data = "300" else: default_data = "" zkhandler.zk_conn.create( diff --git a/gen-api-migrations b/gen-api-migrations index c5d707df..d4e123b6 100755 --- a/gen-api-migrations +++ b/gen-api-migrations @@ -2,12 +2,19 @@ # Generate the database migration files +set -o xtrace + VERSION="$( head -1 debian/changelog | awk -F'[()-]' '{ print $2 }' )" +sudo ip addr add 10.0.1.250/32 dev lo + pushd $( git rev-parse --show-toplevel ) &>/dev/null pushd api-daemon &>/dev/null export PVC_CONFIG_FILE="../pvc.sample.conf" -./pvcapid-manage_flask.py db migrate -m "PVC version ${VERSION}" -./pvcapid-manage_flask.py db upgrade +export FLASK_APP=./pvcapid-manage_flask.py +flask db migrate -m "PVC version ${VERSION}" +flask db upgrade popd &>/dev/null popd &>/dev/null + +sudo ip addr del 10.0.1.250/32 dev lo diff --git a/node-daemon/pvcnoded/objects/VMInstance.py b/node-daemon/pvcnoded/objects/VMInstance.py index acbb5120..5aec54a5 100644 --- a/node-daemon/pvcnoded/objects/VMInstance.py +++ b/node-daemon/pvcnoded/objects/VMInstance.py @@ -687,6 +687,29 @@ class VMInstance(object): abort_migrate("Target node changed during preparation") return if not force_shutdown: + # Set the maxdowntime value from Zookeeper + try: + max_downtime = self.zkhandler.read( + ("domain.meta.migrate_max_downtime", self.domuuid) + ) + except Exception as e: + self.logger.out( + f"Error fetching migrate max downtime; using default of 300s: {e}", + state="w", + ) + self.max_downtime = 300 + self.logger.out( + f"Running migrate-setmaxdowntime with downtime value {max_downtime}", + state="i", + prefix="Domain {}".format(self.domuuid), + ) + retcode, stdout, stderr = common.run_os_command( + f"virsh migrate-setmaxdowntime --downtime {max_downtime} {self.domuuid}" + ) + if retcode: + abort_migrate("Failed to set maxdowntime value on running VM") + return + # A live migrate is attemped 3 times in succession ticks = 0 while True: diff --git a/pvc.sample.conf b/pvc.sample.conf index 7ac9b84e..ab2d5411 100644 --- a/pvc.sample.conf +++ b/pvc.sample.conf @@ -168,7 +168,7 @@ database: port: 6379 # Hostname; use `cluster` network floating IP address - hostname: 10.0.1.250 + hostname: 127.0.0.1 # Path, usually "/0" path: "/0" @@ -180,7 +180,7 @@ database: port: 5432 # Hostname; use `cluster` network floating IP address - hostname: 10.0.1.250 + hostname: 127.0.0.1 # Credentials credentials: