diff --git a/api-daemon/pvcapid/flaskapi.py b/api-daemon/pvcapid/flaskapi.py index c40eccc5..b3bb9ac8 100755 --- a/api-daemon/pvcapid/flaskapi.py +++ b/api-daemon/pvcapid/flaskapi.py @@ -4284,12 +4284,18 @@ class API_Storage_Ceph_OSD_Root(Resource): { "name": "ext_db", "required": False, - "helptext": "Whether to use an external OSD DB LV device.", }, { "name": "ext_db_ratio", "required": False, - "helptext": "Decimal size ratio of the external OSD DB LV device.", + }, + { + "name": "split", + "required": False, + }, + { + "name": "count", + "required": False, }, ] ) @@ -4327,6 +4333,16 @@ class API_Storage_Ceph_OSD_Root(Resource): type: float required: false description: Decimal ratio of total OSD size for the external OSD DB LV device, default 0.05 (5%) + - in: query + name: split + type: boolean + required: false + description: Whether to split the block device into multiple OSDs (recommended for NVMe devices) + - in: query + name: count + type: integer + required: false + description: If {split}, how many OSDs to create on the block device; usually 2 or 4 depending on size responses: 200: description: OK @@ -4345,6 +4361,8 @@ class API_Storage_Ceph_OSD_Root(Resource): reqargs.get("weight", None), reqargs.get("ext_db", False), float(reqargs.get("ext_db_ratio", 0.05)), + reqargs.get("split", False), + reqargs.get("count", 1), ) diff --git a/api-daemon/pvcapid/helper.py b/api-daemon/pvcapid/helper.py index 677faf89..55ec1274 100755 --- a/api-daemon/pvcapid/helper.py +++ b/api-daemon/pvcapid/helper.py @@ -1366,12 +1366,28 @@ def ceph_osd_db_vg_add(zkhandler, node, device): @ZKConnection(config) -def ceph_osd_add(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05): +def ceph_osd_add( + zkhandler, + node, + device, + weight, + ext_db_flag=False, + ext_db_ratio=0.05, + split_flag=False, + split_count=1, +): """ Add a Ceph OSD to the PVC Ceph storage cluster. """ retflag, retdata = pvc_ceph.add_osd( - zkhandler, node, device, weight, ext_db_flag, ext_db_ratio + zkhandler, + node, + device, + weight, + ext_db_flag, + ext_db_ratio, + split_flag, + split_count, ) if retflag: diff --git a/client-cli/pvc/cli/cli.py b/client-cli/pvc/cli/cli.py index 4e28840b..f996296a 100644 --- a/client-cli/pvc/cli/cli.py +++ b/client-cli/pvc/cli/cli.py @@ -3411,8 +3411,17 @@ def cli_storage_osd_create_db_vg(node, device): type=float, help="Decimal ratio of the external database logical volume to the OSD size.", ) -@confirm_opt("Destroy all data on and create new OSD on node {node} device {device}") -def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio): +@click.option( + "-s", + "--split", + "split_count", + default=None, + show_default=False, + type=int, + help="Split an NVMe disk into this many OSDs", +) +@confirm_opt("Destroy all data on and create new OSD(s) on node {node} device {device}") +def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio, split_count): """ Add a new Ceph OSD on node NODE with block device DEVICE. DEVICE must be a valid block device path (e.g. '/dev/sda', '/dev/nvme0n1', '/dev/disk/by-path/...', '/dev/disk/by-id/...') or a "detect" string. Using partitions is not supported. @@ -3423,10 +3432,22 @@ def cli_storage_osd_add(node, device, weight, ext_db_flag, ext_db_ratio): If '--ext-db' is specified, the OSD database and WAL will be placed on a new logical volume in NODE's OSD database volume group. An OSD database volume group must exist on the node or the OSD creation will fail. See the 'pvc storage osd create-db-vg' command for more details. The default '--ext-db-ratio' of 0.05 (5%) is sufficient for most RBD workloads and OSD sizes, though this can be adjusted based on the sizes of the OSD(s) and the underlying database device. Ceph documentation recommends at least 0.02 (2%) for RBD use-cases, and higher values may improve WAL performance under write-heavy workloads with fewer OSDs per node. + + For NVMe devices, it is recommended to split block device into multiple OSDs to provide better processing throughput. To do this, specify "-s"/"--split" and the number of OSDs to create on the block device. For most NVMe devices, the recommended value is 2 or 4, such that each OSD is at least 500GB. Numbers higher than 4 are not recommended. This is NOT RECOMMENDED for SATA SSDs. """ + if split_count is not None: + split_flag = True + retcode, retmsg = pvc.lib.storage.ceph_osd_add( - CLI_CONFIG, node, device, weight, ext_db_flag, ext_db_ratio + CLI_CONFIG, + node, + device, + weight, + ext_db_flag, + ext_db_ratio, + split_flag, + split_count, ) finish(retcode, retmsg) diff --git a/client-cli/pvc/lib/storage.py b/client-cli/pvc/lib/storage.py index 4b83747c..81415239 100644 --- a/client-cli/pvc/lib/storage.py +++ b/client-cli/pvc/lib/storage.py @@ -231,12 +231,14 @@ def ceph_osd_list(config, limit): return False, response.json().get("message", "") -def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio): +def ceph_osd_add( + config, node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count +): """ Add new Ceph OSD API endpoint: POST /api/v1/storage/ceph/osd - API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio} + API arguments: node={node}, device={device}, weight={weight}, ext_db={ext_db_flag}, ext_db_ratio={ext_db_ratio}, split={split_flag}, count={split_count} API schema: {"message":"{data}"} """ params = { @@ -245,6 +247,8 @@ def ceph_osd_add(config, node, device, weight, ext_db_flag, ext_db_ratio): "weight": weight, "ext_db": ext_db_flag, "ext_db_ratio": ext_db_ratio, + "split": split_flag, + "count": split_count, } response = call_api(config, "post", "/storage/ceph/osd", params=params) diff --git a/daemon-common/ceph.py b/daemon-common/ceph.py index 2be6e800..efb53d72 100644 --- a/daemon-common/ceph.py +++ b/daemon-common/ceph.py @@ -211,6 +211,7 @@ def getOSDInformation(zkhandler, osd_id): # Get the devices osd_node = zkhandler.read(("osd.node", osd_id)) osd_device = zkhandler.read(("osd.device", osd_id)) + osd_is_split = zkhandler.read(("osd.is_split", osd_id)) osd_db_device = zkhandler.read(("osd.db_device", osd_id)) # Parse the stats data osd_stats_raw = zkhandler.read(("osd.stats", osd_id)) @@ -220,6 +221,7 @@ def getOSDInformation(zkhandler, osd_id): "id": osd_id, "node": osd_node, "device": osd_device, + "is_split": osd_is_split, "db_device": osd_db_device, "stats": osd_stats, } @@ -266,7 +268,16 @@ def add_osd_db_vg(zkhandler, node, device): # OSD actions use the /cmd/ceph pipe # These actions must occur on the specific node they reference -def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05): +def add_osd( + zkhandler, + node, + device, + weight, + ext_db_flag=False, + ext_db_ratio=0.05, + split_flag=False, + split_count=1, +): # Verify the target node exists if not common.verifyNode(zkhandler, node): return False, 'ERROR: No node named "{}" is present in the cluster.'.format( @@ -284,8 +295,8 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0 ) # Tell the cluster to create a new OSD for the host - add_osd_string = "osd_add {},{},{},{},{}".format( - node, device, weight, ext_db_flag, ext_db_ratio + add_osd_string = "osd_add {},{},{},{},{},{},{}".format( + node, device, weight, ext_db_flag, ext_db_ratio, split_flag, split_count ) zkhandler.write([("base.cmd.ceph", add_osd_string)]) # Wait 1/2 second for the cluster to get the message and start working @@ -295,14 +306,10 @@ def add_osd(zkhandler, node, device, weight, ext_db_flag=False, ext_db_ratio=0.0 try: result = zkhandler.read("base.cmd.ceph").split()[0] if result == "success-osd_add": - message = 'Created new OSD with block device "{}" on node "{}".'.format( - device, node - ) + message = f'Created {split_count} new OSD(s) on node "{node}" block device "{device}"' success = True else: - message = ( - "ERROR: Failed to create new OSD; check node logs for details." - ) + message = "ERROR: Failed to create OSD(s); check node logs for details." success = False except Exception: message = "ERROR: Command ignored by node." diff --git a/daemon-common/migrations/versions/10.json b/daemon-common/migrations/versions/10.json new file mode 100644 index 00000000..9f825c62 --- /dev/null +++ b/daemon-common/migrations/versions/10.json @@ -0,0 +1 @@ +{"version": "10", "root": "", "base": {"root": "", "schema": "/schema", "schema.version": "/schema/version", "config": "/config", "config.maintenance": "/config/maintenance", "config.primary_node": "/config/primary_node", "config.primary_node.sync_lock": "/config/primary_node/sync_lock", "config.upstream_ip": "/config/upstream_ip", "config.migration_target_selector": "/config/migration_target_selector", "cmd": "/cmd", "cmd.node": "/cmd/nodes", "cmd.domain": "/cmd/domains", "cmd.ceph": "/cmd/ceph", "logs": "/logs", "node": "/nodes", "domain": "/domains", "network": "/networks", "storage": "/ceph", "storage.health": "/ceph/health", "storage.util": "/ceph/util", "osd": "/ceph/osds", "pool": "/ceph/pools", "volume": "/ceph/volumes", "snapshot": "/ceph/snapshots"}, "logs": {"node": "", "messages": "/messages"}, "node": {"name": "", "keepalive": "/keepalive", "mode": "/daemonmode", "data.active_schema": "/activeschema", "data.latest_schema": "/latestschema", "data.static": "/staticdata", "data.pvc_version": "/pvcversion", "running_domains": "/runningdomains", "count.provisioned_domains": "/domainscount", "count.networks": "/networkscount", "state.daemon": "/daemonstate", "state.router": "/routerstate", "state.domain": "/domainstate", "cpu.load": "/cpuload", "vcpu.allocated": "/vcpualloc", "memory.total": "/memtotal", "memory.used": "/memused", "memory.free": "/memfree", "memory.allocated": "/memalloc", "memory.provisioned": "/memprov", "ipmi.hostname": "/ipmihostname", "ipmi.username": "/ipmiusername", "ipmi.password": "/ipmipassword", "sriov": "/sriov", "sriov.pf": "/sriov/pf", "sriov.vf": "/sriov/vf", "monitoring.plugins": "/monitoring_plugins", "monitoring.data": "/monitoring_data", "monitoring.health": "/monitoring_health"}, "monitoring_plugin": {"name": "", "last_run": "/last_run", "health_delta": "/health_delta", "message": "/message", "data": "/data", "runtime": "/runtime"}, "sriov_pf": {"phy": "", "mtu": "/mtu", "vfcount": "/vfcount"}, "sriov_vf": {"phy": "", "pf": "/pf", "mtu": "/mtu", "mac": "/mac", "phy_mac": "/phy_mac", "config": "/config", "config.vlan_id": "/config/vlan_id", "config.vlan_qos": "/config/vlan_qos", "config.tx_rate_min": "/config/tx_rate_min", "config.tx_rate_max": "/config/tx_rate_max", "config.spoof_check": "/config/spoof_check", "config.link_state": "/config/link_state", "config.trust": "/config/trust", "config.query_rss": "/config/query_rss", "pci": "/pci", "pci.domain": "/pci/domain", "pci.bus": "/pci/bus", "pci.slot": "/pci/slot", "pci.function": "/pci/function", "used": "/used", "used_by": "/used_by"}, "domain": {"name": "", "xml": "/xml", "state": "/state", "profile": "/profile", "stats": "/stats", "node": "/node", "last_node": "/lastnode", "failed_reason": "/failedreason", "storage.volumes": "/rbdlist", "console.log": "/consolelog", "console.vnc": "/vnc", "meta.autostart": "/node_autostart", "meta.migrate_method": "/migration_method", "meta.node_selector": "/node_selector", "meta.node_limit": "/node_limit", "meta.tags": "/tags", "migrate.sync_lock": "/migrate_sync_lock"}, "tag": {"name": "", "type": "/type", "protected": "/protected"}, "network": {"vni": "", "type": "/nettype", "mtu": "/mtu", "rule": "/firewall_rules", "rule.in": "/firewall_rules/in", "rule.out": "/firewall_rules/out", "nameservers": "/name_servers", "domain": "/domain", "reservation": "/dhcp4_reservations", "lease": "/dhcp4_leases", "ip4.gateway": "/ip4_gateway", "ip4.network": "/ip4_network", "ip4.dhcp": "/dhcp4_flag", "ip4.dhcp_start": "/dhcp4_start", "ip4.dhcp_end": "/dhcp4_end", "ip6.gateway": "/ip6_gateway", "ip6.network": "/ip6_network", "ip6.dhcp": "/dhcp6_flag"}, "reservation": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname"}, "lease": {"mac": "", "ip": "/ipaddr", "hostname": "/hostname", "expiry": "/expiry", "client_id": "/clientid"}, "rule": {"description": "", "rule": "/rule", "order": "/order"}, "osd": {"id": "", "node": "/node", "device": "/device", "db_device": "/db_device", "fsid": "/fsid", "ofsid": "/fsid/osd", "cfsid": "/fsid/cluster", "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", "is_split": "/is_split", "stats": "/stats"}, "pool": {"name": "", "pgs": "/pgs", "tier": "/tier", "stats": "/stats"}, "volume": {"name": "", "stats": "/stats"}, "snapshot": {"name": "", "stats": "/stats"}} \ No newline at end of file diff --git a/daemon-common/zkhandler.py b/daemon-common/zkhandler.py index d5494ed4..3d5e288f 100644 --- a/daemon-common/zkhandler.py +++ b/daemon-common/zkhandler.py @@ -540,7 +540,7 @@ class ZKHandler(object): # class ZKSchema(object): # Current version - _version = 9 + _version = 10 # Root for doing nested keys _schema_root = "" @@ -719,6 +719,7 @@ class ZKSchema(object): "lvm": "/lvm", "vg": "/lvm/vg", "lv": "/lvm/lv", + "is_split": "/is_split", "stats": "/stats", }, # The schema of an individual pool entry (/ceph/pools/{pool_name}) @@ -963,7 +964,9 @@ class ZKSchema(object): kpath = f"{elem}.{ikey}" # Validate that the key exists for that child if not zkhandler.zk_conn.exists(self.path(kpath, child)): - if elem == "pool" and ikey == "tier": + if elem == "osd" and ikey == "is_split": + default_data = "False" + elif elem == "pool" and ikey == "tier": default_data = "default" else: default_data = "" diff --git a/node-daemon/pvcnoded/objects/CephInstance.py b/node-daemon/pvcnoded/objects/CephInstance.py index 1d063b07..c4f8bf9e 100644 --- a/node-daemon/pvcnoded/objects/CephInstance.py +++ b/node-daemon/pvcnoded/objects/CephInstance.py @@ -26,6 +26,7 @@ import daemon_lib.common as common from distutils.util import strtobool from re import search, match, sub +from json import loads as jloads def get_detect_device(detect_string): @@ -260,7 +261,15 @@ class CephOSDInstance(object): @staticmethod def add_osd( - zkhandler, logger, node, device, weight, ext_db_flag=False, ext_db_ratio=0.05 + zkhandler, + logger, + node, + device, + weight, + ext_db_flag=False, + ext_db_ratio=0.05, + split_device=False, + split_count=1, ): # Handle a detect device if that is passed if match(r"detect:", device): @@ -278,177 +287,185 @@ class CephOSDInstance(object): ) device = ddevice - # We are ready to create a new OSD on this node - logger.out("Creating new OSD disk on block device {}".format(device), state="i") + if split_device and split_count > 1: + split_flag = f"--osds-per-device {split_count}" + logger.out( + f"Creating {split_count} new OSD disks on block device {device}", + state="i", + ) + else: + split_flag = "" + logger.out(f"Creating 1 new OSD disk on block device {device}", state="i") + + if "nvme" in device: + class_flag = "--crush-device-class nvme" + else: + class_flag = "--crush-device-class ssd" + try: - # 1. Create an OSD; we do this so we know what ID will be gen'd - retcode, stdout, stderr = common.run_os_command("ceph osd create") - if retcode: - print("ceph osd create") - print(stdout) - print(stderr) - raise Exception - osd_id = stdout.rstrip() - - # 2. Remove that newly-created OSD + # 1. Zap the block device + logger.out(f"Zapping disk {device}", state="i") retcode, stdout, stderr = common.run_os_command( - "ceph osd rm {}".format(osd_id) + f"ceph-volume lvm zap --destroy {device}" ) if retcode: - print("ceph osd rm") - print(stdout) - print(stderr) + logger.out("Failed: ceph-volume lvm zap", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") raise Exception - # 3a. Zap the disk to ensure it is ready to go - logger.out("Zapping disk {}".format(device), state="i") + # 2. Prepare the OSD(s) + logger.out(f"Preparing OSD(s) on disk {device}", state="i") retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm zap --destroy {}".format(device) + f"ceph-volume lvm batch --yes --prepare --bluestore {split_flag} {class_flag} {device}" ) if retcode: - print("ceph-volume lvm zap") - print(stdout) - print(stderr) + logger.out("Failed: ceph-volume lvm batch", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") raise Exception - dev_flags = "--data {}".format(device) + # 3. Get the list of created OSDs on the device (initial pass) + logger.out(f"Querying OSD(s) on disk {device}", state="i") + retcode, stdout, stderr = common.run_os_command( + f"ceph-volume lvm list --format json {device}" + ) + if retcode: + logger.out("Failed: ceph-volume lvm list", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") + raise Exception - # 3b. Prepare the logical volume if ext_db_flag + created_osds = jloads(stdout) + + # 4. Prepare the WAL and DB devices if ext_db_flag: - _, osd_size_bytes, _ = common.run_os_command( - "blockdev --getsize64 {}".format(device) + for created_osd in created_osds: + # 4a. Get the OSD FSID and ID from the details + osd_details = created_osds[created_osd][0] + osd_fsid = osd_details["tags"]["ceph.osd_fsid"] + osd_id = osd_details["tags"]["ceph.osd_id"] + osd_lv = osd_details["lv_path"] + + logger.out( + f"Creating Bluestore DB volume for OSD {osd_id}", state="i" + ) + + # 4b. Prepare the logical volume if ext_db_flag + _, osd_size_bytes, _ = common.run_os_command( + f"blockdev --getsize64 {osd_lv}" + ) + osd_size_bytes = int(osd_size_bytes) + result = CephOSDInstance.create_osd_db_lv( + zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes + ) + if not result: + raise Exception + db_device = "osd-db/osd-{}".format(osd_id) + + # 4c. Attach the new DB device to the OSD + retcode, stdout, stderr = common.run_os_command( + f"ceph-volume lvm new-db --osd-id {osd_id} --osd-fsid {osd_fsid} --target {db_device}" + ) + if retcode: + logger.out("Failed: ceph-volume lvm new-db", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") + raise Exception + + # 4d. Get the list of created OSDs on the device (final pass) + logger.out(f"(Requerying OSD(s) on disk {device}", state="i") + retcode, stdout, stderr = common.run_os_command( + f"ceph-volume lvm list --format json {device}" ) - osd_size_bytes = int(osd_size_bytes) - result = CephOSDInstance.create_osd_db_lv( - zkhandler, logger, osd_id, ext_db_ratio, osd_size_bytes - ) - if not result: + if retcode: + logger.out("Failed: ceph-volume lvm list", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") raise Exception - db_device = "osd-db/osd-{}".format(osd_id) - dev_flags += " --block.db {}".format(db_device) - else: - db_device = "" - # 3c. Create the OSD for real - logger.out( - "Preparing LVM for new OSD disk with ID {} on {}".format( - osd_id, device - ), - state="i", - ) - retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm prepare --bluestore {devices}".format( - devices=dev_flags + created_osds = jloads(stdout) + + # 5. Activate the OSDs + logger.out(f"Activating OSD(s) on disk {device}", state="i") + for created_osd in created_osds: + # 5a. Get the OSD FSID and ID from the details + osd_details = created_osds[created_osd][0] + osd_clusterfsid = osd_details["tags"]["ceph.cluster_fsid"] + osd_fsid = osd_details["tags"]["ceph.osd_fsid"] + osd_id = osd_details["tags"]["ceph.osd_id"] + db_device = osd_details["tags"].get("ceph.db_device", None) + osd_vg = osd_details["vg_name"] + osd_lv = osd_details["lv_name"] + + # 5b. Activate the OSD + logger.out(f"Activating OSD {osd_id}", state="i") + retcode, stdout, stderr = common.run_os_command( + f"ceph-volume lvm activate --bluestore {osd_id} {osd_fsid}" ) - ) - if retcode: - print("ceph-volume lvm prepare") - print(stdout) - print(stderr) - raise Exception + if retcode: + logger.out("Failed: ceph-volume lvm activate", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") + raise Exception - # 4a. Get OSD information - logger.out( - "Getting OSD information for ID {} on {}".format(osd_id, device), - state="i", - ) - retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm list {device}".format(device=device) - ) - for line in stdout.split("\n"): - if "block device" in line: - osd_blockdev = line.split()[-1] - if "osd fsid" in line: - osd_fsid = line.split()[-1] - if "cluster fsid" in line: - osd_clusterfsid = line.split()[-1] - if "devices" in line: - osd_device = line.split()[-1] - - if not osd_fsid: - print("ceph-volume lvm list") - print("Could not find OSD information in data:") - print(stdout) - print(stderr) - raise Exception - - # Split OSD blockdev into VG and LV components - # osd_blockdev = /dev/ceph-/osd-block- - _, _, osd_vg, osd_lv = osd_blockdev.split("/") - - # Reset whatever we were given to Ceph's /dev/xdX naming - if device != osd_device: - device = osd_device - - # 4b. Activate the OSD - logger.out("Activating new OSD disk with ID {}".format(osd_id), state="i") - retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm activate --bluestore {osdid} {osdfsid}".format( - osdid=osd_id, osdfsid=osd_fsid + # 5c. Add it to the crush map + logger.out(f"Adding OSD {osd_id} to CRUSH map", state="i") + retcode, stdout, stderr = common.run_os_command( + f"ceph osd crush add osd.{osd_id} {weight} root=default host={node}" ) - ) - if retcode: - print("ceph-volume lvm activate") - print(stdout) - print(stderr) - raise Exception + if retcode: + logger.out("Failed: ceph osd crush add", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") + raise Exception - # 5. Add it to the crush map - logger.out( - "Adding new OSD disk with ID {} to CRUSH map".format(osd_id), state="i" - ) - retcode, stdout, stderr = common.run_os_command( - "ceph osd crush add osd.{osdid} {weight} root=default host={node}".format( - osdid=osd_id, weight=weight, node=node + # 5d. Wait half a second for it to activate + time.sleep(0.5) + + # 5e. Verify it started + retcode, stdout, stderr = common.run_os_command( + "systemctl status ceph-osd@{osdid}".format(osdid=osd_id) ) - ) - if retcode: - print("ceph osd crush add") - print(stdout) - print(stderr) - raise Exception + if retcode: + logger.out(f"Failed: OSD {osd_id} unit is not active", state="e") + logger.out(stdout, state="d") + logger.out(stderr, state="d") + raise Exception - time.sleep(0.5) + # 5f. Add the new OSD to PVC + logger.out(f"Adding OSD {osd_id} to PVC", state="i") + zkhandler.write( + [ + (("osd", osd_id), ""), + (("osd.node", osd_id), node), + (("osd.device", osd_id), device), + (("osd.db_device", osd_id), db_device), + (("osd.fsid", osd_id), osd_fsid), + (("osd.ofsid", osd_id), osd_fsid), + (("osd.cfsid", osd_id), osd_clusterfsid), + (("osd.lvm", osd_id), ""), + (("osd.vg", osd_id), osd_vg), + (("osd.lv", osd_id), osd_lv), + (("osd.is_split", osd_id), split_flag), + ( + ("osd.stats", osd_id), + '{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}', + ), + ] + ) - # 6. Verify it started - retcode, stdout, stderr = common.run_os_command( - "systemctl status ceph-osd@{osdid}".format(osdid=osd_id) - ) - if retcode: - print("systemctl status") - print(stdout) - print(stderr) - raise Exception - - # 7. Add the new OSD to the list + # 6. Log it logger.out( - "Adding new OSD disk with ID {} to Zookeeper".format(osd_id), state="i" + f"Successfully created {split_count} new OSD(s) {','.join(created_osds.keys())} on disk {device}", + state="o", ) - zkhandler.write( - [ - (("osd", osd_id), ""), - (("osd.node", osd_id), node), - (("osd.device", osd_id), device), - (("osd.db_device", osd_id), db_device), - (("osd.fsid", osd_id), ""), - (("osd.ofsid", osd_id), osd_fsid), - (("osd.cfsid", osd_id), osd_clusterfsid), - (("osd.lvm", osd_id), ""), - (("osd.vg", osd_id), osd_vg), - (("osd.lv", osd_id), osd_lv), - ( - ("osd.stats", osd_id), - '{"uuid": "|", "up": 0, "in": 0, "primary_affinity": "|", "utilization": "|", "var": "|", "pgs": "|", "kb": "|", "weight": "|", "reweight": "|", "node": "|", "used": "|", "avail": "|", "wr_ops": "|", "wr_data": "|", "rd_ops": "|", "rd_data": "|", "state": "|"}', - ), - ] - ) - - # Log it - logger.out("Created new OSD disk with ID {}".format(osd_id), state="o") return True except Exception as e: - # Log it - logger.out("Failed to create new OSD disk: {}".format(e), state="e") + logger.out( + f"Failed to create {split_count} new OSD(s) on disk {device}: {e}", + state="e", + ) return False @staticmethod @@ -828,7 +845,7 @@ class CephOSDInstance(object): @staticmethod def remove_osd(zkhandler, logger, osd_id, osd_obj, force_flag): - logger.out("Removing OSD disk {}".format(osd_id), state="i") + logger.out("Removing OSD {}".format(osd_id), state="i") try: # Verify the OSD is present retcode, stdout, stderr = common.run_os_command("ceph osd ls") @@ -843,7 +860,7 @@ class CephOSDInstance(object): return True # 1. Set the OSD down and out so it will flush - logger.out("Setting down OSD disk with ID {}".format(osd_id), state="i") + logger.out("Setting down OSD {}".format(osd_id), state="i") retcode, stdout, stderr = common.run_os_command( "ceph osd down {}".format(osd_id) ) @@ -856,7 +873,7 @@ class CephOSDInstance(object): else: raise Exception - logger.out("Setting out OSD disk with ID {}".format(osd_id), state="i") + logger.out("Setting out OSD {}".format(osd_id), state="i") retcode, stdout, stderr = common.run_os_command( "ceph osd out {}".format(osd_id) ) @@ -881,7 +898,7 @@ class CephOSDInstance(object): time.sleep(5) # 3. Stop the OSD process and wait for it to be terminated - logger.out("Stopping OSD disk with ID {}".format(osd_id), state="i") + logger.out("Stopping OSD {}".format(osd_id), state="i") retcode, stdout, stderr = common.run_os_command( "systemctl stop ceph-osd@{}".format(osd_id) ) @@ -922,25 +939,8 @@ class CephOSDInstance(object): else: raise Exception - # 5. Zap the volumes - logger.out( - "Zapping OSD {} disk on {}".format(osd_id, osd_device), - state="i", - ) - retcode, stdout, stderr = common.run_os_command( - "ceph-volume lvm zap --destroy {}".format(osd_device) - ) - if retcode: - print("ceph-volume lvm zap") - print(stdout) - print(stderr) - if force_flag: - logger.out("Ignoring error due to force flag", state="i") - else: - raise Exception - - # 6. Purge the OSD from Ceph - logger.out("Purging OSD disk with ID {}".format(osd_id), state="i") + # 5. Purge the OSD from Ceph + logger.out("Purging OSD {}".format(osd_id), state="i") retcode, stdout, stderr = common.run_os_command( "ceph osd purge {} --yes-i-really-mean-it".format(osd_id) ) @@ -964,19 +964,15 @@ class CephOSDInstance(object): ) # 8. Delete OSD from ZK - logger.out( - "Deleting OSD disk with ID {} from Zookeeper".format(osd_id), state="i" - ) + logger.out("Deleting OSD {} from Zookeeper".format(osd_id), state="i") zkhandler.delete(("osd", osd_id), recursive=True) # Log it - logger.out("Removed OSD disk with ID {}".format(osd_id), state="o") + logger.out("Successfully removed OSD {}".format(osd_id), state="o") return True except Exception as e: # Log it - logger.out( - "Failed to purge OSD disk with ID {}: {}".format(osd_id, e), state="e" - ) + logger.out("Failed to remove OSD {}: {}".format(osd_id, e), state="e") return False @staticmethod @@ -1245,16 +1241,34 @@ def ceph_command(zkhandler, logger, this_node, data, d_osd): # Adding a new OSD if command == "osd_add": - node, device, weight, ext_db_flag, ext_db_ratio = args.split(",") + ( + node, + device, + weight, + ext_db_flag, + ext_db_ratio, + split_flag, + split_count, + ) = args.split(",") ext_db_flag = bool(strtobool(ext_db_flag)) ext_db_ratio = float(ext_db_ratio) + split_flag = bool(strtobool(split_flag)) + split_count = int(split_count) if node == this_node.name: # Lock the command queue zk_lock = zkhandler.writelock("base.cmd.ceph") with zk_lock: # Add the OSD result = CephOSDInstance.add_osd( - zkhandler, logger, node, device, weight, ext_db_flag, ext_db_ratio + zkhandler, + logger, + node, + device, + weight, + ext_db_flag, + ext_db_ratio, + split_flag, + split_count, ) # Command succeeded if result: