diff --git a/.changelogs/1.0.0/10_add_container_support.yml b/.changelogs/1.0.0/10_add_container_support.yml index 0f822c5..0fd4fb9 100644 --- a/.changelogs/1.0.0/10_add_container_support.yml +++ b/.changelogs/1.0.0/10_add_container_support.yml @@ -1,2 +1,2 @@ added: - - Add container (e.g., Docker, Podman) support. [#10 by @daanbosch] + - Add Docker/Podman support. [#10 by @daanbosch] diff --git a/.changelogs/1.0.0/27_add_container_lxc_support.yml b/.changelogs/1.0.0/27_add_container_lxc_support.yml new file mode 100644 index 0000000..48253b1 --- /dev/null +++ b/.changelogs/1.0.0/27_add_container_lxc_support.yml @@ -0,0 +1,2 @@ +added: + - Add LXC/Container integration. [#27] diff --git a/README.md b/README.md index 32cf226..208a051 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ - [Options](#options) - [Parameters](#parameters) - [Balancing](#balancing) - - [By Used Memory of VMs](#by-used-memmory-of-vms) - - [By Assigned Memory of VMs](#by-assigned-memory-of-vms) + - [General](#general) + - [By Used Memory of VMs/CTs](#by-used-memory-of-vmscts) + - [By Assigned Memory of VMs/CTs](#by-assigned-memory-of-vmscts) - [Grouping](#grouping) - [Include (Stay Together)](#include-stay-together) - [Exclude (Stay Separate)](#exclude-stay-separate) @@ -37,7 +38,7 @@ - [Author(s)](#authors) ## Introduction -`ProxLB` (PLB) is an advanced tool designed to enhance the efficiency and performance of Proxmox clusters by optimizing the distribution of virtual machines (VMs) across the cluster nodes by using the Proxmox API. ProxLB meticulously gathers and analyzes a comprehensive set of resource metrics from both the cluster nodes and the running VMs. These metrics include CPU usage, memory consumption, and disk utilization, specifically focusing on local disk resources. +`ProxLB` (PLB) is an advanced tool designed to enhance the efficiency and performance of Proxmox clusters by optimizing the distribution of virtual machines (VMs) or Containers (CTs) across the cluster nodes by using the Proxmox API. ProxLB meticulously gathers and analyzes a comprehensive set of resource metrics from both the cluster nodes and the running VMs. These metrics include CPU usage, memory consumption, and disk utilization, specifically focusing on local disk resources. PLB collects resource usage data from each node in the Proxmox cluster, including CPU, (local) disk and memory utilization. Additionally, it gathers resource usage statistics from all running VMs, ensuring a granular understanding of the cluster's workload distribution. @@ -56,6 +57,10 @@ Automated rebalancing reduces the need for manual actions, allowing operators to * Performing * Periodically * One-shot solution +* Types + * Rebalance only VMs + * Rebalance only CTs + * Rebalance all (VMs and CTs) * Filter * Exclude nodes * Exclude virtual machines @@ -91,6 +96,7 @@ The following options can be set in the `proxlb.conf` file: | verify_ssl | 1 | Validate SSL certificates (1) or ignore (0). (default: 1) | | method | memory | Defines the balancing method (default: memory) where you can use `memory`, `disk` or `cpu`. | | mode | used | Rebalance by `used` resources (efficiency) or `assigned` (avoid overprovisioning) resources. (default: used)| +| type | vm | Rebalance only `vm` (virtual machines), `ct` (containers) or `all` (virtual machines & containers). (default: vm)| | balanciness | 10 | Value of the percentage of lowest and highest resource consumption on nodes may differ before rebalancing. (default: 10) | | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. | | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) | @@ -108,6 +114,7 @@ verify_ssl: 1 [balancing] method: memory mode: used +type: vm # Balanciness defines how much difference may be # between the lowest & highest resource consumption # of nodes before rebalancing will be done. @@ -131,7 +138,10 @@ The following options and parameters are currently supported: | -j | --json | Return a JSON of the VM movement. | Unset | ### Balancing -#### By Used Memmory of VMs +#### General +In general, virtual machines and containers can be rebalanced and moved around nodes in the cluster. Often, this also works without downtime without any further downtimes. However, this does **not** work with containers. LXC based containers will be shutdown, copied and started on the new node. Also to note, live migrations can work fluently without any issues but there are still several things to be considered. This is out of scope for ProxLB and applies in general to Proxmox and your cluster setup. You can find more details about this here: https://pve.proxmox.com/wiki/Migrate_to_Proxmox_VE. + +#### By Used Memory of VMs/CTs By continuously monitoring the current resource usage of VMs, ProxLB intelligently reallocates workloads to prevent any single node from becoming overloaded. This approach ensures that resources are balanced efficiently, providing consistent and optimal performance across the entire cluster at all times. To activate this balancing mode, simply activate the following option in your ProxLB configuration: ``` mode: used @@ -139,7 +149,7 @@ mode: used Afterwards, restart the service (if running in daemon mode) to activate this rebalancing mode. -#### By Assigned Memory of VMs +#### By Assigned Memory of VMs/CTs By ensuring that resources are always available for each VM, ProxLB prevents over-provisioning and maintains a balanced load across all nodes. This guarantees that users have consistent access to the resources they need. However, if the total assigned resources exceed the combined capacity of the cluster, ProxLB will issue a warning, indicating potential over-provisioning despite its best efforts to balance the load. To activate this balancing mode, simply activate the following option in your ProxLB configuration: ``` mode: assigned diff --git a/proxlb b/proxlb index dc11484..ef3db12 100755 --- a/proxlb +++ b/proxlb @@ -180,6 +180,7 @@ def initialize_config_options(config_path): # Balancing balancing_method = config['balancing'].get('method', 'memory') balancing_mode = config['balancing'].get('mode', 'used') + balancing_type = config['balancing'].get('type', 'vm') balanciness = config['balancing'].get('balanciness', 10) ignore_nodes = config['balancing'].get('ignore_nodes', None) ignore_vms = config['balancing'].get('ignore_vms', None) @@ -199,7 +200,7 @@ def initialize_config_options(config_path): logging.info(f'{info_prefix} Configuration file loaded.') return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, \ - balancing_mode, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity + balancing_mode, balancing_type, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v): @@ -268,9 +269,10 @@ def get_node_statistics(api_object, ignore_nodes): return node_statistics -def get_vm_statistics(api_object, ignore_vms): +def get_vm_statistics(api_object, ignore_vms, balancing_type): """ Get statistics of cpu, memory and disk for each vm in the cluster. """ info_prefix = 'Info: [vm-statistics]:' + warn_prefix = 'Warn: [vm-statistics]:' vm_statistics = {} ignore_vms_list = ignore_vms.split(',') group_include = None @@ -283,38 +285,80 @@ def get_vm_statistics(api_object, ignore_vms): vm_ignore_wildcard = __validate_ignore_vm_wildcard(ignore_vms) for node in api_object.nodes.get(): - for vm in api_object.nodes(node['node']).qemu.get(): - - # Get the VM tags from API. - vm_tags = __get_vm_tags(api_object, node, vm['vmid']) - if vm_tags is not None: - group_include, group_exclude, vm_ignore = __get_proxlb_groups(vm_tags) - - # Get wildcard match for VMs to ignore if a wildcard pattern was - # previously found. Wildcards may slow down the task when using - # many patterns in the ignore list. Therefore, run this only if - # a wildcard pattern was found. We also do not need to validate - # this if the VM is already being ignored by a defined tag. - if vm_ignore_wildcard and not vm_ignore: - vm_ignore = __check_vm_name_wildcard_pattern(vm['name'], ignore_vms_list) - - if vm['status'] == 'running' and vm['name'] not in ignore_vms_list and not vm_ignore: - vm_statistics[vm['name']] = {} - vm_statistics[vm['name']]['group_include'] = group_include - vm_statistics[vm['name']]['group_exclude'] = group_exclude - vm_statistics[vm['name']]['cpu_total'] = vm['cpus'] - vm_statistics[vm['name']]['cpu_used'] = vm['cpu'] - vm_statistics[vm['name']]['memory_total'] = vm['maxmem'] - vm_statistics[vm['name']]['memory_used'] = vm['mem'] - vm_statistics[vm['name']]['disk_total'] = vm['maxdisk'] - vm_statistics[vm['name']]['disk_used'] = vm['disk'] - vm_statistics[vm['name']]['vmid'] = vm['vmid'] - vm_statistics[vm['name']]['node_parent'] = node['node'] - # Rebalancing node will be overwritten after calculations. - # If the vm stays on the node, it will be removed at a - # later time. - vm_statistics[vm['name']]['node_rebalance'] = node['node'] - logging.info(f'{info_prefix} Added vm {vm["name"]}.') + + # Add all virtual machines if type is vm or all. + if balancing_type == 'vm' or balancing_type == 'all': + for vm in api_object.nodes(node['node']).qemu.get(): + + # Get the VM tags from API. + vm_tags = __get_vm_tags(api_object, node, vm['vmid'], 'vm') + if vm_tags is not None: + group_include, group_exclude, vm_ignore = __get_proxlb_groups(vm_tags) + + # Get wildcard match for VMs to ignore if a wildcard pattern was + # previously found. Wildcards may slow down the task when using + # many patterns in the ignore list. Therefore, run this only if + # a wildcard pattern was found. We also do not need to validate + # this if the VM is already being ignored by a defined tag. + if vm_ignore_wildcard and not vm_ignore: + vm_ignore = __check_vm_name_wildcard_pattern(vm['name'], ignore_vms_list) + + if vm['status'] == 'running' and vm['name'] not in ignore_vms_list and not vm_ignore: + vm_statistics[vm['name']] = {} + vm_statistics[vm['name']]['group_include'] = group_include + vm_statistics[vm['name']]['group_exclude'] = group_exclude + vm_statistics[vm['name']]['cpu_total'] = vm['cpus'] + vm_statistics[vm['name']]['cpu_used'] = vm['cpu'] + vm_statistics[vm['name']]['memory_total'] = vm['maxmem'] + vm_statistics[vm['name']]['memory_used'] = vm['mem'] + vm_statistics[vm['name']]['disk_total'] = vm['maxdisk'] + vm_statistics[vm['name']]['disk_used'] = vm['disk'] + vm_statistics[vm['name']]['vmid'] = vm['vmid'] + vm_statistics[vm['name']]['node_parent'] = node['node'] + vm_statistics[vm['name']]['type'] = 'vm' + # Rebalancing node will be overwritten after calculations. + # If the vm stays on the node, it will be removed at a + # later time. + vm_statistics[vm['name']]['node_rebalance'] = node['node'] + logging.info(f'{info_prefix} Added vm {vm["name"]}.') + + # Add all containers if type is ct or all. + if balancing_type == 'ct' or balancing_type == 'all': + for vm in api_object.nodes(node['node']).lxc.get(): + + logging.warning(f'{warn_prefix} Rebalancing on LXC containers (CT) always requires them to shut down.') + logging.warning(f'{warn_prefix} {vm["name"]} is from type CT and cannot be live migrated!') + # Get the VM tags from API. + vm_tags = __get_vm_tags(api_object, node, vm['vmid'], 'ct') + if vm_tags is not None: + group_include, group_exclude, vm_ignore = __get_proxlb_groups(vm_tags) + + # Get wildcard match for VMs to ignore if a wildcard pattern was + # previously found. Wildcards may slow down the task when using + # many patterns in the ignore list. Therefore, run this only if + # a wildcard pattern was found. We also do not need to validate + # this if the VM is already being ignored by a defined tag. + if vm_ignore_wildcard and not vm_ignore: + vm_ignore = __check_vm_name_wildcard_pattern(vm['name'], ignore_vms_list) + + if vm['status'] == 'running' and vm['name'] not in ignore_vms_list and not vm_ignore: + vm_statistics[vm['name']] = {} + vm_statistics[vm['name']]['group_include'] = group_include + vm_statistics[vm['name']]['group_exclude'] = group_exclude + vm_statistics[vm['name']]['cpu_total'] = vm['cpus'] + vm_statistics[vm['name']]['cpu_used'] = vm['cpu'] + vm_statistics[vm['name']]['memory_total'] = vm['maxmem'] + vm_statistics[vm['name']]['memory_used'] = vm['mem'] + vm_statistics[vm['name']]['disk_total'] = vm['maxdisk'] + vm_statistics[vm['name']]['disk_used'] = vm['disk'] + vm_statistics[vm['name']]['vmid'] = vm['vmid'] + vm_statistics[vm['name']]['node_parent'] = node['node'] + vm_statistics[vm['name']]['type'] = 'ct' + # Rebalancing node will be overwritten after calculations. + # If the vm stays on the node, it will be removed at a + # later time. + vm_statistics[vm['name']]['node_rebalance'] = node['node'] + logging.info(f'{info_prefix} Added vm {vm["name"]}.') logging.info(f'{info_prefix} Created VM statistics.') return vm_statistics @@ -361,12 +405,17 @@ def __check_vm_name_wildcard_pattern(vm_name, ignore_vms_list): return True -def __get_vm_tags(api_object, node, vmid): - """ Get a comment for a VM from a given VMID. """ +def __get_vm_tags(api_object, node, vmid, balancing_type): + """ Get tags for a VM/CT for a given VMID. """ info_prefix = 'Info: [api-get-vm-tags]:' - vm_config = api_object.nodes(node['node']).qemu(vmid).config.get() - logging.info(f'{info_prefix} Got VM comment from API.') + if balancing_type == 'vm': + vm_config = api_object.nodes(node['node']).qemu(vmid).config.get() + + if balancing_type == 'ct': + vm_config = api_object.nodes(node['node']).lxc(vmid).config.get() + + logging.info(f'{info_prefix} Got VM/CT tag from API.') return vm_config.get('tags', None) @@ -672,9 +721,9 @@ def __create_dry_run_output(vm_statistics_rebalanced, app_args): vm_to_node_list = [] logging.info(f'{info_prefix} Starting dry-run to rebalance vms to their new nodes.') - vm_to_node_list.append(['VM', 'Current Node', 'Rebalanced Node']) + vm_to_node_list.append(['VM', 'Current Node', 'Rebalanced Node', 'VM Type']) for vm_name, vm_values in vm_statistics_rebalanced.items(): - vm_to_node_list.append([vm_name, vm_values['node_parent'], vm_values['node_rebalance']]) + vm_to_node_list.append([vm_name, vm_values['node_parent'], vm_values['node_rebalance'], vm_values['type']]) if len(vm_statistics_rebalanced) > 0: logging.info(f'{info_prefix} Printing cli output of VM rebalancing.') @@ -711,7 +760,7 @@ def main(): pre_validations(config_path) # Parse global config. - proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \ + proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_type, \ balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path) # Overwrite logging handler with user defined log verbosity. @@ -723,7 +772,7 @@ def main(): # Get metric & statistics for vms and nodes. node_statistics = get_node_statistics(api_object, ignore_nodes) - vm_statistics = get_vm_statistics(api_object, ignore_vms) + vm_statistics = get_vm_statistics(api_object, ignore_vms, balancing_type) node_statistics = update_node_statistics(node_statistics, vm_statistics) # Calculate rebalancing of vms.