From 664cd73a4715ff14a245262eeccb8c1634db3068 Mon Sep 17 00:00:00 2001 From: Florian Paul Azim Hoberg Date: Sun, 14 Jul 2024 13:17:19 +0200 Subject: [PATCH] feature: Add option to rebalance VMs by their assigned resources. [#16] Fixes: #16 --- ..._add_rebalancing_by_assigned_resources.yml | 2 + README.md | 2 + proxlb | 112 ++++++++++++------ proxlb.conf | 1 + 4 files changed, 80 insertions(+), 37 deletions(-) create mode 100644 .changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml diff --git a/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml b/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml new file mode 100644 index 0000000..0cf41ec --- /dev/null +++ b/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml @@ -0,0 +1,2 @@ +added: + - Add option to rebalance by assigned VM resources to avoid overprovisioning. [#16] diff --git a/README.md b/README.md index a757108..63714da 100644 --- a/README.md +++ b/README.md @@ -85,6 +85,7 @@ The following options can be set in the `proxlb.conf` file: | api_pass | FooBar | Password for the API. | | verify_ssl | 1 | Validate SSL certificates (1) or ignore (0). (default: 1) | | method | memory | Defines the balancing method (default: memory) where you can use `memory`, `disk` or `cpu`. | +| mode | used | Rebalance by `used` resources (efficiency) or `assigned` (avoid overprovisioning) resources. (default: used)| | balanciness | 10 | Value of the percentage of lowest and highest resource consumption on nodes may differ before rebalancing. (default: 10) | | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. | | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) | @@ -101,6 +102,7 @@ api_pass: FooBar verify_ssl: 1 [balancing] method: memory +mode: used # Balanciness defines how much difference may be # between the lowest & highest resource consumption # of nodes before rebalancing will be done. diff --git a/proxlb b/proxlb index 33f4c88..712cc06 100755 --- a/proxlb +++ b/proxlb @@ -179,6 +179,7 @@ def initialize_config_options(config_path): proxmox_api_ssl_v = config['proxmox']['verify_ssl'] # Balancing balancing_method = config['balancing'].get('method', 'memory') + balancing_mode = config['balancing'].get('mode', 'used') balanciness = config['balancing'].get('balanciness', 10) ignore_nodes = config['balancing'].get('ignore_nodes', None) ignore_vms = config['balancing'].get('ignore_vms', None) @@ -198,7 +199,7 @@ def initialize_config_options(config_path): logging.info(f'{info_prefix} Configuration file loaded.') return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, \ - balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity + balancing_mode, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v): @@ -237,18 +238,24 @@ def get_node_statistics(api_object, ignore_nodes): for node in api_object.nodes.get(): if node['status'] == 'online' and node['node'] not in ignore_nodes_list: node_statistics[node['node']] = {} - node_statistics[node['node']]['cpu_total'] = node['maxcpu'] - node_statistics[node['node']]['cpu_used'] = node['cpu'] - node_statistics[node['node']]['cpu_free'] = int(node['maxcpu']) - int(node['cpu']) - node_statistics[node['node']]['cpu_free_percent'] = int((node_statistics[node['node']]['cpu_free']) / int(node['maxcpu']) * 100) - node_statistics[node['node']]['memory_total'] = node['maxmem'] - node_statistics[node['node']]['memory_used'] = node['mem'] - node_statistics[node['node']]['memory_free'] = int(node['maxmem']) - int(node['mem']) - node_statistics[node['node']]['memory_free_percent'] = int((node_statistics[node['node']]['memory_free']) / int(node['maxmem']) * 100) - node_statistics[node['node']]['disk_total'] = node['maxdisk'] - node_statistics[node['node']]['disk_used'] = node['disk'] - node_statistics[node['node']]['disk_free'] = int(node['maxdisk']) - int(node['disk']) - node_statistics[node['node']]['disk_free_percent'] = int((node_statistics[node['node']]['disk_free']) / int(node['maxdisk']) * 100) + node_statistics[node['node']]['cpu_total'] = node['maxcpu'] + node_statistics[node['node']]['cpu_assigned'] = 0 + node_statistics[node['node']]['cpu_assigned_percent'] = int((node_statistics[node['node']]['cpu_assigned']) / int(node_statistics[node['node']]['cpu_total']) * 100) + node_statistics[node['node']]['cpu_used'] = node['cpu'] + node_statistics[node['node']]['cpu_free'] = int(node['maxcpu']) - int(node['cpu']) + node_statistics[node['node']]['cpu_free_percent'] = int((node_statistics[node['node']]['cpu_free']) / int(node['maxcpu']) * 100) + node_statistics[node['node']]['memory_total'] = node['maxmem'] + node_statistics[node['node']]['memory_assigned'] = 0 + node_statistics[node['node']]['memory_assigned_percent'] = int((node_statistics[node['node']]['memory_assigned']) / int(node_statistics[node['node']]['memory_total']) * 100) + node_statistics[node['node']]['memory_used'] = node['mem'] + node_statistics[node['node']]['memory_free'] = int(node['maxmem']) - int(node['mem']) + node_statistics[node['node']]['memory_free_percent'] = int((node_statistics[node['node']]['memory_free']) / int(node['maxmem']) * 100) + node_statistics[node['node']]['disk_total'] = node['maxdisk'] + node_statistics[node['node']]['disk_assigned'] = 0 + node_statistics[node['node']]['disk_assigned_percent'] = int((node_statistics[node['node']]['disk_assigned']) / int(node_statistics[node['node']]['disk_total']) * 100) + node_statistics[node['node']]['disk_used'] = node['disk'] + node_statistics[node['node']]['disk_free'] = int(node['maxdisk']) - int(node['disk']) + node_statistics[node['node']]['disk_free_percent'] = int((node_statistics[node['node']]['disk_free']) / int(node['maxdisk']) * 100) logging.info(f'{info_prefix} Added node {node["node"]}.') logging.info(f'{info_prefix} Created node statistics.') @@ -307,6 +314,33 @@ def get_vm_statistics(api_object, ignore_vms): return vm_statistics +def update_node_statistics(node_statistics, vm_statistics): + """ Update node statistics by VMs statistics. """ + info_prefix = 'Info: [node-update-statistics]:' + warn_prefix = 'Warning: [node-update-statistics]:' + + for vm, vm_value in vm_statistics.items(): + node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total']) + node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100 + node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total']) + node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100 + node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total']) + node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100 + + if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.') + + if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.') + + if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99: + logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for Disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.') + + logging.info(f'{info_prefix} Updated node resource assignments by all VMs.') + logging.debug('node_statistics') + return node_statistics + + def __validate_ignore_vm_wildcard(ignore_vms): """ Validate if a wildcard is used for ignored VMs. """ if '*' in ignore_vms: @@ -355,7 +389,7 @@ def __get_proxlb_groups(vm_tags): return group_include, group_exclude, vm_ignore -def balancing_calculations(balancing_method, node_statistics, vm_statistics, balanciness): +def balancing_calculations(balancing_method, balancing_mode, node_statistics, vm_statistics, balanciness): """ Calculate re-balancing of VMs on present nodes across the cluster. """ info_prefix = 'Info: [rebalancing-calculator]:' balanciness = int(balanciness) @@ -364,29 +398,29 @@ def balancing_calculations(balancing_method, node_statistics, vm_statistics, bal rebalance = True emergency_counter = 0 - # Validate for a supported balancing method. - __validate_balancing_method(balancing_method) + # # Validate for a supported balancing method. + # __validate_balancing_method(balancing_method) - # Rebalance VMs with the highest resource usage to a new - # node until reaching the desired balanciness. - while rebalance and emergency_counter < 10000: - emergency_counter = emergency_counter + 1 - rebalance = __validate_balanciness(balanciness, balancing_method, node_statistics) + # # Rebalance VMs with the highest resource usage to a new + # # node until reaching the desired balanciness. + # while rebalance and emergency_counter < 10000: + # emergency_counter = emergency_counter + 1 + # rebalance = __validate_balanciness(balanciness, balancing_method, node_statistics) - if rebalance: - resource_highest_used_resources_vm, processed_vms = __get_most_used_resources_vm(balancing_method, vm_statistics, processed_vms) - resource_highest_free_resources_node = __get_most_free_resources_node(balancing_method, node_statistics) - node_statistics, vm_statistics = __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, - vm_statistics, node_statistics, balancing_method) + # if rebalance: + # resource_highest_used_resources_vm, processed_vms = __get_most_used_resources_vm(balancing_method, vm_statistics, processed_vms) + # resource_highest_free_resources_node = __get_most_free_resources_node(balancing_method, node_statistics) + # node_statistics, vm_statistics = __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, + # vm_statistics, node_statistics, balancing_method) - # Honour groupings for include and exclude groups for rebalancing VMs. - node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method) - node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method) + # # Honour groupings for include and exclude groups for rebalancing VMs. + # node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method) + # node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method) - # Remove VMs that are not being relocated. - vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')] - for vm_name in vms_to_remove: - del vm_statistics[vm_name] + # # Remove VMs that are not being relocated. + # vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')] + # for vm_name in vms_to_remove: + # del vm_statistics[vm_name] logging.info(f'{info_prefix} Balancing calculations done.') return node_statistics, vm_statistics @@ -617,7 +651,7 @@ def main(): pre_validations(config_path) # Parse global config. - proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, \ + proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \ balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path) # Overwrite logging handler with user defined log verbosity. @@ -629,10 +663,14 @@ def main(): # Get metric & statistics for vms and nodes. node_statistics = get_node_statistics(api_object, ignore_nodes) - vm_statistics = get_vm_statistics(api_object, ignore_vms) + vm_statistics = get_vm_statistics(api_object, ignore_vms) + node_statistics = update_node_statistics(node_statistics, vm_statistics) + + print(node_statistics) + sys.exit(1) # Calculate rebalancing of vms. - node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, node_statistics, vm_statistics, balanciness) + #node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, node_statistics, vm_statistics, balanciness) # Rebalance vms to new nodes within the cluster. run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args) @@ -645,4 +683,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file diff --git a/proxlb.conf b/proxlb.conf index 22bd2e7..fc4c3d5 100644 --- a/proxlb.conf +++ b/proxlb.conf @@ -5,6 +5,7 @@ api_pass: FooBar verify_ssl: 1 [balancing] method: memory +mode: used ignore_nodes: dummynode01,dummynode02 ignore_vms: testvm01,testvm02 [service]