From 664cd73a4715ff14a245262eeccb8c1634db3068 Mon Sep 17 00:00:00 2001
From: Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
Date: Sun, 14 Jul 2024 13:17:19 +0200
Subject: [PATCH] feature: Add option to rebalance VMs by their assigned
 resources. [#16]

Fixes: #16
---
 ..._add_rebalancing_by_assigned_resources.yml |   2 +
 README.md                                     |   2 +
 proxlb                                        | 112 ++++++++++++------
 proxlb.conf                                   |   1 +
 4 files changed, 80 insertions(+), 37 deletions(-)
 create mode 100644 .changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml

diff --git a/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml b/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml
new file mode 100644
index 0000000..0cf41ec
--- /dev/null
+++ b/.changelogs/1.0.0/16_add_rebalancing_by_assigned_resources.yml
@@ -0,0 +1,2 @@
+added:
+  - Add option to rebalance by assigned VM resources to avoid overprovisioning. [#16]
diff --git a/README.md b/README.md
index a757108..63714da 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,7 @@ The following options can be set in the `proxlb.conf` file:
 | api_pass | FooBar | Password for the API. |
 | verify_ssl | 1 | Validate SSL certificates (1) or ignore (0). (default: 1) |
 | method | memory | Defines the balancing method (default: memory) where you can use `memory`, `disk` or `cpu`. |
+| mode | used | Rebalance by `used` resources (efficiency) or `assigned` (avoid overprovisioning) resources. (default: used)|
 | balanciness | 10 | Value of the percentage of lowest and highest resource consumption on nodes may differ before rebalancing. (default: 10) |
 | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. |
 | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) |
@@ -101,6 +102,7 @@ api_pass: FooBar
 verify_ssl: 1
 [balancing]
 method: memory
+mode: used
 # Balanciness defines how much difference may be
 # between the lowest & highest resource consumption
 # of nodes before rebalancing will be done.
diff --git a/proxlb b/proxlb
index 33f4c88..712cc06 100755
--- a/proxlb
+++ b/proxlb
@@ -179,6 +179,7 @@ def initialize_config_options(config_path):
         proxmox_api_ssl_v = config['proxmox']['verify_ssl']
         # Balancing
         balancing_method  = config['balancing'].get('method', 'memory')
+        balancing_mode    = config['balancing'].get('mode', 'used')
         balanciness       = config['balancing'].get('balanciness', 10)
         ignore_nodes      = config['balancing'].get('ignore_nodes', None)
         ignore_vms        = config['balancing'].get('ignore_vms', None)
@@ -198,7 +199,7 @@ def initialize_config_options(config_path):
 
     logging.info(f'{info_prefix} Configuration file loaded.')
     return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, \
-         balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity
+         balancing_mode, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity
 
 
 def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v):
@@ -237,18 +238,24 @@ def get_node_statistics(api_object, ignore_nodes):
     for node in api_object.nodes.get():
         if node['status'] == 'online' and node['node'] not in ignore_nodes_list:
             node_statistics[node['node']] = {}
-            node_statistics[node['node']]['cpu_total']            = node['maxcpu']
-            node_statistics[node['node']]['cpu_used']             = node['cpu']
-            node_statistics[node['node']]['cpu_free']             = int(node['maxcpu']) - int(node['cpu'])
-            node_statistics[node['node']]['cpu_free_percent']     = int((node_statistics[node['node']]['cpu_free']) / int(node['maxcpu']) * 100)
-            node_statistics[node['node']]['memory_total']         = node['maxmem']
-            node_statistics[node['node']]['memory_used']          = node['mem']
-            node_statistics[node['node']]['memory_free']          = int(node['maxmem']) - int(node['mem'])
-            node_statistics[node['node']]['memory_free_percent']  = int((node_statistics[node['node']]['memory_free']) / int(node['maxmem']) * 100)
-            node_statistics[node['node']]['disk_total']           = node['maxdisk']
-            node_statistics[node['node']]['disk_used']            = node['disk']
-            node_statistics[node['node']]['disk_free']            = int(node['maxdisk']) - int(node['disk'])
-            node_statistics[node['node']]['disk_free_percent']   = int((node_statistics[node['node']]['disk_free']) / int(node['maxdisk']) * 100)
+            node_statistics[node['node']]['cpu_total']               = node['maxcpu']
+            node_statistics[node['node']]['cpu_assigned']            = 0
+            node_statistics[node['node']]['cpu_assigned_percent']    = int((node_statistics[node['node']]['cpu_assigned']) / int(node_statistics[node['node']]['cpu_total']) * 100)
+            node_statistics[node['node']]['cpu_used']                = node['cpu']
+            node_statistics[node['node']]['cpu_free']                = int(node['maxcpu']) - int(node['cpu'])
+            node_statistics[node['node']]['cpu_free_percent']        = int((node_statistics[node['node']]['cpu_free']) / int(node['maxcpu']) * 100)
+            node_statistics[node['node']]['memory_total']            = node['maxmem']
+            node_statistics[node['node']]['memory_assigned']         = 0
+            node_statistics[node['node']]['memory_assigned_percent'] = int((node_statistics[node['node']]['memory_assigned']) / int(node_statistics[node['node']]['memory_total']) * 100)
+            node_statistics[node['node']]['memory_used']             = node['mem']
+            node_statistics[node['node']]['memory_free']             = int(node['maxmem']) - int(node['mem'])
+            node_statistics[node['node']]['memory_free_percent']     = int((node_statistics[node['node']]['memory_free']) / int(node['maxmem']) * 100)
+            node_statistics[node['node']]['disk_total']              = node['maxdisk']
+            node_statistics[node['node']]['disk_assigned']           = 0
+            node_statistics[node['node']]['disk_assigned_percent']   = int((node_statistics[node['node']]['disk_assigned']) / int(node_statistics[node['node']]['disk_total']) * 100)
+            node_statistics[node['node']]['disk_used']               = node['disk']
+            node_statistics[node['node']]['disk_free']               = int(node['maxdisk']) - int(node['disk'])
+            node_statistics[node['node']]['disk_free_percent']       = int((node_statistics[node['node']]['disk_free']) / int(node['maxdisk']) * 100)
             logging.info(f'{info_prefix} Added node {node["node"]}.')
 
     logging.info(f'{info_prefix} Created node statistics.')
@@ -307,6 +314,33 @@ def get_vm_statistics(api_object, ignore_vms):
     return vm_statistics
 
 
+def update_node_statistics(node_statistics, vm_statistics):
+    """ Update node statistics by VMs statistics. """
+    info_prefix = 'Info: [node-update-statistics]:'
+    warn_prefix = 'Warning: [node-update-statistics]:'
+
+    for vm, vm_value in vm_statistics.items():
+        node_statistics[vm_value['node_parent']]['cpu_assigned']            = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
+        node_statistics[vm_value['node_parent']]['cpu_assigned_percent']    = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
+        node_statistics[vm_value['node_parent']]['memory_assigned']         = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
+        node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
+        node_statistics[vm_value['node_parent']]['disk_assigned']           = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
+        node_statistics[vm_value['node_parent']]['disk_assigned_percent']   = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
+
+        if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
+            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
+
+        if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
+            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
+
+        if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
+            logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for Disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
+
+    logging.info(f'{info_prefix} Updated node resource assignments by all VMs.')
+    logging.debug('node_statistics')
+    return node_statistics    
+
+
 def __validate_ignore_vm_wildcard(ignore_vms):
     """ Validate if a wildcard is used for ignored VMs. """
     if '*' in ignore_vms:
@@ -355,7 +389,7 @@ def __get_proxlb_groups(vm_tags):
     return group_include, group_exclude, vm_ignore
 
 
-def balancing_calculations(balancing_method, node_statistics, vm_statistics, balanciness):
+def balancing_calculations(balancing_method, balancing_mode, node_statistics, vm_statistics, balanciness):
     """ Calculate re-balancing of VMs on present nodes across the cluster. """
     info_prefix        = 'Info: [rebalancing-calculator]:'
     balanciness        = int(balanciness)
@@ -364,29 +398,29 @@ def balancing_calculations(balancing_method, node_statistics, vm_statistics, bal
     rebalance          = True
     emergency_counter  = 0
 
-    # Validate for a supported balancing method.
-    __validate_balancing_method(balancing_method)
+    # # Validate for a supported balancing method.
+    # __validate_balancing_method(balancing_method)
 
-    # Rebalance VMs with the highest resource usage to a new
-    # node until reaching the desired balanciness.
-    while rebalance and emergency_counter < 10000:
-        emergency_counter = emergency_counter + 1
-        rebalance = __validate_balanciness(balanciness, balancing_method, node_statistics)
+    # # Rebalance VMs with the highest resource usage to a new
+    # # node until reaching the desired balanciness.
+    # while rebalance and emergency_counter < 10000:
+    #     emergency_counter = emergency_counter + 1
+    #     rebalance = __validate_balanciness(balanciness, balancing_method, node_statistics)
 
-        if rebalance:
-            resource_highest_used_resources_vm, processed_vms   = __get_most_used_resources_vm(balancing_method, vm_statistics, processed_vms)
-            resource_highest_free_resources_node                = __get_most_free_resources_node(balancing_method, node_statistics)
-            node_statistics, vm_statistics                      = __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node,
-                                                                                                 vm_statistics, node_statistics, balancing_method)
+    #     if rebalance:
+    #         resource_highest_used_resources_vm, processed_vms   = __get_most_used_resources_vm(balancing_method, vm_statistics, processed_vms)
+    #         resource_highest_free_resources_node                = __get_most_free_resources_node(balancing_method, node_statistics)
+    #         node_statistics, vm_statistics                      = __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node,
+    #                                                                                              vm_statistics, node_statistics, balancing_method)
 
-    # Honour groupings for include and exclude groups for rebalancing VMs.
-    node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method)
-    node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method)
+    # # Honour groupings for include and exclude groups for rebalancing VMs.
+    # node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method)
+    # node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method)
 
-    # Remove VMs that are not being relocated.
-    vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')]
-    for vm_name in vms_to_remove:
-        del vm_statistics[vm_name]
+    # # Remove VMs that are not being relocated.
+    # vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')]
+    # for vm_name in vms_to_remove:
+    #     del vm_statistics[vm_name]
 
     logging.info(f'{info_prefix} Balancing calculations done.')
     return node_statistics, vm_statistics
@@ -617,7 +651,7 @@ def main():
     pre_validations(config_path)
 
     # Parse global config.
-    proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, \
+    proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \
         balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path)
 
     # Overwrite logging handler with user defined log verbosity.
@@ -629,10 +663,14 @@ def main():
 
         # Get metric & statistics for vms and nodes.
         node_statistics = get_node_statistics(api_object, ignore_nodes)
-        vm_statistics = get_vm_statistics(api_object, ignore_vms)
+        vm_statistics   = get_vm_statistics(api_object, ignore_vms)
+        node_statistics = update_node_statistics(node_statistics, vm_statistics)
+
+        print(node_statistics)
+        sys.exit(1)
 
         # Calculate rebalancing of vms.
-        node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, node_statistics, vm_statistics, balanciness)
+        #node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, node_statistics, vm_statistics, balanciness)
 
         # Rebalance vms to new nodes within the cluster.
         run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args)
@@ -645,4 +683,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/proxlb.conf b/proxlb.conf
index 22bd2e7..fc4c3d5 100644
--- a/proxlb.conf
+++ b/proxlb.conf
@@ -5,6 +5,7 @@ api_pass: FooBar
 verify_ssl: 1
 [balancing]
 method: memory
+mode: used
 ignore_nodes: dummynode01,dummynode02
 ignore_vms: testvm01,testvm02
 [service]