From 45c3befc88ed88f0425997d795af8ec08776fa9a Mon Sep 17 00:00:00 2001
From: "Florian Paul Azim Hoberg (@gyptazy)" <florian.hoberg@netapp.com>
Date: Fri, 30 Aug 2024 08:09:57 +0200
Subject: [PATCH] fix: Fix anti-affinity rules not evaluating a new and
 different node correctly.

Fixes: #67
---
 .../1.0.3/67_fix_anti_affinity_rules.yml      |  2 +
 proxlb                                        | 87 +++++++++++++------
 2 files changed, 61 insertions(+), 28 deletions(-)
 create mode 100644 .changelogs/1.0.3/67_fix_anti_affinity_rules.yml

diff --git a/.changelogs/1.0.3/67_fix_anti_affinity_rules.yml b/.changelogs/1.0.3/67_fix_anti_affinity_rules.yml
new file mode 100644
index 0000000..b666449
--- /dev/null
+++ b/.changelogs/1.0.3/67_fix_anti_affinity_rules.yml
@@ -0,0 +1,2 @@
+fixed:
+  - Fix anti-affinity rules not evaluating a new and different node. [#67]
diff --git a/proxlb b/proxlb
index 57a80f3..22a4e0d 100755
--- a/proxlb
+++ b/proxlb
@@ -751,7 +751,10 @@ def __get_vm_tags(api_object, node, vmid, balancing_type):
     if balancing_type == 'ct':
         vm_config = api_object.nodes(node['node']).lxc(vmid).config.get()
 
-    logging.info(f'{info_prefix} Got VM/CT tag from API.')
+    if vm_config.get("tags", None) is None:
+        logging.info(f'{info_prefix} Got no VM/CT tag for VM {vm_config.get("name", None)} from API.')
+    else:
+        logging.info(f'{info_prefix} Got VM/CT tag {vm_config.get("tags", None)} for VM  {vm_config.get("name", None)} from API.')
     return vm_config.get('tags', None)
 
 
@@ -769,8 +772,16 @@ def __get_proxlb_groups(vm_tags):
             logging.info(f'{info_prefix} Got PLB include group.')
             group_include = group
 
-        if group.startswith('plb_exclude_'):
+        if group.startswith('plb_affinity_'):
             logging.info(f'{info_prefix} Got PLB include group.')
+            group_include = group
+
+        if group.startswith('plb_exclude_'):
+            logging.info(f'{info_prefix} Got PLB exclude group.')
+            group_exclude = group
+
+        if group.startswith('plb_antiaffinity_'):
+            logging.info(f'{info_prefix} Got PLB exclude group.')
             group_exclude = group
 
         if group.startswith('plb_ignore_vm'):
@@ -1005,43 +1016,63 @@ def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_metho
 
 def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
     """ Get VMs tags for exclude groups. """
-    info_prefix = 'Info: [rebalancing-tags-group-exclude]:'
+    info_prefix      = 'Info: [rebalancing-tags-group-exclude]:'
     tags_exclude_vms = {}
-    processed_vm = []
 
     # Create groups of tags with belongings hosts.
     for vm_name, vm_values in vm_statistics.items():
-        if vm_values.get('group_include', None):
-            if not tags_exclude_vms.get(vm_values['group_include'], None):
-                tags_exclude_vms[vm_values['group_include']] = [vm_name]
+        if vm_values.get('group_exclude', None):
+            if not tags_exclude_vms.get(vm_values['group_exclude'], None):
+                tags_exclude_vms[vm_values['group_exclude']]               = {}
+                tags_exclude_vms[vm_values['group_exclude']]['nodes_used'] = []
+                tags_exclude_vms[vm_values['group_exclude']]['nodes_used'].append(vm_statistics[vm_name]['node_rebalance'])
+                tags_exclude_vms[vm_values['group_exclude']]['vms']        = [vm_name]
             else:
-                tags_exclude_vms[vm_values['group_include']] = tags_exclude_vms[vm_values['group_include']] + [vm_name]
+                tags_exclude_vms[vm_values['group_exclude']]['vms']        = tags_exclude_vms[vm_values['group_exclude']]['vms'] + [vm_name]
+                tags_exclude_vms[vm_values['group_exclude']]['nodes_used'].append(vm_statistics[vm_name]['node_rebalance'])
 
-    # Update the VMs to the corresponding node to their group assignments.
-    for group, vm_names in tags_exclude_vms.items():
-        # Do not take care of tags that have only a single host included.
-        if len(vm_names) < 2:
-            logging.info(f'{info_prefix} Only one host in group assignment.')
-            return node_statistics, vm_statistics
+    # Evaluate all VMs assigned for each exclude groups and validate that they will be moved to another random node.
+    # However, if there are still more VMs than exclude nodes we need to deal with it.
+    for exclude_group, group_values in tags_exclude_vms.items():
+        for vm in group_values['vms']:
+            if vm_statistics[vm]['node_rebalance'] in group_values['nodes_used']:
 
-        vm_node_rebalance = False
-        logging.info(f'{info_prefix} Create exclude groups of VM hosts.')
-        for vm_name in vm_names:
-            if vm_name not in processed_vm:
-                if not vm_node_rebalance:
-                    random_node = vm_statistics[vm_name]['node_parent']
-                    # Get a random node and make sure that it is not by accident the
-                    # currently assigned one.
-                    while random_node == vm_statistics[vm_name]['node_parent']:
-                        random_node = random.choice(list(node_statistics.keys()))
-                else:
-                    _mocked_vm_object = (vm_name, vm_statistics[vm_name])
-                    node_statistics, vm_statistics = __update_vm_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode)
-            processed_vm.append(vm_name)
+                logging.info(f'{info_prefix} Rebalancing of VM {vm} is needed due to anti-affinity group policy.')
+                counter = 0
+                proceed = True
+
+                while proceed:
+                    random_node, counter = __get_random_node(counter, node_statistics)
+                    if random_node not in group_values['nodes_used']:
+                        group_values['nodes_used'].append(random_node)
+                        proceed = False
+
+                if random_node:
+                    logging.info(f'{info_prefix} Rebalancing VM {vm} to node {random_node} due to anti-affinity group policy.')
+                    vm_statistics[vm_name]['node_rebalance'] = random_node
+
+            else:
+                logging.info(f'{info_prefix} No rebalancing for VM {vm} needed due to any anti-affinity group policies.')
 
     return node_statistics, vm_statistics
 
 
+def __get_random_node(counter, node_statistics):
+    """ Get a random node within the Proxmox cluster. """
+    warning_prefix = 'Warning: [random-node-getter]:'
+    info_prefix    = 'Info: [random-node-getter]:'
+
+    counter = counter + 1
+    random_node = None
+    if counter < 30:
+        random_node = random.choice(list(node_statistics.keys()))
+        logging.info(f'{info_prefix} New random node {random_node} evaluated in run {counter}.')
+        return random_node, counter
+    else:
+        logging.warning(f'{info_warning} Reached limit for random node evaluation. Unable to find a suitable new node.')
+        return random_node, counter
+
+
 def __wait_job_finalized(api_object, node_name, job_id, counter):
     """ Wait for a job to be finalized. """
     error_prefix = 'Error: [job-status-getter]:'