Skip to content

Commit

Permalink
PWX-36167: Duplicate offline nodes should be excluded for pod eviction
Browse files Browse the repository at this point in the history
           if another online node exists with same scheduler ID.

(cherry picked from commit ee5a84e)
  • Loading branch information
diptiranjanpx committed Sep 17, 2024
1 parent 30aea6a commit b1b1d8e
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
13 changes: 11 additions & 2 deletions drivers/volume/volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -650,23 +650,26 @@ func IsNodeMatch(k8sNode *v1.Node, driverNode *NodeInfo) bool {
}

// RemoveDuplicateOfflineNodes Removes duplicate offline nodes from the list which have
// the same IP as an online node
// the same IP or same scheduler name as an online node
func RemoveDuplicateOfflineNodes(nodes []*NodeInfo) []*NodeInfo {
updatedNodes := make([]*NodeInfo, 0)
offlineNodes := make([]*NodeInfo, 0)
onlineIPs := make([]string, 0)
onlineSchedulerIDs := make([]string, 0)
// First add the online nodes to the list
for _, node := range nodes {
if node.Status == NodeOnline {
updatedNodes = append(updatedNodes, node)
onlineIPs = append(onlineIPs, node.IPs...)
onlineSchedulerIDs = append(onlineSchedulerIDs, node.SchedulerID)
} else {
offlineNodes = append(offlineNodes, node)
}
}

// Then go through the offline nodes and ignore any which have
// the same IP as an online node
// the same IP as an online node and also ignore any which have the
// same scheduler ID as an online node
for _, offlineNode := range offlineNodes {
found := false
for _, offlineIP := range offlineNode.IPs {
Expand All @@ -677,6 +680,12 @@ func RemoveDuplicateOfflineNodes(nodes []*NodeInfo) []*NodeInfo {
}
}
}
for _, onlineSchedulerID := range onlineSchedulerIDs {
if offlineNode.SchedulerID == onlineSchedulerID {
found = true
break
}
}
if !found {
updatedNodes = append(updatedNodes, offlineNode)
}
Expand Down
73 changes: 73 additions & 0 deletions pkg/monitor/monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ func TestMonitor(t *testing.T) {

func TestMonitorOfflineNodes(t *testing.T) {
t.Run("testOfflineStorageNode", testOfflineStorageNode)
t.Run("testDuplicateOfflineWithSameSchedulerID", testDuplicateOfflineWithSameSchedulerID)
t.Run("testDuplicateOfflineWithSameIPAddress", testDuplicateOfflineWithSameIPAddress)
t.Run("testOfflineStorageNodeForCSIExtPod", testOfflineStorageNodeForCSIExtPod)
t.Run("testOfflineStorageNodesBatchTest", testOfflineStorageNodesBatchTest)
}
Expand Down Expand Up @@ -516,6 +518,76 @@ func testOfflineStorageNode(t *testing.T) {
require.NoError(t, err, "expected no error from get pod as pod should not be deleted")
}

func testDuplicateOfflineWithSameSchedulerID(test *testing.T) {
setupWithNewMockDriver(test)

defer teardownWithNewMockDriver(test)

driverNodes := getDriverNodes(len(nodes.Items))
driverNodes[0].Status = volume.NodeOffline
// Mark the last node as same scheduler ID as nodeForPod but the status should be online
duplicateOnlineNodeIndex := len(driverNodes) - 1
driverNodes[duplicateOnlineNodeIndex].Status = volume.NodeOnline
driverNodes[duplicateOnlineNodeIndex].SchedulerID = driverNodes[0].SchedulerID
volumeDriver.EXPECT().GetNodes().Return(driverNodes, nil).AnyTimes()
volumes := []*volume.Info{
{
VolumeName: "volume1",
DataNodes: []string{driverNodes[0].StorageID},
},
}

pod := newPod("driverPod", []string{driverVolumeName}, nodeForPod)
_, err := core.Instance().CreatePod(pod)
require.NoError(test, err, "failed to create pod")

wffcVolumes := make([]*volume.Info, 0)

volumeDriver.EXPECT().InspectNode("node1").Return(driverNodes[0], nil).AnyTimes()
volumeDriver.EXPECT().GetCSIPodPrefix().Return("px-csi-ext-", nil).AnyTimes()
volumeDriver.EXPECT().GetPodVolumes(&pod.Spec, pod.Namespace, false).Return(volumes, wffcVolumes, nil).AnyTimes()

testNodeOfflineTimeout = 95 * time.Second
time.Sleep(testNodeOfflineTimeout)
_, err = core.Instance().GetPodByName(pod.Name, "")
require.NoError(test, err, "pod should not get deleted as there is another node with same scheduler ID which is online")
}

func testDuplicateOfflineWithSameIPAddress(test *testing.T) {
setupWithNewMockDriver(test)

defer teardownWithNewMockDriver(test)

driverNodes := getDriverNodes(len(nodes.Items))
driverNodes[0].Status = volume.NodeOffline
// Mark the last node as same IP as nodeForPod but the status should be online
duplicateOnlineNodeIndex := len(driverNodes) - 1
driverNodes[duplicateOnlineNodeIndex].Status = volume.NodeOnline
driverNodes[duplicateOnlineNodeIndex].IPs = driverNodes[0].IPs
volumeDriver.EXPECT().GetNodes().Return(driverNodes, nil).AnyTimes()
volumes := []*volume.Info{
{
VolumeName: "volume1",
DataNodes: []string{driverNodes[0].StorageID},
},
}

pod := newPod("driverPod", []string{driverVolumeName}, nodeForPod)
_, err := core.Instance().CreatePod(pod)
require.NoError(test, err, "failed to create pod")

wffcVolumes := make([]*volume.Info, 0)

volumeDriver.EXPECT().InspectNode("node1").Return(driverNodes[0], nil).AnyTimes()
volumeDriver.EXPECT().GetCSIPodPrefix().Return("px-csi-ext-", nil).AnyTimes()
volumeDriver.EXPECT().GetPodVolumes(&pod.Spec, pod.Namespace, false).Return(volumes, wffcVolumes, nil).AnyTimes()

testNodeOfflineTimeout = 95 * time.Second
time.Sleep(testNodeOfflineTimeout)
_, err = core.Instance().GetPodByName(pod.Name, "")
require.NoError(test, err, "pod should not get deleted as there is another node with same IP which is online")
}

func testOfflineStorageNodesBatchTest(t *testing.T) {

setupWithNewMockDriverScale(t)
Expand Down Expand Up @@ -903,6 +975,7 @@ func getDriverNodes(numNodes int) []*volume.NodeInfo {
StorageID: "node" + strconv.Itoa(i+1),
Status: volume.NodeOnline,
RawStatus: "Ready",
IPs: []string{"192.168.0." + strconv.Itoa(i+1)},
})
}
return driverNodes
Expand Down

0 comments on commit b1b1d8e

Please sign in to comment.