From 2da72d793c6f1483370b6b2d27a22c19c466be64 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Mon, 12 Jun 2017 15:11:10 -0700 Subject: [PATCH] Fix sandbox cleanup Driver and Sanbox have 2 different stores where the endpoints are saved It is possible that the 2 store go out of sync if the endpoint is added to the driver but there is a crash before the sandbox join. On restart now we take the list of endpoints from the network and we assign them back to the sandbox Signed-off-by: Flavio Crisciani --- endpoint.go | 9 ++++---- sandbox_store.go | 57 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/endpoint.go b/endpoint.go index 111b747352..46330686ca 100644 --- a/endpoint.go +++ b/endpoint.go @@ -822,10 +822,6 @@ func (ep *endpoint) Delete(force bool) error { } } - if err = n.getController().deleteFromStore(ep); err != nil { - return err - } - defer func() { if err != nil && !force { ep.dbExists = false @@ -842,6 +838,11 @@ func (ep *endpoint) Delete(force bool) error { return err } + // This has to come after the sandbox and the driver to guarantee that can be the source of truth on restart cases + if err = n.getController().deleteFromStore(ep); err != nil { + return err + } + ep.releaseAddress() if err := n.getEpCnt().DecEndpointCnt(); err != nil { diff --git a/sandbox_store.go b/sandbox_store.go index 38b2bd7e8b..bfc1ccd2dd 100644 --- a/sandbox_store.go +++ b/sandbox_store.go @@ -3,7 +3,6 @@ package libnetwork import ( "container/heap" "encoding/json" - "sync" "github.com/Sirupsen/logrus" "github.com/docker/libnetwork/datastore" @@ -210,6 +209,40 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) { return } + // Get all the endpoints + // Use the network as the source of truth so that if there was an issue before the sandbox registered the endpoint + // this will be taken anyway + endpointsInSandboxID := map[string][]*endpoint{} + nl, err := c.getNetworksForScope(datastore.LocalScope) + if err != nil { + logrus.Warnf("Could not get list of networks during sandbox cleanup: %v", err) + return + } + + for _, n := range nl { + var epl []*endpoint + epl, err = n.getEndpointsFromStore() + if err != nil { + logrus.Warnf("Could not get list of endpoints in network %s during sandbox cleanup: %v", n.name, err) + continue + } + for _, ep := range epl { + ep, err = n.getEndpointFromStore(ep.id) + if err != nil { + logrus.Warnf("Could not get endpoint in network %s during sandbox cleanup: %v", n.name, err) + continue + } + if ep.sandboxID == "" { + logrus.Warnf("Endpoint %s not associated to any sandbox, deleting it", ep.id) + ep.Delete(true) + continue + } + + // Append the endpoint to the corresponding sandboxID + endpointsInSandboxID[ep.sandboxID] = append(endpointsInSandboxID[ep.sandboxID], ep) + } + } + for _, kvo := range kvol { sbs := kvo.(*sbState) @@ -256,25 +289,11 @@ func (c *controller) sandboxCleanup(activeSandboxes map[string]interface{}) { c.sandboxes[sb.id] = sb c.Unlock() - for _, eps := range sbs.Eps { - n, err := c.getNetworkFromStore(eps.Nid) - var ep *endpoint - if err != nil { - logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err) - n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}, persist: true} - ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID} - } else { - ep, err = n.getEndpointFromStore(eps.Eid) - if err != nil { - logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err) - ep = &endpoint{id: eps.Eid, network: n, sandboxID: sbs.ID} - } - } - if _, ok := activeSandboxes[sb.ID()]; ok && err != nil { - logrus.Errorf("failed to restore endpoint %s in %s for container %s due to %v", eps.Eid, eps.Nid, sb.ContainerID(), err) - continue + // Restore all the endpoints that are supposed to be in this sandbox + if eps, ok := endpointsInSandboxID[sb.id]; ok { + for _, ep := range eps { + heap.Push(&sb.endpoints, ep) } - heap.Push(&sb.endpoints, ep) } if _, ok := activeSandboxes[sb.ID()]; !ok {