Skip to content

Commit

Permalink
make stream shutdown if self-node has been removed (#2125)
Browse files Browse the repository at this point in the history
* add shutdown that asserts if headscale had panics

Signed-off-by: Kristoffer Dalby <[email protected]>

* add test case producing 2118 panic

Signed-off-by: Kristoffer Dalby <[email protected]>

* make stream shutdown if self-node has been removed

Currently we will read the node from database, and since it is
deleted, the id might be set to nil. Keep the node around and
just shutdown, so it is cleanly removed from notifier.

Fixes #2118

Signed-off-by: Kristoffer Dalby <[email protected]>

---------

Signed-off-by: Kristoffer Dalby <[email protected]>
  • Loading branch information
kradalby authored Sep 11, 2024
1 parent 4b02dc9 commit 64319f7
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 21 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test-integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ jobs:
- TestExpireNode
- TestNodeOnlineStatus
- TestPingAllByIPManyUpDown
- Test2118DeletingOnlineNodePanics
- TestEnablingRoutes
- TestHASubnetRouterFailover
- TestEnableDisableAutoApprovedRoute
Expand Down
7 changes: 7 additions & 0 deletions hscontrol/poll.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"math/rand/v2"
"net/http"
"slices"
"sort"
"strings"
"time"
Expand Down Expand Up @@ -273,6 +274,12 @@ func (m *mapSession) serveLongPoll() {
return
}

// If the node has been removed from headscale, close the stream
if slices.Contains(update.Removed, m.node.ID) {
m.tracef("node removed, closing stream")
return
}

m.tracef("received stream update: %s %s", update.Type.String(), update.Message)
mapResponseUpdateReceived.WithLabelValues(update.Type.String()).Inc()

Expand Down
4 changes: 2 additions & 2 deletions integration/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import (
)

type ControlServer interface {
Shutdown() error
SaveLog(string) error
Shutdown() (string, string, error)
SaveLog(string) (string, string, error)
SaveProfile(string) error
Execute(command []string) (string, error)
WriteFile(path string, content []byte) error
Expand Down
18 changes: 10 additions & 8 deletions integration/dockertestutil/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ func SaveLog(
pool *dockertest.Pool,
resource *dockertest.Resource,
basePath string,
) error {
) (string, string, error) {
err := os.MkdirAll(basePath, os.ModePerm)
if err != nil {
return err
return "", "", err
}

var stdout bytes.Buffer
Expand All @@ -41,28 +41,30 @@ func SaveLog(
},
)
if err != nil {
return err
return "", "", err
}

log.Printf("Saving logs for %s to %s\n", resource.Container.Name, basePath)

stdoutPath := path.Join(basePath, resource.Container.Name+".stdout.log")
err = os.WriteFile(
path.Join(basePath, resource.Container.Name+".stdout.log"),
stdoutPath,
stdout.Bytes(),
filePerm,
)
if err != nil {
return err
return "", "", err
}

stderrPath := path.Join(basePath, resource.Container.Name+".stderr.log")
err = os.WriteFile(
path.Join(basePath, resource.Container.Name+".stderr.log"),
stderrPath,
stderr.Bytes(),
filePerm,
)
if err != nil {
return err
return "", "", err
}

return nil
return stdoutPath, stderrPath, nil
}
99 changes: 99 additions & 0 deletions integration/general_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -954,3 +954,102 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps))
}
}

func Test2118DeletingOnlineNodePanics(t *testing.T) {
IntegrationSkip(t)
t.Parallel()

scenario, err := NewScenario(dockertestMaxWait())
assertNoErr(t, err)
defer scenario.ShutdownAssertNoPanics(t)

// TODO(kradalby): it does not look like the user thing works, only second
// get created? maybe only when many?
spec := map[string]int{
"user1": 1,
"user2": 1,
}

err = scenario.CreateHeadscaleEnv(spec,
[]tsic.Option{},
hsic.WithTestName("deletenocrash"),
hsic.WithEmbeddedDERPServerOnly(),
hsic.WithTLS(),
hsic.WithHostnameAsServerURL(),
)
assertNoErrHeadscaleEnv(t, err)

allClients, err := scenario.ListTailscaleClients()
assertNoErrListClients(t, err)

allIps, err := scenario.ListTailscaleClientsIPs()
assertNoErrListClientIPs(t, err)

err = scenario.WaitForTailscaleSync()
assertNoErrSync(t, err)

allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string {
return x.String()
})

success := pingAllHelper(t, allClients, allAddrs)
t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps))

headscale, err := scenario.Headscale()
assertNoErr(t, err)

// Test list all nodes after added otherUser
var nodeList []v1.Node
err = executeAndUnmarshal(
headscale,
[]string{
"headscale",
"nodes",
"list",
"--output",
"json",
},
&nodeList,
)
assert.Nil(t, err)
assert.Len(t, nodeList, 2)
assert.True(t, nodeList[0].Online)
assert.True(t, nodeList[1].Online)

// Delete the first node, which is online
_, err = headscale.Execute(
[]string{
"headscale",
"nodes",
"delete",
"--identifier",
// Delete the last added machine
fmt.Sprintf("%d", nodeList[0].Id),
"--output",
"json",
"--force",
},
)
assert.Nil(t, err)

time.Sleep(2 * time.Second)

// Ensure that the node has been deleted, this did not occur due to a panic.
var nodeListAfter []v1.Node
err = executeAndUnmarshal(
headscale,
[]string{
"headscale",
"nodes",
"list",
"--output",
"json",
},
&nodeListAfter,
)
assert.Nil(t, err)
assert.Len(t, nodeListAfter, 1)
assert.True(t, nodeListAfter[0].Online)
assert.Equal(t, nodeList[1].Id, nodeListAfter[0].Id)

}
8 changes: 4 additions & 4 deletions integration/hsic/hsic.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ func (t *HeadscaleInContainer) hasTLS() bool {
}

// Shutdown stops and cleans up the Headscale container.
func (t *HeadscaleInContainer) Shutdown() error {
err := t.SaveLog("/tmp/control")
func (t *HeadscaleInContainer) Shutdown() (string, string, error) {
stdoutPath, stderrPath, err := t.SaveLog("/tmp/control")
if err != nil {
log.Printf(
"Failed to save log from control: %s",
Expand Down Expand Up @@ -458,12 +458,12 @@ func (t *HeadscaleInContainer) Shutdown() error {
t.pool.Purge(t.pgContainer)
}

return t.pool.Purge(t.container)
return stdoutPath, stderrPath, t.pool.Purge(t.container)
}

// SaveLog saves the current stdout log of the container to a path
// on the host system.
func (t *HeadscaleInContainer) SaveLog(path string) error {
func (t *HeadscaleInContainer) SaveLog(path string) (string, string, error) {
return dockertestutil.SaveLog(t.pool, t.container, path)
}

Expand Down
28 changes: 22 additions & 6 deletions integration/scenario.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"sort"
"sync"
"testing"
"time"

v1 "github.com/juanfont/headscale/gen/go/headscale/v1"
Expand All @@ -18,6 +19,7 @@ import (
"github.com/ory/dockertest/v3"
"github.com/puzpuzpuz/xsync/v3"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
"golang.org/x/sync/errgroup"
"tailscale.com/envknob"
)
Expand Down Expand Up @@ -187,20 +189,26 @@ func NewScenario(maxWait time.Duration) (*Scenario, error) {
}, nil
}

// Shutdown shuts down and cleans up all the containers (ControlServer, TailscaleClient)
// and networks associated with it.
// In addition, it will save the logs of the ControlServer to `/tmp/control` in the
// environment running the tests.
func (s *Scenario) Shutdown() {
func (s *Scenario) ShutdownAssertNoPanics(t *testing.T) {
s.controlServers.Range(func(_ string, control ControlServer) bool {
err := control.Shutdown()
stdoutPath, stderrPath, err := control.Shutdown()
if err != nil {
log.Printf(
"Failed to shut down control: %s",
fmt.Errorf("failed to tear down control: %w", err),
)
}

if t != nil {
stdout, err := os.ReadFile(stdoutPath)
assert.NoError(t, err)
assert.NotContains(t, string(stdout), "panic")

stderr, err := os.ReadFile(stderrPath)
assert.NoError(t, err)
assert.NotContains(t, string(stderr), "panic")
}

return true
})

Expand All @@ -224,6 +232,14 @@ func (s *Scenario) Shutdown() {
// }
}

// Shutdown shuts down and cleans up all the containers (ControlServer, TailscaleClient)
// and networks associated with it.
// In addition, it will save the logs of the ControlServer to `/tmp/control` in the
// environment running the tests.
func (s *Scenario) Shutdown() {
s.ShutdownAssertNoPanics(nil)
}

// Users returns the name of all users associated with the Scenario.
func (s *Scenario) Users() []string {
users := make([]string, 0)
Expand Down
4 changes: 3 additions & 1 deletion integration/tsic/tsic.go
Original file line number Diff line number Diff line change
Expand Up @@ -998,7 +998,9 @@ func (t *TailscaleInContainer) WriteFile(path string, data []byte) error {
// SaveLog saves the current stdout log of the container to a path
// on the host system.
func (t *TailscaleInContainer) SaveLog(path string) error {
return dockertestutil.SaveLog(t.pool, t.container, path)
// TODO(kradalby): Assert if tailscale logs contains panics.
_, _, err := dockertestutil.SaveLog(t.pool, t.container, path)
return err
}

// ReadFile reads a file from the Tailscale container.
Expand Down

0 comments on commit 64319f7

Please sign in to comment.