From 9eb2d6860b6ed403c84925d1427b4f4916559c71 Mon Sep 17 00:00:00 2001 From: Andreas Auernhammer Date: Wed, 29 Jan 2025 22:56:16 +0100 Subject: [PATCH] vault: limit token delay to not exceed token TTL This commit fixes a bug in the Vault token renewal. When KES renews its current token `T1` and receives a new token `T2`, KES waits a certain amount of time before using `T2` to account for replication lag between Vault nodes. Vault tokens are not signed but static secrets. Each Vault node in a dist. cluster needs to know the token before being able to verify it. Without the usage delay described above, KES might send a request to a Vault node that has not received the new token `T2`, yet. Now, KES must also not wait longer then the remaining TTL of the current token `T1`. Otherwise, `T1` expires BEFORE KES starts using `T2`. This results in auth errors like the following: ``` Error making API request.\n\nURL: GET http://127.0.0.1:8200/v1/kv/data/my-key3\nCode: 403. Errors:\n\n* 2 errors occurred:\n\t* permission denied\n\t* invalid token\n\n" req.method=POST req.path=/v1/key/create/my-key3 req.ip=127.0.0.1 req.identity=a49fea12c5d1c69f1eba1e4697e62ccdbe389a80f317191892711b47d83c3e85 ``` This commit limits the max. time KES waits before using the new token `T2` to either half the remaining TTL of `T1` or 30s - whatever is shorter. This ensures that `T1` is still valid once KES switches to `T2`. Signed-off-by: Andreas Auernhammer --- internal/keystore/vault/client.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/internal/keystore/vault/client.go b/internal/keystore/vault/client.go index 9df392da..6034117a 100644 --- a/internal/keystore/vault/client.go +++ b/internal/keystore/vault/client.go @@ -7,6 +7,7 @@ package vault import ( "context" "errors" + "net/http" "os" "path" "strings" @@ -195,8 +196,10 @@ func (c *client) RenewToken(ctx context.Context, authenticate authFunc, secret * continue } - renewIn := 80 * (ttl / 100) // Renew token after 80% of its TTL has passed - ttl = 0 // Set TTL to zero to trigger an immediate re-authentication in case of auth failure + renewIn := 80 * (ttl / 100) // Renew token after 80% of its TTL has passed + delay := min((ttl-renewIn)/2, Delay) // Delay usage of renewed token but not beyond expiry + ttl = 0 + select { case <-ctx.Done(): return @@ -210,6 +213,9 @@ func (c *client) RenewToken(ctx context.Context, authenticate authFunc, secret * if err == nil { break } + if resp, ok := err.(*vaultapi.ResponseError); ok && http.StatusBadRequest <= resp.StatusCode && resp.StatusCode < http.StatusInternalServerError { + break // Don't retry on 4xx responses + } } if s == nil { s, _ = authenticate(ctx) @@ -225,10 +231,12 @@ func (c *client) RenewToken(ctx context.Context, authenticate authFunc, secret * // Wait before we use the new auth. token. This accounts // for replication lag between the Vault nodes and allows // them to sync the token across the entire cluster. + // However, we must not wait longer than the remaining lifetime + // of the currently used token. select { case <-ctx.Done(): return - case <-time.After(Delay): + case <-time.After(delay): } c.SetToken(token) // SetToken is safe to call from different go routines }