Skip to content

Commit

Permalink
Merge #131842
Browse files Browse the repository at this point in the history
131842: roachprod: implement fetchlogs r=herkolategan,renatolabs,DarrylWong a=nameisbhaskar

This implements the fetchlogs feature in roachprod. This was present as a roachtest function and was used to fetch the logs for a roachtest cluster. This is very useful to get all the logs from a cluster. So, this PR moves the same code to roachprod. The destination directory and the timeout for fetching the logs is configurable. The roachtest now invokes the roachprod fetchlogs.

Epic: None
Release note: None

Co-authored-by: Bhaskarjyoti Bora <[email protected]>
  • Loading branch information
craig[bot] and nameisbhaskar committed Oct 10, 2024
2 parents ee33a3c + 1b6e6fd commit 8200dd3
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 42 deletions.
4 changes: 4 additions & 0 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ var (
fluentBitConfig fluentbit.Config

opentelemetryConfig opentelemetry.Config

fetchLogsTimeout time.Duration
)

func initFlags() {
Expand Down Expand Up @@ -486,4 +488,6 @@ func initFlags() {
"dashboard-uid", "", "grafana dashboard UID")
grafanaAnnotationCmd.Flags().Int64SliceVar(&grafanaTimeRange,
"time-range", []int64{}, "grafana annotation time range in epoch time")
fetchLogsCmd.Flags().DurationVarP(&fetchLogsTimeout,
"timeout", "t", 5*time.Minute, "Timeout for fetching the logs from the cluster nodes")
}
29 changes: 29 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2010,6 +2010,7 @@ func main() {
fluentBitStopCmd,
opentelemetryStartCmd,
opentelemetryStopCmd,
fetchLogsCmd,
)
loadBalancerCmd.AddCommand(createLoadBalancerCmd)
loadBalancerCmd.AddCommand(loadBalancerPGUrl)
Expand Down Expand Up @@ -2080,3 +2081,31 @@ Node specification
os.Exit(1)
}
}

var fetchLogsCmd = &cobra.Command{
Use: "fetchlogs <cluster> <destination (optional)> [flags]",
Aliases: []string{"getlogs"},
Short: "download the logs from the cluster",
Long: `Download the logs from the cluster using "roachprod get".
The logs will be placed in the directory if specified or in the directory named as <clustername>_logs.
`,
Args: cobra.RangeArgs(1, 2),
Run: wrap(func(cmd *cobra.Command, args []string) error {
cluster := args[0]
ctx := context.Background()
var dest string
if len(args) == 2 {
dest = args[1]
} else {
// trim the node number and keep only the cluster name as prefix of the directory
dest = fmt.Sprintf("%s_logs", strings.Split(args[0], ":")[0])
fmt.Printf("Placing logs at %s\n", dest)
}
if err := os.Mkdir(dest, 0755); err != nil {
return err
}
return roachprod.FetchLogs(ctx, config.Logger, cluster, dest,
fetchLogsTimeout)
}),
}
55 changes: 13 additions & 42 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1209,53 +1209,24 @@ func (c *clusterImpl) FetchLogs(ctx context.Context, l *logger.Logger) error {
return nil
}

l.Printf("fetching logs")
c.status("fetching logs")

// Don't hang forever if we can't fetch the logs.
return timeutil.RunWithTimeout(ctx, "fetch logs", 5*time.Minute, func(ctx context.Context) error {
// Find all log directories, which might include logs for
// external-process virtual clusters.
listLogDirsCmd := "find logs* -maxdepth 0 -type d"
results, err := c.RunWithDetails(ctx, l, option.WithNodes(c.All()), listLogDirsCmd)
if err != nil {
return err
}

logDirs := make(map[string]struct{})
for _, r := range results {
if r.Err != nil {
l.Printf("will not fetch logs for n%d due to error: %v", r.Node, r.Err)
}

for _, logDir := range strings.Fields(r.Stdout) {
logDirs[logDir] = struct{}{}
}
}
err := roachprod.FetchLogs(ctx, l, c.name, c.t.ArtifactsDir(), 5*time.Minute)

for logDir := range logDirs {
path := filepath.Join(c.t.ArtifactsDir(), logDir, "unredacted")
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}

if err := c.Get(ctx, c.l, logDir /* src */, path /* dest */); err != nil {
l.Printf("failed to fetch log directory %s: %v", logDir, err)
if ctx.Err() != nil {
return errors.Wrap(err, "cluster.FetchLogs")
}
}
var logFileFull string
if l.File != nil {
logFileFull = l.File.Name()
}
if err != nil {
if ctxErr := ctx.Err(); ctxErr != nil {
l.Printf("(note: incoming context was canceled: %s)", err)
return ctxErr
}

if err := c.RunE(ctx, option.WithNodes(c.All()), fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", test.DefaultCockroachPath)); err != nil {
l.Printf("failed to redact logs: %v", err)
if ctx.Err() != nil {
return err
}
}
dest := filepath.Join(c.t.ArtifactsDir(), "logs/cockroach.log")
return errors.Wrap(c.Get(ctx, c.l, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs")
})
l.Printf("> result: %s", err)
createFailedFile(logFileFull)
}
return err
}

// saveDiskUsageToLogsDir collects a summary of the disk usage to logs/diskusage.txt on each node.
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ go_library(
"//pkg/build",
"//pkg/cli/exit",
"//pkg/cmd/roachprod/grafana",
"//pkg/cmd/roachtest/test",
"//pkg/roachprod/cloud",
"//pkg/roachprod/config",
"//pkg/roachprod/fluentbit",
Expand Down
64 changes: 64 additions & 0 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/cli/exit"
"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit"
Expand Down Expand Up @@ -2968,3 +2969,66 @@ func getClusterFromCloud(l *logger.Logger, clusterName string) (*cloud.Cluster,

return c, nil
}

// FetchLogs downloads the logs from the cluster using `roachprod get`.
// The logs will be placed in the "destination" directory.
// The command times out after the fetchLogsTimeout time.
func FetchLogs(
ctx context.Context,
l *logger.Logger,
clusterName, destination string,
fetchLogsTimeout time.Duration,
) error {
c, err := getClusterFromCache(l, clusterName)
if err != nil {
return err
}

l.Printf("fetching logs")

// Don't hang forever if we can't fetch the logs.
return timeutil.RunWithTimeout(ctx, "fetch logs", fetchLogsTimeout,
func(ctx context.Context) error {
// Find all log directories, which might include logs for
// external-process virtual clusters.
listLogDirsCmd := "find logs* -maxdepth 0 -type d"
results, err := c.RunWithDetails(ctx, l, install.WithNodes(c.Nodes), "", listLogDirsCmd)
if err != nil {
return err
}

logDirs := make(map[string]struct{})
for _, r := range results {
if r.Err != nil {
l.Printf("will not fetch logs for n%d due to error: %v", r.Node, r.Err)
}

for _, logDir := range strings.Fields(r.Stdout) {
logDirs[logDir] = struct{}{}
}
}

for logDir := range logDirs {
dirPath := filepath.Join(destination, logDir, "unredacted")
if err := os.MkdirAll(filepath.Dir(dirPath), 0755); err != nil {
return err
}

if err := c.Get(ctx, l, c.Nodes, logDir /* src */, dirPath /* dest */); err != nil {
l.Printf("failed to fetch log directory %s: %v", logDir, err)
if ctx.Err() != nil {
return errors.Wrap(err, "cluster.FetchLogs")
}
}
}

if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes), "", fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", test.DefaultCockroachPath)); err != nil {
l.Printf("failed to redact logs: %v", err)
if ctx.Err() != nil {
return err
}
}
dest := filepath.Join(destination, "logs/cockroach.log")
return errors.Wrap(c.Get(ctx, l, c.Nodes, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs")
})
}

0 comments on commit 8200dd3

Please sign in to comment.