diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index 1e07e73c85e5..4adc2b1aaf4c 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -103,6 +103,8 @@ var ( fluentBitConfig fluentbit.Config opentelemetryConfig opentelemetry.Config + + fetchLogsTimeout time.Duration ) func initFlags() { @@ -486,4 +488,6 @@ func initFlags() { "dashboard-uid", "", "grafana dashboard UID") grafanaAnnotationCmd.Flags().Int64SliceVar(&grafanaTimeRange, "time-range", []int64{}, "grafana annotation time range in epoch time") + fetchLogsCmd.Flags().DurationVarP(&fetchLogsTimeout, + "timeout", "t", 5*time.Minute, "Timeout for fetching the logs from the cluster nodes") } diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index f36f4d6e08f7..d57758663eeb 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -2010,6 +2010,7 @@ func main() { fluentBitStopCmd, opentelemetryStartCmd, opentelemetryStopCmd, + fetchLogsCmd, ) loadBalancerCmd.AddCommand(createLoadBalancerCmd) loadBalancerCmd.AddCommand(loadBalancerPGUrl) @@ -2080,3 +2081,31 @@ Node specification os.Exit(1) } } + +var fetchLogsCmd = &cobra.Command{ + Use: "fetchlogs [flags]", + Aliases: []string{"getlogs"}, + Short: "download the logs from the cluster", + Long: `Download the logs from the cluster using "roachprod get". + +The logs will be placed in the directory if specified or in the directory named as _logs. +`, + Args: cobra.RangeArgs(1, 2), + Run: wrap(func(cmd *cobra.Command, args []string) error { + cluster := args[0] + ctx := context.Background() + var dest string + if len(args) == 2 { + dest = args[1] + } else { + // trim the node number and keep only the cluster name as prefix of the directory + dest = fmt.Sprintf("%s_logs", strings.Split(args[0], ":")[0]) + fmt.Printf("Placing logs at %s\n", dest) + } + if err := os.Mkdir(dest, 0755); err != nil { + return err + } + return roachprod.FetchLogs(ctx, config.Logger, cluster, dest, + fetchLogsTimeout) + }), +} diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 060e19c71f43..2740ccbf7c65 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1209,53 +1209,24 @@ func (c *clusterImpl) FetchLogs(ctx context.Context, l *logger.Logger) error { return nil } - l.Printf("fetching logs") c.status("fetching logs") - // Don't hang forever if we can't fetch the logs. - return timeutil.RunWithTimeout(ctx, "fetch logs", 5*time.Minute, func(ctx context.Context) error { - // Find all log directories, which might include logs for - // external-process virtual clusters. - listLogDirsCmd := "find logs* -maxdepth 0 -type d" - results, err := c.RunWithDetails(ctx, l, option.WithNodes(c.All()), listLogDirsCmd) - if err != nil { - return err - } - - logDirs := make(map[string]struct{}) - for _, r := range results { - if r.Err != nil { - l.Printf("will not fetch logs for n%d due to error: %v", r.Node, r.Err) - } - - for _, logDir := range strings.Fields(r.Stdout) { - logDirs[logDir] = struct{}{} - } - } + err := roachprod.FetchLogs(ctx, l, c.name, c.t.ArtifactsDir(), 5*time.Minute) - for logDir := range logDirs { - path := filepath.Join(c.t.ArtifactsDir(), logDir, "unredacted") - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { - return err - } - - if err := c.Get(ctx, c.l, logDir /* src */, path /* dest */); err != nil { - l.Printf("failed to fetch log directory %s: %v", logDir, err) - if ctx.Err() != nil { - return errors.Wrap(err, "cluster.FetchLogs") - } - } + var logFileFull string + if l.File != nil { + logFileFull = l.File.Name() + } + if err != nil { + if ctxErr := ctx.Err(); ctxErr != nil { + l.Printf("(note: incoming context was canceled: %s)", err) + return ctxErr } - if err := c.RunE(ctx, option.WithNodes(c.All()), fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", test.DefaultCockroachPath)); err != nil { - l.Printf("failed to redact logs: %v", err) - if ctx.Err() != nil { - return err - } - } - dest := filepath.Join(c.t.ArtifactsDir(), "logs/cockroach.log") - return errors.Wrap(c.Get(ctx, c.l, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs") - }) + l.Printf("> result: %s", err) + createFailedFile(logFileFull) + } + return err } // saveDiskUsageToLogsDir collects a summary of the disk usage to logs/diskusage.txt on each node. diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index e4af1abd5327..d63b76c30aee 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -13,6 +13,7 @@ go_library( "//pkg/build", "//pkg/cli/exit", "//pkg/cmd/roachprod/grafana", + "//pkg/cmd/roachtest/test", "//pkg/roachprod/cloud", "//pkg/roachprod/config", "//pkg/roachprod/fluentbit", diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index f01c347caa55..937dc0499e06 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -31,6 +31,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/cli/exit" "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/cloud" "github.com/cockroachdb/cockroach/pkg/roachprod/config" "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit" @@ -2968,3 +2969,66 @@ func getClusterFromCloud(l *logger.Logger, clusterName string) (*cloud.Cluster, return c, nil } + +// FetchLogs downloads the logs from the cluster using `roachprod get`. +// The logs will be placed in the "destination" directory. +// The command times out after the fetchLogsTimeout time. +func FetchLogs( + ctx context.Context, + l *logger.Logger, + clusterName, destination string, + fetchLogsTimeout time.Duration, +) error { + c, err := getClusterFromCache(l, clusterName) + if err != nil { + return err + } + + l.Printf("fetching logs") + + // Don't hang forever if we can't fetch the logs. + return timeutil.RunWithTimeout(ctx, "fetch logs", fetchLogsTimeout, + func(ctx context.Context) error { + // Find all log directories, which might include logs for + // external-process virtual clusters. + listLogDirsCmd := "find logs* -maxdepth 0 -type d" + results, err := c.RunWithDetails(ctx, l, install.WithNodes(c.Nodes), "", listLogDirsCmd) + if err != nil { + return err + } + + logDirs := make(map[string]struct{}) + for _, r := range results { + if r.Err != nil { + l.Printf("will not fetch logs for n%d due to error: %v", r.Node, r.Err) + } + + for _, logDir := range strings.Fields(r.Stdout) { + logDirs[logDir] = struct{}{} + } + } + + for logDir := range logDirs { + dirPath := filepath.Join(destination, logDir, "unredacted") + if err := os.MkdirAll(filepath.Dir(dirPath), 0755); err != nil { + return err + } + + if err := c.Get(ctx, l, c.Nodes, logDir /* src */, dirPath /* dest */); err != nil { + l.Printf("failed to fetch log directory %s: %v", logDir, err) + if ctx.Err() != nil { + return errors.Wrap(err, "cluster.FetchLogs") + } + } + } + + if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes), "", fmt.Sprintf("mkdir -p logs/redacted && %s debug merge-logs --redact logs/*.log > logs/redacted/combined.log", test.DefaultCockroachPath)); err != nil { + l.Printf("failed to redact logs: %v", err) + if ctx.Err() != nil { + return err + } + } + dest := filepath.Join(destination, "logs/cockroach.log") + return errors.Wrap(c.Get(ctx, l, c.Nodes, "logs/redacted/combined.log" /* src */, dest), "cluster.FetchLogs") + }) +}