From 22068bf428f49966f69bbc72b0bee07080c8cd7c Mon Sep 17 00:00:00 2001 From: Matthew Sanabria <24284972+sudomateo@users.noreply.github.com> Date: Mon, 29 Apr 2024 13:00:56 -0400 Subject: [PATCH] scripts/drtprod: send logs to datadog Previously, clusters created by `roachprod` logged exclusively to disk, requiring operators to either SSH into the instance or use `roachprod logs` to view logs for a CockroachDB node. This patch adds a new `roachprod fluent-bit-start` command that, when run, installs and starts Fluent Bit on the CockroachDB cluster listening on `127.0.0.1:5170`. The CockroachDB logging configuration has also been updated to log to this Fluent Bit endpoint, choosing not to error if the endpoint is unavailble. Clusters still log to disk as to not break existing workflows. The `drtprod` script was also updated to install and configure Fluent Bit on the DRT clusters. A complementary `roachprod fluent-bit-stop` command was also added to stop Fluent Bit. Epic: none Release note: none --- pkg/BUILD.bazel | 1 + pkg/cmd/roachprod/BUILD.bazel | 1 + pkg/cmd/roachprod/flags.go | 15 ++ pkg/cmd/roachprod/main.go | 22 +++ pkg/roachprod/BUILD.bazel | 1 + pkg/roachprod/fluentbit/BUILD.bazel | 18 +++ .../fluentbit/files/fluent-bit.service | 15 ++ .../fluentbit/files/fluent-bit.yaml.tmpl | 37 +++++ pkg/roachprod/fluentbit/fluentbit.go | 142 ++++++++++++++++++ pkg/roachprod/install/BUILD.bazel | 1 + pkg/roachprod/install/cockroach.go | 38 ++++- .../install/files/cockroachdb-logging.yaml | 70 +++++++++ pkg/roachprod/install/install.go | 9 ++ pkg/roachprod/roachprod.go | 36 +++++ scripts/drtprod | 8 +- 15 files changed, 411 insertions(+), 3 deletions(-) create mode 100644 pkg/roachprod/fluentbit/BUILD.bazel create mode 100644 pkg/roachprod/fluentbit/files/fluent-bit.service create mode 100644 pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl create mode 100644 pkg/roachprod/fluentbit/fluentbit.go create mode 100644 pkg/roachprod/install/files/cockroachdb-logging.yaml diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index f166169374dc..d91aacc38a41 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -1555,6 +1555,7 @@ GO_TARGETS = [ "//pkg/roachprod/config:config", "//pkg/roachprod/config:config_test", "//pkg/roachprod/errors:errors", + "//pkg/roachprod/fluentbit:fluentbit", "//pkg/roachprod/install:install", "//pkg/roachprod/install:install_test", "//pkg/roachprod/lock:lock", diff --git a/pkg/cmd/roachprod/BUILD.bazel b/pkg/cmd/roachprod/BUILD.bazel index 44bc8da26f02..5118fa963d76 100644 --- a/pkg/cmd/roachprod/BUILD.bazel +++ b/pkg/cmd/roachprod/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/roachprod", "//pkg/roachprod/config", "//pkg/roachprod/errors", + "//pkg/roachprod/fluentbit", "//pkg/roachprod/install", "//pkg/roachprod/ssh", "//pkg/roachprod/ui", diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index a2d4c645de7c..a0f8e9fa406f 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" + "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/ssh" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" @@ -96,6 +97,8 @@ var ( } sshKeyUser string + + fluentBitConfig fluentbit.Config ) func initFlags() { @@ -283,6 +286,18 @@ func initFlags() { grafanaDumpCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "", "the absolute path to dump prometheus data to (use the contained 'prometheus-docker-run.sh' to visualize") + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogSite, "datadog-site", "us5.datadoghq.com", + "Datadog site to send telemetry data to") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogAPIKey, "datadog-api-key", "", + "Datadog API key") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogService, "datadog-service", "cockroachdb", + "Datadog service name for emitted logs") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogTeam, "datadog-team", "", + "Datadog team to tag emitted logs") + sshKeysAddCmd.Flags().StringVar(&sshKeyUser, "user", config.OSUser.Username, "the user to be associated with the new key", ) diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index 70f1984e99b3..48ebde10d915 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -1670,6 +1670,26 @@ var _ = func() struct{} { return struct{}{} }() +var fluentBitStartCmd = &cobra.Command{ + Use: "fluent-bit-start ", + Short: "Install and start Fluent Bit", + Long: "Install and start Fluent Bit", + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StartFluentBit(context.Background(), config.Logger, args[0], fluentBitConfig) + }), +} + +var fluentBitStopCmd = &cobra.Command{ + Use: "fluent-bit-stop ", + Short: "Stop Fluent Bit", + Long: "Stop Fluent Bit", + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StopFluentBit(context.Background(), config.Logger, args[0]) + }), +} + func main() { _ = roachprod.InitProviders() providerOptsContainer = vm.CreateProviderOptionsContainer() @@ -1728,6 +1748,8 @@ func main() { jaegerStartCmd, jaegerStopCmd, jaegerURLCmd, + fluentBitStartCmd, + fluentBitStopCmd, ) setBashCompletionFunction() diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index 06fd31b4fd1d..b5c9cac3b621 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//pkg/cmd/roachprod/grafana", "//pkg/roachprod/cloud", "//pkg/roachprod/config", + "//pkg/roachprod/fluentbit", "//pkg/roachprod/install", "//pkg/roachprod/lock", "//pkg/roachprod/logger", diff --git a/pkg/roachprod/fluentbit/BUILD.bazel b/pkg/roachprod/fluentbit/BUILD.bazel new file mode 100644 index 000000000000..4eedc66ec513 --- /dev/null +++ b/pkg/roachprod/fluentbit/BUILD.bazel @@ -0,0 +1,18 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "fluentbit", + srcs = ["fluentbit.go"], + embedsrcs = [ + "files/fluent-bit.service", + "files/fluent-bit.yaml.tmpl", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit", + visibility = ["//visibility:public"], + deps = [ + "//pkg/roachprod/install", + "//pkg/roachprod/logger", + "//pkg/roachprod/vm", + "@com_github_cockroachdb_errors//:errors", + ], +) diff --git a/pkg/roachprod/fluentbit/files/fluent-bit.service b/pkg/roachprod/fluentbit/files/fluent-bit.service new file mode 100644 index 000000000000..95c8457f3ec1 --- /dev/null +++ b/pkg/roachprod/fluentbit/files/fluent-bit.service @@ -0,0 +1,15 @@ +[Unit] +Description=Fluent Bit +Documentation=https://docs.fluentbit.io/manual/ +Requires=network.target +After=network.target + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/fluent-bit +EnvironmentFile=-/etc/default/fluent-bit +ExecStart=/opt/fluent-bit/bin/fluent-bit -c //etc/fluent-bit/fluent-bit.yaml +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl b/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl new file mode 100644 index 000000000000..b0f148bbe4b1 --- /dev/null +++ b/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl @@ -0,0 +1,37 @@ +--- +service: + flush: 1 + daemon: off + http_server: on + http_listen: 127.0.0.1 + http_port: 2020 + log_level: info + storage.path: /tmp + storage.metrics: on + storage.max_chunks_up: 30 + storage.sync: full + storage.checksum: on + storage.delete_irrecoverable_chunks: on + parsers_file: parsers.conf + plugins_file: plugins.conf +pipeline: + inputs: + - name: tcp + tag: cockroachdb + listen: 127.0.0.1 + port: 5170 + format: json + storage.type: filesystem + alias: cockroachdb + outputs: + - name: datadog + match: cockroachdb + host: http-intake.logs.{{ .DatadogSite }} + tls: on + compress: gzip + apikey: {{ .DatadogAPIKey }} + dd_source: cockroachdb + dd_service: {{ .DatadogService }} + dd_tags: {{ join .Tags `,` }} + alias: cockroachdb + storage.total_limit_size: 25MB diff --git a/pkg/roachprod/fluentbit/fluentbit.go b/pkg/roachprod/fluentbit/fluentbit.go new file mode 100644 index 000000000000..4de5e7dc13b4 --- /dev/null +++ b/pkg/roachprod/fluentbit/fluentbit.go @@ -0,0 +1,142 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package fluentbit + +import ( + "bytes" + "context" + _ "embed" + "fmt" + "strings" + "text/template" + + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" + "github.com/cockroachdb/errors" +) + +//go:embed files/fluent-bit.yaml.tmpl +var fluentBitTemplate string + +//go:embed files/fluent-bit.service +var fluentBitSystemdUnit string + +// Config represents the information needed to configure and run Fluent Bit on +// a CockroachDB cluster. +type Config struct { + // Datadog site to send telemetry data to (e.g, us5.datadoghq.com). + DatadogSite string + + // Datadog API key to authenticate to Datadog. + DatadogAPIKey string + + // Datadog service for emitted logs. + DatadogService string + + // Datadog team to tag the emitted logs. + DatadogTeam string +} + +// Install installs, configures, and starts Fluent Bit on the given CockroachDB +// cluster c. +func Install(ctx context.Context, l *logger.Logger, c *install.SyncedCluster, config Config) error { + if err := c.Parallel(ctx, l, install.WithNodes(c.Nodes), func(ctx context.Context, node install.Node) (*install.RunResultDetails, error) { + res := &install.RunResultDetails{Node: node} + + if err := install.InstallTool(ctx, l, c, install.Nodes{node}, "fluent-bit", l.Stdout, l.Stderr); err != nil { + res.Err = errors.Wrap(err, "failed installing fluent bit") + return res, res.Err + } + + tags := []string{ + "env:development", + fmt.Sprintf("host:%s", vm.Name(c.Name, int(node))), + fmt.Sprintf("cluster:%s", c.Name), + } + + if config.DatadogTeam != "" { + tags = append(tags, fmt.Sprintf("team:%s", config.DatadogTeam)) + } + + data := templateData{ + DatadogSite: config.DatadogSite, + DatadogAPIKey: config.DatadogAPIKey, + DatadogService: config.DatadogService, + Tags: tags, + } + + fluentBitConfig, err := executeTemplate(data) + if err != nil { + res.Err = errors.Wrapf(err, "failed rendering fluent bit configuration for node %d", node) + return res, res.Err + } + + if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitConfig, "/tmp/fluent-bit.yaml", 0644); err != nil { + res.Err = errors.Wrapf(err, "failed writing fluent bit configuration to node %d", node) + return res, res.Err + } + + if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitSystemdUnit, "/tmp/fluent-bit.service", 0644); err != nil { + res.Err = errors.Wrap(err, "failed writing fluent bit systemd unit file") + return res, res.Err + } + + if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(install.Nodes{node}), "fluent-bit", ` +sudo cp /tmp/fluent-bit.yaml /etc/fluent-bit/fluent-bit.yaml && rm /tmp/fluent-bit.yaml +sudo cp /tmp/fluent-bit.service /etc/systemd/system/fluent-bit.service && rm /tmp/fluent-bit.service +sudo systemctl daemon-reload && sudo systemctl enable fluent-bit && sudo systemctl restart fluent-bit +`); err != nil { + res.Err = errors.Wrap(err, "failed enabling and starting fluent bit service") + return res, res.Err + } + + return res, nil + }); err != nil { + return errors.Wrap(err, "failed starting fluent bit") + } + + return nil +} + +// Stop stops a running Fluent Bit service on the given CockroachDB cluster c. +func Stop(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) error { + if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes).WithShouldRetryFn(install.AlwaysTrue), "fluent-bit-stop", ` +sudo systemctl disable fluent-bit && sudo systemctl stop fluent-bit +`); err != nil { + return errors.Wrap(err, "failed stopping fluent bit") + } + + return nil +} + +type templateData struct { + DatadogSite string + DatadogAPIKey string + DatadogService string + Tags []string +} + +func executeTemplate(data templateData) (string, error) { + tpl, err := template.New("fluent-bit-config"). + Funcs(template.FuncMap{ + "join": strings.Join, + }). + Parse(fluentBitTemplate) + if err != nil { + return "", err + } + var buf bytes.Buffer + if err := tpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index a77bb325cb8f..669dab41fe9a 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -17,6 +17,7 @@ go_library( "staging.go", ], embedsrcs = [ + "files/cockroachdb-logging.yaml", "scripts/download.sh", "scripts/start.sh", "scripts/open_ports.sh", diff --git a/pkg/roachprod/install/cockroach.go b/pkg/roachprod/install/cockroach.go index a57a1ae45cc6..aab045c0ca49 100644 --- a/pkg/roachprod/install/cockroach.go +++ b/pkg/roachprod/install/cockroach.go @@ -40,6 +40,9 @@ import ( //go:embed scripts/start.sh var startScript string +//go:embed files/cockroachdb-logging.yaml +var loggingConfig string + // sharedProcessVirtualClusterNode is a constant node that is used // whenever we register a service descriptor for a shared-process // virtual cluster. Since these virtual clusters use the system @@ -776,6 +779,10 @@ type startTemplateData struct { EnvVars []string } +type loggingTemplateData struct { + LogDir string +} + // VirtualClusterLabel is the value used to "label" virtual cluster // (cockroach) processes running locally or in a VM. This is used by // roachprod to monitor identify such processes and monitor them. @@ -842,6 +849,20 @@ func execStartTemplate(data startTemplateData) (string, error) { return buf.String(), nil } +func execLoggingTemplate(data loggingTemplateData) (string, error) { + tpl, err := template.New("loggingConfig"). + Delims("#{", "#}"). + Parse(loggingConfig) + if err != nil { + return "", err + } + var buf strings.Builder + if err := tpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} + // generateStartArgs generates cockroach binary arguments for starting a node. // The first argument is the command (e.g. "start"). func (c *SyncedCluster) generateStartArgs( @@ -882,8 +903,21 @@ func (c *SyncedCluster) generateStartArgs( // if neither --log nor --log-config-file are present if idx1 == -1 && idx2 == -1 { - // Specify exit-on-error=false to work around #62763. - args = append(args, "--log", `file-defaults: {dir: '`+logDir+`', exit-on-error: false}`) + loggingConfig, err := execLoggingTemplate(loggingTemplateData{ + LogDir: logDir, + }) + if err != nil { + return nil, errors.Wrap(err, "failed rendering logging template") + } + + loggingConfigFile := fmt.Sprintf("cockroachdb-logging%s.yaml", + virtualClusterDirSuffix(startOpts.VirtualClusterName, startOpts.SQLInstance)) + + if err := c.PutString(ctx, l, c.Nodes, loggingConfig, loggingConfigFile, 0644); err != nil { + return nil, errors.Wrap(err, "failed writing remote logging configuration: %w") + } + + args = append(args, "--log-config-file", loggingConfigFile) } listenHost := "" diff --git a/pkg/roachprod/install/files/cockroachdb-logging.yaml b/pkg/roachprod/install/files/cockroachdb-logging.yaml new file mode 100644 index 000000000000..7fdf4e6889bc --- /dev/null +++ b/pkg/roachprod/install/files/cockroachdb-logging.yaml @@ -0,0 +1,70 @@ +--- +file-defaults: + auditable: false + buffered-writes: true + dir: #{ .LogDir #} + exit-on-error: false + filter: INFO + format: crdb-v2 + max-file-size: 10MiB + max-group-size: 100MiB + redact: false + redactable: true +fluent-defaults: + filter: INFO + format: json-fluent + redact: false + redactable: true + exit-on-error: false + auditable: false + buffering: + max-staleness: 5s + flush-trigger-size: 1.0MiB + max-buffer-size: 50MiB +sinks: + file-groups: + default: + channels: + INFO: [DEV, OPS] + WARNING: all except [DEV, OPS] + health: + channels: [HEALTH] + kv-distribution: + channels: [KV_DISTRIBUTION] + pebble: + channels: [STORAGE] + security: + channels: [PRIVILEGES, USER_ADMIN] + auditable: true + sql-audit: + channels: [SENSITIVE_ACCESS] + auditable: true + sql-auth: + channels: [SESSIONS] + auditable: true + sql-exec: + channels: [SQL_EXEC] + sql-slow: + channels: [SQL_PERF] + sql-slow-internal-only: + channels: [SQL_INTERNAL_PERF] + telemetry: + channels: [TELEMETRY] + max-file-size: 100KiB + max-group-size: 1.0MiB + fluent-servers: + fluent-bit: + channels: {INFO: all} + net: tcp + address: 127.0.0.1:5170 + filter: INFO + redact: false + stderr: + channels: all + filter: NONE + redact: false + redactable: true + exit-on-error: false +capture-stray-errors: + enable: true + max-group-size: 100MiB diff --git a/pkg/roachprod/install/install.go b/pkg/roachprod/install/install.go index bd9aa8a6f977..de4f7f204ca0 100644 --- a/pkg/roachprod/install/install.go +++ b/pkg/roachprod/install/install.go @@ -83,6 +83,15 @@ sudo apt-get install -y \ "postgresql": ` sudo apt-get update; sudo apt-get install -y postgresql; +`, + + "fluent-bit": ` +curl -fsSL https://packages.fluentbit.io/fluentbit.key | sudo gpg --no-tty --batch --yes --dearmor -o /etc/apt/keyrings/fluent-bit.gpg; +code_name="$(. /etc/os-release && echo "${VERSION_CODENAME}")"; +echo "deb [signed-by=/etc/apt/keyrings/fluent-bit.gpg] https://packages.fluentbit.io/ubuntu/${code_name} ${code_name} main" | \ + sudo tee /etc/apt/sources.list.d/fluent-bit.list > /dev/null; +sudo apt-get update; +sudo apt-get install -y fluent-bit; `, } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index a91b142e3c25..b4f97971f793 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -36,6 +36,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana" "github.com/cockroachdb/cockroach/pkg/roachprod/cloud" "github.com/cockroachdb/cockroach/pkg/roachprod/config" + "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/lock" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" @@ -2104,6 +2105,41 @@ func JaegerURL( return urls[0], nil } +// StartFluentBit installs, configures, and starts Fluent Bit on the cluster +// identified by clusterName. +func StartFluentBit( + ctx context.Context, l *logger.Logger, clusterName string, config fluentbit.Config, +) error { + if config.DatadogAPIKey == "" { + return errors.New("Datadog API cannot be empty") + } + + if err := LoadClusters(); err != nil { + return err + } + + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return fluentbit.Install(ctx, l, c, config) +} + +// Stop stops Fluent Bit on the cluster identified by clusterName. +func StopFluentBit(ctx context.Context, l *logger.Logger, clusterName string) error { + if err := LoadClusters(); err != nil { + return err + } + + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return fluentbit.Stop(ctx, l, c) +} + // DestroyDNS destroys the DNS records for the given cluster. func DestroyDNS(ctx context.Context, l *logger.Logger, clusterName string) error { c, err := getClusterFromCache(l, clusterName) diff --git a/scripts/drtprod b/scripts/drtprod index 82df7f5d21a0..ee8324d40d15 100755 --- a/scripts/drtprod +++ b/scripts/drtprod @@ -99,10 +99,10 @@ case $1 in --- api_key: ${dd_api_key} site: ${dd_site} +hostname: \$(hostname) tags: # Datadog reserved tags. - env:development -- service:drt-cockroachdb # Custom tags. - cluster:${cluster%:*} @@ -112,13 +112,19 @@ EOF" roachprod ssh ${cluster} -- "sudo tee /etc/datadog-agent/conf.d/cockroachdb.d/conf.yaml > /dev/null << EOF --- init_config: + instances: - openmetrics_endpoint: http://localhost:26258/_status/vars tls_verify: false + service: drt-cockroachdb EOF" roachprod ssh ${cluster} -- 'sudo systemctl enable datadog-agent && sudo systemctl restart datadog-agent' + roachprod fluent-bit-start ${cluster} \ + --datadog-api-key "${dd_api_key}" \ + --datadog-service drt-cockroachdb \ + --datadog-team drt exit 0 ;; "create")