diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 43807416c834..ff55126563bf 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -1553,6 +1553,7 @@ GO_TARGETS = [ "//pkg/roachprod/config:config", "//pkg/roachprod/config:config_test", "//pkg/roachprod/errors:errors", + "//pkg/roachprod/fluentbit:fluentbit", "//pkg/roachprod/install:install", "//pkg/roachprod/install:install_test", "//pkg/roachprod/lock:lock", diff --git a/pkg/cmd/roachprod/BUILD.bazel b/pkg/cmd/roachprod/BUILD.bazel index 44bc8da26f02..5118fa963d76 100644 --- a/pkg/cmd/roachprod/BUILD.bazel +++ b/pkg/cmd/roachprod/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/roachprod", "//pkg/roachprod/config", "//pkg/roachprod/errors", + "//pkg/roachprod/fluentbit", "//pkg/roachprod/install", "//pkg/roachprod/ssh", "//pkg/roachprod/ui", diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index a2d4c645de7c..a0f8e9fa406f 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -17,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" + "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/ssh" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" @@ -96,6 +97,8 @@ var ( } sshKeyUser string + + fluentBitConfig fluentbit.Config ) func initFlags() { @@ -283,6 +286,18 @@ func initFlags() { grafanaDumpCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "", "the absolute path to dump prometheus data to (use the contained 'prometheus-docker-run.sh' to visualize") + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogSite, "datadog-site", "us5.datadoghq.com", + "Datadog site to send telemetry data to") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogAPIKey, "datadog-api-key", "", + "Datadog API key") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogService, "datadog-service", "cockroachdb", + "Datadog service name for emitted logs") + + fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogTeam, "datadog-team", "", + "Datadog team to tag emitted logs") + sshKeysAddCmd.Flags().StringVar(&sshKeyUser, "user", config.OSUser.Username, "the user to be associated with the new key", ) diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index 70f1984e99b3..48ebde10d915 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -1670,6 +1670,26 @@ var _ = func() struct{} { return struct{}{} }() +var fluentBitStartCmd = &cobra.Command{ + Use: "fluent-bit-start ", + Short: "Install and start Fluent Bit", + Long: "Install and start Fluent Bit", + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StartFluentBit(context.Background(), config.Logger, args[0], fluentBitConfig) + }), +} + +var fluentBitStopCmd = &cobra.Command{ + Use: "fluent-bit-stop ", + Short: "Stop Fluent Bit", + Long: "Stop Fluent Bit", + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StopFluentBit(context.Background(), config.Logger, args[0]) + }), +} + func main() { _ = roachprod.InitProviders() providerOptsContainer = vm.CreateProviderOptionsContainer() @@ -1728,6 +1748,8 @@ func main() { jaegerStartCmd, jaegerStopCmd, jaegerURLCmd, + fluentBitStartCmd, + fluentBitStopCmd, ) setBashCompletionFunction() diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index 06fd31b4fd1d..b5c9cac3b621 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//pkg/cmd/roachprod/grafana", "//pkg/roachprod/cloud", "//pkg/roachprod/config", + "//pkg/roachprod/fluentbit", "//pkg/roachprod/install", "//pkg/roachprod/lock", "//pkg/roachprod/logger", diff --git a/pkg/roachprod/fluentbit/BUILD.bazel b/pkg/roachprod/fluentbit/BUILD.bazel new file mode 100644 index 000000000000..4eedc66ec513 --- /dev/null +++ b/pkg/roachprod/fluentbit/BUILD.bazel @@ -0,0 +1,18 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "fluentbit", + srcs = ["fluentbit.go"], + embedsrcs = [ + "files/fluent-bit.service", + "files/fluent-bit.yaml.tmpl", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit", + visibility = ["//visibility:public"], + deps = [ + "//pkg/roachprod/install", + "//pkg/roachprod/logger", + "//pkg/roachprod/vm", + "@com_github_cockroachdb_errors//:errors", + ], +) diff --git a/pkg/roachprod/fluentbit/files/fluent-bit.service b/pkg/roachprod/fluentbit/files/fluent-bit.service new file mode 100644 index 000000000000..95c8457f3ec1 --- /dev/null +++ b/pkg/roachprod/fluentbit/files/fluent-bit.service @@ -0,0 +1,15 @@ +[Unit] +Description=Fluent Bit +Documentation=https://docs.fluentbit.io/manual/ +Requires=network.target +After=network.target + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/fluent-bit +EnvironmentFile=-/etc/default/fluent-bit +ExecStart=/opt/fluent-bit/bin/fluent-bit -c //etc/fluent-bit/fluent-bit.yaml +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl b/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl new file mode 100644 index 000000000000..b0f148bbe4b1 --- /dev/null +++ b/pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl @@ -0,0 +1,37 @@ +--- +service: + flush: 1 + daemon: off + http_server: on + http_listen: 127.0.0.1 + http_port: 2020 + log_level: info + storage.path: /tmp + storage.metrics: on + storage.max_chunks_up: 30 + storage.sync: full + storage.checksum: on + storage.delete_irrecoverable_chunks: on + parsers_file: parsers.conf + plugins_file: plugins.conf +pipeline: + inputs: + - name: tcp + tag: cockroachdb + listen: 127.0.0.1 + port: 5170 + format: json + storage.type: filesystem + alias: cockroachdb + outputs: + - name: datadog + match: cockroachdb + host: http-intake.logs.{{ .DatadogSite }} + tls: on + compress: gzip + apikey: {{ .DatadogAPIKey }} + dd_source: cockroachdb + dd_service: {{ .DatadogService }} + dd_tags: {{ join .Tags `,` }} + alias: cockroachdb + storage.total_limit_size: 25MB diff --git a/pkg/roachprod/fluentbit/fluentbit.go b/pkg/roachprod/fluentbit/fluentbit.go new file mode 100644 index 000000000000..4de5e7dc13b4 --- /dev/null +++ b/pkg/roachprod/fluentbit/fluentbit.go @@ -0,0 +1,142 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package fluentbit + +import ( + "bytes" + "context" + _ "embed" + "fmt" + "strings" + "text/template" + + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" + "github.com/cockroachdb/errors" +) + +//go:embed files/fluent-bit.yaml.tmpl +var fluentBitTemplate string + +//go:embed files/fluent-bit.service +var fluentBitSystemdUnit string + +// Config represents the information needed to configure and run Fluent Bit on +// a CockroachDB cluster. +type Config struct { + // Datadog site to send telemetry data to (e.g, us5.datadoghq.com). + DatadogSite string + + // Datadog API key to authenticate to Datadog. + DatadogAPIKey string + + // Datadog service for emitted logs. + DatadogService string + + // Datadog team to tag the emitted logs. + DatadogTeam string +} + +// Install installs, configures, and starts Fluent Bit on the given CockroachDB +// cluster c. +func Install(ctx context.Context, l *logger.Logger, c *install.SyncedCluster, config Config) error { + if err := c.Parallel(ctx, l, install.WithNodes(c.Nodes), func(ctx context.Context, node install.Node) (*install.RunResultDetails, error) { + res := &install.RunResultDetails{Node: node} + + if err := install.InstallTool(ctx, l, c, install.Nodes{node}, "fluent-bit", l.Stdout, l.Stderr); err != nil { + res.Err = errors.Wrap(err, "failed installing fluent bit") + return res, res.Err + } + + tags := []string{ + "env:development", + fmt.Sprintf("host:%s", vm.Name(c.Name, int(node))), + fmt.Sprintf("cluster:%s", c.Name), + } + + if config.DatadogTeam != "" { + tags = append(tags, fmt.Sprintf("team:%s", config.DatadogTeam)) + } + + data := templateData{ + DatadogSite: config.DatadogSite, + DatadogAPIKey: config.DatadogAPIKey, + DatadogService: config.DatadogService, + Tags: tags, + } + + fluentBitConfig, err := executeTemplate(data) + if err != nil { + res.Err = errors.Wrapf(err, "failed rendering fluent bit configuration for node %d", node) + return res, res.Err + } + + if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitConfig, "/tmp/fluent-bit.yaml", 0644); err != nil { + res.Err = errors.Wrapf(err, "failed writing fluent bit configuration to node %d", node) + return res, res.Err + } + + if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitSystemdUnit, "/tmp/fluent-bit.service", 0644); err != nil { + res.Err = errors.Wrap(err, "failed writing fluent bit systemd unit file") + return res, res.Err + } + + if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(install.Nodes{node}), "fluent-bit", ` +sudo cp /tmp/fluent-bit.yaml /etc/fluent-bit/fluent-bit.yaml && rm /tmp/fluent-bit.yaml +sudo cp /tmp/fluent-bit.service /etc/systemd/system/fluent-bit.service && rm /tmp/fluent-bit.service +sudo systemctl daemon-reload && sudo systemctl enable fluent-bit && sudo systemctl restart fluent-bit +`); err != nil { + res.Err = errors.Wrap(err, "failed enabling and starting fluent bit service") + return res, res.Err + } + + return res, nil + }); err != nil { + return errors.Wrap(err, "failed starting fluent bit") + } + + return nil +} + +// Stop stops a running Fluent Bit service on the given CockroachDB cluster c. +func Stop(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) error { + if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes).WithShouldRetryFn(install.AlwaysTrue), "fluent-bit-stop", ` +sudo systemctl disable fluent-bit && sudo systemctl stop fluent-bit +`); err != nil { + return errors.Wrap(err, "failed stopping fluent bit") + } + + return nil +} + +type templateData struct { + DatadogSite string + DatadogAPIKey string + DatadogService string + Tags []string +} + +func executeTemplate(data templateData) (string, error) { + tpl, err := template.New("fluent-bit-config"). + Funcs(template.FuncMap{ + "join": strings.Join, + }). + Parse(fluentBitTemplate) + if err != nil { + return "", err + } + var buf bytes.Buffer + if err := tpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index a77bb325cb8f..669dab41fe9a 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -17,6 +17,7 @@ go_library( "staging.go", ], embedsrcs = [ + "files/cockroachdb-logging.yaml", "scripts/download.sh", "scripts/start.sh", "scripts/open_ports.sh", diff --git a/pkg/roachprod/install/cockroach.go b/pkg/roachprod/install/cockroach.go index a57a1ae45cc6..aab045c0ca49 100644 --- a/pkg/roachprod/install/cockroach.go +++ b/pkg/roachprod/install/cockroach.go @@ -40,6 +40,9 @@ import ( //go:embed scripts/start.sh var startScript string +//go:embed files/cockroachdb-logging.yaml +var loggingConfig string + // sharedProcessVirtualClusterNode is a constant node that is used // whenever we register a service descriptor for a shared-process // virtual cluster. Since these virtual clusters use the system @@ -776,6 +779,10 @@ type startTemplateData struct { EnvVars []string } +type loggingTemplateData struct { + LogDir string +} + // VirtualClusterLabel is the value used to "label" virtual cluster // (cockroach) processes running locally or in a VM. This is used by // roachprod to monitor identify such processes and monitor them. @@ -842,6 +849,20 @@ func execStartTemplate(data startTemplateData) (string, error) { return buf.String(), nil } +func execLoggingTemplate(data loggingTemplateData) (string, error) { + tpl, err := template.New("loggingConfig"). + Delims("#{", "#}"). + Parse(loggingConfig) + if err != nil { + return "", err + } + var buf strings.Builder + if err := tpl.Execute(&buf, data); err != nil { + return "", err + } + return buf.String(), nil +} + // generateStartArgs generates cockroach binary arguments for starting a node. // The first argument is the command (e.g. "start"). func (c *SyncedCluster) generateStartArgs( @@ -882,8 +903,21 @@ func (c *SyncedCluster) generateStartArgs( // if neither --log nor --log-config-file are present if idx1 == -1 && idx2 == -1 { - // Specify exit-on-error=false to work around #62763. - args = append(args, "--log", `file-defaults: {dir: '`+logDir+`', exit-on-error: false}`) + loggingConfig, err := execLoggingTemplate(loggingTemplateData{ + LogDir: logDir, + }) + if err != nil { + return nil, errors.Wrap(err, "failed rendering logging template") + } + + loggingConfigFile := fmt.Sprintf("cockroachdb-logging%s.yaml", + virtualClusterDirSuffix(startOpts.VirtualClusterName, startOpts.SQLInstance)) + + if err := c.PutString(ctx, l, c.Nodes, loggingConfig, loggingConfigFile, 0644); err != nil { + return nil, errors.Wrap(err, "failed writing remote logging configuration: %w") + } + + args = append(args, "--log-config-file", loggingConfigFile) } listenHost := "" diff --git a/pkg/roachprod/install/files/cockroachdb-logging.yaml b/pkg/roachprod/install/files/cockroachdb-logging.yaml new file mode 100644 index 000000000000..7fdf4e6889bc --- /dev/null +++ b/pkg/roachprod/install/files/cockroachdb-logging.yaml @@ -0,0 +1,70 @@ +--- +file-defaults: + auditable: false + buffered-writes: true + dir: #{ .LogDir #} + exit-on-error: false + filter: INFO + format: crdb-v2 + max-file-size: 10MiB + max-group-size: 100MiB + redact: false + redactable: true +fluent-defaults: + filter: INFO + format: json-fluent + redact: false + redactable: true + exit-on-error: false + auditable: false + buffering: + max-staleness: 5s + flush-trigger-size: 1.0MiB + max-buffer-size: 50MiB +sinks: + file-groups: + default: + channels: + INFO: [DEV, OPS] + WARNING: all except [DEV, OPS] + health: + channels: [HEALTH] + kv-distribution: + channels: [KV_DISTRIBUTION] + pebble: + channels: [STORAGE] + security: + channels: [PRIVILEGES, USER_ADMIN] + auditable: true + sql-audit: + channels: [SENSITIVE_ACCESS] + auditable: true + sql-auth: + channels: [SESSIONS] + auditable: true + sql-exec: + channels: [SQL_EXEC] + sql-slow: + channels: [SQL_PERF] + sql-slow-internal-only: + channels: [SQL_INTERNAL_PERF] + telemetry: + channels: [TELEMETRY] + max-file-size: 100KiB + max-group-size: 1.0MiB + fluent-servers: + fluent-bit: + channels: {INFO: all} + net: tcp + address: 127.0.0.1:5170 + filter: INFO + redact: false + stderr: + channels: all + filter: NONE + redact: false + redactable: true + exit-on-error: false +capture-stray-errors: + enable: true + max-group-size: 100MiB diff --git a/pkg/roachprod/install/install.go b/pkg/roachprod/install/install.go index bd9aa8a6f977..de4f7f204ca0 100644 --- a/pkg/roachprod/install/install.go +++ b/pkg/roachprod/install/install.go @@ -83,6 +83,15 @@ sudo apt-get install -y \ "postgresql": ` sudo apt-get update; sudo apt-get install -y postgresql; +`, + + "fluent-bit": ` +curl -fsSL https://packages.fluentbit.io/fluentbit.key | sudo gpg --no-tty --batch --yes --dearmor -o /etc/apt/keyrings/fluent-bit.gpg; +code_name="$(. /etc/os-release && echo "${VERSION_CODENAME}")"; +echo "deb [signed-by=/etc/apt/keyrings/fluent-bit.gpg] https://packages.fluentbit.io/ubuntu/${code_name} ${code_name} main" | \ + sudo tee /etc/apt/sources.list.d/fluent-bit.list > /dev/null; +sudo apt-get update; +sudo apt-get install -y fluent-bit; `, } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 876f6f80412e..91623c342db4 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -36,6 +36,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/grafana" "github.com/cockroachdb/cockroach/pkg/roachprod/cloud" "github.com/cockroachdb/cockroach/pkg/roachprod/config" + "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/lock" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" @@ -2140,6 +2141,41 @@ func JaegerURL( return urls[0], nil } +// StartFluentBit installs, configures, and starts Fluent Bit on the cluster +// identified by clusterName. +func StartFluentBit( + ctx context.Context, l *logger.Logger, clusterName string, config fluentbit.Config, +) error { + if config.DatadogAPIKey == "" { + return errors.New("Datadog API cannot be empty") + } + + if err := LoadClusters(); err != nil { + return err + } + + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return fluentbit.Install(ctx, l, c, config) +} + +// Stop stops Fluent Bit on the cluster identified by clusterName. +func StopFluentBit(ctx context.Context, l *logger.Logger, clusterName string) error { + if err := LoadClusters(); err != nil { + return err + } + + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return fluentbit.Stop(ctx, l, c) +} + // DestroyDNS destroys the DNS records for the given cluster. func DestroyDNS(ctx context.Context, l *logger.Logger, clusterName string) error { c, err := getClusterFromCache(l, clusterName) diff --git a/scripts/drtprod b/scripts/drtprod index d993e2e9d96a..7a457f66e52a 100755 --- a/scripts/drtprod +++ b/scripts/drtprod @@ -99,10 +99,10 @@ case $1 in --- api_key: ${dd_api_key} site: ${dd_site} +hostname: \$(hostname) tags: # Datadog reserved tags. - env:development -- service:drt-cockroachdb # Custom tags. - cluster:${cluster%:*} @@ -112,13 +112,19 @@ EOF" roachprod ssh ${cluster} -- "sudo tee /etc/datadog-agent/conf.d/cockroachdb.d/conf.yaml > /dev/null << EOF --- init_config: + instances: - openmetrics_endpoint: http://localhost:26258/_status/vars tls_verify: false + service: drt-cockroachdb EOF" roachprod ssh ${cluster} -- 'sudo systemctl enable datadog-agent && sudo systemctl restart datadog-agent' + roachprod fluent-bit-start ${cluster} \ + --datadog-api-key "${dd_api_key}" \ + --datadog-service drt-cockroachdb \ + --datadog-team drt exit 0 ;; "create")