Skip to content

Commit

Permalink
scripts/drtprod: send logs to datadog
Browse files Browse the repository at this point in the history
Previously, clusters created by `roachprod` logged exclusively to disk,
requiring operators to either SSH into the instance or use `roachprod
logs` to view logs for a CockroachDB node.

This patch adds a new `roachprod fluent-bit-start` command that, when
run, installs and starts Fluent Bit on the CockroachDB cluster listening
on `127.0.0.1:5170`. The CockroachDB logging configuration has also been
updated to log to this Fluent Bit endpoint, choosing not to error if the
endpoint is unavailble. Clusters still log to disk as to not break
existing workflows. The `drtprod` script was also updated to install and
configure Fluent Bit on the DRT clusters. A complementary `roachprod
fluent-bit-stop` command was also added to stop Fluent Bit.

Epic: none

Release note: none
  • Loading branch information
sudomateo committed Apr 30, 2024
1 parent 4e5656a commit 22068bf
Show file tree
Hide file tree
Showing 15 changed files with 411 additions and 3 deletions.
1 change: 1 addition & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1555,6 +1555,7 @@ GO_TARGETS = [
"//pkg/roachprod/config:config",
"//pkg/roachprod/config:config_test",
"//pkg/roachprod/errors:errors",
"//pkg/roachprod/fluentbit:fluentbit",
"//pkg/roachprod/install:install",
"//pkg/roachprod/install:install_test",
"//pkg/roachprod/lock:lock",
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ go_library(
"//pkg/roachprod",
"//pkg/roachprod/config",
"//pkg/roachprod/errors",
"//pkg/roachprod/fluentbit",
"//pkg/roachprod/install",
"//pkg/roachprod/ssh",
"//pkg/roachprod/ui",
Expand Down
15 changes: 15 additions & 0 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/roachprod"
"github.com/cockroachdb/cockroach/pkg/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/ssh"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
Expand Down Expand Up @@ -96,6 +97,8 @@ var (
}

sshKeyUser string

fluentBitConfig fluentbit.Config
)

func initFlags() {
Expand Down Expand Up @@ -283,6 +286,18 @@ func initFlags() {
grafanaDumpCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "",
"the absolute path to dump prometheus data to (use the contained 'prometheus-docker-run.sh' to visualize")

fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogSite, "datadog-site", "us5.datadoghq.com",
"Datadog site to send telemetry data to")

fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogAPIKey, "datadog-api-key", "",
"Datadog API key")

fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogService, "datadog-service", "cockroachdb",
"Datadog service name for emitted logs")

fluentBitStartCmd.Flags().StringVar(&fluentBitConfig.DatadogTeam, "datadog-team", "",
"Datadog team to tag emitted logs")

sshKeysAddCmd.Flags().StringVar(&sshKeyUser, "user", config.OSUser.Username,
"the user to be associated with the new key",
)
Expand Down
22 changes: 22 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1670,6 +1670,26 @@ var _ = func() struct{} {
return struct{}{}
}()

var fluentBitStartCmd = &cobra.Command{
Use: "fluent-bit-start <cluster>",
Short: "Install and start Fluent Bit",
Long: "Install and start Fluent Bit",
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
return roachprod.StartFluentBit(context.Background(), config.Logger, args[0], fluentBitConfig)
}),
}

var fluentBitStopCmd = &cobra.Command{
Use: "fluent-bit-stop <cluster>",
Short: "Stop Fluent Bit",
Long: "Stop Fluent Bit",
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
return roachprod.StopFluentBit(context.Background(), config.Logger, args[0])
}),
}

func main() {
_ = roachprod.InitProviders()
providerOptsContainer = vm.CreateProviderOptionsContainer()
Expand Down Expand Up @@ -1728,6 +1748,8 @@ func main() {
jaegerStartCmd,
jaegerStopCmd,
jaegerURLCmd,
fluentBitStartCmd,
fluentBitStopCmd,
)
setBashCompletionFunction()

Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"//pkg/cmd/roachprod/grafana",
"//pkg/roachprod/cloud",
"//pkg/roachprod/config",
"//pkg/roachprod/fluentbit",
"//pkg/roachprod/install",
"//pkg/roachprod/lock",
"//pkg/roachprod/logger",
Expand Down
18 changes: 18 additions & 0 deletions pkg/roachprod/fluentbit/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "fluentbit",
srcs = ["fluentbit.go"],
embedsrcs = [
"files/fluent-bit.service",
"files/fluent-bit.yaml.tmpl",
],
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/fluentbit",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"@com_github_cockroachdb_errors//:errors",
],
)
15 changes: 15 additions & 0 deletions pkg/roachprod/fluentbit/files/fluent-bit.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[Unit]
Description=Fluent Bit
Documentation=https://docs.fluentbit.io/manual/
Requires=network.target
After=network.target

[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/fluent-bit
EnvironmentFile=-/etc/default/fluent-bit
ExecStart=/opt/fluent-bit/bin/fluent-bit -c //etc/fluent-bit/fluent-bit.yaml
Restart=always

[Install]
WantedBy=multi-user.target
37 changes: 37 additions & 0 deletions pkg/roachprod/fluentbit/files/fluent-bit.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
---
service:
flush: 1
daemon: off
http_server: on
http_listen: 127.0.0.1
http_port: 2020
log_level: info
storage.path: /tmp
storage.metrics: on
storage.max_chunks_up: 30
storage.sync: full
storage.checksum: on
storage.delete_irrecoverable_chunks: on
parsers_file: parsers.conf
plugins_file: plugins.conf
pipeline:
inputs:
- name: tcp
tag: cockroachdb
listen: 127.0.0.1
port: 5170
format: json
storage.type: filesystem
alias: cockroachdb
outputs:
- name: datadog
match: cockroachdb
host: http-intake.logs.{{ .DatadogSite }}
tls: on
compress: gzip
apikey: {{ .DatadogAPIKey }}
dd_source: cockroachdb
dd_service: {{ .DatadogService }}
dd_tags: {{ join .Tags `,` }}
alias: cockroachdb
storage.total_limit_size: 25MB
142 changes: 142 additions & 0 deletions pkg/roachprod/fluentbit/fluentbit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package fluentbit

import (
"bytes"
"context"
_ "embed"
"fmt"
"strings"
"text/template"

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/errors"
)

//go:embed files/fluent-bit.yaml.tmpl
var fluentBitTemplate string

//go:embed files/fluent-bit.service
var fluentBitSystemdUnit string

// Config represents the information needed to configure and run Fluent Bit on
// a CockroachDB cluster.
type Config struct {
// Datadog site to send telemetry data to (e.g, us5.datadoghq.com).
DatadogSite string

// Datadog API key to authenticate to Datadog.
DatadogAPIKey string

// Datadog service for emitted logs.
DatadogService string

// Datadog team to tag the emitted logs.
DatadogTeam string
}

// Install installs, configures, and starts Fluent Bit on the given CockroachDB
// cluster c.
func Install(ctx context.Context, l *logger.Logger, c *install.SyncedCluster, config Config) error {
if err := c.Parallel(ctx, l, install.WithNodes(c.Nodes), func(ctx context.Context, node install.Node) (*install.RunResultDetails, error) {
res := &install.RunResultDetails{Node: node}

if err := install.InstallTool(ctx, l, c, install.Nodes{node}, "fluent-bit", l.Stdout, l.Stderr); err != nil {
res.Err = errors.Wrap(err, "failed installing fluent bit")
return res, res.Err
}

tags := []string{
"env:development",
fmt.Sprintf("host:%s", vm.Name(c.Name, int(node))),
fmt.Sprintf("cluster:%s", c.Name),
}

if config.DatadogTeam != "" {
tags = append(tags, fmt.Sprintf("team:%s", config.DatadogTeam))
}

data := templateData{
DatadogSite: config.DatadogSite,
DatadogAPIKey: config.DatadogAPIKey,
DatadogService: config.DatadogService,
Tags: tags,
}

fluentBitConfig, err := executeTemplate(data)
if err != nil {
res.Err = errors.Wrapf(err, "failed rendering fluent bit configuration for node %d", node)
return res, res.Err
}

if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitConfig, "/tmp/fluent-bit.yaml", 0644); err != nil {
res.Err = errors.Wrapf(err, "failed writing fluent bit configuration to node %d", node)
return res, res.Err
}

if err := c.PutString(ctx, l, install.Nodes{node}, fluentBitSystemdUnit, "/tmp/fluent-bit.service", 0644); err != nil {
res.Err = errors.Wrap(err, "failed writing fluent bit systemd unit file")
return res, res.Err
}

if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(install.Nodes{node}), "fluent-bit", `
sudo cp /tmp/fluent-bit.yaml /etc/fluent-bit/fluent-bit.yaml && rm /tmp/fluent-bit.yaml
sudo cp /tmp/fluent-bit.service /etc/systemd/system/fluent-bit.service && rm /tmp/fluent-bit.service
sudo systemctl daemon-reload && sudo systemctl enable fluent-bit && sudo systemctl restart fluent-bit
`); err != nil {
res.Err = errors.Wrap(err, "failed enabling and starting fluent bit service")
return res, res.Err
}

return res, nil
}); err != nil {
return errors.Wrap(err, "failed starting fluent bit")
}

return nil
}

// Stop stops a running Fluent Bit service on the given CockroachDB cluster c.
func Stop(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) error {
if err := c.Run(ctx, l, l.Stdout, l.Stderr, install.WithNodes(c.Nodes).WithShouldRetryFn(install.AlwaysTrue), "fluent-bit-stop", `
sudo systemctl disable fluent-bit && sudo systemctl stop fluent-bit
`); err != nil {
return errors.Wrap(err, "failed stopping fluent bit")
}

return nil
}

type templateData struct {
DatadogSite string
DatadogAPIKey string
DatadogService string
Tags []string
}

func executeTemplate(data templateData) (string, error) {
tpl, err := template.New("fluent-bit-config").
Funcs(template.FuncMap{
"join": strings.Join,
}).
Parse(fluentBitTemplate)
if err != nil {
return "", err
}
var buf bytes.Buffer
if err := tpl.Execute(&buf, data); err != nil {
return "", err
}
return buf.String(), nil
}
1 change: 1 addition & 0 deletions pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ go_library(
"staging.go",
],
embedsrcs = [
"files/cockroachdb-logging.yaml",
"scripts/download.sh",
"scripts/start.sh",
"scripts/open_ports.sh",
Expand Down
38 changes: 36 additions & 2 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ import (
//go:embed scripts/start.sh
var startScript string

//go:embed files/cockroachdb-logging.yaml
var loggingConfig string

// sharedProcessVirtualClusterNode is a constant node that is used
// whenever we register a service descriptor for a shared-process
// virtual cluster. Since these virtual clusters use the system
Expand Down Expand Up @@ -776,6 +779,10 @@ type startTemplateData struct {
EnvVars []string
}

type loggingTemplateData struct {
LogDir string
}

// VirtualClusterLabel is the value used to "label" virtual cluster
// (cockroach) processes running locally or in a VM. This is used by
// roachprod to monitor identify such processes and monitor them.
Expand Down Expand Up @@ -842,6 +849,20 @@ func execStartTemplate(data startTemplateData) (string, error) {
return buf.String(), nil
}

func execLoggingTemplate(data loggingTemplateData) (string, error) {
tpl, err := template.New("loggingConfig").
Delims("#{", "#}").
Parse(loggingConfig)
if err != nil {
return "", err
}
var buf strings.Builder
if err := tpl.Execute(&buf, data); err != nil {
return "", err
}
return buf.String(), nil
}

// generateStartArgs generates cockroach binary arguments for starting a node.
// The first argument is the command (e.g. "start").
func (c *SyncedCluster) generateStartArgs(
Expand Down Expand Up @@ -882,8 +903,21 @@ func (c *SyncedCluster) generateStartArgs(

// if neither --log nor --log-config-file are present
if idx1 == -1 && idx2 == -1 {
// Specify exit-on-error=false to work around #62763.
args = append(args, "--log", `file-defaults: {dir: '`+logDir+`', exit-on-error: false}`)
loggingConfig, err := execLoggingTemplate(loggingTemplateData{
LogDir: logDir,
})
if err != nil {
return nil, errors.Wrap(err, "failed rendering logging template")
}

loggingConfigFile := fmt.Sprintf("cockroachdb-logging%s.yaml",
virtualClusterDirSuffix(startOpts.VirtualClusterName, startOpts.SQLInstance))

if err := c.PutString(ctx, l, c.Nodes, loggingConfig, loggingConfigFile, 0644); err != nil {
return nil, errors.Wrap(err, "failed writing remote logging configuration: %w")
}

args = append(args, "--log-config-file", loggingConfigFile)
}

listenHost := ""
Expand Down
Loading

0 comments on commit 22068bf

Please sign in to comment.