Skip to content

Commit

Permalink
Remove unnecessary copy while decoding and constructing string
Browse files Browse the repository at this point in the history
Signed-off-by: Bogdan Drutu <[email protected]>
  • Loading branch information
bogdandrutu committed Feb 6, 2025
1 parent 0820ea0 commit 9493b6f
Show file tree
Hide file tree
Showing 29 changed files with 255 additions and 194 deletions.
27 changes: 27 additions & 0 deletions .chloggen/rm-unncessary-copy-enhance.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: enhancement

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: textutil

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Remove unnecessary copy while decoding and constructing string

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [37734]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext: This PR affects all log receivers, text extension and kafkareceiver.

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [api]
27 changes: 27 additions & 0 deletions .chloggen/rm-unncessary-copy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: deprecation

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: pkg/stanza

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Deprecate all functions in stanza/decode

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [37734]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [api]
4 changes: 1 addition & 3 deletions extension/encoding/textencodingextension/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ func (c *Config) Validate() error {
return err
}
}
encCfg := textutils.NewEncodingConfig()
encCfg.Encoding = c.Encoding
_, err := encCfg.Build()
_, err := textutils.LookupEncoding(c.Encoding)
if err != nil {
return err
}
Expand Down
6 changes: 2 additions & 4 deletions extension/encoding/textencodingextension/extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,12 @@ func (e *textExtension) MarshalLogs(ld plog.Logs) ([]byte, error) {
}

func (e *textExtension) Start(_ context.Context, _ component.Host) error {
encCfg := textutils.NewEncodingConfig()
encCfg.Encoding = e.config.Encoding
enc, err := encCfg.Build()
enc, err := textutils.LookupEncoding(e.config.Encoding)
if err != nil {
return err
}
e.textEncoder = &textLogCodec{
enc: &enc,
decoder: enc.NewDecoder(),
}

return err
Expand Down
2 changes: 1 addition & 1 deletion extension/encoding/textencodingextension/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
go.opentelemetry.io/collector/extension/extensiontest v0.119.0
go.opentelemetry.io/collector/pdata v1.25.0
go.uber.org/goleak v1.3.0
golang.org/x/text v0.22.0
)

require (
Expand Down Expand Up @@ -43,7 +44,6 @@ require (
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.33.0 // indirect
golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.22.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a // indirect
google.golang.org/grpc v1.70.0 // indirect
google.golang.org/protobuf v1.36.4 // indirect
Expand Down
7 changes: 4 additions & 3 deletions extension/encoding/textencodingextension/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ import (

"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/plog"
"golang.org/x/text/encoding"

"github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/textutils"
)

type textLogCodec struct {
enc *textutils.Encoding
decoder *encoding.Decoder
marshalingSeparator string
unmarshalingSeparator *regexp.Regexp
}
Expand Down Expand Up @@ -50,11 +51,11 @@ func (r *textLogCodec) UnmarshalLogs(buf []byte) (plog.Logs, error) {
for s.Scan() {
l := p.ResourceLogs().AppendEmpty().ScopeLogs().AppendEmpty().LogRecords().AppendEmpty()
l.SetObservedTimestamp(now)
decoded, err := r.enc.Decode(s.Bytes())
decoded, err := textutils.DecodeAsString(r.decoder, s.Bytes())
if err != nil {
return p, err
}
l.Body().SetStr(string(decoded))
l.Body().SetStr(decoded)
}

return p, nil
Expand Down
18 changes: 6 additions & 12 deletions extension/encoding/textencodingextension/text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,10 @@ import (
)

func TestTextRoundtrip(t *testing.T) {
encCfg := textutils.NewEncodingConfig()
encCfg.Encoding = "utf8"
enc, err := encCfg.Build()
enc, err := textutils.LookupEncoding("utf8")
require.NoError(t, err)
r := regexp.MustCompile(`\r?\n`)
codec := &textLogCodec{enc: &enc, unmarshalingSeparator: r, marshalingSeparator: "\n"}
codec := &textLogCodec{decoder: enc.NewDecoder(), unmarshalingSeparator: r, marshalingSeparator: "\n"}
require.NoError(t, err)
ld, err := codec.UnmarshalLogs([]byte("foo\r\nbar\n"))
require.NoError(t, err)
Expand All @@ -30,12 +28,10 @@ func TestTextRoundtrip(t *testing.T) {
}

func TestTextRoundtripMissingNewline(t *testing.T) {
encCfg := textutils.NewEncodingConfig()
encCfg.Encoding = "utf8"
enc, err := encCfg.Build()
enc, err := textutils.LookupEncoding("utf8")
require.NoError(t, err)
r := regexp.MustCompile(`\r?\n`)
codec := &textLogCodec{enc: &enc, unmarshalingSeparator: r, marshalingSeparator: "\n"}
codec := &textLogCodec{decoder: enc.NewDecoder(), unmarshalingSeparator: r, marshalingSeparator: "\n"}
require.NoError(t, err)
ld, err := codec.UnmarshalLogs([]byte("foo\r\nbar"))
require.NoError(t, err)
Expand All @@ -46,11 +42,9 @@ func TestTextRoundtripMissingNewline(t *testing.T) {
}

func TestNoSeparator(t *testing.T) {
encCfg := textutils.NewEncodingConfig()
encCfg.Encoding = "utf8"
enc, err := encCfg.Build()
enc, err := textutils.LookupEncoding("utf8")
require.NoError(t, err)
codec := &textLogCodec{enc: &enc}
codec := &textLogCodec{decoder: enc.NewDecoder()}
require.NoError(t, err)
ld, err := codec.UnmarshalLogs([]byte("foo\r\nbar\n"))
require.NoError(t, err)
Expand Down
76 changes: 17 additions & 59 deletions internal/coreinternal/textutils/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,64 +4,15 @@
package textutils // import "github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/textutils"

import (
"errors"
"fmt"
"strings"
"unsafe"

"golang.org/x/text/encoding"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)

// NewEncodingConfig creates a new Encoding config
func NewEncodingConfig() EncodingConfig {
return EncodingConfig{
Encoding: "utf-8",
}
}

// EncodingConfig is the configuration of a Encoding helper
type EncodingConfig struct {
Encoding string `mapstructure:"encoding,omitempty"`
}

// Build will build an Encoding operator.
func (c EncodingConfig) Build() (Encoding, error) {
enc, err := lookupEncoding(c.Encoding)
if err != nil {
return Encoding{}, err
}

return Encoding{
Encoding: enc,
decodeBuffer: make([]byte, 1<<12),
decoder: enc.NewDecoder(),
}, nil
}

type Encoding struct {
Encoding encoding.Encoding
decoder *encoding.Decoder
decodeBuffer []byte
}

// Decode converts the bytes in msgBuf to utf-8 from the configured encoding
func (e *Encoding) Decode(msgBuf []byte) ([]byte, error) {
for {
e.decoder.Reset()
nDst, _, err := e.decoder.Transform(e.decodeBuffer, msgBuf, true)
if err == nil {
return e.decodeBuffer[:nDst], nil
}
if errors.Is(err, transform.ErrShortDst) {
e.decodeBuffer = make([]byte, len(e.decodeBuffer)*2)
continue
}
return nil, fmt.Errorf("transform encoding: %w", err)
}
}

var encodingOverrides = map[string]encoding.Encoding{
"utf-16": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
"utf16": unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
Expand All @@ -73,8 +24,8 @@ var encodingOverrides = map[string]encoding.Encoding{
"": unicode.UTF8,
}

func lookupEncoding(enc string) (encoding.Encoding, error) {
if e, ok := EncodingOverridesMap.Get(strings.ToLower(enc)); ok {
func LookupEncoding(enc string) (encoding.Encoding, error) {
if e, ok := encodingOverrides[strings.ToLower(enc)]; ok {
return e, nil
}
e, err := ianaindex.IANA.Encoding(enc)
Expand All @@ -88,18 +39,25 @@ func lookupEncoding(enc string) (encoding.Encoding, error) {
}

func IsNop(enc string) bool {
e, err := lookupEncoding(enc)
e, err := LookupEncoding(enc)
if err != nil {
return false
}
return e == encoding.Nop
}

var EncodingOverridesMap = encodingOverridesMap{}

type encodingOverridesMap struct{}
// DecodeAsString converts the given encoded bytes using the given decoder. It returns the converted
// bytes or nil, err if any error occurred.
func DecodeAsString(decoder *encoding.Decoder, buf []byte) (string, error) {
dstBuf, err := decoder.Bytes(buf)
if err != nil {
return "", err
}
return UnsafeBytesAsString(dstBuf), nil
}

func (e *encodingOverridesMap) Get(key string) (encoding.Encoding, bool) {
v, ok := encodingOverrides[key]
return v, ok
// UnsafeBytesAsString converts the byte array to string.
// This function must be called iff the input buffer is not going to be re-used after.
func UnsafeBytesAsString(buf []byte) string {
return unsafe.String(unsafe.SliceData(buf), len(buf))
}
43 changes: 39 additions & 4 deletions internal/coreinternal/textutils/encoding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)

func TestUTF8Encoding(t *testing.T) {
Expand Down Expand Up @@ -43,11 +44,45 @@ func TestUTF8Encoding(t *testing.T) {
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
encCfg := NewEncodingConfig()
encCfg.Encoding = test.encodingName
enc, err := encCfg.Build()
enc, err := LookupEncoding(test.encodingName)
assert.NoError(t, err)
assert.Equal(t, test.encoding, enc.Encoding)
assert.Equal(t, test.encoding, enc)
})
}
}

func TestDecodeAsString(t *testing.T) {
tests := []struct {
name string
decoder *encoding.Decoder
input []byte
expected string
}{
{
name: "nil",
decoder: &encoding.Decoder{Transformer: transform.Nop},
input: nil,
expected: "",
},
{
name: "empty",
decoder: &encoding.Decoder{Transformer: transform.Nop},
input: []byte{},
expected: "",
},
{
name: "empty",
decoder: &encoding.Decoder{Transformer: transform.Nop},
input: []byte("test"),
expected: "test",
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
enc, err := DecodeAsString(test.decoder, test.input)
assert.NoError(t, err)
assert.Equal(t, test.expected, enc)
})
}
}
2 changes: 1 addition & 1 deletion pkg/ottl/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ require (
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240506185415-9bf2ced13842
golang.org/x/net v0.34.0
golang.org/x/text v0.22.0
)

require (
Expand All @@ -50,6 +49,7 @@ require (
go.opentelemetry.io/otel/sdk/metric v1.34.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/sys v0.29.0 // indirect
golang.org/x/text v0.22.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a // indirect
google.golang.org/grpc v1.70.0 // indirect
google.golang.org/protobuf v1.36.4 // indirect
Expand Down
21 changes: 1 addition & 20 deletions pkg/ottl/ottlfuncs/func_decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@ import (
"context"
"encoding/base64"
"fmt"
"strings"

"go.opentelemetry.io/collector/pdata/pcommon"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/ianaindex"

"github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal/textutils"
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl"
Expand Down Expand Up @@ -71,7 +68,7 @@ func Decode[K any](target ottl.Getter[K], encoding string) (ottl.ExprFunc[K], er
}
return string(decodedBytes), nil
default:
e, err := getEncoding(encoding)
e, err := textutils.LookupEncoding(encoding)
if err != nil {
return nil, err
}
Expand All @@ -85,19 +82,3 @@ func Decode[K any](target ottl.Getter[K], encoding string) (ottl.ExprFunc[K], er
}
}, nil
}

func getEncoding(encoding string) (encoding.Encoding, error) {
if e, ok := textutils.EncodingOverridesMap.Get(strings.ToLower(encoding)); ok {
return e, nil
}
e, err := ianaindex.IANA.Encoding(encoding)
if err != nil {
return nil, fmt.Errorf("could not get encoding for %s: %w", encoding, err)
}
if e == nil {
// for some encodings a nil error and a nil encoding is returned, so we need to double check
// if the encoding is actually set here
return nil, fmt.Errorf("no decoder available for encoding: %s", encoding)
}
return e, nil
}
Loading

0 comments on commit 9493b6f

Please sign in to comment.