From 576c56499311dd3e56452399739556e89c8385f6 Mon Sep 17 00:00:00 2001 From: Roger Peppe Date: Tue, 12 Dec 2023 18:21:52 +0000 Subject: [PATCH] internal/mod/module: add support for escaping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to make sure that module paths can be stored in arbitrary case-insensitive filesystems without risk of clashes. The logic here has been adapted from the Go equivalent. Signed-off-by: Roger Peppe Change-Id: I58a5c082f010a087136cf9535796d6b8226f81f7 Reviewed-on: https://review-eu.gerrithub.io/c/cue-lang/cue/+/1173532 TryBot-Result: CUEcueckoo Reviewed-by: Daniel Martí Unity-Result: CUE porcuepine --- internal/mod/module/escape.go | 68 ++++++++++++++++++++++++++++ internal/mod/module/module.go | 73 ++++++++++++++++++++++++++++++ internal/mod/module/module_test.go | 46 +++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 internal/mod/module/escape.go diff --git a/internal/mod/module/escape.go b/internal/mod/module/escape.go new file mode 100644 index 00000000000..7c0c44201f7 --- /dev/null +++ b/internal/mod/module/escape.go @@ -0,0 +1,68 @@ +package module + +import ( + "fmt" + "strings" + "unicode/utf8" + + "cuelang.org/go/internal/mod/semver" +) + +// EscapePath returns the escaped form of the given module path +// (without the major version suffix). +// It fails if the module path is invalid. +func EscapePath(path string) (escaped string, err error) { + if err := CheckPathWithoutVersion(path); err != nil { + return "", err + } + // Technically there's no need to escape capital letters because CheckPath + // doesn't allow them, but let's be defensive. + return escapeString(path) +} + +// EscapeVersion returns the escaped form of the given module version. +// Versions must be in (possibly non-canonical) semver form and must be valid file names +// and not contain exclamation marks. +func EscapeVersion(v string) (escaped string, err error) { + if !semver.IsValid(v) { + return "", &InvalidVersionError{ + Version: v, + Err: fmt.Errorf("version is not in semver syntax"), + } + } + if err := checkElem(v, filePath); err != nil || strings.Contains(v, "!") { + return "", &InvalidVersionError{ + Version: v, + Err: fmt.Errorf("disallowed version string"), + } + } + return escapeString(v) +} + +func escapeString(s string) (escaped string, err error) { + haveUpper := false + for _, r := range s { + if r == '!' || r >= utf8.RuneSelf { + // This should be disallowed by CheckPath, but diagnose anyway. + // The correctness of the escaping loop below depends on it. + return "", fmt.Errorf("internal error: inconsistency in EscapePath") + } + if 'A' <= r && r <= 'Z' { + haveUpper = true + } + } + + if !haveUpper { + return s, nil + } + + var buf []byte + for _, r := range s { + if 'A' <= r && r <= 'Z' { + buf = append(buf, '!', byte(r+'a'-'A')) + } else { + buf = append(buf, byte(r)) + } + } + return string(buf), nil +} diff --git a/internal/mod/module/module.go b/internal/mod/module/module.go index 8be25893ed1..9fb478bf58c 100644 --- a/internal/mod/module/module.go +++ b/internal/mod/module/module.go @@ -14,6 +14,68 @@ // There are no restrictions imposed directly by use of this structure, // but additional checking functions, most notably Check, verify that // a particular path, version pair is valid. +// +// # Escaped Paths +// +// Module versions appear as substrings of file system paths (as stored by +// the modcache package). +// In general we cannot rely on file systems to be case-sensitive. Although +// module paths cannot currently contain upper case characters because +// OCI registries forbid that, versions can. That +// is, we cannot rely on the file system to keep foo.com/v@v1.0.0-PRE and +// foo.com/v@v1.0.0-PRE separate. Windows and macOS don't. Instead, we must +// never require two different casings of a file path. +// +// One possibility would be to make the escaped form be the lowercase +// hexadecimal encoding of the actual path bytes. This would avoid ever +// needing different casings of a file path, but it would be fairly illegible +// to most programmers when those paths appeared in the file system +// (including in file paths in compiler errors and stack traces) +// in web server logs, and so on. Instead, we want a safe escaped form that +// leaves most paths unaltered. +// +// The safe escaped form is to replace every uppercase letter +// with an exclamation mark followed by the letter's lowercase equivalent. +// +// For example, +// +// foo.com/v@v1.0.0-PRE -> foo.com/v@v1.0.0-!p!r!e +// +// Versions that avoid upper-case letters are left unchanged. +// Note that because import paths are ASCII-only and avoid various +// problematic punctuation (like : < and >), the escaped form is also ASCII-only +// and avoids the same problematic punctuation. +// +// Neither versions nor module paths allow exclamation marks, so there is no +// need to define how to escape a literal !. +// +// # Unicode Restrictions +// +// Today, paths are disallowed from using Unicode. +// +// Although paths are currently disallowed from using Unicode, +// we would like at some point to allow Unicode letters as well, to assume that +// file systems and URLs are Unicode-safe (storing UTF-8), and apply +// the !-for-uppercase convention for escaping them in the file system. +// But there are at least two subtle considerations. +// +// First, note that not all case-fold equivalent distinct runes +// form an upper/lower pair. +// For example, U+004B ('K'), U+006B ('k'), and U+212A ('K' for Kelvin) +// are three distinct runes that case-fold to each other. +// When we do add Unicode letters, we must not assume that upper/lower +// are the only case-equivalent pairs. +// Perhaps the Kelvin symbol would be disallowed entirely, for example. +// Or perhaps it would escape as "!!k", or perhaps as "(212A)". +// +// Second, it would be nice to allow Unicode marks as well as letters, +// but marks include combining marks, and then we must deal not +// only with case folding but also normalization: both U+00E9 ('é') +// and U+0065 U+0301 ('e' followed by combining acute accent) +// look the same on the page and are treated by some file systems +// as the same path. If we do allow Unicode marks in paths, there +// must be some kind of normalization to allow only one canonical +// encoding of any character used in an import path. package module // IMPORTANT NOTE @@ -50,10 +112,12 @@ func (m Version) Path() string { return m.path } +// Equal reports whether m is equal to m1. func (m Version) Equal(m1 Version) bool { return m.path == m1.path && m.version == m1.version } +// BasePath returns the path part of m without its major version suffix. func (m Version) BasePath() string { basePath, _, ok := SplitPathVersion(m.path) if !ok { @@ -62,14 +126,23 @@ func (m Version) BasePath() string { return basePath } +// Version returns the version part of m. This is either +// a canonical semver version or "none" or the empty string. func (m Version) Version() string { return m.version } +// IsValid reports whether m is non-zero. func (m Version) IsValid() bool { return m.path != "" } +// IsCanonical reports whether m is valid and has a canonical +// semver version. +func (m Version) IsCanonical() bool { + return m.IsValid() && m.version != "" && m.version != "none" +} + // String returns the string form of the Version: // (Path@Version, or just Path if Version is empty). func (m Version) String() string { diff --git a/internal/mod/module/module_test.go b/internal/mod/module/module_test.go index b161cb8d201..7a5c9025e0e 100644 --- a/internal/mod/module/module_test.go +++ b/internal/mod/module/module_test.go @@ -151,3 +151,49 @@ func TestParseVersion(t *testing.T) { }) } } + +var escapeVersionTests = []struct { + v string + esc string // empty means same as path +}{ + {v: "v1.2.3-alpha"}, + {v: "v3"}, + {v: "v2.3.1-ABcD", esc: "v2.3.1-!a!bc!d"}, +} + +func TestEscapeVersion(t *testing.T) { + for _, tt := range escapeVersionTests { + esc, err := EscapeVersion(tt.v) + if err != nil { + t.Errorf("EscapeVersion(%q): unexpected error: %v", tt.v, err) + continue + } + want := tt.esc + if want == "" { + want = tt.v + } + if esc != want { + t.Errorf("EscapeVersion(%q) = %q, want %q", tt.v, esc, want) + } + } +} + +func TestEscapePath(t *testing.T) { + // Check invalid paths. + for _, tt := range checkPathWithoutVersionTests { + if tt.wantErr != "" { + _, err := EscapePath(tt.path) + if err == nil { + t.Errorf("EscapePath(%q): succeeded, want error (invalid path)", tt.path) + } + } + } + path := "foo.com/bar" + esc, err := EscapePath(path) + if err != nil { + t.Fatal(err) + } + if esc != path { + t.Fatalf("EscapePath(%q) = %q, want %q", path, esc, path) + } +}