From 36435f1a27f3a3fddc8bcc761c18f6e253e6d4d6 Mon Sep 17 00:00:00 2001 From: James Craig Burley <430319+jcburley@users.noreply.github.com> Date: Thu, 25 Feb 2021 15:20:19 -0500 Subject: [PATCH] New --valid-letters option --- README.md | 18 ++++++++++++++++++ core/read.go | 24 +++++++++++++++++++++--- main.go | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bb0f527e3..697cce36d 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,24 @@ test.clj:1:1: Parse warning: let form with empty body ``` The output format is as follows: `::: : `, where `` can be `Read error`, `Parse error`, `Parse warning` or `Exception`. +### Valid Symbol (and Keyword) Characters + +The `--valid-letters` option specifies valid characters for symbol and keyword "letters", which are the non-special, non-digit characters that make up symbol names. + +While Clojure 1.10 allows, for example, em dashes in symbol names (which can lead to confusion), Joker does not; by default, it allows only Unicode letters (category L). E.g. `(def é "hey")` works as expected. `--valid-letters letters` specifies this explicitly. + +Earlier versions of Joker allowed any Unicode code points beyond MaxLatin1 (255, aka `0xff`) in addition to the ASCII letters (`[A-Z][a-z]`). This is more consistent with Clojure, in that it allows em dashes; e.g. `(def a–b "wow")`. Specify `--valid-letters unicode` for this behavior. + +Some developers prefer "strict ASCII". Joker supports this for symbol letters (though not for digits, which may be any Unicode digits) via `--valid-letters ascii`. + +On the other hand, `--valid-letters any` allows any character (not otherwise special, such as delimeters), including ASCII control characters. + +Note that `--valid-letters` affects only how symbols are parsed; `joker.core/symbol` is able to form a symbol from any string, regardless of content. + +Also note that, though most closely associated with linting, `--valid-letters` governs how symbols are parsed regardless of mode of operation (linting, running scripts, formatting, etc). + +Finally, the environment variable `JOKER_VALID_LETTERS` may be used to specify the default for `--valid-letters`. + ### Integration with editors - Emacs: [flycheck syntax checker](https://github.com/candid82/flycheck-joker) diff --git a/core/read.go b/core/read.go index bf1244f30..f349518a6 100644 --- a/core/read.go +++ b/core/read.go @@ -387,6 +387,24 @@ func readNumber(reader *Reader) Object { return scanInt(str, str, 0, reader) } +func AnyRuneIsValid(r rune) bool { + return !isDelimiter(r) +} + +func AnyUnicodeIsValid(r rune) bool { + return unicode.IsLetter(r) || r > unicode.MaxLatin1 +} + +func AnyUnicodeLetterIsValid(r rune) bool { + return unicode.IsLetter(r) +} + +func AnyASCIILetterIsValid(r rune) bool { + return r <= unicode.MaxASCII && unicode.IsLetter(r) +} + +var IsValidLetterFn = AnyUnicodeLetterIsValid + func isSymbolInitial(r rune) bool { switch r { case '*', '+', '!', '-', '_', '?', ':', '=', '<', '>', '&', '%', '$', '|': @@ -394,7 +412,7 @@ func isSymbolInitial(r rune) bool { case '.': return DIALECT != CLJS } - return unicode.IsLetter(r) || r > 255 + return IsValidLetterFn(r) } func isSymbolRune(r rune) bool { @@ -1208,8 +1226,6 @@ func Read(reader *Reader) (Object, bool) { return readSymbol(reader, r), false } return readArgSymbol(reader), false - case isSymbolInitial(r): - return readSymbol(reader, r), false case r == '"': return readString(reader), false case r == '(': @@ -1271,6 +1287,8 @@ func Read(reader *Reader) (Object, bool) { return readWithMeta(reader), false case r == '#': return readDispatch(reader) + case isSymbolInitial(r): + return readSymbol(reader, r), false case r == EOF: panic(MakeReadError(reader, "Unexpected end of file")) } diff --git a/main.go b/main.go index eccffe395..44cc0116e 100644 --- a/main.go +++ b/main.go @@ -369,6 +369,22 @@ func dialectFromArg(arg string) Dialect { return UNKNOWN } +func setValidLetters(arg, what string) { + switch strings.ToLower(arg) { + case "ascii": + IsValidLetterFn = AnyASCIILetterIsValid + case "letters": + IsValidLetterFn = AnyUnicodeLetterIsValid + case "unicode": + IsValidLetterFn = AnyUnicodeIsValid + case "any": + IsValidLetterFn = AnyRuneIsValid + default: + fmt.Fprintf(Stderr, "Error: Unrecognized %s '%s'.\n", what, arg) + ExitJoker(18) + } +} + func usage(out io.Writer) { fmt.Fprintf(out, "Joker - %s\n\n", VERSION) fmt.Fprintln(out, "Usage: joker [args] [-- ] starts a repl") @@ -416,6 +432,12 @@ func usage(out io.Writer) { fmt.Fprintln(out, " --dialect ") fmt.Fprintln(out, " Set input dialect (\"clj\", \"cljs\", \"joker\", \"edn\") for linting;") fmt.Fprintln(out, " default is inferred from suffix, if any.") + fmt.Fprintln(out, " --valid-letters ") + fmt.Fprintln(out, " Set valid runes for non-special/non-digit characters in symbol names; is:") + fmt.Fprintln(out, " \"ascii\", denoting only [A-Za-z]") + fmt.Fprintln(out, " \"letters\" (default), any Unicode Letter (category L)") + fmt.Fprintln(out, " \"unicode\", [A-Za-z] or any Unicode code point beyond ASCII/Latin1 (255)") + fmt.Fprintln(out, " \"any\", any character (that has no other specific meaning in Joker)") fmt.Fprintln(out, " --hashmap-threshold ") fmt.Fprintln(out, " Set HASHMAP_THRESHOLD accordingly (internal magic of some sort).") fmt.Fprintln(out, " --profiler ") @@ -488,6 +510,9 @@ func parseArgs(args []string) { } else { classPath = "" } + if v, ok := os.LookupEnv("JOKER_VALID_LETTERS"); ok { + setValidLetters(v, "JOKER_VALID_LETTERS environment-variable value") + } var i int for i = 1; i < length; i++ { // shift if debugOut != nil { @@ -567,6 +592,13 @@ func parseArgs(args []string) { } else { missing = true } + case "--valid-letters": + if i < length-1 && notOption(args[i+1]) { + i += 1 // shift + setValidLetters(args[i], "--valid-letters argument") + } else { + missing = true + } case "--hashmap-threshold": if i < length-1 && notOption(args[i+1]) { i += 1 // shift