Skip to content

Commit

Permalink
Add utf8 support for string literal
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo Heuzard authored and hhugo committed Nov 1, 2024
1 parent c8156d1 commit 3740720
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ with a length different from one.


Note:
- The OCaml source is assumed to be encoded in Latin1 (for string
- The OCaml source is assumed to be encoded in utf8 (for string
and character literals).


Expand Down
100 changes: 69 additions & 31 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -285,29 +285,52 @@ let codepoint i =
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
i

let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))

let regexp_for_string s =
let rec aux n =
if n = String.length s then Sedlex.eps
else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
let fold_bytes ~f acc s =
let rec loop acc n =
if n = String.length s then acc
else (
let acc = f acc s.[n] in
loop acc (succ n))
in
aux 0
loop acc 0

let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
let regexp_for_uchar c = Sedlex.chars (Cset.singleton (Uchar.to_int c))

let err loc s =
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s))

let fold_well_formed_utf8 ~loc ~f acc s =
Utf8.fold
~f:(fun acc _ uchar ->
match uchar with
| `Malformed _ -> err loc "Malformed utf-8 string"
| `Uchar uchar -> f acc uchar)
acc s

let regexp_for_string ~loc ~utf8 s =
let l_rev =
if utf8 then
fold_well_formed_utf8 ~loc ~f:(fun acc uchar -> uchar :: acc) [] s
else fold_bytes ~f:(fun acc c -> Uchar.of_char c :: acc) [] s
in
let rec aux = function
| [] -> Sedlex.eps
| x :: xs -> Sedlex.seq (regexp_for_uchar x) (aux xs)
in
aux (List.rev l_rev)

let rec repeat r = function
| 0, 0 -> Sedlex.eps
| 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1)))
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))

let regexp_of_pattern env =
let rec char_pair_op func name p tuple =
let rec char_pair_op func name ~utf8 p tuple =
(* Construct something like Sub(a,b) *)
match tuple with
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
match func (aux p0) (aux p1) with
match func (aux ~utf8 p0) (aux ~utf8 p1) with
| Some r -> r
| None ->
err p.ppat_loc @@ "the " ^ name
Expand All @@ -317,16 +340,20 @@ let regexp_of_pattern env =
| _ ->
err p.ppat_loc @@ "the " ^ name
^ " operator requires two arguments, like " ^ name ^ "(a,b)"
and aux p =
and aux ~utf8 p =
(* interpret one pattern node *)
match p.ppat_desc with
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
| Ppat_or (p1, p2) -> Sedlex.alt (aux ~utf8 p1) (aux ~utf8 p2)
| Ppat_tuple (p :: pl) ->
List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl
List.fold_left
(fun r p -> Sedlex.seq r (aux ~utf8 p))
(aux ~utf8 p) pl
| Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) ->
Sedlex.rep (aux p)
Sedlex.rep (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) ->
Sedlex.plus (aux p)
Sedlex.plus (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) ->
aux ~utf8:true p
| Ppat_construct
( { txt = Lident "Rep" },
Some
Expand All @@ -346,19 +373,19 @@ let regexp_of_pattern env =
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
let i1 = int_of_string i1 in
let i2 = int_of_string i2 in
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
if 0 <= i1 && i1 <= i2 then repeat (aux ~utf8 p0) (i1, i2)
else err p.ppat_loc "Invalid range for Rep operator"
| _ ->
err p.ppat_loc "Rep must take an integer constant or interval"
end
| Ppat_construct ({ txt = Lident "Rep" }, _) ->
err p.ppat_loc "the Rep operator takes 2 arguments"
| Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) ->
Sedlex.alt Sedlex.eps (aux p)
Sedlex.alt Sedlex.eps (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin
match arg with
| Some (_, p0) -> begin
match Sedlex.compl (aux p0) with
match Sedlex.compl (aux ~utf8 p0) with
| Some r -> r
| None ->
err p.ppat_loc
Expand All @@ -368,26 +395,36 @@ let regexp_of_pattern env =
| _ -> err p.ppat_loc "the Compl operator requires an argument"
end
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
char_pair_op Sedlex.subtract "Sub" p
char_pair_op ~utf8 Sedlex.subtract "Sub" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
char_pair_op Sedlex.intersection "Intersect" p
char_pair_op ~utf8 Sedlex.intersection "Intersect" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Chars" }, arg) -> (
| Ppat_construct ({ txt = Lident "Chars" }, arg) ->
let const =
match arg with
| Some (_, { ppat_desc = Ppat_constant const }) -> Some const
| _ -> None
in
match const with
| Some (Pconst_string (s, _, _)) ->
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
Sedlex.chars !c
| _ ->
err p.ppat_loc "the Chars operator requires a string argument")
begin
match const with
| Some (Pconst_string (s, _, _)) ->
let chars =
if utf8 then
fold_well_formed_utf8 ~loc:p.ppat_loc
~f:(fun acc uchar ->
Cset.union acc (Cset.singleton (Uchar.to_int uchar)))
Cset.empty s
else
fold_bytes
~f:(fun acc c ->
Cset.union acc (Cset.singleton (Char.code c)))
Cset.empty s
in
Sedlex.chars chars
| _ ->
err p.ppat_loc "the Chars operator requires a string argument"
end
| Ppat_interval (i_start, i_end) -> begin
match (i_start, i_end) with
| Pconst_char c1, Pconst_char c2 ->
Expand All @@ -401,7 +438,8 @@ let regexp_of_pattern env =
end
| Ppat_constant const -> begin
match const with
| Pconst_string (s, _, _) -> regexp_for_string s
| Pconst_string (s, _, _) ->
regexp_for_string ~loc:p.ppat_loc ~utf8 s
| Pconst_char c -> regexp_for_char c
| Pconst_integer (i, _) ->
Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
Expand All @@ -414,7 +452,7 @@ let regexp_of_pattern env =
end
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
in
aux
aux ~utf8:false

let previous = ref []
let regexps = ref []
Expand Down
73 changes: 73 additions & 0 deletions src/syntax/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
let unsafe_byte s j = Char.code (String.unsafe_get s j)
let malformed s j l = `Malformed (String.sub s j l)

let width = function
| '\000' .. '\127' -> 1
| '\192' .. '\223' -> 2
| '\224' .. '\239' -> 3
| '\240' .. '\247' -> 4
| _ -> 0

let r_utf_8 s j l =
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
let uchar c = `Uchar (Uchar.unsafe_of_int c) in
match l with
| 1 -> uchar (unsafe_byte s j)
| 2 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
if b1 lsr 6 != 0b10 then malformed s j l
else uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
| 3 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let c =
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
in
if b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xE0 ->
if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c
| 0xED ->
if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| 4 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let b3 = unsafe_byte s (j + 3) in
let c =
((b0 land 0x07) lsl 18)
lor ((b1 land 0x3F) lsl 12)
lor ((b2 land 0x3F) lsl 6)
lor (b3 land 0x3F)
in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xF0 ->
if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c
| 0xF4 ->
if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| _ -> assert false

let fold ~f acc s =
let rec loop acc f s i last =
if i > last then acc
else (
let need = width (String.unsafe_get s i) in
if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last
else (
let rem = last - i + 1 in
if rem < need then f acc i (malformed s i rem)
else loop (f acc i (r_utf_8 s i need)) f s (i + need) last))
in
let pos = 0 in
let len = String.length s in
let last = pos + len - 1 in
loop acc f s pos last
5 changes: 5 additions & 0 deletions src/syntax/utf8.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
val fold :
f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) ->
'a ->
string ->
'a
19 changes: 19 additions & 0 deletions test/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
open Printf

let next_tok buf =
let open Sedlexing.Utf8 in
match%sedlex buf with
| "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf)
| "b", Utf8 ("+" | "-" | "×" | "÷") ->
sprintf "with or_pattern: %s" (lexeme buf)
| _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf))

let%expect_test _ =
Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string;
[%expect {| with Chars: a+ |}];
Sedlexing.Utf8.from_string "" |> next_tok |> print_string;
[%expect {| with Chars: a÷ |}];
Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string;
[%expect {| with or_pattern: b+ |}];
Sedlexing.Utf8.from_string "" |> next_tok |> print_string;
[%expect {| with or_pattern: b÷ |}]

0 comments on commit 3740720

Please sign in to comment.