Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC: Add utf8 support for string literal #127

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ with a length different from one.


Note:
- The OCaml source is assumed to be encoded in Latin1 (for string
- The OCaml source is assumed to be encoded in utf8 (for string
and character literals).


Expand Down
99 changes: 69 additions & 30 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -285,14 +285,35 @@ let codepoint i =
failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
i

let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
let fold_bytes ~f acc s =
let rec loop acc n =
if n = String.length s then acc
else (
let acc = f acc s.[n] in
loop acc (succ n))
in
loop acc 0

let regexp_for_string s =
let rec aux n =
if n = String.length s then Sedlex.eps
else Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
let regexp_for_char c = Sedlex.chars (Cset.singleton (Char.code c))
let regexp_for_uchar c = Sedlex.chars (Cset.singleton (Uchar.to_int c))

let regexp_for_string ~utf8 s =
let l =
if utf8 then
List.rev
(Utf8.fold
~f:(fun acc _ uchar ->
match uchar with
| `Malformed _ -> assert false
| `Uchar uchar -> uchar :: acc)
[] s)
else List.rev (fold_bytes ~f:(fun acc c -> Uchar.of_char c :: acc) [] s)
in
let rec aux = function
| [] -> Sedlex.eps
| x :: xs -> Sedlex.seq (regexp_for_uchar x) (aux xs)
in
aux 0
aux l

let err loc s =
raise (Location.Error (Location.Error.createf ~loc "Sedlex: %s" s))
Expand All @@ -303,11 +324,11 @@ let rec repeat r = function
| n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))

let regexp_of_pattern env =
let rec char_pair_op func name p tuple =
let rec char_pair_op func name ~utf8 p tuple =
(* Construct something like Sub(a,b) *)
match tuple with
| Some { ppat_desc = Ppat_tuple [p0; p1] } -> begin
match func (aux p0) (aux p1) with
match func (aux ~utf8 p0) (aux ~utf8 p1) with
| Some r -> r
| None ->
err p.ppat_loc @@ "the " ^ name
Expand All @@ -317,16 +338,20 @@ let regexp_of_pattern env =
| _ ->
err p.ppat_loc @@ "the " ^ name
^ " operator requires two arguments, like " ^ name ^ "(a,b)"
and aux p =
and aux ~utf8 p =
(* interpret one pattern node *)
match p.ppat_desc with
| Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
| Ppat_or (p1, p2) -> Sedlex.alt (aux ~utf8 p1) (aux ~utf8 p2)
| Ppat_tuple (p :: pl) ->
List.fold_left (fun r p -> Sedlex.seq r (aux p)) (aux p) pl
List.fold_left
(fun r p -> Sedlex.seq r (aux ~utf8 p))
(aux ~utf8 p) pl
| Ppat_construct ({ txt = Lident "Star" }, Some (_, p)) ->
Sedlex.rep (aux p)
Sedlex.rep (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Plus" }, Some (_, p)) ->
Sedlex.plus (aux p)
Sedlex.plus (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Utf8" }, Some (_, p)) ->
aux ~utf8:true p
| Ppat_construct
( { txt = Lident "Rep" },
Some
Expand All @@ -346,19 +371,19 @@ let regexp_of_pattern env =
| Pconst_integer (i1, _), Pconst_integer (i2, _) ->
let i1 = int_of_string i1 in
let i2 = int_of_string i2 in
if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
if 0 <= i1 && i1 <= i2 then repeat (aux ~utf8 p0) (i1, i2)
else err p.ppat_loc "Invalid range for Rep operator"
| _ ->
err p.ppat_loc "Rep must take an integer constant or interval"
end
| Ppat_construct ({ txt = Lident "Rep" }, _) ->
err p.ppat_loc "the Rep operator takes 2 arguments"
| Ppat_construct ({ txt = Lident "Opt" }, Some (_, p)) ->
Sedlex.alt Sedlex.eps (aux p)
Sedlex.alt Sedlex.eps (aux ~utf8 p)
| Ppat_construct ({ txt = Lident "Compl" }, arg) -> begin
match arg with
| Some (_, p0) -> begin
match Sedlex.compl (aux p0) with
match Sedlex.compl (aux ~utf8 p0) with
| Some r -> r
| None ->
err p.ppat_loc
Expand All @@ -368,26 +393,40 @@ let regexp_of_pattern env =
| _ -> err p.ppat_loc "the Compl operator requires an argument"
end
| Ppat_construct ({ txt = Lident "Sub" }, arg) ->
char_pair_op Sedlex.subtract "Sub" p
char_pair_op ~utf8 Sedlex.subtract "Sub" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
char_pair_op Sedlex.intersection "Intersect" p
char_pair_op ~utf8 Sedlex.intersection "Intersect" p
(Option.map (fun (_, arg) -> arg) arg)
| Ppat_construct ({ txt = Lident "Chars" }, arg) -> (
| Ppat_construct ({ txt = Lident "Chars" }, arg) ->
let const =
match arg with
| Some (_, { ppat_desc = Ppat_constant const }) -> Some const
| _ -> None
in
match const with
| Some (Pconst_string (s, _, _)) ->
let c = ref Cset.empty in
for i = 0 to String.length s - 1 do
c := Cset.union !c (Cset.singleton (Char.code s.[i]))
done;
Sedlex.chars !c
| _ ->
err p.ppat_loc "the Chars operator requires a string argument")
begin
match const with
| Some (Pconst_string (s, _, _)) ->
let chars =
if utf8 then
Utf8.fold
~f:(fun acc _ uchar ->
match uchar with
| `Malformed _ -> assert false
| `Uchar uchar ->
Cset.union acc
(Cset.singleton (Uchar.to_int uchar)))
Cset.empty s
else
fold_bytes
~f:(fun acc c ->
Cset.union acc (Cset.singleton (Char.code c)))
Cset.empty s
in
Sedlex.chars chars
| _ ->
err p.ppat_loc "the Chars operator requires a string argument"
end
| Ppat_interval (i_start, i_end) -> begin
match (i_start, i_end) with
| Pconst_char c1, Pconst_char c2 ->
Expand All @@ -401,7 +440,7 @@ let regexp_of_pattern env =
end
| Ppat_constant const -> begin
match const with
| Pconst_string (s, _, _) -> regexp_for_string s
| Pconst_string (s, _, _) -> regexp_for_string ~utf8 s
| Pconst_char c -> regexp_for_char c
| Pconst_integer (i, _) ->
Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
Expand All @@ -414,7 +453,7 @@ let regexp_of_pattern env =
end
| _ -> err p.ppat_loc "this pattern is not a valid regexp"
in
aux
aux ~utf8:false

let previous = ref []
let regexps = ref []
Expand Down
73 changes: 73 additions & 0 deletions src/syntax/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
let unsafe_byte s j = Char.code (String.unsafe_get s j)
let malformed s j l = `Malformed (String.sub s j l)

let width = function
| '\000' .. '\127' -> 1
| '\192' .. '\223' -> 2
| '\224' .. '\239' -> 3
| '\240' .. '\247' -> 4
| _ -> 0

let r_utf_8 s j l =
(* assert (0 <= j && 0 <= l && j + l <= String.length s); *)
let uchar c = `Uchar (Uchar.unsafe_of_int c) in
match l with
| 1 -> uchar (unsafe_byte s j)
| 2 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
if b1 lsr 6 != 0b10 then malformed s j l
else uchar (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F))
| 3 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let c =
((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F)
in
if b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xE0 ->
if b1 < 0xA0 || 0xBF < b1 then malformed s j l else uchar c
| 0xED ->
if b1 < 0x80 || 0x9F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| 4 ->
let b0 = unsafe_byte s j in
let b1 = unsafe_byte s (j + 1) in
let b2 = unsafe_byte s (j + 2) in
let b3 = unsafe_byte s (j + 3) in
let c =
((b0 land 0x07) lsl 18)
lor ((b1 land 0x3F) lsl 12)
lor ((b2 land 0x3F) lsl 6)
lor (b3 land 0x3F)
in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 then malformed s j l
else begin
match b0 with
| 0xF0 ->
if b1 < 0x90 || 0xBF < b1 then malformed s j l else uchar c
| 0xF4 ->
if b1 < 0x80 || 0x8F < b1 then malformed s j l else uchar c
| _ -> if b1 lsr 6 != 0b10 then malformed s j l else uchar c
end
| _ -> assert false

let fold ~f acc s =
let rec loop acc f s i last =
if i > last then acc
else (
let need = width (String.unsafe_get s i) in
if need = 0 then loop (f acc i (malformed s i 1)) f s (i + 1) last
else (
let rem = last - i + 1 in
if rem < need then f acc i (malformed s i rem)
else loop (f acc i (r_utf_8 s i need)) f s (i + need) last))
in
let pos = 0 in
let len = String.length s in
let last = pos + len - 1 in
loop acc f s pos last
5 changes: 5 additions & 0 deletions src/syntax/utf8.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
val fold :
f:('a -> int -> [> `Malformed of string | `Uchar of Uchar.t ] -> 'a) ->
'a ->
string ->
'a
19 changes: 19 additions & 0 deletions test/utf8.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
open Printf

let next_tok buf =
let open Sedlexing.Utf8 in
match%sedlex buf with
| "a", Utf8 (Chars "+-×÷") -> sprintf "with Chars: %s" (lexeme buf)
| "b", Utf8 ("+" | "-" | "×" | "÷") ->
sprintf "with or_pattern: %s" (lexeme buf)
| _ -> failwith (sprintf "Unexpected character: %s" (lexeme buf))

let%expect_test _ =
Sedlexing.Utf8.from_string "a+" |> next_tok |> print_string;
[%expect {| with Chars: a+ |}];
Sedlexing.Utf8.from_string "a÷" |> next_tok |> print_string;
[%expect {| with Chars: a÷ |}];
Sedlexing.Utf8.from_string "b+" |> next_tok |> print_string;
[%expect {| with or_pattern: b+ |}];
Sedlexing.Utf8.from_string "b÷" |> next_tok |> print_string;
[%expect {| with or_pattern: b÷ |}]
Loading