diff --git a/src/lib/sedlexing.ml b/src/lib/sedlexing.ml index 6f44e81..a646908 100644 --- a/src/lib/sedlexing.ml +++ b/src/lib/sedlexing.ml @@ -7,8 +7,21 @@ exception MalFormed module Uchar = struct (* This for compatibility with ocaml < 4.14.0 *) - let utf_8_byte_length _ = 1 - let utf_16_byte_length _ = 1 + let utf_8_byte_length u = + match Uchar.to_int u with + | u when u < 0 -> assert false + | u when u <= 0x007F -> 1 + | u when u <= 0x07FF -> 2 + | u when u <= 0xFFFF -> 3 + | u when u <= 0x10FFFF -> 4 + | _ -> assert false + + let utf_16_byte_length u = + match Uchar.to_int u with + | u when u < 0 -> assert false + | u when u <= 0xFFFF -> 2 + | u when u <= 0x10FFFF -> 4 + | _ -> assert false let () = ignore utf_8_byte_length; diff --git a/src/lib/sedlexing.mli b/src/lib/sedlexing.mli index 4fe8c2c..5d6d801 100644 --- a/src/lib/sedlexing.mli +++ b/src/lib/sedlexing.mli @@ -91,9 +91,7 @@ val lexeme_start : lexbuf -> int (** [Sedlexing.lexeme_start lexbuf] returns the offset in the input stream of the first byte of the matched string. - The first code point of the stream has offset 0. Returned - value is the same as its code point equivalent when compiled with - OCaml < 4.14.0 *) + The first code point of the stream has offset 0. *) val lexeme_bytes_start : lexbuf -> int (** [Sedlexing.lexeme_end lexbuf] returns the offset in the input @@ -105,8 +103,7 @@ val lexeme_end : lexbuf -> int (** [Sedlexing.lexeme_end lexbuf] returns the offset in the input stream of the byte following the last code point of the matched string. The first character of the stream has offset - 0. Returned value is the same as its code point equivalent when - compiled with OCaml < 4.14.0 *) + 0. *) val lexeme_bytes_end : lexbuf -> int (** [Sedlexing.loc lexbuf] returns the pair @@ -140,8 +137,7 @@ val lexing_positions : lexbuf -> Lexing.position * Lexing.position (** [Sedlexing.lexing_bytes_positions lexbuf] returns the start and end positions, in bytes, of the current token, using a record of type [Lexing.position]. This is intended for consumption - by parsers like those generated by [Menhir]. Returned value is the - same as its code point equivalent when compiled with OCaml < 4.14.0 *) + by parsers like those generated by [Menhir]. *) val lexing_bytes_positions : lexbuf -> Lexing.position * Lexing.position (** [Sedlexing.new_line lexbuf] increments the line count and