Skip to content

Commit

Permalink
Better unicode.ml generator (#141)
Browse files Browse the repository at this point in the history
Co-authored-by: Hugo Heuzard <[email protected]>
Co-authored-by: Romain Beauxis <[email protected]>
  • Loading branch information
3 people authored Jun 28, 2023
1 parent 5f65255 commit fad335a
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 119 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Check and fix invariants from Cset. The codebase was not respecting
invariants documented in the Cset module which could break code
relying on it. The code generated by sedlex.ppx could be affected.
- Do not rely on comments from unicode UCD files
- Add API to track position in bytes. Should be opt-in and backward compatible. (#146)

# 3.1:
Expand Down
3 changes: 2 additions & 1 deletion dune-project
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(lang dune 2.8)
(lang dune 2.9)
(version 3.2)
(name sedlex)
(source (github ocaml-community/sedlex))
Expand All @@ -9,6 +9,7 @@
(homepage "https://github.com/ocaml-community/sedlex")

(generate_opam_files true)
(executables_implicit_empty_intf true)

(package
(name sedlex)
Expand Down
4 changes: 3 additions & 1 deletion sedlex.opam
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ homepage: "https://github.com/ocaml-community/sedlex"
bug-reports: "https://github.com/ocaml-community/sedlex/issues"
depends: [
"ocaml" {>= "4.08"}
"dune" {>= "2.8"}
"dune" {>= "2.9"}
"ppxlib" {>= "0.26.0"}
"gen"
"ppx_expect" {with-test}
Expand All @@ -32,10 +32,12 @@ build: [
name
"-j"
jobs
"--promote-install-files=false"
"@install"
"@runtest" {with-test}
"@doc" {with-doc}
]
["dune" "install" "-p" name "--create-install-files" name]
]
dev-repo: "git+https://github.com/ocaml-community/sedlex.git"
doc: "https://ocaml-community.github.io/sedlex/index.html"
18 changes: 12 additions & 6 deletions src/generator/data/dune
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
(rule
(targets DerivedCoreProperties.txt)
(target DerivedCoreProperties.txt)
(deps base_url)
(action
(run
Expand All @@ -8,10 +8,10 @@
-s
%{read:base_url}/ucd/DerivedCoreProperties.txt
-o
DerivedCoreProperties.txt)))
%{target})))

(rule
(targets DerivedGeneralCategory.txt)
(target DerivedGeneralCategory.txt)
(deps base_url)
(action
(run
Expand All @@ -20,10 +20,16 @@
-s
%{read:base_url}/ucd/extracted/DerivedGeneralCategory.txt
-o
DerivedGeneralCategory.txt)))
%{target})))

(rule
(targets PropList.txt)
(target PropList.txt)
(deps base_url)
(action
(run curl -L -s %{read:base_url}/ucd/PropList.txt -o PropList.txt)))
(run curl -L -s %{read:base_url}/ucd/PropList.txt -o %{target})))

(rule
(target UnicodeData.txt)
(deps base_url)
(action
(run curl -L -s %{read:base_url}/ucd/UnicodeData.txt -o %{target})))
232 changes: 124 additions & 108 deletions src/generator/gen_unicode.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,127 @@
* the files exported at https://www.unicode.org/Public/<unicode version>
* and stored at src/generator/data. *)
open Sedlex_utils
module SSet = Set.Make (String)

let target = Sys.argv.(1)
let categories = Hashtbl.create 1024
let labels = Hashtbl.create 1024

(* Categories and Properties that we're keeping. *)
let keepers =
(* Drop comments and split semi-column separated fields *)
let parse_line l =
let l =
match String.index_opt l '#' with None -> l | Some i -> String.sub l 0 i
in
String.split_on_char ';' l

let parse_code s =
try int_of_string (Printf.sprintf "0x%s" s)
with _ -> failwith (Printf.sprintf "invalid code %s" s)

let parse_category x = String.lowercase_ascii (String.trim x)
let parse_prop x = String.lowercase_ascii (String.trim x)

let parse_interval s =
match String.split_on_char '.' (String.trim s) with
| [] -> assert false
| [x] ->
let x = parse_code x in
Cset.singleton x
| [x; ""; y] ->
let x = parse_code x and y = parse_code y in
Cset.interval x y
| _ -> failwith (Printf.sprintf "invalid interval %s" s)

let print_elements ch hashtbl cats =
let cats_set = SSet.of_list cats in
let all_keys = SSet.of_seq (Hashtbl.to_seq_keys hashtbl) in
let missing = SSet.diff cats_set all_keys in
let ignoring = SSet.diff all_keys cats_set in
let len = List.length cats in
List.iter
(fun c ->
let entries =
List.map
(fun (b, e) -> Printf.sprintf "0x%x, 0x%x" b e)
(Cset.union_list (Hashtbl.find_all hashtbl c) :> (int * int) list)
in
Printf.fprintf ch " let %s = Sedlex_cset.of_list\n [" c;
List.iteri
(fun i x ->
if i > 0 then
if i mod 5 = 0 then Printf.fprintf ch ";\n "
else Printf.fprintf ch "; ";
Printf.fprintf ch "%s" x)
entries;
Printf.fprintf ch "]\n\n")
cats;
Printf.fprintf ch " let list = [\n";
List.iteri
(fun pos c ->
Printf.fprintf ch " (%S, %s)%s\n" c c
(if pos == len - 1 then "" else ";"))
cats;
Printf.fprintf ch " ]\n\n";
if not (SSet.is_empty ignoring) then (
Printf.fprintf ch "(* ignoring:\n";
SSet.iter (fun s -> Printf.fprintf ch " - %s\n" s) ignoring;
Printf.fprintf ch "*)\n");
if not (SSet.is_empty missing) then (
Printf.fprintf ch "(* missing:\n";
SSet.iter (fun s -> Printf.fprintf ch " - %s\n" s) missing;
Printf.fprintf ch "*)\n")

let files =
[
( "PropList.txt",
fun s ->
match parse_line s with
| [""] -> ()
| [interval; prop] ->
let interval = parse_interval interval in
let prop = parse_prop prop in
Hashtbl.add labels prop interval
| _ -> assert false );
( "DerivedCoreProperties.txt",
fun s ->
match parse_line s with
| [""] -> ()
| [interval; prop] ->
let interval = parse_interval interval in
let prop = parse_prop prop in
Hashtbl.add labels prop interval
| _ -> assert false );
( "DerivedGeneralCategory.txt",
fun s ->
match parse_line s with
| [""] -> ()
| [interval; cat] ->
let interval = parse_interval interval in
let cat = parse_category cat in
Hashtbl.add categories cat interval
| _ -> assert false );
( "UnicodeData.txt",
fun s ->
match parse_line s with
| [""] -> ()
| interval :: _ :: cat :: _ ->
let interval = parse_interval interval in
let cat = parse_category cat in
Hashtbl.add categories cat interval
| _ -> assert false );
]

let read_version fname =
let version_rex =
Str.regexp "^# PropList-\\([0-9]+\\.[0-9]+\\.[0-9]+\\)\\.txt"
in
let ch = open_in_bin fname in
let s = input_line ch in
close_in ch;
ignore (Str.string_match version_rex s 0);
Str.matched_group 1 s

let exported_categories =
[
"cc";
"cf";
Expand Down Expand Up @@ -40,6 +154,10 @@ let keepers =
"zl";
"zp";
"zs";
]

let exported_properties =
[
"alphabetic";
"ascii_hex_digit";
"hex_digit";
Expand All @@ -57,120 +175,18 @@ let keepers =
"xid_start";
]

let prop_interval_rex =
Str.regexp
"^\\([0-9a-fA-F]+\\)\\.\\.\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)[ \
]+#[ ]+\\([a-zA-Z][a-zA-Z&]\\)"

let prop_single_rex =
Str.regexp
"^\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)[ ]+#[ \
]+\\([a-zA-Z][a-zA-Z&]\\)"

let derived_interval_rex =
Str.regexp
"^\\([0-9a-fA-F]+\\)\\.\\.\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)"

let derived_single_rex =
Str.regexp "^\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)"

let add_entry hashtbl (b, e) name =
let mk s = int_of_string (Printf.sprintf "0x%s" s) in
let interval = (mk b, mk e) in
let label = String.lowercase_ascii name in
if List.mem label keepers then Hashtbl.add hashtbl label interval

let match_interval s =
if Str.string_match prop_interval_rex s 0 then (
let interval = (Str.matched_group 1 s, Str.matched_group 2 s) in
add_entry labels interval (Str.matched_group 3 s);
add_entry categories interval (Str.matched_group 4 s))

let match_single s =
if Str.string_match prop_single_rex s 0 then (
let interval = (Str.matched_group 1 s, Str.matched_group 1 s) in
add_entry labels interval (Str.matched_group 2 s);
add_entry categories interval (Str.matched_group 3 s))

let match_derived_interval s =
if Str.string_match derived_interval_rex s 0 then (
let interval = (Str.matched_group 1 s, Str.matched_group 2 s) in
add_entry categories interval (Str.matched_group 3 s))

let match_derived_single s =
if Str.string_match derived_single_rex s 0 then (
let interval = (Str.matched_group 1 s, Str.matched_group 1 s) in
add_entry categories interval (Str.matched_group 2 s))

let split list n =
let rec aux acc rem =
match (acc, rem) with
| [], el :: rem -> aux [[el]] rem
| l :: acc, el :: rem when List.length l = n ->
aux ([el] :: List.rev l :: acc) rem
| l :: acc, el :: rem -> aux ((el :: l) :: acc) rem
| l :: acc, [] -> List.rev (List.rev l :: acc)
| [], [] -> []
in
aux [] list

let build_interval l =
List.map (fun (a, b) -> Cset.interval a b) l |> Cset.union_list

let print_elements ch hashtbl =
let cats =
List.sort_uniq compare (Hashtbl.fold (fun cat _ l -> cat :: l) hashtbl [])
in
let len = List.length cats in
List.iter
(fun c ->
let entries =
List.map
(fun (b, e) -> Printf.sprintf "0x%x, 0x%x" b e)
(build_interval (Hashtbl.find_all hashtbl c) :> (int * int) list)
in
let entries = List.map (String.concat "; ") (split entries 5) in
let entries = String.concat ";\n " entries in
Printf.fprintf ch " let %s = Sedlex_cset.of_list\n [%s]\n\n" c entries)
cats;
Printf.fprintf ch " let list = [\n";
List.iteri
(fun pos c ->
Printf.fprintf ch " (%S, %s)%s\n" c c
(if pos == len - 1 then "" else ";"))
cats;
Printf.fprintf ch " ]\n\n"

let files =
[
("PropList.txt", [match_interval; match_single]);
("DerivedCoreProperties.txt", [match_interval; match_single]);
( "DerivedGeneralCategory.txt",
[match_derived_interval; match_derived_single] );
]

let read_version fname =
let version_rex =
Str.regexp "^# PropList-\\([0-9]+\\.[0-9]+\\.[0-9]+\\)\\.txt"
in
let ch = open_in_bin fname in
let s = input_line ch in
close_in ch;
ignore (Str.string_match version_rex s 0);
Str.matched_group 1 s

let () =
let base_dir =
Filename.concat (Filename.dirname Sys.executable_name) "data"
in
let version = read_version (Filename.concat base_dir "PropList.txt") in
List.iter
(fun (fname, fns) ->
(fun (fname, fn) ->
let ch = open_in_bin (Filename.concat base_dir fname) in
try
while true do
let ret = input_line ch in
List.iter (fun fn -> fn ret) fns
fn ret
done
with End_of_file -> close_in ch)
files;
Expand All @@ -182,9 +198,9 @@ let () =
Printf.fprintf ch "(* Edit gen_unicode.ml.inc instead. *)\n\n";
Printf.fprintf ch "\n\nlet version = %S\n\n" version;
Printf.fprintf ch "module Categories = struct\n\n";
print_elements ch categories;
print_elements ch categories exported_categories;
Printf.fprintf ch "end\n\n";
Printf.fprintf ch "module Properties = struct\n\n";
print_elements ch labels;
print_elements ch labels exported_properties;
Printf.fprintf ch "end\n";
close_out ch
4 changes: 1 addition & 3 deletions src/syntax/dune
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
(mode promote-until-clean)
(deps
(:gen ../generator/gen_unicode.exe)
../generator/data/DerivedCoreProperties.txt
../generator/data/DerivedGeneralCategory.txt
../generator/data/PropList.txt)
(glob_files ../generator/data/*.txt))
(action
(run %{gen} %{targets})))
Loading

0 comments on commit fad335a

Please sign in to comment.