From fad335a87cfada01c0adbeade4967dbe55ef0fe3 Mon Sep 17 00:00:00 2001 From: hhugo Date: Wed, 28 Jun 2023 02:15:47 +0200 Subject: [PATCH] Better unicode.ml generator (#141) Co-authored-by: Hugo Heuzard Co-authored-by: Romain Beauxis --- CHANGES.md | 1 + dune-project | 3 +- sedlex.opam | 4 +- src/generator/data/dune | 18 ++- src/generator/gen_unicode.ml | 232 +++++++++++++++++++---------------- src/syntax/dune | 4 +- src/syntax/unicode.ml | 40 ++++++ 7 files changed, 183 insertions(+), 119 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index acf0a1ea..d5cf82e1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,7 @@ - Check and fix invariants from Cset. The codebase was not respecting invariants documented in the Cset module which could break code relying on it. The code generated by sedlex.ppx could be affected. +- Do not rely on comments from unicode UCD files - Add API to track position in bytes. Should be opt-in and backward compatible. (#146) # 3.1: diff --git a/dune-project b/dune-project index e3a668bf..31d66394 100644 --- a/dune-project +++ b/dune-project @@ -1,4 +1,4 @@ -(lang dune 2.8) +(lang dune 2.9) (version 3.2) (name sedlex) (source (github ocaml-community/sedlex)) @@ -9,6 +9,7 @@ (homepage "https://github.com/ocaml-community/sedlex") (generate_opam_files true) +(executables_implicit_empty_intf true) (package (name sedlex) diff --git a/sedlex.opam b/sedlex.opam index 7da40b72..8ea4142b 100644 --- a/sedlex.opam +++ b/sedlex.opam @@ -17,7 +17,7 @@ homepage: "https://github.com/ocaml-community/sedlex" bug-reports: "https://github.com/ocaml-community/sedlex/issues" depends: [ "ocaml" {>= "4.08"} - "dune" {>= "2.8"} + "dune" {>= "2.9"} "ppxlib" {>= "0.26.0"} "gen" "ppx_expect" {with-test} @@ -32,10 +32,12 @@ build: [ name "-j" jobs + "--promote-install-files=false" "@install" "@runtest" {with-test} "@doc" {with-doc} ] + ["dune" "install" "-p" name "--create-install-files" name] ] dev-repo: "git+https://github.com/ocaml-community/sedlex.git" doc: "https://ocaml-community.github.io/sedlex/index.html" diff --git a/src/generator/data/dune b/src/generator/data/dune index c2199f9b..0f8a827a 100644 --- a/src/generator/data/dune +++ b/src/generator/data/dune @@ -1,5 +1,5 @@ (rule - (targets DerivedCoreProperties.txt) + (target DerivedCoreProperties.txt) (deps base_url) (action (run @@ -8,10 +8,10 @@ -s %{read:base_url}/ucd/DerivedCoreProperties.txt -o - DerivedCoreProperties.txt))) + %{target}))) (rule - (targets DerivedGeneralCategory.txt) + (target DerivedGeneralCategory.txt) (deps base_url) (action (run @@ -20,10 +20,16 @@ -s %{read:base_url}/ucd/extracted/DerivedGeneralCategory.txt -o - DerivedGeneralCategory.txt))) + %{target}))) (rule - (targets PropList.txt) + (target PropList.txt) (deps base_url) (action - (run curl -L -s %{read:base_url}/ucd/PropList.txt -o PropList.txt))) + (run curl -L -s %{read:base_url}/ucd/PropList.txt -o %{target}))) + +(rule + (target UnicodeData.txt) + (deps base_url) + (action + (run curl -L -s %{read:base_url}/ucd/UnicodeData.txt -o %{target}))) diff --git a/src/generator/gen_unicode.ml b/src/generator/gen_unicode.ml index 98e5cede..6f8981b2 100644 --- a/src/generator/gen_unicode.ml +++ b/src/generator/gen_unicode.ml @@ -2,13 +2,127 @@ * the files exported at https://www.unicode.org/Public/ * and stored at src/generator/data. *) open Sedlex_utils +module SSet = Set.Make (String) let target = Sys.argv.(1) let categories = Hashtbl.create 1024 let labels = Hashtbl.create 1024 -(* Categories and Properties that we're keeping. *) -let keepers = +(* Drop comments and split semi-column separated fields *) +let parse_line l = + let l = + match String.index_opt l '#' with None -> l | Some i -> String.sub l 0 i + in + String.split_on_char ';' l + +let parse_code s = + try int_of_string (Printf.sprintf "0x%s" s) + with _ -> failwith (Printf.sprintf "invalid code %s" s) + +let parse_category x = String.lowercase_ascii (String.trim x) +let parse_prop x = String.lowercase_ascii (String.trim x) + +let parse_interval s = + match String.split_on_char '.' (String.trim s) with + | [] -> assert false + | [x] -> + let x = parse_code x in + Cset.singleton x + | [x; ""; y] -> + let x = parse_code x and y = parse_code y in + Cset.interval x y + | _ -> failwith (Printf.sprintf "invalid interval %s" s) + +let print_elements ch hashtbl cats = + let cats_set = SSet.of_list cats in + let all_keys = SSet.of_seq (Hashtbl.to_seq_keys hashtbl) in + let missing = SSet.diff cats_set all_keys in + let ignoring = SSet.diff all_keys cats_set in + let len = List.length cats in + List.iter + (fun c -> + let entries = + List.map + (fun (b, e) -> Printf.sprintf "0x%x, 0x%x" b e) + (Cset.union_list (Hashtbl.find_all hashtbl c) :> (int * int) list) + in + Printf.fprintf ch " let %s = Sedlex_cset.of_list\n [" c; + List.iteri + (fun i x -> + if i > 0 then + if i mod 5 = 0 then Printf.fprintf ch ";\n " + else Printf.fprintf ch "; "; + Printf.fprintf ch "%s" x) + entries; + Printf.fprintf ch "]\n\n") + cats; + Printf.fprintf ch " let list = [\n"; + List.iteri + (fun pos c -> + Printf.fprintf ch " (%S, %s)%s\n" c c + (if pos == len - 1 then "" else ";")) + cats; + Printf.fprintf ch " ]\n\n"; + if not (SSet.is_empty ignoring) then ( + Printf.fprintf ch "(* ignoring:\n"; + SSet.iter (fun s -> Printf.fprintf ch " - %s\n" s) ignoring; + Printf.fprintf ch "*)\n"); + if not (SSet.is_empty missing) then ( + Printf.fprintf ch "(* missing:\n"; + SSet.iter (fun s -> Printf.fprintf ch " - %s\n" s) missing; + Printf.fprintf ch "*)\n") + +let files = + [ + ( "PropList.txt", + fun s -> + match parse_line s with + | [""] -> () + | [interval; prop] -> + let interval = parse_interval interval in + let prop = parse_prop prop in + Hashtbl.add labels prop interval + | _ -> assert false ); + ( "DerivedCoreProperties.txt", + fun s -> + match parse_line s with + | [""] -> () + | [interval; prop] -> + let interval = parse_interval interval in + let prop = parse_prop prop in + Hashtbl.add labels prop interval + | _ -> assert false ); + ( "DerivedGeneralCategory.txt", + fun s -> + match parse_line s with + | [""] -> () + | [interval; cat] -> + let interval = parse_interval interval in + let cat = parse_category cat in + Hashtbl.add categories cat interval + | _ -> assert false ); + ( "UnicodeData.txt", + fun s -> + match parse_line s with + | [""] -> () + | interval :: _ :: cat :: _ -> + let interval = parse_interval interval in + let cat = parse_category cat in + Hashtbl.add categories cat interval + | _ -> assert false ); + ] + +let read_version fname = + let version_rex = + Str.regexp "^# PropList-\\([0-9]+\\.[0-9]+\\.[0-9]+\\)\\.txt" + in + let ch = open_in_bin fname in + let s = input_line ch in + close_in ch; + ignore (Str.string_match version_rex s 0); + Str.matched_group 1 s + +let exported_categories = [ "cc"; "cf"; @@ -40,6 +154,10 @@ let keepers = "zl"; "zp"; "zs"; + ] + +let exported_properties = + [ "alphabetic"; "ascii_hex_digit"; "hex_digit"; @@ -57,120 +175,18 @@ let keepers = "xid_start"; ] -let prop_interval_rex = - Str.regexp - "^\\([0-9a-fA-F]+\\)\\.\\.\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)[ \ - ]+#[ ]+\\([a-zA-Z][a-zA-Z&]\\)" - -let prop_single_rex = - Str.regexp - "^\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)[ ]+#[ \ - ]+\\([a-zA-Z][a-zA-Z&]\\)" - -let derived_interval_rex = - Str.regexp - "^\\([0-9a-fA-F]+\\)\\.\\.\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)" - -let derived_single_rex = - Str.regexp "^\\([0-9a-fA-F]+\\)[ ]*;[ ]+\\([a-zA-Z_]+\\)" - -let add_entry hashtbl (b, e) name = - let mk s = int_of_string (Printf.sprintf "0x%s" s) in - let interval = (mk b, mk e) in - let label = String.lowercase_ascii name in - if List.mem label keepers then Hashtbl.add hashtbl label interval - -let match_interval s = - if Str.string_match prop_interval_rex s 0 then ( - let interval = (Str.matched_group 1 s, Str.matched_group 2 s) in - add_entry labels interval (Str.matched_group 3 s); - add_entry categories interval (Str.matched_group 4 s)) - -let match_single s = - if Str.string_match prop_single_rex s 0 then ( - let interval = (Str.matched_group 1 s, Str.matched_group 1 s) in - add_entry labels interval (Str.matched_group 2 s); - add_entry categories interval (Str.matched_group 3 s)) - -let match_derived_interval s = - if Str.string_match derived_interval_rex s 0 then ( - let interval = (Str.matched_group 1 s, Str.matched_group 2 s) in - add_entry categories interval (Str.matched_group 3 s)) - -let match_derived_single s = - if Str.string_match derived_single_rex s 0 then ( - let interval = (Str.matched_group 1 s, Str.matched_group 1 s) in - add_entry categories interval (Str.matched_group 2 s)) - -let split list n = - let rec aux acc rem = - match (acc, rem) with - | [], el :: rem -> aux [[el]] rem - | l :: acc, el :: rem when List.length l = n -> - aux ([el] :: List.rev l :: acc) rem - | l :: acc, el :: rem -> aux ((el :: l) :: acc) rem - | l :: acc, [] -> List.rev (List.rev l :: acc) - | [], [] -> [] - in - aux [] list - -let build_interval l = - List.map (fun (a, b) -> Cset.interval a b) l |> Cset.union_list - -let print_elements ch hashtbl = - let cats = - List.sort_uniq compare (Hashtbl.fold (fun cat _ l -> cat :: l) hashtbl []) - in - let len = List.length cats in - List.iter - (fun c -> - let entries = - List.map - (fun (b, e) -> Printf.sprintf "0x%x, 0x%x" b e) - (build_interval (Hashtbl.find_all hashtbl c) :> (int * int) list) - in - let entries = List.map (String.concat "; ") (split entries 5) in - let entries = String.concat ";\n " entries in - Printf.fprintf ch " let %s = Sedlex_cset.of_list\n [%s]\n\n" c entries) - cats; - Printf.fprintf ch " let list = [\n"; - List.iteri - (fun pos c -> - Printf.fprintf ch " (%S, %s)%s\n" c c - (if pos == len - 1 then "" else ";")) - cats; - Printf.fprintf ch " ]\n\n" - -let files = - [ - ("PropList.txt", [match_interval; match_single]); - ("DerivedCoreProperties.txt", [match_interval; match_single]); - ( "DerivedGeneralCategory.txt", - [match_derived_interval; match_derived_single] ); - ] - -let read_version fname = - let version_rex = - Str.regexp "^# PropList-\\([0-9]+\\.[0-9]+\\.[0-9]+\\)\\.txt" - in - let ch = open_in_bin fname in - let s = input_line ch in - close_in ch; - ignore (Str.string_match version_rex s 0); - Str.matched_group 1 s - let () = let base_dir = Filename.concat (Filename.dirname Sys.executable_name) "data" in let version = read_version (Filename.concat base_dir "PropList.txt") in List.iter - (fun (fname, fns) -> + (fun (fname, fn) -> let ch = open_in_bin (Filename.concat base_dir fname) in try while true do let ret = input_line ch in - List.iter (fun fn -> fn ret) fns + fn ret done with End_of_file -> close_in ch) files; @@ -182,9 +198,9 @@ let () = Printf.fprintf ch "(* Edit gen_unicode.ml.inc instead. *)\n\n"; Printf.fprintf ch "\n\nlet version = %S\n\n" version; Printf.fprintf ch "module Categories = struct\n\n"; - print_elements ch categories; + print_elements ch categories exported_categories; Printf.fprintf ch "end\n\n"; Printf.fprintf ch "module Properties = struct\n\n"; - print_elements ch labels; + print_elements ch labels exported_properties; Printf.fprintf ch "end\n"; close_out ch diff --git a/src/syntax/dune b/src/syntax/dune index 18365ce3..8268aa74 100644 --- a/src/syntax/dune +++ b/src/syntax/dune @@ -14,8 +14,6 @@ (mode promote-until-clean) (deps (:gen ../generator/gen_unicode.exe) - ../generator/data/DerivedCoreProperties.txt - ../generator/data/DerivedGeneralCategory.txt - ../generator/data/PropList.txt) + (glob_files ../generator/data/*.txt)) (action (run %{gen} %{targets}))) diff --git a/src/syntax/unicode.ml b/src/syntax/unicode.ml index ebcfaec7..1635cc7f 100644 --- a/src/syntax/unicode.ml +++ b/src/syntax/unicode.ml @@ -2074,4 +2074,44 @@ module Properties = struct ("xid_start", xid_start) ] +(* ignoring: + - bidi_control + - case_ignorable + - cased + - changes_when_casefolded + - changes_when_casemapped + - changes_when_lowercased + - changes_when_titlecased + - changes_when_uppercased + - dash + - default_ignorable_code_point + - deprecated + - diacritic + - extender + - grapheme_base + - grapheme_extend + - grapheme_link + - hyphen + - ideographic + - ids_binary_operator + - ids_trinary_operator + - join_control + - logical_order_exception + - noncharacter_code_point + - other_default_ignorable_code_point + - other_grapheme_extend + - other_id_continue + - other_id_start + - pattern_syntax + - pattern_white_space + - prepended_concatenation_mark + - quotation_mark + - radical + - regional_indicator + - sentence_terminal + - soft_dotted + - terminal_punctuation + - unified_ideograph + - variation_selector +*) end