From ebe61d378152378b6d827a26e004e3b4ed67d66a Mon Sep 17 00:00:00 2001 From: msagca <51697294+msagca@users.noreply.github.com> Date: Sun, 7 May 2023 06:15:42 +0300 Subject: [PATCH 1/5] add support for ANTLR --- lib/rouge/demos/antlr | 19 ++++++++++++ lib/rouge/lexers/antlr.rb | 63 +++++++++++++++++++++++++++++++++++++++ spec/lexers/antlr_spec.rb | 15 ++++++++++ spec/visual/samples/antlr | 45 ++++++++++++++++++++++++++++ 4 files changed, 142 insertions(+) create mode 100644 lib/rouge/demos/antlr create mode 100644 lib/rouge/lexers/antlr.rb create mode 100644 spec/lexers/antlr_spec.rb create mode 100644 spec/visual/samples/antlr diff --git a/lib/rouge/demos/antlr b/lib/rouge/demos/antlr new file mode 100644 index 0000000000..adc9fd42bb --- /dev/null +++ b/lib/rouge/demos/antlr @@ -0,0 +1,19 @@ +grammar awk; +options { caseInsensitive = true; } +/* parser rules */ +program: item_list item? EOF; +param_list: name (',' name)* @{ action code }; +terminated_statement: + IF '(' expr ')' newline_opt terminated_statement ( + ELSE newline_opt terminated_statement + )? # if_stmt + | WHILE '(' expr ')' newline_opt terminated_statement #while_stmt + | FOR '(' simple_statement_opt ';' expr_opt ';' simple_statement_opt ')' newline_opt + terminated_statement # for_stmt; +COMMENT: '#' .*? NEWLINE -> channel(HIDDEN); +ESC_NEWLINE: '\\' NEWLINE -> skip; +STRING: '"' (~["\\\r\n] | ESCAPE_SEQUENCE)* '"'; +WORD: [A-Za-z_] [A-Za-z_0-9]*; +// fragments +fragment EXPONENT_PART: [eE] [+\-]? DIGIT_SEQUENCE; +fragment HEX_CONSTANT: '0' [xX] [0-9A-Fa-f]+; diff --git a/lib/rouge/lexers/antlr.rb b/lib/rouge/lexers/antlr.rb new file mode 100644 index 0000000000..8f0c8e97f7 --- /dev/null +++ b/lib/rouge/lexers/antlr.rb @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true + +module Rouge + module Lexers + class ANTLR < RegexLexer + title "ANTLR" + desc "ANother Tool for Language Recognition" + tag 'antlr' + filenames '*.g4' + word = /[a-z][a-zA-Z0-9_]*/ + WORD = /[A-Z][a-zA-Z0-9_]*/ + def self.keywords + @keywords ||= Set.new %w( + import fragment lexer parser grammar protected public private returns + locals throws catch finally mode options tokens channels channel type + popMode pushMode skip more + ) + end + state :string do + rule %r/'/, Str, :pop! + rule %r/\\./, Str::Escape + rule %r/[^\\'\n]+/, Str + end + state :charset do + rule %r/\]/, Name::Variable, :pop! + rule %r/\\./, Str::Escape + rule %r/[^\\\[\]\n]+/, Name::Variable + end + state :label do + rule %r/[a-zA-Z0-9_]+/, Name::Label, :pop! + rule %r/\s+/, Text + end + state :block do + rule %r/}/, Punctuation, :pop! + rule %r/\\./, Str::Escape + rule %r/[^\\{}\s]+/, Name::Builtin + rule %r/\s+/, Text + end + state :root do + rule %r/\s+/, Text + rule %r(//.*?$), Comment::Single + rule %r(/\*.*?\*/)m, Comment::Multiline + rule %r/'/, Str, :string + rule %r/\[/, Name::Variable, :charset + rule %r/#/, Name::Label, :label + rule %r/{/, Punctuation, :block + rule %r/0|[1-9][0-9]*/, Num::Integer + rule %r/[-?@*+<=>~$.]/, Operator + rule %r/[;:()|]/, Punctuation + rule %r/\\./, Str::Escape + rule WORD, Name::Class + rule word do |m| + if self.class.keywords.include? m[0] + token Keyword + else + token Name::Function + end + end + end + end + end +end diff --git a/spec/lexers/antlr_spec.rb b/spec/lexers/antlr_spec.rb new file mode 100644 index 0000000000..4e1f9b7ceb --- /dev/null +++ b/spec/lexers/antlr_spec.rb @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- # +# frozen_string_literal: true + +describe Rouge::Lexers::ANTLR do + let(:subject) { Rouge::Lexers::ANTLR.new } + + describe 'guessing' do + include Support::Guessing + + it 'guesses by filename' do + assert_guess :filename => 'foo.g4' + end + + end +end diff --git a/spec/visual/samples/antlr b/spec/visual/samples/antlr new file mode 100644 index 0000000000..fdb42ecd33 --- /dev/null +++ b/spec/visual/samples/antlr @@ -0,0 +1,45 @@ +grammar MyGrammar; +// options +options { + language = Ruby; + output = AST; + backtrack = true; +} +// lexer commands +@header { + require 'strscan' +} +@lexer::members { + def scan_number + # implementation here + end +} +// lexer modes +mode COMMAND_MODE; +// lexer rules +WORD: [a-zA-Z]+; +NUMBER: [0-9]+; +// parser rules +parse: expr+; +expr: WORD (PLUS | MINUS) expr | NUMBER; +// rule actions +expr + returns[int value] + @init { + $value = 0 +}: + WORD { $value = lookup_word($WORD.text) } + | NUMBER { $value = $NUMBER.text.to_i }; +// options and lexer commands in lexer rule +WORD: + [a-zA-Z]+ { + # option example: sets the token type + $type = $options.myToken + + # lexer command example: calls the method defined in @lexer::members + scan_number + }; +// lexer modes in lexer rule +COMMAND: '/' -> pushMode(COMMAND_MODE); +mode COMMAND_MODE; +COMMAND_MODE_COMMAND: ~[\r\n]+ -> type(COMMAND); From 7d2e389579ddd222e61149e4c4b2eda3348522df Mon Sep 17 00:00:00 2001 From: msagca <51697294+msagca@users.noreply.github.com> Date: Mon, 8 May 2023 15:24:39 +0300 Subject: [PATCH 2/5] handle nested braces --- lib/rouge/demos/antlr | 4 ++-- lib/rouge/lexers/antlr.rb | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/rouge/demos/antlr b/lib/rouge/demos/antlr index adc9fd42bb..8946dbab42 100644 --- a/lib/rouge/demos/antlr +++ b/lib/rouge/demos/antlr @@ -1,8 +1,8 @@ grammar awk; -options { caseInsensitive = true; } +options { an option; } /* parser rules */ program: item_list item? EOF; -param_list: name (',' name)* @{ action code }; +param_list: name (',' name)* @{ {action code} }; terminated_statement: IF '(' expr ')' newline_opt terminated_statement ( ELSE newline_opt terminated_statement diff --git a/lib/rouge/lexers/antlr.rb b/lib/rouge/lexers/antlr.rb index 8f0c8e97f7..bd7842078d 100644 --- a/lib/rouge/lexers/antlr.rb +++ b/lib/rouge/lexers/antlr.rb @@ -32,7 +32,8 @@ def self.keywords rule %r/\s+/, Text end state :block do - rule %r/}/, Punctuation, :pop! + rule %r/{/, Name::Builtin, :block + rule %r/}/, Name::Builtin, :pop! rule %r/\\./, Str::Escape rule %r/[^\\{}\s]+/, Name::Builtin rule %r/\s+/, Text @@ -44,7 +45,7 @@ def self.keywords rule %r/'/, Str, :string rule %r/\[/, Name::Variable, :charset rule %r/#/, Name::Label, :label - rule %r/{/, Punctuation, :block + rule %r/{/, Name::Builtin, :block rule %r/0|[1-9][0-9]*/, Num::Integer rule %r/[-?@*+<=>~$.]/, Operator rule %r/[;:()|]/, Punctuation From a104d0db28ff1fcd9d30c5cf416b1f34f005decc Mon Sep 17 00:00:00 2001 From: msagca <51697294+msagca@users.noreply.github.com> Date: Mon, 8 May 2023 22:39:38 +0300 Subject: [PATCH 3/5] improve antlr lexer --- lib/rouge/demos/antlr | 3 +- lib/rouge/lexers/antlr.rb | 112 ++++++++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 32 deletions(-) diff --git a/lib/rouge/demos/antlr b/lib/rouge/demos/antlr index 8946dbab42..842cc77819 100644 --- a/lib/rouge/demos/antlr +++ b/lib/rouge/demos/antlr @@ -1,8 +1,7 @@ grammar awk; -options { an option; } /* parser rules */ program: item_list item? EOF; -param_list: name (',' name)* @{ {action code} }; +param_list /*comment*/ : name (',' name)* ; terminated_statement: IF '(' expr ')' newline_opt terminated_statement ( ELSE newline_opt terminated_statement diff --git a/lib/rouge/lexers/antlr.rb b/lib/rouge/lexers/antlr.rb index bd7842078d..c7dee609d9 100644 --- a/lib/rouge/lexers/antlr.rb +++ b/lib/rouge/lexers/antlr.rb @@ -8,54 +8,106 @@ class ANTLR < RegexLexer desc "ANother Tool for Language Recognition" tag 'antlr' filenames '*.g4' - word = /[a-z][a-zA-Z0-9_]*/ - WORD = /[A-Z][a-zA-Z0-9_]*/ def self.keywords @keywords ||= Set.new %w( - import fragment lexer parser grammar protected public private returns - locals throws catch finally mode options tokens channels channel type - popMode pushMode skip more + catch channel channels finally fragment grammar import lexer locals + mode more options parser popMode private protected public pushMode + returns skip throws tokens type ) end + identifier = %r/[A-Za-z][a-zA-Z0-9_]*/ + integer = %r/0|[1-9][0-9]*/ + parse_rule_name = false + rule_name = %r/[a-z][a-zA-Z0-9_]*/ + token_name = %r/[A-Z][a-zA-Z0-9_]*/ + state :whitespace do + rule %r/\s+/, Text + end + state :comment_and_whitespace do + mixin :whitespace + rule %r(//.*?$), Comment::Single + rule %r(/\*.*?\*/)m, Comment::Multiline + end + state :escape_sequence do + rule %r/\\./, Str::Escape + end state :string do + mixin :escape_sequence rule %r/'/, Str, :pop! - rule %r/\\./, Str::Escape rule %r/[^\\'\n]+/, Str end - state :charset do - rule %r/\]/, Name::Variable, :pop! - rule %r/\\./, Str::Escape - rule %r/[^\\\[\]\n]+/, Name::Variable + state :options_spec do + mixin :comment_and_whitespace + rule %r/{/, Punctuation + rule %r/}/, Punctuation, :pop! + rule %r/=/, Operator, :option_value + rule identifier, Name::Variable end - state :label do - rule %r/[a-zA-Z0-9_]+/, Name::Label, :pop! - rule %r/\s+/, Text + state :option_value do + mixin :comment_and_whitespace + rule %r/;/, Punctuation, :pop! + rule %r/./, Punctuation + rule %r/'/, Str, :string + rule %r/{/, Punctuation, :action_block + rule identifier, Name::Constant + rule integer, Num::Integer end - state :block do - rule %r/{/, Name::Builtin, :block - rule %r/}/, Name::Builtin, :pop! - rule %r/\\./, Str::Escape - rule %r/[^\\{}\s]+/, Name::Builtin - rule %r/\s+/, Text + state :action_block do + mixin :escape_sequence + mixin :whitespace + rule %r/[^\\{}\s]+/, Name::Function + rule %r/{/, Punctuation, :action_block + rule %r/}/, Punctuation, :pop! + end + state :arg_action_block do + mixin :escape_sequence + mixin :whitespace + rule %r/[^\\\[\]]+/, Str + rule %r/\]/, Str, :pop! + end + state :label do + mixin :comment_and_whitespace + rule rule_name, Name::Label, :pop! end state :root do - rule %r/\s+/, Text - rule %r(//.*?$), Comment::Single - rule %r(/\*.*?\*/)m, Comment::Multiline + mixin :comment_and_whitespace rule %r/'/, Str, :string - rule %r/\[/, Name::Variable, :charset + rule %r/[|,.()]/, Punctuation + rule %r/[@<>=~\-+?*]/, Operator + rule %r/{/, Punctuation, :action_block + rule %r/\[/, Str, :arg_action_block rule %r/#/, Name::Label, :label - rule %r/{/, Name::Builtin, :block rule %r/0|[1-9][0-9]*/, Num::Integer - rule %r/[-?@*+<=>~$.]/, Operator - rule %r/[;:()|]/, Punctuation - rule %r/\\./, Str::Escape - rule WORD, Name::Class - rule word do |m| + rule %r/:/ do + token Punctuation + parse_rule_name = false + end + rule %r/;/ do + token Punctuation + parse_rule_name = true + end + rule token_name do + if parse_rule_name + token Name::Label + else + token Name::Class + end + end + rule rule_name do |m| if self.class.keywords.include? m[0] token Keyword + case m[0] + when 'grammar' + parse_rule_name = true + when 'options' + push :options_spec + when 'throws' + parse_rule_name = false + end + elsif parse_rule_name + token Name::Label else - token Name::Function + token Name::Variable end end end From 8b742f6633821b2c7ec10f959e1f7889547b1dbd Mon Sep 17 00:00:00 2001 From: msagca <51697294+msagca@users.noreply.github.com> Date: Mon, 8 May 2023 23:27:29 +0300 Subject: [PATCH 4/5] tiny styling update --- lib/rouge/lexers/antlr.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/rouge/lexers/antlr.rb b/lib/rouge/lexers/antlr.rb index c7dee609d9..66fea899ee 100644 --- a/lib/rouge/lexers/antlr.rb +++ b/lib/rouge/lexers/antlr.rb @@ -72,12 +72,12 @@ def self.keywords state :root do mixin :comment_and_whitespace rule %r/'/, Str, :string - rule %r/[|,.()]/, Punctuation rule %r/[@<>=~\-+?*]/, Operator + rule %r/[|,.()]/, Punctuation rule %r/{/, Punctuation, :action_block rule %r/\[/, Str, :arg_action_block rule %r/#/, Name::Label, :label - rule %r/0|[1-9][0-9]*/, Num::Integer + rule integer, Num::Integer rule %r/:/ do token Punctuation parse_rule_name = false From c67ff20fec3f0fcb142f80069282659d52cffbe0 Mon Sep 17 00:00:00 2001 From: msagca <51697294+msagca@users.noreply.github.com> Date: Sun, 4 Jun 2023 23:43:14 +0300 Subject: [PATCH 5/5] fix a bug --- lib/rouge/demos/antlr | 11 ++++++----- lib/rouge/lexers/antlr.rb | 26 ++++++++++++-------------- spec/visual/samples/antlr | 14 +++----------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/lib/rouge/demos/antlr b/lib/rouge/demos/antlr index 842cc77819..b1aca03fbb 100644 --- a/lib/rouge/demos/antlr +++ b/lib/rouge/demos/antlr @@ -1,7 +1,8 @@ -grammar awk; -/* parser rules */ +grammar GrammarName; +options { anOption = optionValue; } program: item_list item? EOF; -param_list /*comment*/ : name (',' name)* ; +param_list /* multi-line +comment */ : name (',' name)* ; terminated_statement: IF '(' expr ')' newline_opt terminated_statement ( ELSE newline_opt terminated_statement @@ -12,7 +13,7 @@ terminated_statement: COMMENT: '#' .*? NEWLINE -> channel(HIDDEN); ESC_NEWLINE: '\\' NEWLINE -> skip; STRING: '"' (~["\\\r\n] | ESCAPE_SEQUENCE)* '"'; -WORD: [A-Za-z_] [A-Za-z_0-9]*; -// fragments +mode WORD_MODE; +WORD: [A-Za-z_] [A-Za-z_0-9]*; // single line comment fragment EXPONENT_PART: [eE] [+\-]? DIGIT_SEQUENCE; fragment HEX_CONSTANT: '0' [xX] [0-9A-Fa-f]+; diff --git a/lib/rouge/lexers/antlr.rb b/lib/rouge/lexers/antlr.rb index 66fea899ee..fb4a70bfd8 100644 --- a/lib/rouge/lexers/antlr.rb +++ b/lib/rouge/lexers/antlr.rb @@ -15,11 +15,11 @@ def self.keywords returns skip throws tokens type ) end + get_label = true identifier = %r/[A-Za-z][a-zA-Z0-9_]*/ integer = %r/0|[1-9][0-9]*/ - parse_rule_name = false - rule_name = %r/[a-z][a-zA-Z0-9_]*/ - token_name = %r/[A-Z][a-zA-Z0-9_]*/ + lowercase_name = %r/[a-z][a-zA-Z0-9_]*/ + uppercase_name = %r/[A-Z][a-zA-Z0-9_]*/ state :whitespace do rule %r/\s+/, Text end @@ -41,7 +41,7 @@ def self.keywords rule %r/{/, Punctuation rule %r/}/, Punctuation, :pop! rule %r/=/, Operator, :option_value - rule identifier, Name::Variable + rule identifier, Name::Attribute end state :option_value do mixin :comment_and_whitespace @@ -67,7 +67,7 @@ def self.keywords end state :label do mixin :comment_and_whitespace - rule rule_name, Name::Label, :pop! + rule lowercase_name, Name::Label, :pop! end state :root do mixin :comment_and_whitespace @@ -80,31 +80,29 @@ def self.keywords rule integer, Num::Integer rule %r/:/ do token Punctuation - parse_rule_name = false + get_label = false end rule %r/;/ do token Punctuation - parse_rule_name = true + get_label = true end - rule token_name do - if parse_rule_name + rule uppercase_name do + if get_label token Name::Label else token Name::Class end end - rule rule_name do |m| + rule lowercase_name do |m| if self.class.keywords.include? m[0] token Keyword case m[0] - when 'grammar' - parse_rule_name = true when 'options' push :options_spec when 'throws' - parse_rule_name = false + get_label = false end - elsif parse_rule_name + elsif get_label token Name::Label else token Name::Variable diff --git a/spec/visual/samples/antlr b/spec/visual/samples/antlr index fdb42ecd33..540e68544c 100644 --- a/spec/visual/samples/antlr +++ b/spec/visual/samples/antlr @@ -1,11 +1,9 @@ grammar MyGrammar; -// options options { language = Ruby; output = AST; backtrack = true; } -// lexer commands @header { require 'strscan' } @@ -14,32 +12,26 @@ options { # implementation here end } -// lexer modes mode COMMAND_MODE; -// lexer rules WORD: [a-zA-Z]+; NUMBER: [0-9]+; -// parser rules -parse: expr+; +parse: expr+; // single line comment expr: WORD (PLUS | MINUS) expr | NUMBER; -// rule actions expr returns[int value] @init { $value = 0 }: - WORD { $value = lookup_word($WORD.text) } + WORD { $value = lookup_word($WORD.text) } /* multi-line +comment */ | NUMBER { $value = $NUMBER.text.to_i }; -// options and lexer commands in lexer rule WORD: [a-zA-Z]+ { # option example: sets the token type $type = $options.myToken - # lexer command example: calls the method defined in @lexer::members scan_number }; -// lexer modes in lexer rule COMMAND: '/' -> pushMode(COMMAND_MODE); mode COMMAND_MODE; COMMAND_MODE_COMMAND: ~[\r\n]+ -> type(COMMAND);