Class: Prism::LexCompat
Do not use. This class is for internal use only.
| Relationships & Source Files | |
| Namespace Children | |
|
Modules:
| |
|
Classes:
| |
| Inherits: | Object |
| Defined in: | lib/prism/lex_compat.rb |
Overview
This class is responsible for lexing the source using prism and then converting those tokens to be compatible with Ripper. In the vast majority of cases, this is a one-to-one mapping of the token type. Everything else generally lines up. However, there are a few cases that require special handling.
Constant Summary
-
BOM_FLUSHED =
private
# File 'lib/prism/lex_compat.rb', line 581
In previous versions of Ruby, Ripper wouldn’t flush the bom before the first token, so we had to have a hack in place to account for that.
RUBY_VERSION >= "3.3.0"
-
RIPPER =
# File 'lib/prism/lex_compat.rb', line 31
{ AMPERSAND: :on_op, AMPERSAND_AMPERSAND: :on_op, AMPERSAND_AMPERSAND_EQUAL: :on_op, AMPERSAND_DOT: :on_op, AMPERSAND_EQUAL: :on_op, BACK_REFERENCE: :on_backref, BACKTICK: :on_backtick, BANG: :on_op, BANG_EQUAL: :on_op, BANG_TILDE: :on_op, BRACE_LEFT: :on_lbrace, BRACE_RIGHT: :on_rbrace, BRACKET_LEFT: :on_lbracket, BRACKET_LEFT_ARRAY: :on_lbracket, BRACKET_LEFT_RIGHT: :on_op, BRACKET_LEFT_RIGHT_EQUAL: :on_op, BRACKET_RIGHT: :on_rbracket, CARET: :on_op, CARET_EQUAL: :on_op, CHARACTER_LITERAL: :on_CHAR, CLASS_VARIABLE: :on_cvar, COLON: :on_op, COLON_COLON: :on_op, COMMA: :on_comma, COMMENT: :on_comment, CONSTANT: :on_const, DOT: :on_period, DOT_DOT: :on_op, DOT_DOT_DOT: :on_op, EMBDOC_BEGIN: :on_embdoc_beg, EMBDOC_END: :on_embdoc_end, EMBDOC_LINE: :on_embdoc, EMBEXPR_BEGIN: :on_embexpr_beg, EMBEXPR_END: :on_embexpr_end, EMBVAR: :on_embvar, EOF: :on_eof, EQUAL: :on_op, EQUAL_EQUAL: :on_op, EQUAL_EQUAL_EQUAL: :on_op, EQUAL_GREATER: :on_op, EQUAL_TILDE: :on_op, FLOAT: :on_float, FLOAT_IMAGINARY: :on_imaginary, FLOAT_RATIONAL: :on_rational, FLOAT_RATIONAL_IMAGINARY: :on_imaginary, GREATER: :on_op, GREATER_EQUAL: :on_op, GREATER_GREATER: :on_op, GREATER_GREATER_EQUAL: :on_op, GLOBAL_VARIABLE: :on_gvar, HEREDOC_END: :on_heredoc_end, HEREDOC_START: :on_heredoc_beg, IDENTIFIER: :on_ident, IGNORED_NEWLINE: :on_ignored_nl, INTEGER: :on_int, INTEGER_IMAGINARY: :on_imaginary, INTEGER_RATIONAL: :on_rational, INTEGER_RATIONAL_IMAGINARY: :on_imaginary, INSTANCE_VARIABLE: :on_ivar, INVALID: :INVALID, KEYWORD___ENCODING__: :on_kw, KEYWORD___LINE__: :on_kw, KEYWORD___FILE__: :on_kw, KEYWORD_ALIAS: :on_kw, KEYWORD_AND: :on_kw, KEYWORD_BEGIN: :on_kw, KEYWORD_BEGIN_UPCASE: :on_kw, KEYWORD_BREAK: :on_kw, KEYWORD_CASE: :on_kw, KEYWORD_CLASS: :on_kw, KEYWORD_DEF: :on_kw, KEYWORD_DEFINED: :on_kw, KEYWORD_DO: :on_kw, KEYWORD_DO_LOOP: :on_kw, KEYWORD_ELSE: :on_kw, KEYWORD_ELSIF: :on_kw, KEYWORD_END: :on_kw, KEYWORD_END_UPCASE: :on_kw, KEYWORD_ENSURE: :on_kw, KEYWORD_FALSE: :on_kw, KEYWORD_FOR: :on_kw, KEYWORD_IF: :on_kw, KEYWORD_IF_MODIFIER: :on_kw, KEYWORD_IN: :on_kw, KEYWORD_MODULE: :on_kw, KEYWORD_NEXT: :on_kw, KEYWORD_NIL: :on_kw, KEYWORD_NOT: :on_kw, KEYWORD_OR: :on_kw, KEYWORD_REDO: :on_kw, KEYWORD_RESCUE: :on_kw, KEYWORD_RESCUE_MODIFIER: :on_kw, KEYWORD_RETRY: :on_kw, KEYWORD_RETURN: :on_kw, KEYWORD_SELF: :on_kw, KEYWORD_SUPER: :on_kw, KEYWORD_THEN: :on_kw, KEYWORD_TRUE: :on_kw, KEYWORD_UNDEF: :on_kw, KEYWORD_UNLESS: :on_kw, KEYWORD_UNLESS_MODIFIER: :on_kw, KEYWORD_UNTIL: :on_kw, KEYWORD_UNTIL_MODIFIER: :on_kw, KEYWORD_WHEN: :on_kw, KEYWORD_WHILE: :on_kw, KEYWORD_WHILE_MODIFIER: :on_kw, KEYWORD_YIELD: :on_kw, LABEL: :on_label, LABEL_END: :on_label_end, LAMBDA_BEGIN: :on_tlambeg, LESS: :on_op, LESS_EQUAL: :on_op, LESS_EQUAL_GREATER: :on_op, LESS_LESS: :on_op, LESS_LESS_EQUAL: :on_op, METHOD_NAME: :on_ident, MINUS: :on_op, MINUS_EQUAL: :on_op, MINUS_GREATER: :on_tlambda, NEWLINE: :on_nl, NUMBERED_REFERENCE: :on_backref, PARENTHESIS_LEFT: :on_lparen, PARENTHESIS_LEFT_PARENTHESES: :on_lparen, PARENTHESIS_RIGHT: :on_rparen, PERCENT: :on_op, PERCENT_EQUAL: :on_op, PERCENT_LOWER_I: :on_qsymbols_beg, PERCENT_LOWER_W: :on_qwords_beg, PERCENT_LOWER_X: :on_backtick, PERCENT_UPPER_I: :on_symbols_beg, PERCENT_UPPER_W: :on_words_beg, PIPE: :on_op, PIPE_EQUAL: :on_op, PIPE_PIPE: :on_op, PIPE_PIPE_EQUAL: :on_op, PLUS: :on_op, PLUS_EQUAL: :on_op, QUESTION_MARK: :on_op, RATIONAL_FLOAT: :on_rational, RATIONAL_INTEGER: :on_rational, REGEXP_BEGIN: :on_regexp_beg, REGEXP_END: :on_regexp_end, SEMICOLON: :on_semicolon, SLASH: :on_op, SLASH_EQUAL: :on_op, STAR: :on_op, STAR_EQUAL: :on_op, STAR_STAR: :on_op, STAR_STAR_EQUAL: :on_op, STRING_BEGIN: :on_tstring_beg, STRING_CONTENT: :on_tstring_content, STRING_END: :on_tstring_end, SYMBOL_BEGIN: :on_symbeg, TILDE: :on_op, UAMPERSAND: :on_op, UCOLON_COLON: :on_op, UDOT_DOT: :on_op, UDOT_DOT_DOT: :on_op, UMINUS: :on_op, UMINUS_NUM: :on_op, UPLUS: :on_op, USTAR: :on_op, USTAR_STAR: :on_op, WORDS_SEP: :on_words_sep, "__END__": :on___end__ }.freeze
Class Method Summary
- .new(code, **options) ⇒ LexCompat constructor
Instance Attribute Summary
- #options readonly
Instance Method Summary
Constructor Details
.new(code, **options) ⇒ LexCompat
# File 'lib/prism/lex_compat.rb', line 586
def initialize(code, **) @code = code @options = end
Instance Attribute Details
#options (readonly)
[ GitHub ]# File 'lib/prism/lex_compat.rb', line 584
attr_reader :
Instance Method Details
#add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
[ GitHub ]# File 'lib/prism/lex_compat.rb', line 814
def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token) new_tokens = [] prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG) prev_token_end = bom ? 3 : 0 tokens.each do |token| line, column = token.location start_offset = source.line_to_byte_offset(line) + column # Ripper reports columns on line 1 without counting the BOM, so we adjust to get the real offset start_offset += 3 if line == 1 && bom if start_offset > prev_token_end sp_value = source.slice(prev_token_end, start_offset - prev_token_end) sp_line = source.line(prev_token_end) sp_column = source.column(prev_token_end) # Ripper reports columns on line 1 without counting the BOM sp_column -= 3 if sp_line == 1 && bom continuation_index = sp_value.byteindex("\\") # ripper emits up to three :on_sp tokens when line continuations are used if continuation_index next_whitespace_index = continuation_index + 1 next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r" next_whitespace_index += 1 first_whitespace = sp_value[0...continuation_index] continuation = sp_value[continuation_index...next_whitespace_index] second_whitespace = sp_value[next_whitespace_index..] new_tokens << IgnoreStateToken.new([ [sp_line, sp_column], :on_sp, first_whitespace, prev_token_state ]) unless first_whitespace.empty? new_tokens << IgnoreStateToken.new([ [sp_line, sp_column + continuation_index], :on_sp, continuation, prev_token_state ]) new_tokens << IgnoreStateToken.new([ [sp_line + 1, 0], :on_sp, second_whitespace, prev_token_state ]) unless second_whitespace.empty? else new_tokens << IgnoreStateToken.new([ [sp_line, sp_column], :on_sp, sp_value, prev_token_state ]) end end new_tokens << token prev_token_state = token.state prev_token_end = start_offset + token.value.bytesize end unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl end_offset = eof_token.location.end_offset if prev_token_end < end_offset new_tokens << IgnoreStateToken.new([ [source.line(prev_token_end), source.column(prev_token_end)], :on_sp, source.slice(prev_token_end, end_offset - prev_token_end), prev_token_state ]) end end new_tokens end
#result
[ GitHub ]# File 'lib/prism/lex_compat.rb', line 591
def result tokens = [] #: Array[LexCompat::Token] state = :default heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]] result = Prism.lex(@code, **) source = result.source result_value = result.value previous_state = nil #: State? last_heredoc_end = nil #: Integer? eof_token = nil bom = source.slice(0, 3) == "\xEF\xBB\xBF" result_value.each_with_index do |(token, lex_state), index| lineno = token.location.start_line column = token.location.start_column # If there's a UTF-8 byte-order mark as the start of the file, then for # certain tokens ripper sets the first token back by 3 bytes. It also # keeps the byte order mark in the first token's value. This is weird, # and I don't want to mirror that in our parser. So instead, we'll match # up the columns and values here. if bom && lineno == 1 column -= 3 if index == 0 && column == 0 && !BOM_FLUSHED flushed = case token.type when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, :PERCENT_UPPER_W, :STRING_BEGIN true when :REGEXP_BEGIN, :SYMBOL_BEGIN token.value.start_with?("%") else false end unless flushed column -= 3 value = token.value value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) end end end event = RIPPER.fetch(token.type) value = token.value lex_state = Translation::Ripper::Lexer::State.cached(lex_state) token = case event when :on___end__ # Ripper doesn't include the rest of the token in the event, so we need to # trim it down to just the content on the first line. value = value[0..value.index("\n")] Token.new([[lineno, column], event, value, lex_state]) when :on_comment IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_heredoc_end # Heredoc end tokens can be emitted in an odd order, so we don't # want to bother comparing the state on them. last_heredoc_end = token.location.end_offset IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_embexpr_end IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_regexp_end # On regex end, Ripper scans and then sets end state, so the ripper # lexed output is begin, when it should be end. prism sets lex state # correctly to end state, but we want to be able to compare against # Ripper's lexed state. So here, if it's a regexp end token, we # output the state as the previous state, solely for the sake of # comparison. previous_token = result_value[index - 1][0] lex_state = if RIPPER.fetch(previous_token.type) == :on_embexpr_end # If the previous token is embexpr_end, then we have to do even # more processing. The end of an embedded expression sets the # state to the state that it had at the beginning of the # embedded expression. So we have to go and find that state and # set it here. counter = 1 current_index = index - 1 until counter == 0 current_index -= 1 current_event = RIPPER.fetch(result_value[current_index][0].type) counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0 end Translation::Ripper::Lexer::State.cached(result_value[current_index][1]) else previous_state end Token.new([[lineno, column], event, value, lex_state]) when :on_eof eof_token = token previous_token = result_value[index - 1][0] # If we're at the end of the file and the previous token was a # comment and there is still whitespace after the comment, then # Ripper will append a on_nl token (even though there isn't # necessarily a newline). We mirror that here. if previous_token.type == :COMMENT # If the comment is at the start of a heredoc: <<HEREDOC # comment # then the comment's end_offset is up near the heredoc_beg. # This is not the correct offset to use for figuring out if # there is trailing whitespace after the last token. # Use the greater offset of the two to determine the start of # the trailing whitespace. start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max end_offset = token.location.start_offset if start_offset < end_offset if bom start_offset += 3 end_offset += 3 end tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state]) end end Token.new([[lineno, column], event, value, lex_state]) else Token.new([[lineno, column], event, value, lex_state]) end previous_state = lex_state # The order in which tokens appear in our lexer is different from the # order that they appear in Ripper. When we hit the declaration of a # heredoc in prism, we skip forward and lex the rest of the content of # the heredoc before going back and lexing at the end of the heredoc # identifier. # # To match up to ripper, we keep a small state variable around here to # track whether we're in the middle of a heredoc or not. In this way we # can shuffle around the token to match Ripper's output. case state when :default # The default state is when there are no heredocs at all. In this # state we can append the token to the list of tokens and move on. tokens << token # If we get the declaration of a heredoc, then we open a new heredoc # and move into the heredoc_opened state. if event == :on_heredoc_beg state = :heredoc_opened heredoc_stack.last << Heredoc.build(token) end when :heredoc_opened # The heredoc_opened state is when we've seen the declaration of a # heredoc and are now lexing the body of the heredoc. In this state we # push tokens onto the most recently created heredoc. heredoc_stack.last.last << token case event when :on_heredoc_beg # If we receive a heredoc declaration while lexing the body of a # heredoc, this means we have nested heredocs. In this case we'll # push a new heredoc onto the stack and stay in the heredoc_opened # state since we're now lexing the body of the new heredoc. heredoc_stack << [Heredoc.build(token)] when :on_heredoc_end # If we receive the end of a heredoc, then we're done lexing the # body of the heredoc. In this case we now have a completed heredoc # but need to wait for the next newline to push it into the token # stream. state = :heredoc_closed end when :heredoc_closed if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n")) if heredoc_stack.size > 1 flushing = heredoc_stack.pop heredoc_stack.last.last << token flushing.each do |heredoc| heredoc.to_a.each do |flushed_token| heredoc_stack.last.last << flushed_token end end state = :heredoc_opened next end elsif event == :on_heredoc_beg tokens << token state = :heredoc_opened heredoc_stack.last << Heredoc.build(token) next elsif heredoc_stack.size > 1 heredoc_stack[-2].last << token next end heredoc_stack.last.each do |heredoc| tokens.concat(heredoc.to_a) end heredoc_stack.last.clear state = :default tokens << token end end # Drop the EOF token from the list tokens = tokens[0...-1] # We sort by location to compare against Ripper's output tokens.sort_by!(&:location) # Add :on_sp tokens tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token) Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source) end