Improve rubocop handling of multiline ruby (#415)

* Remove RubyExtractor It's not used since the new RubyExtraction system * More uniform handling of comments with a space Ex: - #hi and = #world They both get turned to - comments * Rubocop: Improved handling for multiline scripts and tag attributes * Normalize handling of indentation for tag attributes and tag script Before, if a tag had children, the attributes and script of the tag would be nested in the begin, adding to their indentation. If the tag did not have childrne, the attributes and script of the tag would not be nested, leading to occasional inconsistencies in what would happen to them based on the presence of a child... Now they are always nested. --------- Co-authored-by: Shane da Silva <[email protected]>
sds · Jul 6, 2023 · a9761b2 · a9761b2
1 parent 92bd0d4
commit a9761b2
Show file tree

Hide file tree

Showing 20 changed files with 1,471 additions and 1,256 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,13 @@
 # HAML-Lint Changelog
 
+# master (unreleased)
+
 * Fix `Marshal.dump` error when using `--parallel` option and `RepeatedId` Linter
+* Improved support for multiline code with RuboCop linting/auto-correction
 
 # 0.47.0
 
-* Bugfixes related to experimental auto-correct
+* Bugfixes related to experimental auto-correct with RuboCop
 * Fix `Marshal.dump` errors when using `--parallel` option
 
 ## 0.46.0

diff --git a/lib/haml_lint/linter/rubocop.rb b/lib/haml_lint/linter/rubocop.rb
@@ -1,6 +1,5 @@
 # frozen_string_literal: true
 
-require 'haml_lint/ruby_extractor'
 require 'rubocop'
 require 'tempfile'
 

diff --git a/lib/haml_lint/ruby_extraction/chunk_extractor.rb b/lib/haml_lint/ruby_extraction/chunk_extractor.rb
@@ -11,6 +11,12 @@ class ChunkExtractor
 
     attr_reader :script_output_prefix
 
+    HAML_PARSER_INSTANCE = if Haml::VERSION >= '5.0.0'
+                             ::Haml::Parser.new({})
+                           else
+                             ::Haml::Parser.new('', {})
+                           end
+
     def initialize(document, script_output_prefix:)
       @document = document
       @script_output_prefix = script_output_prefix
@@ -19,13 +25,18 @@ def initialize(document, script_output_prefix:)
     def extract
       raise 'Already extracted' if @ruby_chunks
 
-      @ruby_chunks = []
-      @original_haml_lines = @document.source_lines
+      prepare_extract
 
       visit(@document.tree)
       @ruby_chunks
     end
 
+    # Useful for tests
+    def prepare_extract
+      @ruby_chunks = []
+      @original_haml_lines = @document.source_lines
+    end
+
     def visit_root(_node)
       yield # Collect lines of code from children
     end
@@ -71,33 +82,37 @@ def visit_haml_comment(node)
     # Visiting comments which are output to HTML. Lines looking like
     #   `  / This will be in the HTML source!`
     def visit_comment(node)
-      lines = raw_lines_of_interest(node.line - 1)
-      indent = lines.first.index(/\S/)
+      line = @original_haml_lines[node.line - 1]
+      indent = line.index(/\S/)
       @ruby_chunks << PlaceholderMarkerChunk.new(node, 'comment', indent: indent)
     end
 
     # Visit a script which outputs. Lines looking like `  = foo`
     def visit_script(node, &block)
-      lines = raw_lines_of_interest(node.line - 1)
+      raw_first_line = @original_haml_lines[node.line - 1]
 
-      if lines.first !~ /\A\s*[-=]/
+      if raw_first_line !~ /\A\s*[-=]/
         # The line doesn't start with a - or a =, this is actually a "plain"
         # that contains interpolation.
-        indent = lines.first.index(/\S/)
+        indent = raw_first_line.index(/\S/)
         @ruby_chunks << PlaceholderMarkerChunk.new(node, 'interpolation', indent: indent)
-        add_interpolation_chunks(node, lines.first, node.line - 1, indent: indent)
+        add_interpolation_chunks(node, raw_first_line, node.line - 1, indent: indent)
         return
       end
 
+      _first_line_offset, lines = extract_raw_ruby_lines(node.script, node.line - 1)
+      # We want the actual indentation and prefix for the first line
+      first_line = lines[0] = @original_haml_lines[node.line - 1].rstrip
+      process_multiline!(first_line)
+
       lines[0] = lines[0].sub(/(=[ \t]?)/, '')
       line_indentation = Regexp.last_match(1).size
 
       raw_code = lines.join("\n")
 
       if lines[0][/\S/] == '#'
-        # a script that only constains a comment... needs special handling
-        comment_index = lines[0].index(/\S/)
-        lines[0].insert(comment_index + 1, " #{script_output_prefix.rstrip}")
+        # a "=" script that only contains a comment... No need for the "HL.out = " prefix,
+        # just treat it as comment which will turn into a "-" comment
       else
         lines[0] = HamlLint::Utils.insert_after_indentation(lines[0], script_output_prefix)
       end
@@ -131,7 +146,11 @@ def visit_script(node, &block)
 
     # Visit a script which doesn't output. Lines looking like `  - foo`
     def visit_silent_script(node, &block)
-      lines = raw_lines_of_interest(node.line - 1)
+      _first_line_offset, lines = extract_raw_ruby_lines(node.script, node.line - 1)
+      # We want the actual indentation and prefix for the first line
+      first_line = lines[0] = @original_haml_lines[node.line - 1].rstrip
+      process_multiline!(first_line)
+
       lines[0] = lines[0].sub(/(-[ \t]?)/, '')
       nb_to_deindent = Regexp.last_match(1).size
 
@@ -184,26 +203,32 @@ def finish_visit_any_script(node, lines, raw_code: nil, must_start_chunk: false)
     def visit_tag(node)
       indent = @original_haml_lines[node.line - 1].index(/\S/)
 
-      has_children = !node.children.empty?
-      if has_children
-        # We don't want to use a block because assignments in a block are local to that block,
-        # so the semantics of the extracted ruby would be different from the one generated by
-        # Haml. Those differences can make some cops, such as UselessAssignment, have false
-        # positives
-        code = 'begin'
-        @ruby_chunks << AdHocChunk.new(node,
-                                       [' ' * indent + code])
-        indent += 2
-      end
+      # We don't want to use a block because assignments in a block are local to that block,
+      # so the semantics of the extracted ruby would be different from the one generated by
+      # Haml. Those differences can make some cops, such as UselessAssignment, have false
+      # positives
+      code = 'begin'
+      @ruby_chunks << AdHocChunk.new(node,
+                                     [' ' * indent + code])
+      indent += 2
 
-      @ruby_chunks << PlaceholderMarkerChunk.new(node, 'tag', indent: indent)
+      tag_chunk = PlaceholderMarkerChunk.new(node, 'tag', indent: indent)
+      @ruby_chunks << tag_chunk
 
       current_line_index = visit_tag_attributes(node, indent: indent)
       visit_tag_script(node, line_index: current_line_index, indent: indent)
 
-      if has_children
-        yield
-        indent -= 2
+      yield
+
+      indent -= 2
+
+      if @ruby_chunks.last.equal?(tag_chunk)
+        # So there is nothing going "in" the tag, remove the wrapping "begin" and replace the PlaceholderMarkerChunk
+        # by one less indented
+        @ruby_chunks.pop
+        @ruby_chunks.pop
+        @ruby_chunks << PlaceholderMarkerChunk.new(node, 'tag', indent: indent)
+      else
         @ruby_chunks << AdHocChunk.new(node,
                                        [' ' * indent + 'ensure', ' ' * indent + '  HL.noop', ' ' * indent + 'end'],
                                        haml_line_index: @ruby_chunks.last.haml_end_line_index)
@@ -222,11 +247,16 @@ def visit_tag_attributes(node, indent:)
       attributes_code = additional_attributes.first
       if !attributes_code && node.hash_attributes? && node.dynamic_attributes_sources.empty?
         # No idea why .foo{:bar => 123} doesn't get here, but .foo{:bar => '123'} does...
-        # The code we get for the later is {:bar => '123'}.
+        # The code we get for the latter is {:bar => '123'}.
         # We normalize it by removing the { } so that it matches wha we normally get
         attributes_code = node.dynamic_attributes_source[:hash][1...-1]
       end
 
+      if attributes_code&.start_with?('{')
+        # Looks like the .foo(bar = 123) case. Ignoring.
+        attributes_code = nil
+      end
+
       return final_line_index unless attributes_code
       # Attributes have different ways to be given to us:
       #   .foo{bar: 123} => "bar: 123"
@@ -235,14 +265,13 @@ def visit_tag_attributes(node, indent:)
       #   .foo(bar = 123) => '{"bar" => 123,}'
       #   .foo{html_attrs('fr-fr')} => html_attrs('fr-fr')
       #
-      # The (bar = 123) case is extra painful to autocorrect (so is ignored).
+      # The (bar = 123) case is extra painful to autocorrect (so is ignored up there).
       # #raw_ruby_from_haml  will "detect" this case by not finding the code.
       #
       # We wrap the result in a method to have a valid syntax for all 3 ways
       # without having to differentiate them.
-      first_line_offset, raw_attributes_lines = raw_ruby_lines_from_haml(attributes_code,
-                                                                         node.line - 1)
-
+      first_line_offset, raw_attributes_lines = extract_raw_tag_attributes_ruby_lines(attributes_code,
+                                                                                      node.line - 1)
       return final_line_index unless raw_attributes_lines
 
       final_line_index += raw_attributes_lines.size - 1
@@ -276,7 +305,7 @@ def visit_tag_script(node, line_index:, indent:)
       # We ignore scripts which are just a comment
       return if node.script[/\S/] == '#'
 
-      first_line_offset, script_lines = raw_ruby_lines_from_haml(node.script, line_index)
+      first_line_offset, script_lines = extract_raw_ruby_lines(node.script, line_index)
 
       if script_lines.nil?
         # This is a string with interpolation after a tag
@@ -380,52 +409,87 @@ def add_interpolation_chunks(node, code, haml_line_index, indent:, line_start_in
       end
     end
 
+    def process_multiline!(line)
+      if HAML_PARSER_INSTANCE.send(:is_multiline?, line)
+        line.chop!.rstrip!
+        true
+      else
+        false
+      end
+    end
+
     # Returns the raw lines from the haml for the given index.
     # Multiple lines are returned when a line ends with a comma as that is the only
     # time HAMLs allows Ruby lines to be split.
-    def raw_lines_of_interest(first_line_index)
-      line_index = first_line_index
-      lines_of_interest = [@original_haml_lines[line_index]]
-
-      while @original_haml_lines[line_index].rstrip.end_with?(',')
-        line_index += 1
-        lines_of_interest << @original_haml_lines[line_index]
-      end
-
-      lines_of_interest
-    end
 
     # Haml's line-splitting rules (allowed after comma in scripts and attributes) are handled
     # at the parser level, so Haml doesn't provide the code as it is actually formatted in the Haml
     # file. #raw_ruby_from_haml extracts the ruby code as it is exactly in the Haml file.
     # The first and last lines may not be the complete lines from the Haml, only the Ruby parts
     # and the indentation between the first and last list.
-    def raw_ruby_lines_from_haml(code, first_line_index)
-      stripped_code = code.strip
-      return if stripped_code.empty?
 
-      lines_of_interest = raw_lines_of_interest(first_line_index)
+    # HAML transforms the ruby code in many ways as it parses a document. Often removing lines and/or
+    # indentation. This is quite annoying for us since we want the exact layout of the code to analyze it.
+    #
+    # This function receives the code as haml provides it and the line where it starts. It returns
+    # the actual code as it is in the haml file, keeping breaks and indentation for the following lines.
+    # In addition, the start position of the code in the first line.
+    #
+    # The rules for handling multiline code in HAML are as follow:
+    # * if the line being processed ends with a space and a pipe, then append to the line (without
+    #   newlines) every following lines that also end with a space and a pipe. This means the last line of
+    #   the "block" also needs a pipe at the end.
+    # * after processing the pipes, when dealing with ruby code (and not in tag attributes' hash), if the line
+    #   (which maybe span across multiple lines) ends with a comma, add the next line to the current piece of code.
+    #
+    # @return [first_line_offset, ruby_lines]
+    def extract_raw_ruby_lines(haml_processed_ruby_code, first_line_index)
+      haml_processed_ruby_code = haml_processed_ruby_code.strip
+      first_line = @original_haml_lines[first_line_index]
 
-      if lines_of_interest.size == 1
-        index = lines_of_interest.first.index(stripped_code)
-        if lines_of_interest.first.include?(stripped_code)
-          return [index, [stripped_code]]
-        else
-          # Sometimes, the code just isn't in the Haml when Haml does transformations to it
-          return
-        end
+      char_index = first_line.index(haml_processed_ruby_code)
+
+      if char_index
+        return [char_index, [haml_processed_ruby_code]]
+      end
+
+      cur_line_index = first_line_index
+      cur_line = first_line.rstrip
+      lines = []
+
+      # The pipes must also be on the last line of the multi-line section
+      while cur_line && process_multiline!(cur_line)
+        lines << cur_line
+        cur_line_index += 1
+        cur_line = @original_haml_lines[cur_line_index].rstrip
+      end
+
+      if lines.empty?
+        lines << cur_line
+      else
+        # The pipes must also be on the last line of the multi-line section. So cur_line is not the next line.
+        # We want to go back to check for commas
+        cur_line_index -= 1
+        cur_line = lines.last
+      end
+
+      while HAML_PARSER_INSTANCE.send(:is_ruby_multiline?, cur_line)
+        cur_line_index += 1
+        cur_line = @original_haml_lines[cur_line_index].rstrip
+        lines << cur_line
       end
 
-      raw_haml = lines_of_interest.join("\n")
+      joined_lines = lines.join("\n")
+
+      if haml_processed_ruby_code.include?("\n")
+        haml_processed_ruby_code = haml_processed_ruby_code.gsub("\n", ' ')
+      end
 
-      # Need the gsub because while multiline scripts are turned into a single line,
-      # by haml, multiline tag attributes are not.
-      code_parts = stripped_code.gsub("\n", ' ').split(/,\s*/)
+      haml_processed_ruby_code.split(/[, ]/)
 
-      regexp_code = code_parts.map { |c| Regexp.quote(c) }.join(',\\s*')
-      regexp = Regexp.new(regexp_code)
+      regexp = HamlLint::Utils.regexp_for_parts(haml_processed_ruby_code.split(/,\s*|\s+/), '(?:,\\s*|\\s+)')
 
-      match = raw_haml.match(regexp)
+      match = joined_lines.match(regexp)
       # This can happen when pipes are used as marker for multiline parts, and when tag attributes change lines
       # without ending by a comma. This is quite a can of worm and is probably not too frequent, so for now,
       # these cases are not supported.
@@ -438,6 +502,48 @@ def raw_ruby_lines_from_haml(code, first_line_index)
       [first_line_offset, ruby_lines]
     end
 
+    # Tag attributes actually handle multiline differently than scripts.
+    # The basic system basically keeps considering more lines until it meets the closing braces, but still
+    # processes pipes too (same as extract_raw_ruby_lines).
+    def extract_raw_tag_attributes_ruby_lines(haml_processed_ruby_code, first_line_index)
+      haml_processed_ruby_code = haml_processed_ruby_code.strip
+      first_line = @original_haml_lines[first_line_index]
+
+      char_index = first_line.index(haml_processed_ruby_code)
+
+      if char_index
+        return [char_index, [haml_processed_ruby_code]]
+      end
+
+      min_non_white_chars_to_add = haml_processed_ruby_code.scan(/\S/).size
+
+      regexp = HamlLint::Utils.regexp_for_parts(haml_processed_ruby_code.split(/\s+/), '\\s+')
+
+      joined_lines = first_line.rstrip
+      process_multiline!(joined_lines)
+
+      cur_line_index = first_line_index + 1
+      while @original_haml_lines[cur_line_index] && min_non_white_chars_to_add > 0
+        new_line = @original_haml_lines[cur_line_index].rstrip
+        process_multiline!(new_line)
+
+        min_non_white_chars_to_add -= new_line.scan(/\S/).size
+        joined_lines << "\n"
+        joined_lines << new_line
+        cur_line_index += 1
+      end
+
+      match = joined_lines.match(regexp)
+
+      return if match.nil?
+
+      first_line_offset = match.begin(0)
+      raw_ruby = match[0]
+      ruby_lines = raw_ruby.split("\n")
+
+      [first_line_offset, ruby_lines]
+    end
+
     def wrap_lines(lines, wrap_depth)
       lines = lines.dup
       wrapping_prefix = 'W' * (wrap_depth - 1) + '('