r-lib/tree-sitter-r

Remove `unmatched_delimiter` after all

DavisVaughan opened this issue · 1 comments

In some cases unmatched_delimiter can actually interfere with tree sitter's error recovery and make things worse!

What seems to be happening is that if () is invalid R code because there isn't a condition between the two (), and in our grammar we expect the condition to be any _expression. However, an unmatched ) is considered an _expression so it gets matched here. It gets considered as part of a weird binary operator node of ) 1 + 1 where ) is the LHS, the first 1 is an unexpected error, and the second 1 is the RHS.

text <- "foo <- function() {
  if ()

  1 + 1
}"

library(treesitter)
parser <- parser(treesitterr::language())
parser_parse(parser, text) |>
  tree_root_node() |>
  node_show_s_expression()
#> (program [(0, 0), (4, 1)]
#>   (binary_operator [(0, 0), (1, 0)]
#>     lhs: (identifier [(0, 0), (0, 3)])
#>     operator: "<-" [(0, 4), (0, 6)]
#>     rhs: (function_definition [(0, 7), (1, 0)]
#>       name: "function" [(0, 7), (0, 15)]
#>       parameters: (parameters [(0, 15), (0, 17)]
#>         "(" [(0, 15), (0, 16)]
#>         ")" [(0, 16), (0, 17)]
#>       )
#>       body: (braced_expression [(0, 18), (1, 0)]
#>         "{" [(0, 18), (0, 19)]
#>       )
#>     )
#>   )
#>   (ERROR [(1, 2), (4, 1)]
#>     "if" [(1, 2), (1, 4)]
#>     "(" [(1, 5), (1, 6)]
#>     (binary_operator [(1, 6), (3, 7)]
#>       lhs: (unmatched_delimiter [(1, 6), (1, 7)])
#>       lhs: (ERROR [(3, 2), (3, 3)])
#>       operator: "+" [(3, 4), (3, 5)]
#>       rhs: (float [(3, 6), (3, 7)])
#>     )
#>   )
#> )

It gets parsed more sensibly if you remove unmatched_delimiter

#> (program [(0, 0), (4, 1)]
#>   (binary_operator [(0, 0), (4, 1)]
#>     lhs: (identifier [(0, 0), (0, 3)])
#>     operator: "<-" [(0, 4), (0, 6)]
#>     rhs: (function_definition [(0, 7), (4, 1)]
#>       name: "function" [(0, 7), (0, 15)]
#>       parameters: (parameters [(0, 15), (0, 17)]
#>         "(" [(0, 15), (0, 16)]
#>         ")" [(0, 16), (0, 17)]
#>       )
#>       body: (braced_expression [(0, 18), (4, 1)]
#>         "{" [(0, 18), (0, 19)]
#>         body: (if_statement [(1, 2), (3, 7)]
#>           "if" [(1, 2), (1, 4)]
#>           "(" [(1, 5), (1, 6)]
#>           condition: (identifier [(1, 6), (1, 6)])
#>           ")" [(1, 6), (1, 7)]
#>           consequence: (binary_operator [(3, 2), (3, 7)]
#>             lhs: (float [(3, 2), (3, 3)])
#>             operator: "+" [(3, 4), (3, 5)]
#>             rhs: (float [(3, 6), (3, 7)])
#>           )
#>         )
#>         "}" [(4, 0), (4, 1)]
#>       )
#>     )
#>   )
#> )

The condition: (identifier [(1, 6), (1, 6)]) is really a MISSING node but the print method isn't showing that right now due to a tree-sitter bug.

Here's another example that is more complex, but it makes the entire program show as an ERROR. I'm going to hide it in details because it is long

text <- "blah('foo',
list(
  foo = function() {
    if ()
  },

  foo2 = function() {

  }
))"

library(treesitter)
parser <- parser(treesitterr::language())
parser_parse(parser, text) |>
  tree_root_node() |>
  node_show_s_expression()
#> (ERROR [(0, 0), (9, 2)]
#>   (identifier [(0, 0), (0, 4)])
#>   "(" [(0, 4), (0, 5)]
#>   argument: (argument [(0, 5), (0, 10)]
#>     value: (string [(0, 5), (0, 10)])
#>   )
#>   (comma [(0, 10), (0, 11)])
#>   (identifier [(1, 0), (1, 4)])
#>   "(" [(1, 4), (1, 5)]
#>   argument: (argument [(2, 2), (9, 2)]
#>     name: (identifier [(2, 2), (2, 5)])
#>     "=" [(2, 6), (2, 7)]
#>     value: (function_definition [(2, 8), (9, 2)]
#>       name: "function" [(2, 8), (2, 16)]
#>       parameters: (parameters [(2, 16), (2, 18)]
#>         "(" [(2, 16), (2, 17)]
#>         ")" [(2, 17), (2, 18)]
#>       )
#>       body: (braced_expression [(2, 19), (9, 2)]
#>         "{" [(2, 19), (2, 20)]
#>         body: (if_statement [(3, 4), (9, 2)]
#>           "if" [(3, 4), (3, 6)]
#>           "(" [(3, 7), (3, 8)]
#>           condition: (binary_operator [(3, 8), (8, 3)]
#>             lhs: (unmatched_delimiter [(3, 8), (3, 9)])
#>             lhs: (ERROR [(4, 2), (6, 6)]
#>               (comma [(4, 3), (4, 4)])
#>             )
#>             operator: "=" [(6, 7), (6, 8)]
#>             rhs: (function_definition [(6, 9), (8, 3)]
#>               name: "function" [(6, 9), (6, 17)]
#>               parameters: (parameters [(6, 17), (6, 19)]
#>                 "(" [(6, 17), (6, 18)]
#>                 ")" [(6, 18), (6, 19)]
#>               )
#>               body: (braced_expression [(6, 20), (8, 3)]
#>                 "{" [(6, 20), (6, 21)]
#>                 "}" [(8, 2), (8, 3)]
#>               )
#>             )
#>           )
#>           ")" [(9, 0), (9, 1)]
#>           consequence: (unmatched_delimiter [(9, 1), (9, 2)])
#>         )
#>       )
#>     )
#>   )
#> )

Without unmatched_delimiter, it is better and we get zero width identifiers inserted as MISSING nodes.

#> (program [(0, 0), (9, 2)]
#>   (call [(0, 0), (9, 2)]
#>     function: (identifier [(0, 0), (0, 4)])
#>     arguments: (arguments [(0, 4), (9, 2)]
#>       "(" [(0, 4), (0, 5)]
#>       argument: (argument [(0, 5), (0, 10)]
#>         value: (string [(0, 5), (0, 10)])
#>       )
#>       (comma [(0, 10), (0, 11)])
#>       argument: (argument [(1, 0), (9, 1)]
#>         value: (call [(1, 0), (9, 1)]
#>           function: (identifier [(1, 0), (1, 4)])
#>           arguments: (arguments [(1, 4), (9, 1)]
#>             "(" [(1, 4), (1, 5)]
#>             argument: (argument [(2, 2), (4, 3)]
#>               name: (identifier [(2, 2), (2, 5)])
#>               "=" [(2, 6), (2, 7)]
#>               value: (function_definition [(2, 8), (4, 3)]
#>                 name: "function" [(2, 8), (2, 16)]
#>                 parameters: (parameters [(2, 16), (2, 18)]
#>                   "(" [(2, 16), (2, 17)]
#>                   ")" [(2, 17), (2, 18)]
#>                 )
#>                 body: (braced_expression [(2, 19), (4, 3)]
#>                   "{" [(2, 19), (2, 20)]
#>                   body: (if_statement [(3, 4), (4, 0)]
#>                     "if" [(3, 4), (3, 6)]
#>                     "(" [(3, 7), (3, 8)]
#>                     condition: (identifier [(3, 8), (3, 8)])
#>                     ")" [(3, 8), (3, 9)]
#>                     consequence: (identifier [(4, 0), (4, 0)])
#>                   )
#>                   "}" [(4, 2), (4, 3)]
#>                 )
#>               )
#>             )
#>             (comma [(4, 3), (4, 4)])
#>             argument: (argument [(6, 2), (8, 3)]
#>               name: (identifier [(6, 2), (6, 6)])
#>               "=" [(6, 7), (6, 8)]
#>               value: (function_definition [(6, 9), (8, 3)]
#>                 name: "function" [(6, 9), (6, 17)]
#>                 parameters: (parameters [(6, 17), (6, 19)]
#>                   "(" [(6, 17), (6, 18)]
#>                   ")" [(6, 18), (6, 19)]
#>                 )
#>                 body: (braced_expression [(6, 20), (8, 3)]
#>                   "{" [(6, 20), (6, 21)]
#>                   "}" [(8, 2), (8, 3)]
#>                 )
#>               )
#>             )
#>             ")" [(9, 0), (9, 1)]
#>           )
#>         )
#>       )
#>       ")" [(9, 1), (9, 2)]
#>     )
#>   )
#> )

You can see it in the raw print method that uses their internal api, i.e. look for (identifier (MISSING _identifier) below

[1] "(program (call function: (identifier) arguments: (arguments argument: (argument value: (string)) (comma) argument: (argument value: (call function: (identifier) arguments: (arguments argument: (argument name: (identifier) value: (function_definition parameters: (parameters) body: (braced_expression body: (if_statement condition: (identifier (MISSING _identifier)) consequence: (identifier (MISSING _identifier)))))) (comma) argument: (argument name: (identifier) value: (function_definition parameters: (parameters) body: (braced_expression)))))))))"

Another case from @lionel-

text <- "{
  ggplot() +
}"
treesitter:::text_parse(text, treesitterr::language())
(program [(0, 0), (2, 1)]
  (braced_expression [(0, 0), (2, 1)]
    "{" [(0, 0), (0, 1)]
    body: (binary_operator [(1, 2), (2, 1)]
      lhs: (call [(1, 2), (1, 10)]
        function: (identifier [(1, 2), (1, 8)])
        arguments: (arguments [(1, 8), (1, 10)]
          "(" [(1, 8), (1, 9)]
          ")" [(1, 9), (1, 10)]
        )
      )
      operator: "+" [(1, 11), (1, 12)]
      rhs: (unmatched_delimiter [(2, 0), (2, 1)])
    )
  )
)

so the { looks unmatched and the } looks like an unmatched_delimiter expression on the RHS of the +

if you remove unmatched_delimiter entirely you get

(program [(0, 0), (2, 1)]
  (braced_expression [(0, 0), (2, 1)]
    "{" [(0, 0), (0, 1)]
    body: (binary_operator [(1, 2), (2, 0)]
      lhs: (call [(1, 2), (1, 10)]
        function: (identifier [(1, 2), (1, 8)])
        arguments: (arguments [(1, 8), (1, 10)]
          "(" [(1, 8), (1, 9)]
          ")" [(1, 9), (1, 10)]
        )
      )
      operator: "+" [(1, 11), (1, 12)]
      rhs: (identifier [(2, 0), (2, 0)])
    )
    "}" [(2, 0), (2, 1)]
  )
)

where now the { } is matched and tree sitter error recovery is smart enough to inject rhs: (identifier [(2, 0), (2, 0)]) which is actually a MISSING node but due to the TS bug we can't display that right rn