Remove `unmatched_delimiter` after all
DavisVaughan opened this issue · 1 comments
In some cases unmatched_delimiter
can actually interfere with tree sitter's error recovery and make things worse!
What seems to be happening is that if ()
is invalid R code because there isn't a condition
between the two ()
, and in our grammar we expect the condition
to be any _expression
. However, an unmatched )
is considered an _expression
so it gets matched here. It gets considered as part of a weird binary operator node of ) 1 + 1
where )
is the LHS, the first 1
is an unexpected error, and the second 1
is the RHS.
text <- "foo <- function() {
if ()
1 + 1
}"
library(treesitter)
parser <- parser(treesitterr::language())
parser_parse(parser, text) |>
tree_root_node() |>
node_show_s_expression()
#> (program [(0, 0), (4, 1)]
#> (binary_operator [(0, 0), (1, 0)]
#> lhs: (identifier [(0, 0), (0, 3)])
#> operator: "<-" [(0, 4), (0, 6)]
#> rhs: (function_definition [(0, 7), (1, 0)]
#> name: "function" [(0, 7), (0, 15)]
#> parameters: (parameters [(0, 15), (0, 17)]
#> "(" [(0, 15), (0, 16)]
#> ")" [(0, 16), (0, 17)]
#> )
#> body: (braced_expression [(0, 18), (1, 0)]
#> "{" [(0, 18), (0, 19)]
#> )
#> )
#> )
#> (ERROR [(1, 2), (4, 1)]
#> "if" [(1, 2), (1, 4)]
#> "(" [(1, 5), (1, 6)]
#> (binary_operator [(1, 6), (3, 7)]
#> lhs: (unmatched_delimiter [(1, 6), (1, 7)])
#> lhs: (ERROR [(3, 2), (3, 3)])
#> operator: "+" [(3, 4), (3, 5)]
#> rhs: (float [(3, 6), (3, 7)])
#> )
#> )
#> )
It gets parsed more sensibly if you remove unmatched_delimiter
#> (program [(0, 0), (4, 1)]
#> (binary_operator [(0, 0), (4, 1)]
#> lhs: (identifier [(0, 0), (0, 3)])
#> operator: "<-" [(0, 4), (0, 6)]
#> rhs: (function_definition [(0, 7), (4, 1)]
#> name: "function" [(0, 7), (0, 15)]
#> parameters: (parameters [(0, 15), (0, 17)]
#> "(" [(0, 15), (0, 16)]
#> ")" [(0, 16), (0, 17)]
#> )
#> body: (braced_expression [(0, 18), (4, 1)]
#> "{" [(0, 18), (0, 19)]
#> body: (if_statement [(1, 2), (3, 7)]
#> "if" [(1, 2), (1, 4)]
#> "(" [(1, 5), (1, 6)]
#> condition: (identifier [(1, 6), (1, 6)])
#> ")" [(1, 6), (1, 7)]
#> consequence: (binary_operator [(3, 2), (3, 7)]
#> lhs: (float [(3, 2), (3, 3)])
#> operator: "+" [(3, 4), (3, 5)]
#> rhs: (float [(3, 6), (3, 7)])
#> )
#> )
#> "}" [(4, 0), (4, 1)]
#> )
#> )
#> )
#> )
The condition: (identifier [(1, 6), (1, 6)])
is really a MISSING
node but the print method isn't showing that right now due to a tree-sitter bug.
Here's another example that is more complex, but it makes the entire program show as an ERROR. I'm going to hide it in details because it is long
text <- "blah('foo',
list(
foo = function() {
if ()
},
foo2 = function() {
}
))"
library(treesitter)
parser <- parser(treesitterr::language())
parser_parse(parser, text) |>
tree_root_node() |>
node_show_s_expression()
#> (ERROR [(0, 0), (9, 2)]
#> (identifier [(0, 0), (0, 4)])
#> "(" [(0, 4), (0, 5)]
#> argument: (argument [(0, 5), (0, 10)]
#> value: (string [(0, 5), (0, 10)])
#> )
#> (comma [(0, 10), (0, 11)])
#> (identifier [(1, 0), (1, 4)])
#> "(" [(1, 4), (1, 5)]
#> argument: (argument [(2, 2), (9, 2)]
#> name: (identifier [(2, 2), (2, 5)])
#> "=" [(2, 6), (2, 7)]
#> value: (function_definition [(2, 8), (9, 2)]
#> name: "function" [(2, 8), (2, 16)]
#> parameters: (parameters [(2, 16), (2, 18)]
#> "(" [(2, 16), (2, 17)]
#> ")" [(2, 17), (2, 18)]
#> )
#> body: (braced_expression [(2, 19), (9, 2)]
#> "{" [(2, 19), (2, 20)]
#> body: (if_statement [(3, 4), (9, 2)]
#> "if" [(3, 4), (3, 6)]
#> "(" [(3, 7), (3, 8)]
#> condition: (binary_operator [(3, 8), (8, 3)]
#> lhs: (unmatched_delimiter [(3, 8), (3, 9)])
#> lhs: (ERROR [(4, 2), (6, 6)]
#> (comma [(4, 3), (4, 4)])
#> )
#> operator: "=" [(6, 7), (6, 8)]
#> rhs: (function_definition [(6, 9), (8, 3)]
#> name: "function" [(6, 9), (6, 17)]
#> parameters: (parameters [(6, 17), (6, 19)]
#> "(" [(6, 17), (6, 18)]
#> ")" [(6, 18), (6, 19)]
#> )
#> body: (braced_expression [(6, 20), (8, 3)]
#> "{" [(6, 20), (6, 21)]
#> "}" [(8, 2), (8, 3)]
#> )
#> )
#> )
#> ")" [(9, 0), (9, 1)]
#> consequence: (unmatched_delimiter [(9, 1), (9, 2)])
#> )
#> )
#> )
#> )
#> )
Without unmatched_delimiter
, it is better and we get zero width identifier
s inserted as MISSING nodes.
#> (program [(0, 0), (9, 2)]
#> (call [(0, 0), (9, 2)]
#> function: (identifier [(0, 0), (0, 4)])
#> arguments: (arguments [(0, 4), (9, 2)]
#> "(" [(0, 4), (0, 5)]
#> argument: (argument [(0, 5), (0, 10)]
#> value: (string [(0, 5), (0, 10)])
#> )
#> (comma [(0, 10), (0, 11)])
#> argument: (argument [(1, 0), (9, 1)]
#> value: (call [(1, 0), (9, 1)]
#> function: (identifier [(1, 0), (1, 4)])
#> arguments: (arguments [(1, 4), (9, 1)]
#> "(" [(1, 4), (1, 5)]
#> argument: (argument [(2, 2), (4, 3)]
#> name: (identifier [(2, 2), (2, 5)])
#> "=" [(2, 6), (2, 7)]
#> value: (function_definition [(2, 8), (4, 3)]
#> name: "function" [(2, 8), (2, 16)]
#> parameters: (parameters [(2, 16), (2, 18)]
#> "(" [(2, 16), (2, 17)]
#> ")" [(2, 17), (2, 18)]
#> )
#> body: (braced_expression [(2, 19), (4, 3)]
#> "{" [(2, 19), (2, 20)]
#> body: (if_statement [(3, 4), (4, 0)]
#> "if" [(3, 4), (3, 6)]
#> "(" [(3, 7), (3, 8)]
#> condition: (identifier [(3, 8), (3, 8)])
#> ")" [(3, 8), (3, 9)]
#> consequence: (identifier [(4, 0), (4, 0)])
#> )
#> "}" [(4, 2), (4, 3)]
#> )
#> )
#> )
#> (comma [(4, 3), (4, 4)])
#> argument: (argument [(6, 2), (8, 3)]
#> name: (identifier [(6, 2), (6, 6)])
#> "=" [(6, 7), (6, 8)]
#> value: (function_definition [(6, 9), (8, 3)]
#> name: "function" [(6, 9), (6, 17)]
#> parameters: (parameters [(6, 17), (6, 19)]
#> "(" [(6, 17), (6, 18)]
#> ")" [(6, 18), (6, 19)]
#> )
#> body: (braced_expression [(6, 20), (8, 3)]
#> "{" [(6, 20), (6, 21)]
#> "}" [(8, 2), (8, 3)]
#> )
#> )
#> )
#> ")" [(9, 0), (9, 1)]
#> )
#> )
#> )
#> ")" [(9, 1), (9, 2)]
#> )
#> )
#> )
You can see it in the raw print method that uses their internal api, i.e. look for (identifier (MISSING _identifier)
below
[1] "(program (call function: (identifier) arguments: (arguments argument: (argument value: (string)) (comma) argument: (argument value: (call function: (identifier) arguments: (arguments argument: (argument name: (identifier) value: (function_definition parameters: (parameters) body: (braced_expression body: (if_statement condition: (identifier (MISSING _identifier)) consequence: (identifier (MISSING _identifier)))))) (comma) argument: (argument name: (identifier) value: (function_definition parameters: (parameters) body: (braced_expression)))))))))"
Another case from @lionel-
text <- "{
ggplot() +
}"
treesitter:::text_parse(text, treesitterr::language())
(program [(0, 0), (2, 1)]
(braced_expression [(0, 0), (2, 1)]
"{" [(0, 0), (0, 1)]
body: (binary_operator [(1, 2), (2, 1)]
lhs: (call [(1, 2), (1, 10)]
function: (identifier [(1, 2), (1, 8)])
arguments: (arguments [(1, 8), (1, 10)]
"(" [(1, 8), (1, 9)]
")" [(1, 9), (1, 10)]
)
)
operator: "+" [(1, 11), (1, 12)]
rhs: (unmatched_delimiter [(2, 0), (2, 1)])
)
)
)
so the {
looks unmatched and the }
looks like an unmatched_delimiter
expression on the RHS of the +
if you remove unmatched_delimiter
entirely you get
(program [(0, 0), (2, 1)]
(braced_expression [(0, 0), (2, 1)]
"{" [(0, 0), (0, 1)]
body: (binary_operator [(1, 2), (2, 0)]
lhs: (call [(1, 2), (1, 10)]
function: (identifier [(1, 2), (1, 8)])
arguments: (arguments [(1, 8), (1, 10)]
"(" [(1, 8), (1, 9)]
")" [(1, 9), (1, 10)]
)
)
operator: "+" [(1, 11), (1, 12)]
rhs: (identifier [(2, 0), (2, 0)])
)
"}" [(2, 0), (2, 1)]
)
)
where now the { }
is matched and tree sitter error recovery is smart enough to inject rhs: (identifier [(2, 0), (2, 0)])
which is actually a MISSING node but due to the TS bug we can't display that right rn