Tokenisation fails to produce an error
Closed this issue · 1 comments
andreypopp commented
Given this code (a stripped down tokenizer example):
import Automa
import Automa.RegExp: @re_str
re = Automa.RegExp
string = re.cat('"', re.rep(re"[ !#-~]" | re.cat("\\\"")), '"')
minijulia = Automa.compile(
string => :(emit(:string)),
re"[0-9]+" => :(emit(:integer)),
re"[\t ]+" => :(emit(:spaces)),
)
context = Automa.CodeGenContext()
@eval function tokenize(data)
$(Automa.generate_init_code(context, minijulia))
p_end = p_eof = sizeof(data)
tokens = Tuple{Symbol,String}[]
emit(kind) = push!(tokens, (kind, data[ts:te]))
while p ≤ p_eof && cs > 0
$(Automa.generate_exec_code(context, minijulia))
end
if cs < 0
error("failed to tokenize")
end
return tokens
end
the case:
julia> tokenize(""" "s """)
1-element Vector{Tuple{Symbol, String}}:
(:spaces, " ")
produces just a single token for space and notices no error. I'd expect it to fail.
jakobnissen commented
Reduced example:
import Automa
import Automa.RegExp: @re_str
re = Automa.RegExp
tokenizer = Automa.compile(re"ab" => :(emit(:string)),)
context = Automa.CodeGenContext()
@eval function tokenize(data)
$(Automa.generate_init_code(context, tokenizer))
p_end = p_eof = sizeof(data)
$(Automa.generate_exec_code(context, tokenizer))
return cs
end
@assert tokenize("a") < 0
It looks like the original author forgot to add a clause for when the tokenizer just stops in the middle of a token. I think it should be fairly easy to fix, I'll see if I can't push a fix later today. In tokenize.jl
, lines 85-ish
if p > p_eof ≥ 0 && cs ∈ $(tokenizer.machine.final_states)
$(eof_action_code)
cs = 0
elseif cs < 0
p -= 1
end
The tokenizer needs to set cs = -cs
if p > p_eof && cs ∉ $(tokenizer.machine.final_states)
...
More ambitiously, the tokenizer could use most of the Machine
's codegen machinery, perhaps by simply calling it internally since it contains a machine.