Tokenisation fails to produce an error

Question

Tokenisation fails to produce an error

Closed this issue 2 years ago · 1 comments

Given this code (a stripped down tokenizer example):

import Automa
import Automa.RegExp: @re_str
re = Automa.RegExp

string = re.cat('"', re.rep(re"[ !#-~]" | re.cat("\\\"")), '"')
minijulia = Automa.compile(
    string         => :(emit(:string)),
    re"[0-9]+"     => :(emit(:integer)),
    re"[\t ]+"     => :(emit(:spaces)),
)

context = Automa.CodeGenContext()
@eval function tokenize(data)
    $(Automa.generate_init_code(context, minijulia))
    p_end = p_eof = sizeof(data)
    tokens = Tuple{Symbol,String}[]
    emit(kind) = push!(tokens, (kind, data[ts:te]))
    while p ≤ p_eof && cs > 0
        $(Automa.generate_exec_code(context, minijulia))
    end
    if cs < 0
        error("failed to tokenize")
    end
    return tokens
end

the case:

julia> tokenize(""" "s """)
1-element Vector{Tuple{Symbol, String}}:
 (:spaces, " ")

produces just a single token for space and notices no error. I'd expect it to fail.

Answer 1 · 2022-04-29T16:14:47.000Z

Reduced example:

import Automa
import Automa.RegExp: @re_str
re = Automa.RegExp

tokenizer = Automa.compile(re"ab" => :(emit(:string)),)
context = Automa.CodeGenContext()
@eval function tokenize(data)
    $(Automa.generate_init_code(context, tokenizer))
    p_end = p_eof = sizeof(data)
    $(Automa.generate_exec_code(context, tokenizer))
    return cs
end

@assert tokenize("a") < 0

It looks like the original author forgot to add a clause for when the tokenizer just stops in the middle of a token. I think it should be fairly easy to fix, I'll see if I can't push a fix later today. In tokenize.jl, lines 85-ish

        if p > p_eof ≥ 0 && cs ∈ $(tokenizer.machine.final_states)
            $(eof_action_code)
            cs = 0
        elseif cs < 0
            p -= 1
        end

The tokenizer needs to set cs = -cs if p > p_eof && cs ∉ $(tokenizer.machine.final_states)...
More ambitiously, the tokenizer could use most of the Machine's codegen machinery, perhaps by simply calling it internally since it contains a machine.