BioJulia/Automa.jl

Non-deterministic FSM generation

Closed this issue · 3 comments

This is a note for myself. I need to remove nondeterminism of FSM generation before #16 because it may affect the benchmarks.

Here is the smallest example of nondeterminism I could make:

import Automa
import Automa.RegExp: @re_str

re = re"0?12|0?1*"

dot = Automa.machine2dot(Automa.compile(re))
write("test.0.dot", dot)
run(`dot -Tpng -o test.0.png test.0.dot`)
n = 0
for i in 1:100
    dot′ = Automa.machine2dot(Automa.compile(deepcopy(re)))
    if hash(dot′) != hash(dot)
        n += 1
        write("test.$(i).dot", dot′)
        run(`dot -Tpng -o test.$(i).png test.$(i).dot`)
    end
end
@show n

There are two possibilities:

test 0
test 20

Curiously, re"1*|2" seems to be deterministic.

re"0?12|0?11" is also nondeterministic:
test 0
test 6

Combining re2nfa and nfa2dfa causes the problem.

import Automa
import Automa.RegExp: @re_str

re = re"0?11|0?12"

println("nfa2dot")
n = 0
for _ in 1:1000
    nfa = Automa.re2nfa(re)
    n += Automa.nfa2dot(nfa) != Automa.nfa2dot(nfa)
end
@show n

println("nfa2dot . re2nfa")
n = 0
for _ in 1:1000
    n += Automa.nfa2dot(Automa.re2nfa(re)) != Automa.nfa2dot(Automa.re2nfa(re))
end
@show n

println("dfa2dot . nfa2dfa")
n = 0
for _ in 1:1000
    nfa = Automa.re2nfa(re)
    n += Automa.dfa2dot(Automa.nfa2dfa(nfa)) != Automa.dfa2dot(Automa.nfa2dfa(nfa))
end
@show n

println("dfa2dot . nfa2dfa . re2nfa")
n = 0
for _ in 1:1000
    n += Automa.dfa2dot(Automa.nfa2dfa(Automa.re2nfa(re))) != Automa.dfa2dot(Automa.nfa2dfa(Automa.re2nfa(re)))
end
@show n
nfa2dot
n = 0
nfa2dot . re2nfa
n = 0
dfa2dot . nfa2dfa
n = 0
dfa2dot . nfa2dfa . re2nfa
n = 135

This kind of poor man's Dict and Set seem to solve the problem. So I think OrderedDict and/or OrderedSet still have nondeterministic operations.

type Dict{K,V} <: Associative{K,V}
    data::Vector{Tuple{K,V}}
end

function Dict{K,V}() where {K,V}
    return Dict{K,V}(Tuple{K,V}[])
end

function Dict(kvs::Pair{K,V}...) where {K,V}
    return Dict{K,V}([(k, v) for (k, v) in kvs])
end

function Dict(vals)
    return Dict([(k, v) for (k, v) in vals])
end

function Base.copy(dict::Dict{K,V}) where {K,V}
    return Dict{K,V}(copy(dict.data))
end

function Base.length(dict::Dict)
    return length(dict.data)
end

function Base.eltype(::Type{Dict{K,V}}) where {K,V}
    return Tuple{K,V}
end

function Base.haskey(dict::Dict, key)
    for (k, v) in dict.data
        if k == key
            return true
        end
    end
    return false
end

function Base.getindex(dict::Dict, key)
    for (k, v) in dict.data
        if k == key
            return v
        end
    end
    throw(KeyError(key))
end

function Base.get!(dict::Dict, key, default)
    if haskey(dict, key)
        return dict[key]
    end
    dict[key] = default
    return default
end

function Base.get!(f::Function, dict::Dict, key)
    if haskey(dict, key)
        return dict[key]
    end
    default = f()
    dict[key] = default
    return default
end

function Base.setindex!(dict::Dict, val, key)
    for (i, (k, v)) in enumerate(dict.data)
        if k == key
            dict.data[i] = (key, val)
            return dict
        end
    end
    resize!(dict.data, length(dict.data) + 1)
    dict.data[end] = (key, val)
    return dict
end

function Base.start(dict::Dict)
    return 1
end

function Base.done(dict::Dict, i)
    return i > endof(dict.data)
end

function Base.next(dict::Dict, i)
    return dict.data[i], i + 1
end

type Set{T} <: AbstractSet{T}
    dict::Dict{T,Void}
end

function Set{T}() where T
    return Set{T}(Dict{T,Void}())
end

function Set(vals)
    return Set(Dict([(v, nothing) for v in vals]))
end

function Base.length(set::Set)
    return length(set.dict)
end

function Base.eltype(::Type{Set{T}}) where T
    return T
end

function Base.haskey(set::Set, val)
    return haskey(set.dict, val)
end

function Base.push!(set::Set, val)
    if !haskey(set, val)
        set.dict[val] = nothing
    end
    return set
end

function Base.pop!(set::Set)
    return pop!(set.dict.data)[1]
end

function Base.delete!(set::Set, val)
    for (i, (v, _)) in enumerate(set.dict.data)
        if v == val
            deleteat!(set.dict.data, i)
            break
        end
    end
    return set
end

function Base.union!(set::Set, xs)
    for x in xs
        push!(set, x)
    end
    return set
end

function Base.union(set::Set, xs)
    return union!(copy(set), xs)
end

function Base.start(set::Set)
    return start(set.dict)
end

function Base.done(set::Set, s)
    return done(set.dict, s)
end

function Base.next(set::Set, s)
    item, s = next(set.dict, s)
    return item[1], s
end

function Base.copy(set::Set{T}) where T
    return Set{T}(copy(set.dict))
end

function Base.filter(p::Function, set::Set{T}) where T
    ret = Set{T}()
    for x in set
        if p(x)
            push!(ret, x)
        end
    end
    return ret
end