- mklex: CLI tool for compiling regular expressions into state transition tables
- libklex: C++ library for lexing
mklex - klex lexer generator
(c) 2018 Christian Parpart <christian@parpart.family>
-v, --verbose Prints some more verbose output
-h, --help Prints this help and exits
-f, --file=PATTERN_FILE Input file with lexer rules
-t, --output-table=FILE Output file that will contain the compiled tables (use - to represent stderr)
-T, --output-token=FILE Output file that will contain the compiled tables (use - to represent stderr)
-n, --table-name=IDENTIFIER Symbol name for generated table (may include namespace). [lexerDef]
-N, --token-name=IDENTIFIER Symbol name for generated token enum type (may include namespace). [Token]
-M, --machine-name=IDENTIFIER
Symbol name for generated machine enum type (must not include namespace). [Machine]
-x, --debug-dfa=DOT_FILE Writes dot graph of final finite automaton. Use - to represent stdout. []
-d, --debug-nfa Writes dot graph of non-deterministic finite automaton to stdout and exits.
--no-dfa-minimize Do not minimize the DFA
-p, --perf Print performance counters to stderr.
# specials
Spacing(ignore) ::= "[\t\s]+"
Eof ::= <<EOF>>
# symbols
Plus ::= \+
RndOpen ::= \(
RndClose ::= \)
# keywords
If ::= if
Then ::= then
Else ::= else
# literals & identifiers
NumberLiteral ::= 0|[1-9][0-9]*
Identifier ::= [a-zA-Z_][a-zA-Z0-9_]*
The great thing about the Lexer API is, that it is header-only, as the most complex parts are done at compilation already.
You can compile the above grammar with klex -f rules.klex -t myrules.h -T mytokens.h
and then compile the code below:
#include <klex/Lexer.h>
#include <fstream>
#include <memory>
#include "myrules.h"
#include "mytokens.h"
int main(int argc, const char* argv[]) {
klex::Lexer<Token> lexer {lexerDef, std::make_unique<std::ifstream>(argv[1])};
for (Token t = lexer.recognize(); t != Token::Eof; t = lexer.recognize()) {
std::cerr << fmt::format("[{}-{}]: token {} (\"{}\")\n",
lexer.offset().first,
lexer.offset().second,
to_string(t), lexer.word());
}
return EXIT_SUCCESS;
}
See examples/mathexpr.cc as a great example. Here's a snippet:
enum class Token { Eof = 1, Plus, Minus, Mul, Div, RndOpen, RndClose, Number, INVALID };
std::string RULES = R"(
Space(ignore) ::= [\s\t]+
Eof ::= <<EOF>>
Plus ::= "+"
Minus ::= "-"
Mul ::= "*"
Div ::= "/"
RndOpen ::= "("
RndClose ::= \)
Number ::= -?([0-9]+|[0-9]{1,3}(_[0-9]{3})*)
INVALID ::= .
)";
using Number = long long int;
Number expr(Lexer<Token>& lexer) {
// [... consume lexer tokens here ...]
return 42;
}
int main(int argc, const char* argv[]) {
klex::Compiler cc;
cc.declareAll(std::make_unique<std::stringstream>(RULES));
Lexer lexer { cc.compile(), std::make_unique<std::stringstream>("2 + 3 * (5 - 1)") };
lexer.recognize(); // recognize first token
Number y = expr(lexer);
std::cerr << fmt::format("{} = {}\n", input, y);
return EXIT_SUCCESS;
}