/tokenizer

⚙️ A simple tokenizer for deno

Primary LanguageTypeScriptMIT LicenseMIT

Tokenizer Badge License

A simple Deno library


Badge Status



Examples

import { Tokenizer } from 'https://deno.land/x/tokenizer/mod.ts';

const input = 'abc 123 HELLO [a cool](link)';

const rules = [{ 
    type : 'HELLO' ,
    pattern : 'HELLO' 
},{ 
    type : 'WORD' ,
    pattern : /[a-zA-Z]+/ 
},{ 
    type : 'DIGITS' ,
    pattern : /\d+/ ,
    value : m => Number.parseInt(m.match)
},{ 
    type : 'LINK' ,
    pattern : /\[([^\[]+)\]\(([^\)]+)\)/
},{ 
    type : 'SPACE' , 
    pattern : / / ,
    ignore: true // Or leave type blank and remove "ignore: true"
}];

const tokenizer = new Tokenizer(input,rules);

Option A

console.log(...tokenizer);
{ type: "WORD", match: "abc", value: "abc", groups: [], position: { start: 0, end: 3 } },
{ type: "DIGITS", match: "123", value: 123, groups: [], position: { start: 4, end: 7 } },
{ type: "HELLO", match: "HELLO", value: "HELLO", groups: [], position: { start: 8, end: 13 } },
{ type: "LINK", match: "[a cool](link)", value: "[a cool](link)", groups: [ "a cool", "link" ], position: { start: 14, end: 28 } }

Option B

while(!tokenizer.done)
    console.log(tokenizer.next().value);
{ type: "WORD", match: "abc", value: "abc", groups: [], position: { start: 0, end: 3 } }
{ type: "DIGITS", match: "123", value: 123, groups: [], position: { start: 4, end: 7 } }
{ type: "HELLO", match: "HELLO", value: "HELLO", groups: [], position: { start: 8, end: 13 } }
{ type: "LINK", match: "[a cool](link)", value: "[a cool](link)", groups: [ "a cool", "link" ], position: { start: 14, end: 28 } }

Option C

// Add a parameter to the tokenize method to override the source string
console.log(tokenizer.tokenize());
[{ type: "WORD", match: "abc", value: "abc", groups: [], position: { start: 0, end: 3 } },
 { type: "DIGITS", match: "123", value: 123, groups: [], position: { start: 4, end: 7 } },
 { type: "HELLO", match: "HELLO", value: "HELLO", groups: [], position: { start: 8, end: 13 } },
 { type: "LINK", match: "[a cool](link)", value: "[a cool](link)", groups: [ "a cool", "link" ], position: { start: 14, end: 28 } } ]

TODO

  • Custom patterns using functions
  • Add position information to Token
  • Array patterns (Multiple patterns for the same rule)
  • Documentation
  • Better error handling
  • Group matching
  • Value transform
  • More and better tests for everything
  • Examples
  • Line and column information? Or just a helper function to get line and column from index
  • BNF / EBNF ?
  • Generate a tokenizer