![Codacy Badge](https://camo.githubusercontent.com/a17f7a18a4817f00f4fa98f4f526ba75e74894c84b40a96e50f2d60355756244/68747470733a2f2f6170702e636f646163792e636f6d2f70726f6a6563742f62616467652f47726164652f3131326535306162643937343434613461636130366639346662376538383733)
![Codacy Badge](https://camo.githubusercontent.com/81a39907822a0bb45e9d7f37c1ac62b3114a0238f21034c8582cd2d42fb8fcfe/68747470733a2f2f6170702e636f646163792e636f6d2f70726f6a6563742f62616467652f436f7665726167652f3131326535306162643937343434613461636130366639346662376538383733)
pip install text-embeddings --upgrade
Link
from text_embeddings.visual import VTRTokenizer
data = [
"Hello world!",
"¡Hola Mundo!",
"你好,世界!",
]
tokenizer = VTRTokenizer(
font_size=14,
window_size=10,
font="resources/NotoSans-Regular.ttf",
max_length=36
)
results = tokenizer(
text=data,
text_pair=data,
add_special_tokens=True,
padding="longest",
return_tensors='pt',
truncation="longest_first",
return_attention_mask=True,
return_special_tokens_mask=True,
return_length=True,
prepend_batch_axis=True,
return_overflowing_tokens=False,
)
assert results["input_ids"].shape == (3, results["input_ids"].shape[1], 14, 10)
assert results["attention_mask"].shape == (3, results["input_ids"].shape[1])
assert results["token_type_ids"].shape == (3, results["input_ids"].shape[1])
assert results["length"].shape == (3, )
Write Your Own Embedding Tokenizer
import numpy as np
from typing import Optional, List, Dict
from text_embeddings.base import EmbeddingTokenizer
class MyOwnTokenizer(EmbeddingTokenizer):
def __init__(
self,
model_input_names: Optional[List[str]] = None,
special_tokens: Optional[Dict[str, np.ndarray]] = None,
max_length: Optional[int] = 2048,
):
super().__init__(model_input_names, special_tokens, max_length)
def text2embeddings(self, text: str) -> np.ndarray:
sequence_length = 10
dimensions = (10, 10, 10) # each token is mapped to a 3-d array
return np.zeros((sequence_length, *dimensions))
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
# let's create a consistent 3-d array
return np.zeros((10, 10, 10))
import torch.onnx # nightly torch only
from text_embeddings.byte.charformer import GBST, ByteTokenizer
model = GBST(
embed_size=128,
max_block_size=4,
downsampling_factor=2,
score_calibration=True,
vocab_size=259,
)
tokenizer = ByteTokenizer()
results = tokenizer(
["Life is like a box of chocolates.", "Coding is fun."],
add_special_tokens=True,
padding="longest",
truncation="longest_first",
)
# Export the model
torch.onnx.export(
model,
torch.tensor(results["input_ids"], requires_grad=True).long(),
"gbst.onnx",
export_params=True,
opset_version=11,
do_constant_folding=True,
input_names=["input"],
output_names=["output"],
dynamic_axes={
"input": {0: "batch_size", 1: "sequence_length"},
"output": {0: "batch_size"},
},
)