A library for augmenting language models with external caching mechanisms
- Python 3.6+
- PyTorch 1.13.0+
- Transformers 4.25.0+
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from neurocache import (
NeurocacheModelForCausalLM,
OnDeviceCacheConfig,
)
model_name = "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
cache_layer_idx = model.config.num_hidden_layers - 5
config = OnDeviceCacheConfig(
cache_layers=[cache_layer_idx, cache_layer_idx + 3],
attention_layers=list(range(cache_layer_idx, model.config.num_hidden_layers)),
compression_factor=8,
topk=8,
)
model = NeurocacheModelForCausalLM(model, config)
input_text = ["Hello, my dog is cute", " is cute"]
tokenized_input = tokenizer(input_text, return_tensors="pt")
tokenized_input["start_of_sequence"] = torch.tensor([1, 0]).bool()
outputs = model(**tokenized_input)
from neurocache.utils import NEUROCACHE_SUPPORTED_MODELS
print(NEUROCACHE_SUPPORTED_MODELS)
[
"opt",
"llama",
"mistral",
"gptj",
]