Aw, Snap! Crash in Chrome Using Whisper
Closed this issue · 5 comments
System Info
Transformers.js version: "@huggingface/transformers": "3.0.0-alpha.9"
Browser (if applicable): Chrome
Operating system: MacOS
Environment/Platform
- Website/web-app
- Browser extension
- Server-side (e.g., Node.js, Deno, Bun)
- Desktop app (e.g., Electron)
- Other (e.g., VSCode extension)
Description
I'm building a dictation tool using React based on the WebGPU-whisper demo. Once data models are loaded, they periodically crash Chrome. I've tried with models from both my own CDN, and the Huggingface CDN, and both have the issue. I've also tried using smaller models (using whisper-tiny.en
currently, but also saw the issue with the base model).
My Code:
worker.ts
/* eslint no-restricted-globals: 0 */
import {
AutoTokenizer,
AutoProcessor,
env,
full,
PreTrainedTokenizer,
Processor,
TextStreamer,
WhisperForConditionalGeneration,
} from "@huggingface/transformers";
env.allowLocalModels = false;
env.remoteHost =
"https://MYSUPERCOOLCDN.com/shared/dictation/models/";
env.remotePathTemplate = "{model}/";
env.backends.onnx.wasm.wasmPaths =
"https://MYSUPERCOOLCDN.com/shared/dictation/";
const MAX_NEW_TOKENS = 64;
class AutomaticSpeechRecognitionPipeline {
static model_id: string | null = null;
static tokenizer: Promise<PreTrainedTokenizer> | null = null;
static processor: Promise<Processor> | null = null;
static model: Promise<any> | null = null;
static async getInstance(
progress_callback: Function | undefined = undefined
) {
this.model_id = "onnx-community/whisper-tiny.en";
this.tokenizer ??= AutoTokenizer.from_pretrained(this.model_id, {
progress_callback: progress_callback || (() => {}),
});
this.processor ??= AutoProcessor.from_pretrained(this.model_id, {
progress_callback: progress_callback || (() => {}),
});
this.model ??= WhisperForConditionalGeneration.from_pretrained(
this.model_id,
{
dtype: {
encoder_model: "fp32", // 'fp16' works too
decoder_model_merged: "q4", // or 'fp32' ('fp16' is broken)
},
device: "webgpu",
progress_callback: progress_callback || (() => {}),
}
);
return Promise.all([this.tokenizer, this.processor, this.model]);
}
}
let processing = false;
async function generate({ audio }: { audio: Float32Array }) {
if (processing) return;
processing = true;
// Tell the main thread we are starting
self.postMessage({ status: "start" });
// Retrieve the text-generation pipeline.
const [tokenizer, processor, model] =
await AutomaticSpeechRecognitionPipeline.getInstance();
let numTokens = 0;
const callback_function = (output: string[]) => {
self.postMessage({
status: "update",
output,
numTokens,
});
};
const streamer = new TextStreamer(tokenizer, {
skip_prompt: true,
callback_function,
});
const inputs = await processor(audio);
const outputs = await model.generate({
...inputs,
max_new_tokens: MAX_NEW_TOKENS,
streamer,
});
const outputText = tokenizer.batch_decode(outputs, {
skip_special_tokens: true,
});
// Send the output back to the main thread
self.postMessage({
status: "complete",
output: outputText,
});
processing = false;
}
async function load() {
self.postMessage({
status: "loading",
data: "Loading model...",
});
// Load the pipeline and save it for future use.
const [tokenizer, processor, model] =
await AutomaticSpeechRecognitionPipeline.getInstance((x: any) => {
self.postMessage(x);
});
self.postMessage({
status: "loading",
data: "Compiling shaders and warming up model...",
});
await model.generate({
input_features: full([1, 80, 3000], 0.0),
max_new_tokens: 1,
});
self.postMessage({ status: "ready" });
}
self.addEventListener("message", async (e) => {
const { type, data } = e.data;
switch (type) {
case "load":
load();
break;
case "generate":
generate(data);
break;
}
});
dictationOverlay.ts
const DictationOverlay = () => {
const dispatch = useDispatch() as ThunkDispatch<any, any, AnyAction>;
const features: FeatureFlags = useSelector(
({ sys }: ApplicationState) => sys.features
);
const dictationKey = useSelector(({ sys }) => sys.dictation?.key);
const prevDictationStatus = useRef<string>("loading");
const prevText = useRef<string>("");
const worker = useRef<Worker | null>(null);
const recorderRef = useRef<MediaRecorder | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const [dictationStatus, setDictationStatus] = useState<string>("loading");
const [text, setText] = useState("");
const [displayText, setDisplayText] = useState<string>("");
const [isProcessing, setIsProcessing] = useState(false);
const [chunks, setChunks] = useState<any>([]);
const [open, setOpen] = useState(false);
// setup the worker as soon as the component is mounted.
useEffect(() => {
if (browserSupported && features?.dictation) {
if (!worker.current) {
// Create the worker if it does not yet exist.
worker.current = getWorker();
}
// Create a callback function for messages from the worker thread.
const onMessageReceived = (e: any) => {
switch (e.data.status) {
case "loading":
// Model file start load: add a new progress item to the list.
setDictationStatus("loading");
break;
case "ready":
// Pipeline ready: the worker is ready to accept messages.
setDictationStatus("ready");
break;
case "start":
// Start generation
setIsProcessing(true);
// Request new data from the recorder if present.
if (recorderRef.current?.state !== "inactive") {
recorderRef.current?.requestData();
}
break;
case "complete":
setIsProcessing(false);
if (recorderRef.current?.state === "inactive") {
setText("");
setDisplayText("");
} else {
setText(e.data.output);
}
break;
}
};
// Attach the callback function as an event listener.
worker.current.addEventListener("message", onMessageReceived);
// Load the data model when the component is mounted.
setTimeout(() => {
worker.current?.postMessage({ type: "load" });
}, 1000);
// Define a cleanup function for when the component is unmounted.
return () => {
worker.current?.removeEventListener("message", onMessageReceived);
worker.current?.terminate();
worker.current = null;
dispatch(setDictationKey(null));
dispatch(setDictationText(""));
};
}
}, []);
const sliceAfterString = (
stringToSlice: string,
searchString: string
): string => {
const index = stringToSlice.toLowerCase().indexOf(searchString);
// If we can't find the search string, return the original string.
if (index === -1) {
return stringToSlice;
}
// Slice the string after the search string.
return stringToSlice
.slice(index + searchString.length)
.replace(/^[\s\p{P}]+/u, "");
};
// Update the display text when the text changes.
// This will slice the dispplay text based on the dictation key.
useEffect(() => {
if (text.length > 0) {
const firstText = text[0];
if (firstText !== prevText.current) {
prevText.current = firstText;
if (!isSuppressedString(firstText)) {
const foundDictationKey = findDictationKey(firstText);
if (foundDictationKey && !dictationKey) {
dispatch(setDictationKey(foundDictationKey));
} else if (dictationKey) {
const trimmedText = sliceAfterString(firstText, dictationKey);
dispatch(setDictationText(trimmedText));
setDisplayText(trimmedText);
}
}
}
} else {
setDisplayText("");
}
}, [text]);
useEffect(() => {
// If the dictation overlay is open and loading isn't complete,
// once the loading is complete, start recording.
if (
open &&
prevDictationStatus.current === "loading" &&
dictationStatus === "ready"
) {
record();
}
prevDictationStatus.current = dictationStatus;
}, [dictationStatus]);
const record = () => {
setDictationStatus("recording");
navigator.mediaDevices
.getUserMedia({ audio: true })
.then((stream) => {
recorderRef.current = new MediaRecorder(stream);
audioContextRef.current = new AudioContext({
sampleRate: WHISPER_SAMPLING_RATE,
});
recorderRef.current.onstart = () => {
setChunks([]);
};
recorderRef.current.ondataavailable = (e) => {
if (e.data.size > 0) {
setChunks((prev: any[]) => [...prev, e.data]);
} else {
// Empty chunk received, so we request new data after a short timeout
setTimeout(() => {
if (recorderRef.current?.state !== "inactive") {
recorderRef.current?.requestData();
}
}, 25);
}
};
recorderRef.current.onstop = () => {
const tracks = stream.getTracks();
// When all tracks have been stopped the stream will
// no longer be active and release any permissioned input
tracks.forEach((track) => track.stop());
};
recorderRef.current.start();
})
.catch((err) => console.error("The following error occurred: ", err));
};
// Cleanup the recorder worker on unmount
useEffect(() => {
return () => {
recorderRef.current?.stop();
recorderRef.current = null;
};
}, []);
// Generate audio and post to the worker
useEffect(() => {
if (!recorderRef.current) return;
if (recorderRef.current?.state === "inactive") return;
if (isProcessing) return;
if (dictationStatus !== "recording") return;
if (chunks.length > 0) {
// Generate from data
const blob = new Blob(chunks, { type: recorderRef.current.mimeType });
const fileReader = new FileReader();
fileReader.onloadend = async () => {
const arrayBuffer = fileReader.result as ArrayBuffer;
const decoded = await audioContextRef.current?.decodeAudioData(
arrayBuffer
);
let audio = decoded && decoded.getChannelData(0);
if (audio && audio.length > MAX_SAMPLES) {
// Get last MAX_SAMPLES
audio = audio.slice(-MAX_SAMPLES);
}
worker.current?.postMessage({
type: "generate",
data: { audio, language: "en" },
});
};
fileReader.readAsArrayBuffer(blob);
} else {
recorderRef.current?.requestData();
}
}, [dictationStatus, isProcessing, chunks]);
return (
...JSX
);
};
Reproduction
This happens immediately once the model files and wasm binary load. The only warning I get in the console is:
Transformers.js version: "@huggingface/transformers": "3.0.0-alpha.9"
Hi there! This should have been fixed in a recent update - can you upgrade to 3.0.0
and see if that fixes it?
@xenova Great to hear the new version is out! I tried it, and I'm still getting the Module not found error I called out in this issue.
BTW the @.huggingface.transformers pinged the entire company 🤣
Been figuring out ever since why I am getting notifications ...
This is fixed in 3.0.2! Closing.