ggerganov/whisper.cpp

Adding `--dtw` preset for `large-v3-turbo`

rotemdan opened this issue · 2 comments

Using the large.v3 preset for large-v3-turbo doesn't seem to work:

main test2.wav --model models/ggml-large-v3-turbo.bin --dtw large.v3
aheads_masks_init: tried to set alignment head on text layer 8, but model only has 4 text layerswhisper_init_state: aheads_masks_init() failed for alignment heads masks

I believe the alignment heads are defined in here:

// [EXPERIMENTAL] Token-level timestamps with DTW
static const whisper_ahead g_aheads_tiny_en[]   = { {1, 0}, {2, 0}, {2, 5}, {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 4} };
static const whisper_ahead g_aheads_tiny[]      = { {2, 2}, {3, 0}, {3, 2}, {3, 3}, {3, 4}, {3, 5} };
static const whisper_ahead g_aheads_base_en[]   = { {3, 3}, {4, 7}, {5, 1}, {5, 5}, {5, 7} };
static const whisper_ahead g_aheads_base[]      = { {3, 1}, {4, 2}, {4, 3}, {4, 7}, {5, 1}, {5, 2}, {5, 4}, {5, 6} };
static const whisper_ahead g_aheads_small_en[]  = { {6, 6}, {7, 0}, {7, 3}, {7, 8}, {8, 2}, {8, 5}, {8, 7}, {9, 0}, {9, 4}, {9, 8}, {9, 10}, {10, 0}, {10, 1}, {10, 2}, {10, 3}, {10, 6}, {10, 11}, {11, 2}, {11, 4} };
static const whisper_ahead g_aheads_small[]     = { {5, 3}, {5, 9}, {8, 0}, {8, 4}, {8, 7}, {8, 8}, {9, 0}, {9, 7}, {9, 9}, {10, 5} };
static const whisper_ahead g_aheads_medium_en[] = { {11, 4}, {14, 1}, {14, 12}, {14, 14}, {15, 4}, {16, 0}, {16, 4}, {16, 9}, {17, 12}, {17, 14}, {18, 7}, {18, 10}, {18, 15}, {20, 0}, {20, 3}, {20, 9}, {20, 14}, {21, 12} };
static const whisper_ahead g_aheads_medium[]    = { {13, 15}, {15, 4}, {15, 15}, {16, 1}, {20, 0}, {23, 4} };
static const whisper_ahead g_aheads_large_v1[]  = { {9, 19}, {11, 2}, {11, 4}, {11, 17}, {22, 7}, {22, 11}, {22, 17}, {23, 2}, {23, 15} };
static const whisper_ahead g_aheads_large_v2[]  = { {10, 12}, {13, 17}, {16, 11}, {16, 12}, {16, 13}, {17, 15}, {17, 16}, {18, 4}, {18, 11}, {18, 19}, {19, 11}, {21, 2}, {21, 3}, {22, 3}, {22, 9}, {22, 12}, {23, 5}, {23, 7}, {23, 13}, {25, 5}, {26, 1}, {26, 12}, {27, 15} };
static const whisper_ahead g_aheads_large_v3[]  = { {7, 0}, {10, 17}, {12, 18}, {13, 12}, {16, 1}, {17, 14}, {19, 11}, {21, 4}, {24, 1}, {25, 6} };

static const std::map<whisper_alignment_heads_preset, whisper_aheads> g_aheads {
    { WHISPER_AHEADS_TINY_EN,   {  8, g_aheads_tiny_en   } },
    { WHISPER_AHEADS_TINY,      {  6, g_aheads_tiny      } },
    { WHISPER_AHEADS_BASE_EN,   {  5, g_aheads_base_en   } },
    { WHISPER_AHEADS_BASE,      {  8, g_aheads_base      } },
    { WHISPER_AHEADS_SMALL_EN,  { 19, g_aheads_small_en  } },
    { WHISPER_AHEADS_SMALL,     { 10, g_aheads_small     } },
    { WHISPER_AHEADS_MEDIUM_EN, { 18, g_aheads_medium_en } },
    { WHISPER_AHEADS_MEDIUM,    {  6, g_aheads_medium    } },
    { WHISPER_AHEADS_LARGE_V1,  {  9, g_aheads_large_v1  } },
    { WHISPER_AHEADS_LARGE_V2,  { 23, g_aheads_large_v2  } },
    { WHISPER_AHEADS_LARGE_V3,  { 10, g_aheads_large_v3  } },
};

The alignment head indices I extracted from the official Python implementation are:

const alignmentHeadsIndexes: { [name in WhisperModelName]: number[] } = {
	'tiny.en': [6, 12, 17, 18, 19, 20, 21, 22,],
	'tiny': [14, 18, 20, 21, 22, 23,],
	'base.en': [27, 39, 41, 45, 47,],
	'base': [25, 34, 35, 39, 41, 42, 44, 46,],
	'small.en': [78, 84, 87, 92, 98, 101, 103, 108, 112, 116, 118, 120, 121, 122, 123, 126, 131, 134, 136,],
	'small': [63, 69, 96, 100, 103, 104, 108, 115, 117, 125,],
	'medium.en': [180, 225, 236, 238, 244, 256, 260, 265, 284, 286, 295, 298, 303, 320, 323, 329, 334, 348,],
	'medium': [223, 244, 255, 257, 320, 372,],
	'large-v1': [199, 222, 224, 237, 447, 451, 457, 462, 475,],
	'large-v2': [212, 277, 331, 332, 333, 355, 356, 364, 371, 379, 391, 422, 423, 443, 449, 452, 465, 467, 473, 505, 521, 532, 555,],
	'large-v3': [140, 217, 258, 272, 321, 354, 391, 424, 481, 506,],
	'large-v3-turbo': [44, 51, 63, 66, 71, 74,],
}

(the reference Python code has them encoded in base85 encoded gziped binary which provides no actual benefit except obfuscation - I had to write special code to extract them - I can post it here if needed).

So for 'large-v3-turbo' the reference implementation uses [44, 51, 63, 66, 71, 74,].

I don't know what kind of indexing system whisper.cpp uses, so I can't really try to add this myself.

Based on some guesswork, it seems like the relationship between the pairs and the single indices could be something like:

i = (n_text_head * x) + y

So for base.en, which has n_text_head = 8:

{3, 3}, {4, 7}, {5, 1}, {5, 5}, {5, 7}

Is mapped to:

3 * 8 + 3 = 27
4 * 8 + 7 = 39
5 * 8 + 1 = 41
5 * 8 + 5 = 45
5 * 8 + 7 = 47

Which matches the reference.

So with this logic, for large-v3-turbo, which has n_text_head = 20:

44, 51, 63, 66, 71, 74

Should probably correspond to:

{2, 4}, {2, 11}, {3, 3}, {3, 6}, {3, 11}, {3, 14}

So based on this extrapolation I could try to add:

static const whisper_ahead g_aheads_large_v3_turbo[]  = { {2, 4}, {2, 11}, {3, 3}, {3, 6}, {3, 11}, {3, 14} };

and:

{ WHISPER_AHEADS_LARGE_V3_TURBO,  { 6, g_aheads_large_v3_turbo  } },

I'll try adding that to the code and see what happens.

If you see any error, please let me know.

In case someone finds it useful in the future, here's the script I used to extract the arrays from the encoded data in the reference Python code:

import base64
import gzip
import numpy as np

heads = {
    "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00",
    "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO",
    "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00",
    "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-<FaQ7m",
    "small.en": b"ABzY8>?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00",
    "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P<N0000",
    "medium.en": b"ABzY8usPae0{>%R7<zz_OvQ{)4kMa0BMw6u5rT}kRKX;$NfYBv00*Hl@qhsU00",
    "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
    "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
    "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
    "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
    "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
    "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
    "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
}

text = ""

for key in heads:
	text += "'{}': [".format(key)
	
	array = np.frombuffer(
		gzip.decompress(base64.b85decode(heads[key])), dtype=bool
	)
	
	for i in range(0, len(array)):
		if array[i] != False:
			text += "{}, ".format(i)
			
	text += "],\n"

print(text)