Adding `--dtw` preset for `large-v3-turbo`

Question

Adding `--dtw` preset for `large-v3-turbo`

rotemdan opened this issue a month ago · 2 comments

Using the large.v3 preset for large-v3-turbo doesn't seem to work:

main test2.wav --model models/ggml-large-v3-turbo.bin --dtw large.v3

aheads_masks_init: tried to set alignment head on text layer 8, but model only has 4 text layerswhisper_init_state: aheads_masks_init() failed for alignment heads masks

I believe the alignment heads are defined in here:

// [EXPERIMENTAL] Token-level timestamps with DTW
static const whisper_ahead g_aheads_tiny_en[]   = { {1, 0}, {2, 0}, {2, 5}, {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 4} };
static const whisper_ahead g_aheads_tiny[]      = { {2, 2}, {3, 0}, {3, 2}, {3, 3}, {3, 4}, {3, 5} };
static const whisper_ahead g_aheads_base_en[]   = { {3, 3}, {4, 7}, {5, 1}, {5, 5}, {5, 7} };
static const whisper_ahead g_aheads_base[]      = { {3, 1}, {4, 2}, {4, 3}, {4, 7}, {5, 1}, {5, 2}, {5, 4}, {5, 6} };
static const whisper_ahead g_aheads_small_en[]  = { {6, 6}, {7, 0}, {7, 3}, {7, 8}, {8, 2}, {8, 5}, {8, 7}, {9, 0}, {9, 4}, {9, 8}, {9, 10}, {10, 0}, {10, 1}, {10, 2}, {10, 3}, {10, 6}, {10, 11}, {11, 2}, {11, 4} };
static const whisper_ahead g_aheads_small[]     = { {5, 3}, {5, 9}, {8, 0}, {8, 4}, {8, 7}, {8, 8}, {9, 0}, {9, 7}, {9, 9}, {10, 5} };
static const whisper_ahead g_aheads_medium_en[] = { {11, 4}, {14, 1}, {14, 12}, {14, 14}, {15, 4}, {16, 0}, {16, 4}, {16, 9}, {17, 12}, {17, 14}, {18, 7}, {18, 10}, {18, 15}, {20, 0}, {20, 3}, {20, 9}, {20, 14}, {21, 12} };
static const whisper_ahead g_aheads_medium[]    = { {13, 15}, {15, 4}, {15, 15}, {16, 1}, {20, 0}, {23, 4} };
static const whisper_ahead g_aheads_large_v1[]  = { {9, 19}, {11, 2}, {11, 4}, {11, 17}, {22, 7}, {22, 11}, {22, 17}, {23, 2}, {23, 15} };
static const whisper_ahead g_aheads_large_v2[]  = { {10, 12}, {13, 17}, {16, 11}, {16, 12}, {16, 13}, {17, 15}, {17, 16}, {18, 4}, {18, 11}, {18, 19}, {19, 11}, {21, 2}, {21, 3}, {22, 3}, {22, 9}, {22, 12}, {23, 5}, {23, 7}, {23, 13}, {25, 5}, {26, 1}, {26, 12}, {27, 15} };
static const whisper_ahead g_aheads_large_v3[]  = { {7, 0}, {10, 17}, {12, 18}, {13, 12}, {16, 1}, {17, 14}, {19, 11}, {21, 4}, {24, 1}, {25, 6} };

static const std::map<whisper_alignment_heads_preset, whisper_aheads> g_aheads {
    { WHISPER_AHEADS_TINY_EN,   {  8, g_aheads_tiny_en   } },
    { WHISPER_AHEADS_TINY,      {  6, g_aheads_tiny      } },
    { WHISPER_AHEADS_BASE_EN,   {  5, g_aheads_base_en   } },
    { WHISPER_AHEADS_BASE,      {  8, g_aheads_base      } },
    { WHISPER_AHEADS_SMALL_EN,  { 19, g_aheads_small_en  } },
    { WHISPER_AHEADS_SMALL,     { 10, g_aheads_small     } },
    { WHISPER_AHEADS_MEDIUM_EN, { 18, g_aheads_medium_en } },
    { WHISPER_AHEADS_MEDIUM,    {  6, g_aheads_medium    } },
    { WHISPER_AHEADS_LARGE_V1,  {  9, g_aheads_large_v1  } },
    { WHISPER_AHEADS_LARGE_V2,  { 23, g_aheads_large_v2  } },
    { WHISPER_AHEADS_LARGE_V3,  { 10, g_aheads_large_v3  } },
};

The alignment head indices I extracted from the official Python implementation are:

const alignmentHeadsIndexes: { [name in WhisperModelName]: number[] } = {
	'tiny.en': [6, 12, 17, 18, 19, 20, 21, 22,],
	'tiny': [14, 18, 20, 21, 22, 23,],
	'base.en': [27, 39, 41, 45, 47,],
	'base': [25, 34, 35, 39, 41, 42, 44, 46,],
	'small.en': [78, 84, 87, 92, 98, 101, 103, 108, 112, 116, 118, 120, 121, 122, 123, 126, 131, 134, 136,],
	'small': [63, 69, 96, 100, 103, 104, 108, 115, 117, 125,],
	'medium.en': [180, 225, 236, 238, 244, 256, 260, 265, 284, 286, 295, 298, 303, 320, 323, 329, 334, 348,],
	'medium': [223, 244, 255, 257, 320, 372,],
	'large-v1': [199, 222, 224, 237, 447, 451, 457, 462, 475,],
	'large-v2': [212, 277, 331, 332, 333, 355, 356, 364, 371, 379, 391, 422, 423, 443, 449, 452, 465, 467, 473, 505, 521, 532, 555,],
	'large-v3': [140, 217, 258, 272, 321, 354, 391, 424, 481, 506,],
	'large-v3-turbo': [44, 51, 63, 66, 71, 74,],
}

(the reference Python code has them encoded in base85 encoded gziped binary which provides no actual benefit except obfuscation - I had to write special code to extract them - I can post it here if needed).

So for 'large-v3-turbo' the reference implementation uses [44, 51, 63, 66, 71, 74,].

I don't know what kind of indexing system whisper.cpp uses, so I can't really try to add this myself.

Answer 1 · 2024-10-15T10:26:32.000Z

Based on some guesswork, it seems like the relationship between the pairs and the single indices could be something like:

i = (n_text_head * x) + y

So for base.en, which has n_text_head = 8:

{3, 3}, {4, 7}, {5, 1}, {5, 5}, {5, 7}

Is mapped to:

3 * 8 + 3 = 27
4 * 8 + 7 = 39
5 * 8 + 1 = 41
5 * 8 + 5 = 45
5 * 8 + 7 = 47

Which matches the reference.

So with this logic, for large-v3-turbo, which has n_text_head = 20:

44, 51, 63, 66, 71, 74

Should probably correspond to:

{2, 4}, {2, 11}, {3, 3}, {3, 6}, {3, 11}, {3, 14}

So based on this extrapolation I could try to add:

static const whisper_ahead g_aheads_large_v3_turbo[]  = { {2, 4}, {2, 11}, {3, 3}, {3, 6}, {3, 11}, {3, 14} };

and:

{ WHISPER_AHEADS_LARGE_V3_TURBO,  { 6, g_aheads_large_v3_turbo  } },

I'll try adding that to the code and see what happens.

If you see any error, please let me know.

Answer 2 · 2024-10-15T20:05:44.000Z

In case someone finds it useful in the future, here's the script I used to extract the arrays from the encoded data in the reference Python code:

import base64
import gzip
import numpy as np

heads = {
    "tiny.en": b"ABzY8J1N>@0{>%R00Bk>$p{7v037`oCl~+#00",
    "tiny": b"ABzY8bu8Lr0{>%RKn9Fp%m@SkK7Kt=7ytkO",
    "base.en": b"ABzY8;40c<0{>%RzzG;p*o+Vo09|#PsxSZm00",
    "base": b"ABzY8KQ!870{>%RzyTQH3`Q^yNP!>##QT-<FaQ7m",
    "small.en": b"ABzY8>?_)10{>%RpeA61k&I|OI3I$65C{;;pbCHh0B{qLQ;+}v00",
    "small": b"ABzY8DmU6=0{>%Rpa?J`kvJ6qF(V^F86#Xh7JUGMK}P<N0000",
    "medium.en": b"ABzY8usPae0{>%R7<zz_OvQ{)4kMa0BMw6u5rT}kRKX;$NfYBv00*Hl@qhsU00",
    "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
    "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
    "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
    "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
    "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
    "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
    "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
}

text = ""

for key in heads:
	text += "'{}': [".format(key)
	
	array = np.frombuffer(
		gzip.decompress(base64.b85decode(heads[key])), dtype=bool
	)
	
	for i in range(0, len(array)):
		if array[i] != False:
			text += "{}, ".format(i)
			
	text += "],\n"

print(text)