Speech recognition from labelled waveforms implemented in PyTorch, TensorFlow, and Keras for comparison
Model 1:
Based on the model outlined in PyTorch in this tutorial:
SpeechRecognitionModel(
(cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(rescnn_layers): Sequential(
(0): ResidualCNN(
(cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(layer_norm1): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
(layer_norm2): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
)
(1): ResidualCNN(
(cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(layer_norm1): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
(layer_norm2): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
)
(2): ResidualCNN(
(cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(layer_norm1): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
(layer_norm2): CNNLayerNorm(
(layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
)
)
)
(fully_connected): Linear(in_features=2048, out_features=512, bias=True)
(birnn_layers): Sequential(
(0): BidirectionalGRU(
(BiGRU): GRU(512, 512, batch_first=True, bidirectional=True)
(layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): BidirectionalGRU(
(BiGRU): GRU(1024, 512, bidirectional=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(2): BidirectionalGRU(
(BiGRU): GRU(1024, 512, bidirectional=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(3): BidirectionalGRU(
(BiGRU): GRU(1024, 512, bidirectional=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(4): BidirectionalGRU(
(BiGRU): GRU(1024, 512, bidirectional=True)
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(classifier): Sequential(
(0): Linear(in_features=1024, out_features=512, bias=True)
(1): GELU()
(2): Dropout(p=0.1, inplace=False)
(3): Linear(in_features=512, out_features=29, bias=True)
)
)