Category Archives: code

Predict emodb emotions with a Multi Layer Perceptron ANN

This post shows you how to classify emotions with a Multi Layer Perceptron (MLP) artificial neural net based on the torch framework (a different very famous ANN framework would be Keras).

Here's a complete jupyter notebook for your convenience.

We start with some imports, you need to install these packages, e.g. with pip, before you run this code:

import audformat
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import os
import opensmile
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score

Then we need to download and prepare our sample dataset, the Berlin emodb:

# get and unpack the Berlin Emodb emotional database if not already there
if not os.path.isdir('./emodb/'):
    !wget -c https://tubcloud.tu-berlin.de/s/LzPWz83Fjneb6SP/download
    !mv download emodb_audformat.zip
    !unzip emodb_audformat.zip
    !rm emodb_audformat.zip
# prepare the dataframe
db = audformat.Database.load('./emodb')
root = './emodb/'
db.map_files(lambda x: os.path.join(root, x))    
df_emotion = db.tables['emotion'].df
df = db.tables['files'].df
# copy the emotion label from the the emotion dataframe to the files dataframe
df['emotion'] = df_emotion['emotion']

As neural nets can only deal with numbers, we need to encode the target emotion labels with numbers:

# Encode the emotion words as numbers and use this as target 
target = 'enc_emo'
encoder = LabelEncoder()
encoder.fit(df['emotion'])
df[target] = encoder.transform(df['emotion'])

Now the dataframe should look like this:

df.head()

To ensure that we learn about emotions and not speaker idiosyncrasies we need to have speaker disjunct training and development sets:

# define fixed speaker disjunct train and test sets
train_spkrs = df.speaker.unique()[5:]
test_spkrs = df.speaker.unique()[:5]
df_train = df[df.speaker.isin(train_spkrs)]
df_test = df[df.speaker.isin(test_spkrs)]

print(f'#train samples: {df_train.shape[0]}, #test samples: {df_test.shape[0]}')
#train samples: 292, #test samples: 243

Next, we need to extract some acoustic features:

# extract (or get) GeMAPS features
if os.path.isfile('feats_train.pkl'):
    feats_train = pd.read_pickle('feats_train.pkl')
    feats_test = pd.read_pickle('feats_test.pkl')
else:
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.GeMAPSv01b,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    feats_train = smile.process_files(df_train.index)
    feats_test = smile.process_files(df_test.index)
    feats_train.to_pickle('feats_train.pkl')
    feats_test.to_pickle('feats_test.pkl')

Because neural nets are sensitive to large numbers, we need to scale all features with a mean of 0 and stddev of 1:

# Perform a standard scaling / z-transformation on the features (mean=0, std=1)
scaler = StandardScaler()
scaler.fit(feats_train)
feats_train_norm = pd.DataFrame(scaler.transform(feats_train))
feats_test_norm = pd.DataFrame(scaler.transform(feats_test))

Next we define two torch dataloaders, one for the training and one for the dev set:

def get_loader(df_x, df_y):
    data=[]
    for i in range(len(df_x)):
       data.append([df_x.values[i], df_y[target][i]])
    return torch.utils.data.DataLoader(data, shuffle=True, batch_size=8)
trainloader = get_loader(feats_train_norm, df_train)
testloader = get_loader(feats_test_norm, df_test)

We can then define the model, in this example with one hidden layer of 16 neurons:

class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(feats_train_norm.shape[1], 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, len(encoder.classes_))
        )
    def forward(self, x):
        # x: (batch_size, channels, samples)
        x = x.squeeze(dim=1)
        return self.linear(x)

We define two functions to train and evaluate the model:

def train_epoch(model, loader, device, optimizer, criterion):
    model.train()
    losses = []
    for features, labels in loader:
        logits = model(features.to(device))
        loss = criterion(logits, labels.to(device))
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return (np.asarray(losses)).mean()

def evaluate_model(model, loader, device, encoder):
    logits = torch.zeros(len(loader.dataset), len(encoder.classes_))
    targets = torch.zeros(len(loader.dataset))
    model.eval()
    with torch.no_grad():
        for index, (features, labels) in enumerate(loader):
            start_index = index * loader.batch_size
            end_index = (index + 1) * loader.batch_size
            if end_index > len(loader.dataset):
                end_index = len(loader.dataset)
            logits[start_index:end_index, :] = model(features.to(device))
            targets[start_index:end_index] = labels

    predictions = logits.argmax(dim=1)
    uar = recall_score(targets.numpy(), predictions.numpy(), average='macro')
    return uar, targets, predictions

Next we initialize the model and set the loss function (criterion) and optimizer:

device = 'cpu'
model = MLP().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
epoch_num = 250
uars_train = []
uars_dev = []
losses = []

We can then do the training loop over the epochs:

for epoch in range(0, epoch_num):
    loss = train_epoch(model, trainloader, device, optimizer, criterion)
    losses.append(loss)
    acc_train = evaluate_model(model, trainloader, device, encoder)[0]
    uars_train.append(acc_train)
    acc_dev, truths, preds = evaluate_model(model, testloader, device, encoder)
    uars_dev.append(acc_dev)
# scale the losses so they fit on the picture
losses = np.asarray(losses)/2

Next we might want to take a look at how the net performed with respect to unweighted average recall (UAR):

plt.figure(dpi=200)
plt.plot(uars_train, 'green', label='train set') 
plt.plot(uars_dev, 'red', label='dev set')
plt.plot(losses, 'grey', label='losses/2')
plt.xlabel('eopchs')
plt.ylabel('UAR')
plt.legend()
plt.show()

And perhaps see the resulting confusion matrix:

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(truths, preds,  normalize = 'true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=encoder.classes_).plot(cmap='gray')

Make a t-SNE plot

This post shows you how to generate a t-distributed stochastic neighbor embedding (t-SNE) plot with the opensmile features extracted from emodb data (which is explained in more detail in a previous blog post).

A t-SNE plot is a very useful visualization, as it condenses your feature space into two dimensions (so it can be plotted) and then uses colors to represent the class membership. This means, if you can identify clusters of same colored dots in your data cloud, the features are able to separate the classes.

We need the following imports:

import audformat
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import opensmile

First, you download and prepare emodb:

# get and unpack the berlin Emodb emotional database
!wget -c https://tubcloud.tu-berlin.de/s/LzPWz83Fjneb6SP/download
!mv download emodb_audformat.zip
!unzip emodb_audformat.zip
!rm emodb_audformat.zip
# preapare the dataframe
db = audformat.Database.load('./emodb')
root = './emodb/'
db.map_files(lambda x: os.path.join(root, x))
df = db.tables['emotion'].df

Then, you extract the geMAPS features:

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)
feats_df = smile.process_files(df.index)

And finally, you generate the t-SNE plot with the sklearn library like this:

# Plot a TSNE
def plotTsne(feats, labels, perplexity=30, learning_rate=200):
    model = TSNE(n_components=2, random_state=0, perplexity=perplexity, learning_rate=learning_rate)
    tsne_data = model.fit_transform(feats)
    tsne_data_labs = np.vstack((tsne_data.T, labels)).T
    tsne_df = pd.DataFrame(data=tsne_data_labs, columns=('Dim_1', 'Dim_2', 'label'))
    sns.FacetGrid(tsne_df, hue='label', size=6).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
    plt.show()
plotTsne(feats_df, df['emotion'], 30, 200)

It seems that these features are useful to distinguish at least the category anger from the rest.

You might want to fiddle around with the two main parameters of the algorithm: perplexity and learning-rate.

A python class to predict your emotions

This is a post to introduce you to the idea of encapsulating functionality with object-oriented programming.

We simply put the emotional classification of speech that was demonstrated in this post in a python class like this:

import opensmile
import os
import audformat
from sklearn import svm
import sounddevice as sd
import soundfile as sf
from scipy.io.wavfile import write

class EmoRec():
    root = './emodb/'
    clf = None
    filename = 'emorec.wav'
    sr = 16000
    def __init__(self):
        self.smile = opensmile.Smile(
            feature_set=opensmile.FeatureSet.GeMAPSv01b,
            feature_level=opensmile.FeatureLevel.Functionals,
        )
        if not os.path.isdir(self.root):
            self.download_emodb()
        db = audformat.Database.load(self.root)
        db.map_files(lambda x: os.path.join(self.root, x))
        self.df_emo = db.tables['emotion'].df
        self.df_files = db.tables['files'].df
        if not self.clf:
            self.train_model()

    def download_emodb(self):
        os.system('wget -c https://tubcloud.tu-berlin.de/s/LzPWz83Fjneb6SP/download')
        os.system('mv download emodb_audformat.zip')
        os.system('unzip emodb_audformat.zip')
        os.system('rm emodb_audformat.zip')

    def train_model(self):
        print('training a model...')
        df_feats = self.smile.process_files(self.df_emo.index)
        train_labels = self.df_emo.emotion
        train_feats =  df_feats
        self.clf = svm.SVC(kernel='linear', C=.001)
        self.clf.fit(train_feats, train_labels)
        print('done')

    def classify(self, wavefile):
        test_feats = self.smile.process_file(wavefile)
        return self.clf.predict(test_feats)

    def classify_from_micro(self, seconds):
        self.record(seconds)
        return self.classify(self.filename)[0]

    def record(self, seconds):
        data = sd.rec(int(seconds * self.sr), samplerate=self.sr, channels=1)
        sd.wait()  
        write(self.filename, self.sr, data)

def main():
    test = EmoRec()
    print(test.classify_from_micro(3))

if __name__ == "__main__":
    main()

To try this you could store the above in a file called , for example, 'emorec.py' and then in a jupyter notebook, call the constructor

import emorec
emoRec = emorec.EmoRec()

and use the functionality

result = emoRec.classify_from_micro(3)
print(f'emodb thinks your emotion is {result}')

Plot two parameters for categories

This is an examle how to plot values for two parameters in on plot and builds upon the dta generated at this example.
So, from the features you extracted you would isolate two parameters from the dataframe:

x1 = df_feats.loc[:, 'F0semitoneFrom27.5Hz_sma3nz_amean']
x2 = df_feats.loc[:, 'F0semitoneFrom27.5Hz_sma3nz_stddevNorm']

You'd need matplotlib

import matplotlib.pyplot as plt

You would color the dots according to the emotion they have been labeled with. Because the plot function does not accept string values as color designators but only numbers, you'd first have to convert them, e.g. with the LabelEncoder:

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
c_vals = le.fit_transform(df_emo.emotion.values)

and then you can simply do the plot:

plt.scatter(x1, x2, c=c_vals)
plt.show()

Feature scaling

Usually machine learning algorithms are not trained with raw data (aka end-to-end) but with features that model the entities of interest.
With respect to speech samples these features might be for example average pitch value over the whole utterance or length of utterance.

Now if the pitch value is given in Hz and the length in seconds, the pitch value will be in the range of [80, 300] and the length, say, in the range of [1.5, 6].
Machine learning approaches now would give higher consideration on the avr. pitch because the values are higher and differ by a larger amount, which is in the most cases not a good idea because it's a totally different feature.

A solution to this problem is to scale all values so that the features have a mean of 0 and standard deviation of 1.
This can be easily done with the preprocessing API from sklearn:

from sklearn import preprocessing
scaler = StandardScaler()
scaled_features = preprocessing.scaler.fit_transform(features)

Be aware that the use of the standard scaler only makes sense if the data follows a normal distribution.

Recording and transcribing a speech sample on Google colab“

Set up the recording method using java script:

# all imports
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(fn, sec):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(fn,'wb') as f:
    f.write(b)
  return fn

Record something:

 filename = 'felixtest.wav'
record(filename, 5)

Play it back:

import IPython
IPython.display.Audio(filename)

install Google speechbrain

%%capture
!pip install speechbrain
import speechbrain as sb

Load the ASR nodel train on libri speech:

from speechbrain.pretrained import EncoderDecoderASR
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir="pretrained_model")

And get a transcript on your audio:

asr_model.transcribe_file(audio_file )

Record sound from microphone

This works if you got "PortAudio" on your system.

import audiofile as af
import sounddevice as sd

def record_audio(filename, seconds):
    fs = 16000
    print("recording {} ({}s) ...".format(filename, seconds))
    y = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
    sd.wait()
    y = y.T
    af.write(filename, y, fs)
    print("  ... saved to {}".format(filename))