With nkululeko since version 0.96 you there are linguistic feature extractors, i.e. using the text of the spoḱen words as input.
Of course you can combine them with acoustic features and use any fitting model architecture with it.
[EXP]
# optional: language for linguistics
language = de
[DATA]
data = ../mydata
# the linguistic feature extractors require a column named "text"
# example, perhaps not needed!
data.col_names = {"transcription":"text"}
[FEAT]
# combine linguistic bert features with acoustic open smile features
type = ['bert', 'os']
[MODEL]
type = xgb