Skip to content

Commit 0908aed

Browse files
committed
Audio: MFCC: Add Voice Activity Detection based on Mel spectrum
Add mfcc_vad module with A-weighted energy-based voice activity detection that operates on the Mel log spectrum produced by the MFCC component. The algorithm tracks a per-bin noise floor with instant-down and slow-rise behavior, then computes a weighted energy delta above the floor. Speech is declared when the delta exceeds a threshold (0.35 in Q9.23) with a 20-frame hangover to prevent rapid toggling. The VAD is gated on the new enable_vad flag in sof_mfcc_config. Add struct mfcc_data_header with six int32 fields (magic, frame_number, reserved, energy, noise_energy, vad_flag) prepended to every output frame in all format paths (S16, S24, S32). This replaces the previous magic-word-only header. The header carries the VAD decision and energy values from the DSP for downstream consumers. Extend sof_mfcc_config in user/mfcc.h with reserved16[3] padding for 32-bit alignment, and new boolean fields enable_vad, enable_dtx, update_controls, and reserved_bool[5]. The config blob size increases from 104 to 116 bytes. Update Matlab/Octave decode scripts (decode_mel.m, decode_ceps.m, decode_all.m) and setup_mfcc.m for the expanded header and config struct. Regenerate topology2 configuration blobs (default.conf, mel80.conf) with the new blob size. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent e35a7ef commit 0908aed

13 files changed

Lines changed: 574 additions & 72 deletions

File tree

src/audio/mfcc/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ if(CONFIG_COMP_MFCC STREQUAL "m" AND DEFINED CONFIG_LLEXT)
44
add_subdirectory(llext ${PROJECT_BINARY_DIR}/mfcc_llext)
55
add_dependencies(app mfcc)
66
else()
7-
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c)
7+
add_local_sources(sof mfcc.c mfcc_setup.c mfcc_common.c mfcc_generic.c mfcc_hifi4.c mfcc_hifi3.c mfcc_vad.c)
88
endif()

src/audio/mfcc/mfcc_common.c

Lines changed: 45 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#include <stddef.h>
2222
#include <stdint.h>
2323

24+
#include <sof/audio/mfcc/mfcc_vad.h>
25+
2426
LOG_MODULE_REGISTER(mfcc_common, CONFIG_SOF_LOG_LEVEL);
2527

2628
/*
@@ -169,6 +171,21 @@ static int mfcc_stft_process(const struct comp_dev *dev, struct mfcc_comp_data *
169171

170172
cc_count += state->dct.num_out;
171173
}
174+
175+
/* Use hop counter for frame numbering (independent of VAD enable) */
176+
state->header.frame_number = state->hop_count;
177+
178+
/* Run VAD on the mel log spectrum (available in both modes) */
179+
if (config->enable_vad)
180+
mfcc_vad_update(&cd->vad, state->mel_log_32);
181+
182+
/* Populate data header for this output frame */
183+
state->header.energy = cd->vad.energy;
184+
state->header.noise_energy = cd->vad.noise_energy;
185+
state->header.vad_flag = cd->vad.is_speech ? 1 : 0;
186+
187+
/* Increment hop counter at end of hop processing */
188+
state->hop_count++;
172189
}
173190

174191
return cc_count;
@@ -267,9 +284,8 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
267284
struct mfcc_comp_data *cd = module_get_private_data(mod);
268285
struct mfcc_state *state = &cd->state;
269286
struct mfcc_buffer *buf = &cd->state.buf;
270-
uint32_t magic = MFCC_MAGIC;
271287
int16_t *w_ptr = audio_stream_get_wptr(sink);
272-
const int num_magic = 2;
288+
const int num_header_s16 = sizeof(state->header) / sizeof(int16_t);
273289
int num_ceps;
274290
int sink_samples;
275291
int to_copy;
@@ -280,25 +296,27 @@ void mfcc_s16_default(struct processing_module *mod, struct input_stream_buffer
280296
/* Run STFT and processing after FFT: Mel auditory filter and DCT. */
281297
num_ceps = mfcc_stft_process(mod->dev, cd);
282298

283-
/* If new output produced, set up pointer into scratch data and mark magic pending */
299+
/* If new output produced, set up pointer into scratch data and mark header pending */
284300
if (num_ceps > 0) {
285-
if (state->mel_only)
301+
if (state->mel_only) {
286302
state->out_data_ptr = state->mel_spectra->data;
287-
else
303+
} else {
288304
state->out_data_ptr = state->cepstral_coef->data;
305+
}
289306

290307
state->out_remain = num_ceps;
291-
state->magic_pending = true;
308+
state->header_pending = true;
292309
}
293310

294311
/* Write to sink, limited by period size */
295312
sink_samples = frames * audio_stream_get_channels(sink);
296313

297-
/* Write magic word first if pending */
298-
if (state->magic_pending && sink_samples >= num_magic) {
299-
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_magic, (int16_t *)&magic);
300-
sink_samples -= num_magic;
301-
state->magic_pending = false;
314+
/* Write data header first if pending */
315+
if (state->header_pending && sink_samples >= num_header_s16) {
316+
w_ptr = mfcc_sink_copy_data_s16(sink, w_ptr, num_header_s16,
317+
(int16_t *)&state->header);
318+
sink_samples -= num_header_s16;
319+
state->header_pending = false;
302320
}
303321

304322
/* Write cepstral/mel data from scratch buffer */
@@ -363,9 +381,8 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
363381
struct mfcc_comp_data *cd = module_get_private_data(mod);
364382
struct mfcc_state *state = &cd->state;
365383
struct mfcc_buffer *buf = &cd->state.buf;
366-
uint32_t magic = MFCC_MAGIC;
367384
int32_t *w_ptr = audio_stream_get_wptr(sink);
368-
const int num_magic = 1; /* one int32_t word for magic */
385+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
369386
int num_ceps;
370387
int sink_samples;
371388
int remain_s32;
@@ -391,17 +408,18 @@ void mfcc_s24_default(struct processing_module *mod, struct input_stream_buffer
391408
}
392409

393410
state->out_remain = num_ceps;
394-
state->magic_pending = true;
411+
state->header_pending = true;
395412
}
396413

397414
/* Write to sink, limited by period size */
398415
sink_samples = frames * audio_stream_get_channels(sink);
399416

400-
/* Write magic word first if pending */
401-
if (state->magic_pending && sink_samples >= num_magic) {
402-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
403-
sink_samples -= num_magic;
404-
state->magic_pending = false;
417+
/* Write data header first if pending */
418+
if (state->header_pending && sink_samples >= num_header_s32) {
419+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
420+
(int32_t *)&state->header);
421+
sink_samples -= num_header_s32;
422+
state->header_pending = false;
405423
}
406424

407425
if (state->mel_only) {
@@ -443,9 +461,8 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
443461
struct mfcc_comp_data *cd = module_get_private_data(mod);
444462
struct mfcc_state *state = &cd->state;
445463
struct mfcc_buffer *buf = &cd->state.buf;
446-
uint32_t magic = MFCC_MAGIC;
447464
int32_t *w_ptr = audio_stream_get_wptr(sink);
448-
const int num_magic = 1; /* one int32_t word for magic */
465+
const int num_header_s32 = sizeof(state->header) / sizeof(int32_t);
449466
int num_ceps;
450467
int sink_samples;
451468
int remain_s32;
@@ -466,17 +483,18 @@ void mfcc_s32_default(struct processing_module *mod, struct input_stream_buffer
466483
}
467484

468485
state->out_remain = num_ceps;
469-
state->magic_pending = true;
486+
state->header_pending = true;
470487
}
471488

472489
/* Write to sink, limited by period size */
473490
sink_samples = frames * audio_stream_get_channels(sink);
474491

475-
/* Write magic word first if pending */
476-
if (state->magic_pending && sink_samples >= num_magic) {
477-
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_magic, (int32_t *)&magic);
478-
sink_samples -= num_magic;
479-
state->magic_pending = false;
492+
/* Write data header first if pending */
493+
if (state->header_pending && sink_samples >= num_header_s32) {
494+
w_ptr = mfcc_sink_copy_data_s32(sink, w_ptr, num_header_s32,
495+
(int32_t *)&state->header);
496+
sink_samples -= num_header_s32;
497+
state->header_pending = false;
480498
}
481499

482500
if (state->mel_only) {

src/audio/mfcc/mfcc_setup.c

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <stddef.h>
1919
#include <stdint.h>
2020

21+
#include <sof/audio/mfcc/mfcc_vad.h>
22+
2123
/* Definitions for cepstral lifter */
2224
#define PI_Q23 Q_CONVERT_FLOAT(3.1415926536, 23)
2325
#define TWO_PI_Q23 Q_CONVERT_FLOAT(6.2831853072, 23)
@@ -127,6 +129,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
127129
return -EINVAL;
128130
}
129131

132+
if (sample_rate > MFCC_MAX_SAMPLE_RATE) {
133+
comp_err(dev, "Sample rate %d exceeds max %d Hz", sample_rate, MFCC_MAX_SAMPLE_RATE);
134+
return -EINVAL;
135+
}
136+
130137
if (config->sample_frequency != sample_rate) {
131138
comp_err(dev, "Config sample_frequency does not match stream");
132139
return -EINVAL;
@@ -328,11 +335,11 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
328335

329336
/* Check that output data can be drained within the periods spanned by one
330337
* FFT hop. Each hop consumes fft_hop_size input samples and produces
331-
* max_out_per_hop + 2 (magic) int16_t output values. The sink provides at
332-
* least fft_hop_size * channels int16_t samples per hop (worst case s16).
338+
* max_out_per_hop + 12 (magic header) int16_t output values. The sink provides
339+
* at least fft_hop_size * channels int16_t samples per hop (worst case s16).
333340
* If output exceeds this, data accumulates and will eventually overflow.
334341
*/
335-
int out_per_hop = max_out_per_hop + 2;
342+
int out_per_hop = max_out_per_hop + sizeof(state->header) / sizeof(int16_t);
336343
int sink_per_hop = fft->fft_hop_size * channels;
337344

338345
if (out_per_hop > sink_per_hop) {
@@ -345,11 +352,21 @@ int mfcc_setup(struct processing_module *mod, int max_frames, int sample_rate, i
345352
/* Set initial state for STFT */
346353
state->waiting_fill = true;
347354
state->prev_samples_valid = false;
348-
state->magic_pending = false;
355+
state->header_pending = false;
356+
memset(&state->header, 0, sizeof(state->header));
357+
state->header.magic = MFCC_MAGIC;
349358
state->out_data_ptr = NULL;
350359
state->out_data_ptr_32 = NULL;
351360
state->out_remain = 0;
352361

362+
if (config->enable_vad) {
363+
ret = mfcc_vad_init(&cd->vad, config->num_mel_bins, sample_rate, mod);
364+
if (ret < 0) {
365+
comp_err(dev, "Failed VAD init");
366+
goto free_lifter;
367+
}
368+
}
369+
353370
comp_dbg(dev, "done");
354371
return 0;
355372

@@ -389,4 +406,6 @@ void mfcc_free_buffers(struct processing_module *mod)
389406
mod_free(mod, cd->state.melfb.data);
390407
mod_free(mod, cd->state.dct.matrix);
391408
mod_free(mod, cd->state.lifter.matrix);
409+
mod_free(mod, cd->vad.noise_floor);
410+
mod_free(mod, cd->vad.weights);
392411
}

0 commit comments

Comments
 (0)