freeswitch/src/switch_vad.c

268 lines
7.2 KiB
C

/*
* FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
* Copyright (C) 2018-2020, Anthony Minessale II <anthm@freeswitch.org>
*
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
*
* The Initial Developer of the Original Code is
* Anthony Minessale II <anthm@freeswitch.org>
* Portions created by the Initial Developer are Copyright (C)
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Seven Du <dujinfang@gmail.com>
* Chris Rienzo <chris@signalwire.com>
*
*
* switch_vad.c VAD code with optional libfvad
*
*/
#include <switch.h>
#ifdef SWITCH_HAVE_FVAD
#include <fvad.h>
#endif
struct switch_vad_s {
// configs
int channels;
int sample_rate;
int debug;
int divisor;
int thresh;
int voice_samples_thresh;
int silence_samples_thresh;
// VAD state
int voice_samples;
int silence_samples;
switch_vad_state_t vad_state;
#ifdef SWITCH_HAVE_FVAD
Fvad *fvad;
#endif
};
SWITCH_DECLARE(const char *) switch_vad_state2str(switch_vad_state_t state)
{
switch(state) {
case SWITCH_VAD_STATE_NONE:
return "none";
case SWITCH_VAD_STATE_START_TALKING:
return "start_talking";
case SWITCH_VAD_STATE_TALKING:
return "talking";
case SWITCH_VAD_STATE_STOP_TALKING:
return "stop_talking";
default:
return "error";
}
}
SWITCH_DECLARE(switch_vad_t *) switch_vad_init(int sample_rate, int channels)
{
switch_vad_t *vad = malloc(sizeof(switch_vad_t));
if (!vad) return NULL;
memset(vad, 0, sizeof(*vad));
vad->sample_rate = sample_rate ? sample_rate : 8000;
vad->channels = channels;
vad->silence_samples_thresh = 500 * (vad->sample_rate / 1000);
vad->voice_samples_thresh = 200 * (vad->sample_rate / 1000);
vad->thresh = 100;
vad->divisor = vad->sample_rate / 8000;
if (vad->divisor <= 0) {
vad->divisor = 1;
}
switch_vad_reset(vad);
return vad;
}
SWITCH_DECLARE(int) switch_vad_set_mode(switch_vad_t *vad, int mode)
{
#ifdef SWITCH_HAVE_FVAD
int ret = 0;
if (mode < 0) {
if (vad->fvad) fvad_free(vad->fvad);
vad->fvad = NULL;
return ret;
} else if (mode > 3) {
mode = 3;
}
if (!vad->fvad) {
vad->fvad = fvad_new();
if (!vad->fvad) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "libfvad init error\n");
}
}
if (vad->fvad) {
ret = fvad_set_mode(vad->fvad, mode);
fvad_set_sample_rate(vad->fvad, vad->sample_rate);
}
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "libfvad started, mode = %d\n", mode);
return ret;
#else
if (vad->debug) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "set vad mode = %d\n", mode);
return 0;
#endif
}
SWITCH_DECLARE(void) switch_vad_set_param(switch_vad_t *vad, const char *key, int val)
{
if (!key) return;
if (!strcmp(key, "hangover_len")) {
/* convert old-style hits to samples assuming 20ms ptime */
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "hangover_len is deprecated, setting silence_ms to %d\n", 20 * val);
switch_vad_set_param(vad, "silence_ms", val * 20);
} else if (!strcmp(key, "silence_ms")) {
if (val > 0) {
vad->silence_samples_thresh = val * (vad->sample_rate / 1000);
} else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Ignoring invalid silence_ms of %d\n", val);
}
} else if (!strcmp(key, "thresh")) {
vad->thresh = val;
} else if (!strcmp(key, "debug")) {
vad->debug = val;
} else if (!strcmp(key, "voice_ms")) {
if (val > 0) {
vad->voice_samples_thresh = val * (vad->sample_rate / 1000);
} else {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Ignoring invalid voice_ms of %d\n", val);
}
} else if (!strcmp(key, "listen_hits")) {
/* convert old-style hits to samples assuming 20ms ptime */
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "listen_hits is deprecated, setting voice_ms to %d\n", 20 * val);
switch_vad_set_param(vad, "voice_ms", 20 * val);
}
if (vad->debug) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "set %s to %d\n", key, val);
}
}
SWITCH_DECLARE(void) switch_vad_reset(switch_vad_t *vad)
{
#ifdef SWITCH_HAVE_FVAD
if (vad->fvad) {
fvad_reset(vad->fvad);
}
#endif
vad->vad_state = SWITCH_VAD_STATE_NONE;
vad->voice_samples = 0;
vad->silence_samples = 0;
if (vad->debug) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "reset vad state\n");
}
SWITCH_DECLARE(switch_vad_state_t) switch_vad_process(switch_vad_t *vad, int16_t *data, unsigned int samples)
{
int score = 0;
// Each frame has 2 possible outcomes- voice or not voice.
// The VAD has 2 real states- talking / not talking with
// begin talking and stop talking as events to mark transitions
// determine if this is a voice or non-voice frame
#ifdef SWITCH_HAVE_FVAD
if (vad->fvad) {
// fvad returns -1, 0, or 1
// -1: error
// 0: non-voice frame
// 1: voice frame
int ret = fvad_process(vad->fvad, data, samples);
// if voice frame set score > threshold
score = ret > 0 ? vad->thresh + 100 : 0;
} else {
#endif
int energy = 0, j = 0, count = 0;
for (energy = 0, j = 0, count = 0; count < samples; count++) {
energy += abs(data[j]);
j += vad->channels;
}
if (samples && vad->divisor && samples >= vad->divisor) {
score = (uint32_t)(energy / (samples / vad->divisor));
}
#ifdef SWITCH_HAVE_FVAD
}
#endif
if (vad->debug > 9) {
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "score: %d\n", score);
}
// clear the STOP/START TALKING events
if (vad->vad_state == SWITCH_VAD_STATE_STOP_TALKING) {
vad->vad_state = SWITCH_VAD_STATE_NONE;
} else if (vad->vad_state == SWITCH_VAD_STATE_START_TALKING) {
vad->vad_state = SWITCH_VAD_STATE_TALKING;
}
// adjust voice/silence run length counters
if (score > vad->thresh) {
vad->silence_samples = 0;
vad->voice_samples += samples;
} else {
vad->silence_samples += samples;
vad->voice_samples = 0;
}
// check for state transitions
if (vad->vad_state == SWITCH_VAD_STATE_TALKING && vad->silence_samples > vad->silence_samples_thresh) {
vad->vad_state = SWITCH_VAD_STATE_STOP_TALKING;
if (vad->debug) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "vad state STOP_TALKING\n");
} else if (vad->vad_state == SWITCH_VAD_STATE_NONE && vad->voice_samples > vad->voice_samples_thresh) {
vad->vad_state = SWITCH_VAD_STATE_START_TALKING;
if (vad->debug) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "vad state START_TALKING\n");
}
if (vad->debug > 9) switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "vad state %s\n", switch_vad_state2str(vad->vad_state));
return vad->vad_state;
}
SWITCH_DECLARE(switch_vad_state_t) switch_vad_get_state(switch_vad_t *vad)
{
return vad->vad_state;
}
SWITCH_DECLARE(void) switch_vad_destroy(switch_vad_t **vad)
{
if (*vad) {
#ifdef SWITCH_HAVE_FVAD
if ((*vad)->fvad) fvad_free ((*vad)->fvad);
#endif
free(*vad);
*vad = NULL;
}
}