diff options
Diffstat (limited to 'src/common_audio/vad/main')
-rw-r--r-- | src/common_audio/vad/main/interface/webrtc_vad.h | 159 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad.gypi | 48 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_const.c | 80 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_const.h | 59 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_core.c | 685 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_core.h | 132 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_defines.h | 95 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_filterbank.c | 267 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_filterbank.h | 143 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_gmm.c | 70 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_gmm.h | 47 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_sp.c | 231 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/vad_sp.h | 60 | ||||
-rw-r--r-- | src/common_audio/vad/main/source/webrtc_vad.c | 197 | ||||
-rw-r--r-- | src/common_audio/vad/main/test/unit_test/unit_test.cc | 123 | ||||
-rw-r--r-- | src/common_audio/vad/main/test/unit_test/unit_test.h | 28 |
16 files changed, 2424 insertions, 0 deletions
diff --git a/src/common_audio/vad/main/interface/webrtc_vad.h b/src/common_audio/vad/main/interface/webrtc_vad.h new file mode 100644 index 0000000..6e3eb74 --- /dev/null +++ b/src/common_audio/vad/main/interface/webrtc_vad.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the VAD API calls. Specific function calls are given below. + */ + +#ifndef WEBRTC_VAD_WEBRTC_VAD_H_ +#define WEBRTC_VAD_WEBRTC_VAD_H_ + +#include "typedefs.h" + +typedef struct WebRtcVadInst VadInst; + +#ifdef __cplusplus +extern "C" +{ +#endif + +/**************************************************************************** + * WebRtcVad_get_version(...) + * + * This function returns the version number of the code. + * + * Output: + * - version : Pointer to a buffer where the version info will + * be stored. + * Input: + * - size_bytes : Size of the buffer. + * + */ +WebRtc_Word16 WebRtcVad_get_version(char *version, size_t size_bytes); + +/**************************************************************************** + * WebRtcVad_AssignSize(...) + * + * This functions get the size needed for storing the instance for encoder + * and decoder, respectively + * + * Input/Output: + * - size_in_bytes : Pointer to integer where the size is returned + * + * Return value : 0 + */ +WebRtc_Word16 WebRtcVad_AssignSize(int *size_in_bytes); + +/**************************************************************************** + * WebRtcVad_Assign(...) + * + * This functions Assigns memory for the instances. + * + * Input: + * - vad_inst_addr : Address to where to assign memory + * Output: + * - vad_inst : Pointer to the instance that should be created + * + * Return value : 0 - Ok + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_Assign(VadInst **vad_inst, void *vad_inst_addr); + +/**************************************************************************** + * WebRtcVad_Create(...) + * + * This function creates an instance to the VAD structure + * + * Input: + * - vad_inst : Pointer to VAD instance that should be created + * + * Output: + * - vad_inst : Pointer to created VAD instance + * + * Return value : 0 - Ok + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_Create(VadInst **vad_inst); + +/**************************************************************************** + * WebRtcVad_Free(...) + * + * This function frees the dynamic memory of a specified VAD instance + * + * Input: + * - vad_inst : Pointer to VAD instance that should be freed + * + * Return value : 0 - Ok + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_Free(VadInst *vad_inst); + +/**************************************************************************** + * WebRtcVad_Init(...) + * + * This function initializes a VAD instance + * + * Input: + * - vad_inst : Instance that should be initialized + * + * Output: + * - vad_inst : Initialized instance + * + * Return value : 0 - Ok + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_Init(VadInst *vad_inst); + +/**************************************************************************** + * WebRtcVad_set_mode(...) + * + * This function initializes a VAD instance + * + * Input: + * - vad_inst : VAD instance + * - mode : Aggressiveness setting (0, 1, 2, or 3) + * + * Output: + * - vad_inst : Initialized instance + * + * Return value : 0 - Ok + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_set_mode(VadInst *vad_inst, WebRtc_Word16 mode); + +/**************************************************************************** + * WebRtcVad_Process(...) + * + * This functions does a VAD for the inserted speech frame + * + * Input + * - vad_inst : VAD Instance. Needs to be initiated before call. + * - fs : sampling frequency (Hz): 8000, 16000, or 32000 + * - speech_frame : Pointer to speech frame buffer + * - frame_length : Length of speech frame buffer in number of samples + * + * Output: + * - vad_inst : Updated VAD instance + * + * Return value : 1 - Active Voice + * 0 - Non-active Voice + * -1 - Error + */ +WebRtc_Word16 WebRtcVad_Process(VadInst *vad_inst, + WebRtc_Word16 fs, + WebRtc_Word16 *speech_frame, + WebRtc_Word16 frame_length); + +#ifdef __cplusplus +} +#endif + +#endif // WEBRTC_VAD_WEBRTC_VAD_H_ diff --git a/src/common_audio/vad/main/source/vad.gypi b/src/common_audio/vad/main/source/vad.gypi new file mode 100644 index 0000000..7b23ae8 --- /dev/null +++ b/src/common_audio/vad/main/source/vad.gypi @@ -0,0 +1,48 @@ +# Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +{ + 'targets': [ + { + 'target_name': 'vad', + 'type': '<(library)', + 'dependencies': [ + 'spl', + ], + 'include_dirs': [ + '../interface', + ], + 'direct_dependent_settings': { + 'include_dirs': [ + '../interface', + ], + }, + 'sources': [ + '../interface/webrtc_vad.h', + 'webrtc_vad.c', + 'vad_const.c', + 'vad_const.h', + 'vad_defines.h', + 'vad_core.c', + 'vad_core.h', + 'vad_filterbank.c', + 'vad_filterbank.h', + 'vad_gmm.c', + 'vad_gmm.h', + 'vad_sp.c', + 'vad_sp.h', + ], + }, + ], +} + +# Local Variables: +# tab-width:2 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=2 shiftwidth=2: diff --git a/src/common_audio/vad/main/source/vad_const.c b/src/common_audio/vad/main/source/vad_const.c new file mode 100644 index 0000000..47b6a4b --- /dev/null +++ b/src/common_audio/vad/main/source/vad_const.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This file includes the constant values used internally in VAD. + */ + +#include "vad_const.h" + +// Spectrum Weighting +const WebRtc_Word16 kSpectrumWeight[6] = {6, 8, 10, 12, 14, 16}; + +const WebRtc_Word16 kCompVar = 22005; + +// Constant 160*log10(2) in Q9 +const WebRtc_Word16 kLogConst = 24660; + +// Constant log2(exp(1)) in Q12 +const WebRtc_Word16 kLog10Const = 5909; + +// Q15 +const WebRtc_Word16 kNoiseUpdateConst = 655; +const WebRtc_Word16 kSpeechUpdateConst = 6554; + +// Q8 +const WebRtc_Word16 kBackEta = 154; + +// Coefficients used by WebRtcVad_HpOutput, Q14 +const WebRtc_Word16 kHpZeroCoefs[3] = {6631, -13262, 6631}; +const WebRtc_Word16 kHpPoleCoefs[3] = {16384, -7756, 5620}; + +// Allpass filter coefficients, upper and lower, in Q15 +// Upper: 0.64, Lower: 0.17 +const WebRtc_Word16 kAllPassCoefsQ15[2] = {20972, 5571}; +const WebRtc_Word16 kAllPassCoefsQ13[2] = {5243, 1392}; // Q13 + +// Minimum difference between the two models, Q5 +const WebRtc_Word16 kMinimumDifference[6] = {544, 544, 576, 576, 576, 576}; + +// Upper limit of mean value for speech model, Q7 +const WebRtc_Word16 kMaximumSpeech[6] = {11392, 11392, 11520, 11520, 11520, 11520}; + +// Minimum value for mean value +const WebRtc_Word16 kMinimumMean[2] = {640, 768}; + +// Upper limit of mean value for noise model, Q7 +const WebRtc_Word16 kMaximumNoise[6] = {9216, 9088, 8960, 8832, 8704, 8576}; + +// Adjustment for division with two in WebRtcVad_SplitFilter +const WebRtc_Word16 kOffsetVector[6] = {368, 368, 272, 176, 176, 176}; + +// Start values for the Gaussian models, Q7 +// Weights for the two Gaussians for the six channels (noise) +const WebRtc_Word16 kNoiseDataWeights[12] = {34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103}; + +// Weights for the two Gaussians for the six channels (speech) +const WebRtc_Word16 kSpeechDataWeights[12] = {48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81}; + +// Means for the two Gaussians for the six channels (noise) +const WebRtc_Word16 kNoiseDataMeans[12] = {6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, + 7820, 7266, 5020, 4362}; + +// Means for the two Gaussians for the six channels (speech) +const WebRtc_Word16 kSpeechDataMeans[12] = {8306, 10085, 10078, 11823, 11843, 6309, 9473, + 9571, 10879, 7581, 8180, 7483}; + +// Stds for the two Gaussians for the six channels (noise) +const WebRtc_Word16 kNoiseDataStds[12] = {378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, + 421, 455}; + +// Stds for the two Gaussians for the six channels (speech) +const WebRtc_Word16 kSpeechDataStds[12] = {555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, + 1079, 850}; diff --git a/src/common_audio/vad/main/source/vad_const.h b/src/common_audio/vad/main/source/vad_const.h new file mode 100644 index 0000000..8980437 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_const.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the declarations of the internally used constants. + */ + +#ifndef WEBRTC_VAD_CONST_H_ +#define WEBRTC_VAD_CONST_H_ + +#include "typedefs.h" + +// TODO(ajm): give these internal-linkage by moving to the appropriate file +// where possible, and otherwise tag with WebRtcVad_. + +// Spectrum Weighting +extern const WebRtc_Word16 kSpectrumWeight[]; +extern const WebRtc_Word16 kCompVar; +// Logarithm constant +extern const WebRtc_Word16 kLogConst; +extern const WebRtc_Word16 kLog10Const; +// Q15 +extern const WebRtc_Word16 kNoiseUpdateConst; +extern const WebRtc_Word16 kSpeechUpdateConst; +// Q8 +extern const WebRtc_Word16 kBackEta; +// Coefficients used by WebRtcVad_HpOutput, Q14 +extern const WebRtc_Word16 kHpZeroCoefs[]; +extern const WebRtc_Word16 kHpPoleCoefs[]; +// Allpass filter coefficients, upper and lower, in Q15 resp. Q13 +extern const WebRtc_Word16 kAllPassCoefsQ15[]; +extern const WebRtc_Word16 kAllPassCoefsQ13[]; +// Minimum difference between the two models, Q5 +extern const WebRtc_Word16 kMinimumDifference[]; +// Maximum value when updating the speech model, Q7 +extern const WebRtc_Word16 kMaximumSpeech[]; +// Minimum value for mean value +extern const WebRtc_Word16 kMinimumMean[]; +// Upper limit of mean value for noise model, Q7 +extern const WebRtc_Word16 kMaximumNoise[]; +// Adjustment for division with two in WebRtcVad_SplitFilter +extern const WebRtc_Word16 kOffsetVector[]; +// Start values for the Gaussian models, Q7 +extern const WebRtc_Word16 kNoiseDataWeights[]; +extern const WebRtc_Word16 kSpeechDataWeights[]; +extern const WebRtc_Word16 kNoiseDataMeans[]; +extern const WebRtc_Word16 kSpeechDataMeans[]; +extern const WebRtc_Word16 kNoiseDataStds[]; +extern const WebRtc_Word16 kSpeechDataStds[]; + +#endif // WEBRTC_VAD_CONST_H_ diff --git a/src/common_audio/vad/main/source/vad_core.c b/src/common_audio/vad/main/source/vad_core.c new file mode 100644 index 0000000..85864e1 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_core.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the core functionality in VAD. + * For function description, see vad_core.h. + */ + +#include "vad_core.h" +#include "vad_const.h" +#include "vad_defines.h" +#include "vad_filterbank.h" +#include "vad_gmm.h" +#include "vad_sp.h" +#include "signal_processing_library.h" + +static const int kInitCheck = 42; + +// Initialize VAD +int WebRtcVad_InitCore(VadInstT *inst, short mode) +{ + int i; + + // Initialization of struct + inst->vad = 1; + inst->frame_counter = 0; + inst->over_hang = 0; + inst->num_of_speech = 0; + + // Initialization of downsampling filter state + inst->downsampling_filter_states[0] = 0; + inst->downsampling_filter_states[1] = 0; + inst->downsampling_filter_states[2] = 0; + inst->downsampling_filter_states[3] = 0; + + // Read initial PDF parameters + for (i = 0; i < NUM_TABLE_VALUES; i++) + { + inst->noise_means[i] = kNoiseDataMeans[i]; + inst->speech_means[i] = kSpeechDataMeans[i]; + inst->noise_stds[i] = kNoiseDataStds[i]; + inst->speech_stds[i] = kSpeechDataStds[i]; + } + + // Index and Minimum value vectors are initialized + for (i = 0; i < 16 * NUM_CHANNELS; i++) + { + inst->low_value_vector[i] = 10000; + inst->index_vector[i] = 0; + } + + for (i = 0; i < 5; i++) + { + inst->upper_state[i] = 0; + inst->lower_state[i] = 0; + } + + for (i = 0; i < 4; i++) + { + inst->hp_filter_state[i] = 0; + } + + // Init mean value memory, for FindMin function + inst->mean_value[0] = 1600; + inst->mean_value[1] = 1600; + inst->mean_value[2] = 1600; + inst->mean_value[3] = 1600; + inst->mean_value[4] = 1600; + inst->mean_value[5] = 1600; + + if (mode == 0) + { + // Quality mode + inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_Q; + inst->individual[1] = INDIVIDUAL_20MS_Q; + inst->individual[2] = INDIVIDUAL_30MS_Q; + + inst->total[0] = TOTAL_10MS_Q; + inst->total[1] = TOTAL_20MS_Q; + inst->total[2] = TOTAL_30MS_Q; + } else if (mode == 1) + { + // Low bitrate mode + inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_LBR; + inst->individual[1] = INDIVIDUAL_20MS_LBR; + inst->individual[2] = INDIVIDUAL_30MS_LBR; + + inst->total[0] = TOTAL_10MS_LBR; + inst->total[1] = TOTAL_20MS_LBR; + inst->total[2] = TOTAL_30MS_LBR; + } else if (mode == 2) + { + // Aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_AGG; + inst->individual[1] = INDIVIDUAL_20MS_AGG; + inst->individual[2] = INDIVIDUAL_30MS_AGG; + + inst->total[0] = TOTAL_10MS_AGG; + inst->total[1] = TOTAL_20MS_AGG; + inst->total[2] = TOTAL_30MS_AGG; + } else + { + // Very aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_VAG; + inst->individual[1] = INDIVIDUAL_20MS_VAG; + inst->individual[2] = INDIVIDUAL_30MS_VAG; + + inst->total[0] = TOTAL_10MS_VAG; + inst->total[1] = TOTAL_20MS_VAG; + inst->total[2] = TOTAL_30MS_VAG; + } + + inst->init_flag = kInitCheck; + + return 0; +} + +// Set aggressiveness mode +int WebRtcVad_set_mode_core(VadInstT *inst, short mode) +{ + + if (mode == 0) + { + // Quality mode + inst->over_hang_max_1[0] = OHMAX1_10MS_Q; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_Q; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_Q; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_Q; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_Q; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_Q; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_Q; + inst->individual[1] = INDIVIDUAL_20MS_Q; + inst->individual[2] = INDIVIDUAL_30MS_Q; + + inst->total[0] = TOTAL_10MS_Q; + inst->total[1] = TOTAL_20MS_Q; + inst->total[2] = TOTAL_30MS_Q; + } else if (mode == 1) + { + // Low bitrate mode + inst->over_hang_max_1[0] = OHMAX1_10MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_LBR; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_LBR; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_LBR; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_LBR; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_LBR; + inst->individual[1] = INDIVIDUAL_20MS_LBR; + inst->individual[2] = INDIVIDUAL_30MS_LBR; + + inst->total[0] = TOTAL_10MS_LBR; + inst->total[1] = TOTAL_20MS_LBR; + inst->total[2] = TOTAL_30MS_LBR; + } else if (mode == 2) + { + // Aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_AGG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_AGG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_AGG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_AGG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_AGG; + inst->individual[1] = INDIVIDUAL_20MS_AGG; + inst->individual[2] = INDIVIDUAL_30MS_AGG; + + inst->total[0] = TOTAL_10MS_AGG; + inst->total[1] = TOTAL_20MS_AGG; + inst->total[2] = TOTAL_30MS_AGG; + } else if (mode == 3) + { + // Very aggressive mode + inst->over_hang_max_1[0] = OHMAX1_10MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[1] = OHMAX1_20MS_VAG; // Overhang short speech burst + inst->over_hang_max_1[2] = OHMAX1_30MS_VAG; // Overhang short speech burst + inst->over_hang_max_2[0] = OHMAX2_10MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[1] = OHMAX2_20MS_VAG; // Overhang long speech burst + inst->over_hang_max_2[2] = OHMAX2_30MS_VAG; // Overhang long speech burst + + inst->individual[0] = INDIVIDUAL_10MS_VAG; + inst->individual[1] = INDIVIDUAL_20MS_VAG; + inst->individual[2] = INDIVIDUAL_30MS_VAG; + + inst->total[0] = TOTAL_10MS_VAG; + inst->total[1] = TOTAL_20MS_VAG; + inst->total[2] = TOTAL_30MS_VAG; + } else + { + return -1; + } + + return 0; +} + +// Calculate VAD decision by first extracting feature values and then calculate +// probability for both speech and background noise. + +WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + + // Downsample signal 32->16->8 before doing VAD + WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]), + frame_length); + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + + WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len); + len = WEBRTC_SPL_RSHIFT_W16(len, 1); + + // Do VAD on an 8 kHz signal + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 len, vad; + WebRtc_Word16 speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) + + // Wideband: Downsample signal before doing VAD + WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states, + frame_length); + + len = WEBRTC_SPL_RSHIFT_W16(frame_length, 1); + vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); + + return vad; +} + +WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT *inst, WebRtc_Word16 *speech_frame, + int frame_length) +{ + WebRtc_Word16 feature_vector[NUM_CHANNELS], total_power; + + // Get power in the bands + total_power = WebRtcVad_get_features(inst, speech_frame, frame_length, feature_vector); + + // Make a VAD + inst->vad = WebRtcVad_GmmProbability(inst, feature_vector, total_power, frame_length); + + return inst->vad; +} + +// Calculate probability for both speech and background noise, and perform a +// hypothesis-test. +WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT *inst, WebRtc_Word16 *feature_vector, + WebRtc_Word16 total_power, int frame_length) +{ + int n, k; + WebRtc_Word16 backval; + WebRtc_Word16 h0, h1; + WebRtc_Word16 ratvec, xval; + WebRtc_Word16 vadflag; + WebRtc_Word16 shifts0, shifts1; + WebRtc_Word16 tmp16, tmp16_1, tmp16_2; + WebRtc_Word16 diff, nr, pos; + WebRtc_Word16 nmk, nmk2, nmk3, smk, smk2, nsk, ssk; + WebRtc_Word16 delt, ndelt; + WebRtc_Word16 maxspe, maxmu; + WebRtc_Word16 deltaN[NUM_TABLE_VALUES], deltaS[NUM_TABLE_VALUES]; + WebRtc_Word16 ngprvec[NUM_TABLE_VALUES], sgprvec[NUM_TABLE_VALUES]; + WebRtc_Word32 h0test, h1test; + WebRtc_Word32 tmp32_1, tmp32_2; + WebRtc_Word32 dotVal; + WebRtc_Word32 nmid, smid; + WebRtc_Word32 probn[NUM_MODELS], probs[NUM_MODELS]; + WebRtc_Word16 *nmean1ptr, *nmean2ptr, *smean1ptr, *smean2ptr, *nstd1ptr, *nstd2ptr, + *sstd1ptr, *sstd2ptr; + WebRtc_Word16 overhead1, overhead2, individualTest, totalTest; + + // Set the thresholds to different values based on frame length + if (frame_length == 80) + { + // 80 input samples + overhead1 = inst->over_hang_max_1[0]; + overhead2 = inst->over_hang_max_2[0]; + individualTest = inst->individual[0]; + totalTest = inst->total[0]; + } else if (frame_length == 160) + { + // 160 input samples + overhead1 = inst->over_hang_max_1[1]; + overhead2 = inst->over_hang_max_2[1]; + individualTest = inst->individual[1]; + totalTest = inst->total[1]; + } else + { + // 240 input samples + overhead1 = inst->over_hang_max_1[2]; + overhead2 = inst->over_hang_max_2[2]; + individualTest = inst->individual[2]; + totalTest = inst->total[2]; + } + + if (total_power > MIN_ENERGY) + { // If signal present at all + + // Set pointers to the gaussian parameters + nmean1ptr = &inst->noise_means[0]; + nmean2ptr = &inst->noise_means[NUM_CHANNELS]; + smean1ptr = &inst->speech_means[0]; + smean2ptr = &inst->speech_means[NUM_CHANNELS]; + nstd1ptr = &inst->noise_stds[0]; + nstd2ptr = &inst->noise_stds[NUM_CHANNELS]; + sstd1ptr = &inst->speech_stds[0]; + sstd2ptr = &inst->speech_stds[NUM_CHANNELS]; + + vadflag = 0; + dotVal = 0; + for (n = 0; n < NUM_CHANNELS; n++) + { // For all channels + + pos = WEBRTC_SPL_LSHIFT_W16(n, 1); + xval = feature_vector[n]; + + // Probability for Noise, Q7 * Q20 = Q27 + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean1ptr++, *nstd1ptr++, + &deltaN[pos]); + probn[0] = (WebRtc_Word32)(kNoiseDataWeights[n] * tmp32_1); + tmp32_1 = WebRtcVad_GaussianProbability(xval, *nmean2ptr++, *nstd2ptr++, + &deltaN[pos + 1]); + probn[1] = (WebRtc_Word32)(kNoiseDataWeights[n + NUM_CHANNELS] * tmp32_1); + h0test = probn[0] + probn[1]; // Q27 + h0 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h0test, 12); // Q15 + + // Probability for Speech + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean1ptr++, *sstd1ptr++, + &deltaS[pos]); + probs[0] = (WebRtc_Word32)(kSpeechDataWeights[n] * tmp32_1); + tmp32_1 = WebRtcVad_GaussianProbability(xval, *smean2ptr++, *sstd2ptr++, + &deltaS[pos + 1]); + probs[1] = (WebRtc_Word32)(kSpeechDataWeights[n + NUM_CHANNELS] * tmp32_1); + h1test = probs[0] + probs[1]; // Q27 + h1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(h1test, 12); // Q15 + + // Get likelihood ratio. Approximate log2(H1/H0) with shifts0 - shifts1 + shifts0 = WebRtcSpl_NormW32(h0test); + shifts1 = WebRtcSpl_NormW32(h1test); + + if ((h0test > 0) && (h1test > 0)) + { + ratvec = shifts0 - shifts1; + } else if (h1test > 0) + { + ratvec = 31 - shifts1; + } else if (h0test > 0) + { + ratvec = shifts0 - 31; + } else + { + ratvec = 0; + } + + // VAD decision with spectrum weighting + dotVal += WEBRTC_SPL_MUL_16_16(ratvec, kSpectrumWeight[n]); + + // Individual channel test + if ((ratvec << 2) > individualTest) + { + vadflag = 1; + } + + // Probabilities used when updating model + if (h0 > 0) + { + tmp32_1 = probn[0] & 0xFFFFF000; // Q27 + tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); // Q29 + ngprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h0); + ngprvec[pos + 1] = 16384 - ngprvec[pos]; + } else + { + ngprvec[pos] = 16384; + ngprvec[pos + 1] = 0; + } + + // Probabilities used when updating model + if (h1 > 0) + { + tmp32_1 = probs[0] & 0xFFFFF000; + tmp32_2 = WEBRTC_SPL_LSHIFT_W32(tmp32_1, 2); + sgprvec[pos] = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, h1); + sgprvec[pos + 1] = 16384 - sgprvec[pos]; + } else + { + sgprvec[pos] = 0; + sgprvec[pos + 1] = 0; + } + } + + // Overall test + if (dotVal >= totalTest) + { + vadflag |= 1; + } + + // Set pointers to the means and standard deviations. + nmean1ptr = &inst->noise_means[0]; + smean1ptr = &inst->speech_means[0]; + nstd1ptr = &inst->noise_stds[0]; + sstd1ptr = &inst->speech_stds[0]; + + maxspe = 12800; + + // Update the model's parameters + for (n = 0; n < NUM_CHANNELS; n++) + { + + pos = WEBRTC_SPL_LSHIFT_W16(n, 1); + + // Get min value in past which is used for long term correction + backval = WebRtcVad_FindMinimum(inst, feature_vector[n], n); // Q4 + + // Compute the "global" mean, that is the sum of the two means weighted + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); // Q7 * Q7 + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], + *(nmean1ptr+NUM_CHANNELS)); + tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 6); // Q8 + + for (k = 0; k < NUM_MODELS; k++) + { + + nr = pos + k; + + nmean2ptr = nmean1ptr + k * NUM_CHANNELS; + smean2ptr = smean1ptr + k * NUM_CHANNELS; + nstd2ptr = nstd1ptr + k * NUM_CHANNELS; + sstd2ptr = sstd1ptr + k * NUM_CHANNELS; + nmk = *nmean2ptr; + smk = *smean2ptr; + nsk = *nstd2ptr; + ssk = *sstd2ptr; + + // Update noise mean vector if the frame consists of noise only + nmk2 = nmk; + if (!vadflag) + { + // deltaN = (x-mu)/sigma^2 + // ngprvec[k] = probn[k]/(probn[0] + probn[1]) + + delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ngprvec[nr], + deltaN[nr], 11); // Q14*Q11 + nmk2 = nmk + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, + kNoiseUpdateConst, + 22); // Q7+(Q14*Q15>>22) + } + + // Long term correction of the noise mean + ndelt = WEBRTC_SPL_LSHIFT_W16(backval, 4); + ndelt -= tmp16_1; // Q8 - Q8 + nmk3 = nmk2 + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(ndelt, + kBackEta, + 9); // Q7+(Q8*Q8)>>9 + + // Control that the noise mean does not drift to much + tmp16 = WEBRTC_SPL_LSHIFT_W16(k+5, 7); + if (nmk3 < tmp16) + nmk3 = tmp16; + tmp16 = WEBRTC_SPL_LSHIFT_W16(72+k-n, 7); + if (nmk3 > tmp16) + nmk3 = tmp16; + *nmean2ptr = nmk3; + + if (vadflag) + { + // Update speech mean vector: + // deltaS = (x-mu)/sigma^2 + // sgprvec[k] = probn[k]/(probn[0] + probn[1]) + + delt = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(sgprvec[nr], + deltaS[nr], + 11); // (Q14*Q11)>>11=Q14 + tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delt, + kSpeechUpdateConst, + 21) + 1; + smk2 = smk + (tmp16 >> 1); // Q7 + (Q14 * Q15 >> 22) + + // Control that the speech mean does not drift to much + maxmu = maxspe + 640; + if (smk2 < kMinimumMean[k]) + smk2 = kMinimumMean[k]; + if (smk2 > maxmu) + smk2 = maxmu; + + *smean2ptr = smk2; + + // (Q7>>3) = Q4 + tmp16 = WEBRTC_SPL_RSHIFT_W16((smk + 4), 3); + + tmp16 = feature_vector[n] - tmp16; // Q4 + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaS[nr], tmp16, 3); + tmp32_2 = tmp32_1 - (WebRtc_Word32)4096; // Q12 + tmp16 = WEBRTC_SPL_RSHIFT_W16((sgprvec[nr]), 2); + tmp32_1 = (WebRtc_Word32)(tmp16 * tmp32_2);// (Q15>>3)*(Q14>>2)=Q12*Q12=Q24 + + tmp32_2 = WEBRTC_SPL_RSHIFT_W32(tmp32_1, 4); // Q20 + + // 0.1 * Q20 / Q7 = Q13 + if (tmp32_2 > 0) + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_2, ssk * 10); + else + { + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_2, ssk * 10); + tmp16 = -tmp16; + } + // divide by 4 giving an update factor of 0.025 + tmp16 += 128; // Rounding + ssk += WEBRTC_SPL_RSHIFT_W16(tmp16, 8); + // Division with 8 plus Q7 + if (ssk < MIN_STD) + ssk = MIN_STD; + *sstd2ptr = ssk; + } else + { + // Update GMM variance vectors + // deltaN * (feature_vector[n] - nmk) - 1, Q11 * Q4 + tmp16 = feature_vector[n] - WEBRTC_SPL_RSHIFT_W16(nmk, 3); + + // (Q15>>3) * (Q14>>2) = Q12 * Q12 = Q24 + tmp32_1 = WEBRTC_SPL_MUL_16_16_RSFT(deltaN[nr], tmp16, 3) - 4096; + tmp16 = WEBRTC_SPL_RSHIFT_W16((ngprvec[nr]+2), 2); + tmp32_2 = (WebRtc_Word32)(tmp16 * tmp32_1); + tmp32_1 = WEBRTC_SPL_RSHIFT_W32(tmp32_2, 14); + // Q20 * approx 0.001 (2^-10=0.0009766) + + // Q20 / Q7 = Q13 + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); + if (tmp32_1 > 0) + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32_1, nsk); + else + { + tmp16 = (WebRtc_Word16)WebRtcSpl_DivW32W16(-tmp32_1, nsk); + tmp16 = -tmp16; + } + tmp16 += 32; // Rounding + nsk += WEBRTC_SPL_RSHIFT_W16(tmp16, 6); + + if (nsk < MIN_STD) + nsk = MIN_STD; + + *nstd2ptr = nsk; + } + } + + // Separate models if they are too close - nmid in Q14 + nmid = WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n], *nmean1ptr); + nmid += WEBRTC_SPL_MUL_16_16(kNoiseDataWeights[n+NUM_CHANNELS], *nmean2ptr); + + // smid in Q14 + smid = WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n], *smean1ptr); + smid += WEBRTC_SPL_MUL_16_16(kSpeechDataWeights[n+NUM_CHANNELS], *smean2ptr); + + // diff = "global" speech mean - "global" noise mean + diff = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 9); + tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 9); + diff -= tmp16; + + if (diff < kMinimumDifference[n]) + { + + tmp16 = kMinimumDifference[n] - diff; // Q5 + + // tmp16_1 = ~0.8 * (kMinimumDifference - diff) in Q7 + // tmp16_2 = ~0.2 * (kMinimumDifference - diff) in Q7 + tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(13, tmp16, 2); + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(3, tmp16, 2); + + // First Gauss, speech model + tmp16 = tmp16_1 + *smean1ptr; + *smean1ptr = tmp16; + smid = WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n]); + + // Second Gauss, speech model + tmp16 = tmp16_1 + *smean2ptr; + *smean2ptr = tmp16; + smid += WEBRTC_SPL_MUL_16_16(tmp16, kSpeechDataWeights[n+NUM_CHANNELS]); + + // First Gauss, noise model + tmp16 = *nmean1ptr - tmp16_2; + *nmean1ptr = tmp16; + + nmid = WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n]); + + // Second Gauss, noise model + tmp16 = *nmean2ptr - tmp16_2; + *nmean2ptr = tmp16; + nmid += WEBRTC_SPL_MUL_16_16(tmp16, kNoiseDataWeights[n+NUM_CHANNELS]); + } + + // Control that the speech & noise means do not drift to much + maxspe = kMaximumSpeech[n]; + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(smid, 7); + if (tmp16_2 > maxspe) + { // Upper limit of speech model + tmp16_2 -= maxspe; + + *smean1ptr -= tmp16_2; + *smean2ptr -= tmp16_2; + } + + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(nmid, 7); + if (tmp16_2 > kMaximumNoise[n]) + { + tmp16_2 -= kMaximumNoise[n]; + + *nmean1ptr -= tmp16_2; + *nmean2ptr -= tmp16_2; + } + + nmean1ptr++; + smean1ptr++; + nstd1ptr++; + sstd1ptr++; + } + inst->frame_counter++; + } else + { + vadflag = 0; + } + + // Hangover smoothing + if (!vadflag) + { + if (inst->over_hang > 0) + { + vadflag = 2 + inst->over_hang; + inst->over_hang = inst->over_hang - 1; + } + inst->num_of_speech = 0; + } else + { + inst->num_of_speech = inst->num_of_speech + 1; + if (inst->num_of_speech > NSP_MAX) + { + inst->num_of_speech = NSP_MAX; + inst->over_hang = overhead2; + } else + inst->over_hang = overhead1; + } + return vadflag; +} diff --git a/src/common_audio/vad/main/source/vad_core.h b/src/common_audio/vad/main/source/vad_core.h new file mode 100644 index 0000000..544caf5 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_core.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the descriptions of the core VAD calls. + */ + +#ifndef WEBRTC_VAD_CORE_H_ +#define WEBRTC_VAD_CORE_H_ + +#include "typedefs.h" +#include "vad_defines.h" + +typedef struct VadInstT_ +{ + + WebRtc_Word16 vad; + WebRtc_Word32 downsampling_filter_states[4]; + WebRtc_Word16 noise_means[NUM_TABLE_VALUES]; + WebRtc_Word16 speech_means[NUM_TABLE_VALUES]; + WebRtc_Word16 noise_stds[NUM_TABLE_VALUES]; + WebRtc_Word16 speech_stds[NUM_TABLE_VALUES]; + WebRtc_Word32 frame_counter; + WebRtc_Word16 over_hang; // Over Hang + WebRtc_Word16 num_of_speech; + WebRtc_Word16 index_vector[16 * NUM_CHANNELS]; + WebRtc_Word16 low_value_vector[16 * NUM_CHANNELS]; + WebRtc_Word16 mean_value[NUM_CHANNELS]; + WebRtc_Word16 upper_state[5]; + WebRtc_Word16 lower_state[5]; + WebRtc_Word16 hp_filter_state[4]; + WebRtc_Word16 over_hang_max_1[3]; + WebRtc_Word16 over_hang_max_2[3]; + WebRtc_Word16 individual[3]; + WebRtc_Word16 total[3]; + + short init_flag; + +} VadInstT; + +/**************************************************************************** + * WebRtcVad_InitCore(...) + * + * This function initializes a VAD instance + * + * Input: + * - inst : Instance that should be initialized + * - mode : Aggressiveness degree + * 0 (High quality) - 3 (Highly aggressive) + * + * Output: + * - inst : Initialized instance + * + * Return value : 0 - Ok + * -1 - Error + */ +int WebRtcVad_InitCore(VadInstT* inst, short mode); + +/**************************************************************************** + * WebRtcVad_set_mode_core(...) + * + * This function changes the VAD settings + * + * Input: + * - inst : VAD instance + * - mode : Aggressiveness degree + * 0 (High quality) - 3 (Highly aggressive) + * + * Output: + * - inst : Changed instance + * + * Return value : 0 - Ok + * -1 - Error + */ + +int WebRtcVad_set_mode_core(VadInstT* inst, short mode); + +/**************************************************************************** + * WebRtcVad_CalcVad32khz(...) + * WebRtcVad_CalcVad16khz(...) + * WebRtcVad_CalcVad8khz(...) + * + * Calculate probability for active speech and make VAD decision. + * + * Input: + * - inst : Instance that should be initialized + * - speech_frame : Input speech frame + * - frame_length : Number of input samples + * + * Output: + * - inst : Updated filter states etc. + * + * Return value : VAD decision + * 0 - No active speech + * 1-6 - Active speech + */ +WebRtc_Word16 WebRtcVad_CalcVad32khz(VadInstT* inst, WebRtc_Word16* speech_frame, + int frame_length); +WebRtc_Word16 WebRtcVad_CalcVad16khz(VadInstT* inst, WebRtc_Word16* speech_frame, + int frame_length); +WebRtc_Word16 WebRtcVad_CalcVad8khz(VadInstT* inst, WebRtc_Word16* speech_frame, + int frame_length); + +/**************************************************************************** + * WebRtcVad_GmmProbability(...) + * + * This function calculates the probabilities for background noise and + * speech using Gaussian Mixture Models. A hypothesis-test is performed to decide + * which type of signal is most probable. + * + * Input: + * - inst : Pointer to VAD instance + * - feature_vector : Feature vector = log10(energy in frequency band) + * - total_power : Total power in frame. + * - frame_length : Number of input samples + * + * Output: + * VAD decision : 0 - noise, 1 - speech + * + */ +WebRtc_Word16 WebRtcVad_GmmProbability(VadInstT* inst, WebRtc_Word16* feature_vector, + WebRtc_Word16 total_power, int frame_length); + +#endif // WEBRTC_VAD_CORE_H_ diff --git a/src/common_audio/vad/main/source/vad_defines.h b/src/common_audio/vad/main/source/vad_defines.h new file mode 100644 index 0000000..b33af2e --- /dev/null +++ b/src/common_audio/vad/main/source/vad_defines.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the macros used in VAD. + */ + +#ifndef WEBRTC_VAD_DEFINES_H_ +#define WEBRTC_VAD_DEFINES_H_ + +#define NUM_CHANNELS 6 // Eight frequency bands +#define NUM_MODELS 2 // Number of Gaussian models +#define NUM_TABLE_VALUES NUM_CHANNELS * NUM_MODELS + +#define MIN_ENERGY 10 +#define ALPHA1 6553 // 0.2 in Q15 +#define ALPHA2 32439 // 0.99 in Q15 +#define NSP_MAX 6 // Maximum number of VAD=1 frames in a row counted +#define MIN_STD 384 // Minimum standard deviation +// Mode 0, Quality thresholds - Different thresholds for the different frame lengths +#define INDIVIDUAL_10MS_Q 24 +#define INDIVIDUAL_20MS_Q 21 // (log10(2)*66)<<2 ~=16 +#define INDIVIDUAL_30MS_Q 24 + +#define TOTAL_10MS_Q 57 +#define TOTAL_20MS_Q 48 +#define TOTAL_30MS_Q 57 + +#define OHMAX1_10MS_Q 8 // Max Overhang 1 +#define OHMAX2_10MS_Q 14 // Max Overhang 2 +#define OHMAX1_20MS_Q 4 // Max Overhang 1 +#define OHMAX2_20MS_Q 7 // Max Overhang 2 +#define OHMAX1_30MS_Q 3 +#define OHMAX2_30MS_Q 5 + +// Mode 1, Low bitrate thresholds - Different thresholds for the different frame lengths +#define INDIVIDUAL_10MS_LBR 37 +#define INDIVIDUAL_20MS_LBR 32 +#define INDIVIDUAL_30MS_LBR 37 + +#define TOTAL_10MS_LBR 100 +#define TOTAL_20MS_LBR 80 +#define TOTAL_30MS_LBR 100 + +#define OHMAX1_10MS_LBR 8 // Max Overhang 1 +#define OHMAX2_10MS_LBR 14 // Max Overhang 2 +#define OHMAX1_20MS_LBR 4 +#define OHMAX2_20MS_LBR 7 + +#define OHMAX1_30MS_LBR 3 +#define OHMAX2_30MS_LBR 5 + +// Mode 2, Very aggressive thresholds - Different thresholds for the different frame lengths +#define INDIVIDUAL_10MS_AGG 82 +#define INDIVIDUAL_20MS_AGG 78 +#define INDIVIDUAL_30MS_AGG 82 + +#define TOTAL_10MS_AGG 285 //580 +#define TOTAL_20MS_AGG 260 +#define TOTAL_30MS_AGG 285 + +#define OHMAX1_10MS_AGG 6 // Max Overhang 1 +#define OHMAX2_10MS_AGG 9 // Max Overhang 2 +#define OHMAX1_20MS_AGG 3 +#define OHMAX2_20MS_AGG 5 + +#define OHMAX1_30MS_AGG 2 +#define OHMAX2_30MS_AGG 3 + +// Mode 3, Super aggressive thresholds - Different thresholds for the different frame lengths +#define INDIVIDUAL_10MS_VAG 94 +#define INDIVIDUAL_20MS_VAG 94 +#define INDIVIDUAL_30MS_VAG 94 + +#define TOTAL_10MS_VAG 1100 //1700 +#define TOTAL_20MS_VAG 1050 +#define TOTAL_30MS_VAG 1100 + +#define OHMAX1_10MS_VAG 6 // Max Overhang 1 +#define OHMAX2_10MS_VAG 9 // Max Overhang 2 +#define OHMAX1_20MS_VAG 3 +#define OHMAX2_20MS_VAG 5 + +#define OHMAX1_30MS_VAG 2 +#define OHMAX2_30MS_VAG 3 + +#endif // WEBRTC_VAD_DEFINES_H_ diff --git a/src/common_audio/vad/main/source/vad_filterbank.c b/src/common_audio/vad/main/source/vad_filterbank.c new file mode 100644 index 0000000..11392c9 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_filterbank.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the internal filterbank associated functions. + * For function description, see vad_filterbank.h. + */ + +#include "vad_filterbank.h" +#include "vad_defines.h" +#include "vad_const.h" +#include "signal_processing_library.h" + +void WebRtcVad_HpOutput(WebRtc_Word16 *in_vector, + WebRtc_Word16 in_vector_length, + WebRtc_Word16 *out_vector, + WebRtc_Word16 *filter_state) +{ + WebRtc_Word16 i, *pi, *outPtr; + WebRtc_Word32 tmpW32; + + pi = &in_vector[0]; + outPtr = &out_vector[0]; + + // The sum of the absolute values of the impulse response: + // The zero/pole-filter has a max amplification of a single sample of: 1.4546 + // Impulse response: 0.4047 -0.6179 -0.0266 0.1993 0.1035 -0.0194 + // The all-zero section has a max amplification of a single sample of: 1.6189 + // Impulse response: 0.4047 -0.8094 0.4047 0 0 0 + // The all-pole section has a max amplification of a single sample of: 1.9931 + // Impulse response: 1.0000 0.4734 -0.1189 -0.2187 -0.0627 0.04532 + + for (i = 0; i < in_vector_length; i++) + { + // all-zero section (filter coefficients in Q14) + tmpW32 = (WebRtc_Word32)WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[0], (*pi)); + tmpW32 += (WebRtc_Word32)WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[1], filter_state[0]); + tmpW32 += (WebRtc_Word32)WEBRTC_SPL_MUL_16_16(kHpZeroCoefs[2], filter_state[1]); // Q14 + filter_state[1] = filter_state[0]; + filter_state[0] = *pi++; + + // all-pole section + tmpW32 -= (WebRtc_Word32)WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[1], filter_state[2]); // Q14 + tmpW32 -= (WebRtc_Word32)WEBRTC_SPL_MUL_16_16(kHpPoleCoefs[2], filter_state[3]); + filter_state[3] = filter_state[2]; + filter_state[2] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32 (tmpW32, 14); + *outPtr++ = filter_state[2]; + } +} + +void WebRtcVad_Allpass(WebRtc_Word16 *in_vector, + WebRtc_Word16 *out_vector, + WebRtc_Word16 filter_coefficients, + int vector_length, + WebRtc_Word16 *filter_state) +{ + // The filter can only cause overflow (in the w16 output variable) + // if more than 4 consecutive input numbers are of maximum value and + // has the the same sign as the impulse responses first taps. + // First 6 taps of the impulse response: 0.6399 0.5905 -0.3779 + // 0.2418 -0.1547 0.0990 + + int n; + WebRtc_Word16 tmp16; + WebRtc_Word32 tmp32, in32, state32; + + state32 = WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)(*filter_state)), 16); // Q31 + + for (n = 0; n < vector_length; n++) + { + + tmp32 = state32 + WEBRTC_SPL_MUL_16_16(filter_coefficients, (*in_vector)); + tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32, 16); + *out_vector++ = tmp16; + in32 = WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)(*in_vector)), 14); + state32 = in32 - WEBRTC_SPL_MUL_16_16(filter_coefficients, tmp16); + state32 = WEBRTC_SPL_LSHIFT_W32(state32, 1); + in_vector += 2; + } + + *filter_state = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(state32, 16); +} + +void WebRtcVad_SplitFilter(WebRtc_Word16 *in_vector, + WebRtc_Word16 *out_vector_hp, + WebRtc_Word16 *out_vector_lp, + WebRtc_Word16 *upper_state, + WebRtc_Word16 *lower_state, + int in_vector_length) +{ + WebRtc_Word16 tmpOut; + int k, halflen; + + // Downsampling by 2 and get two branches + halflen = WEBRTC_SPL_RSHIFT_W16(in_vector_length, 1); + + // All-pass filtering upper branch + WebRtcVad_Allpass(&in_vector[0], out_vector_hp, kAllPassCoefsQ15[0], halflen, upper_state); + + // All-pass filtering lower branch + WebRtcVad_Allpass(&in_vector[1], out_vector_lp, kAllPassCoefsQ15[1], halflen, lower_state); + + // Make LP and HP signals + for (k = 0; k < halflen; k++) + { + tmpOut = *out_vector_hp; + *out_vector_hp++ -= *out_vector_lp; + *out_vector_lp++ += tmpOut; + } +} + +WebRtc_Word16 WebRtcVad_get_features(VadInstT *inst, + WebRtc_Word16 *in_vector, + int frame_size, + WebRtc_Word16 *out_vector) +{ + int curlen, filtno; + WebRtc_Word16 vecHP1[120], vecLP1[120]; + WebRtc_Word16 vecHP2[60], vecLP2[60]; + WebRtc_Word16 *ptin; + WebRtc_Word16 *hptout, *lptout; + WebRtc_Word16 power = 0; + + // Split at 2000 Hz and downsample + filtno = 0; + ptin = in_vector; + hptout = vecHP1; + lptout = vecLP1; + curlen = frame_size; + WebRtcVad_SplitFilter(ptin, hptout, lptout, &inst->upper_state[filtno], + &inst->lower_state[filtno], curlen); + + // Split at 3000 Hz and downsample + filtno = 1; + ptin = vecHP1; + hptout = vecHP2; + lptout = vecLP2; + curlen = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); + + WebRtcVad_SplitFilter(ptin, hptout, lptout, &inst->upper_state[filtno], + &inst->lower_state[filtno], curlen); + + // Energy in 3000 Hz - 4000 Hz + curlen = WEBRTC_SPL_RSHIFT_W16(curlen, 1); + WebRtcVad_LogOfEnergy(vecHP2, &out_vector[5], &power, kOffsetVector[5], curlen); + + // Energy in 2000 Hz - 3000 Hz + WebRtcVad_LogOfEnergy(vecLP2, &out_vector[4], &power, kOffsetVector[4], curlen); + + // Split at 1000 Hz and downsample + filtno = 2; + ptin = vecLP1; + hptout = vecHP2; + lptout = vecLP2; + curlen = WEBRTC_SPL_RSHIFT_W16(frame_size, 1); + WebRtcVad_SplitFilter(ptin, hptout, lptout, &inst->upper_state[filtno], + &inst->lower_state[filtno], curlen); + + // Energy in 1000 Hz - 2000 Hz + curlen = WEBRTC_SPL_RSHIFT_W16(curlen, 1); + WebRtcVad_LogOfEnergy(vecHP2, &out_vector[3], &power, kOffsetVector[3], curlen); + + // Split at 500 Hz + filtno = 3; + ptin = vecLP2; + hptout = vecHP1; + lptout = vecLP1; + + WebRtcVad_SplitFilter(ptin, hptout, lptout, &inst->upper_state[filtno], + &inst->lower_state[filtno], curlen); + + // Energy in 500 Hz - 1000 Hz + curlen = WEBRTC_SPL_RSHIFT_W16(curlen, 1); + WebRtcVad_LogOfEnergy(vecHP1, &out_vector[2], &power, kOffsetVector[2], curlen); + // Split at 250 Hz + filtno = 4; + ptin = vecLP1; + hptout = vecHP2; + lptout = vecLP2; + + WebRtcVad_SplitFilter(ptin, hptout, lptout, &inst->upper_state[filtno], + &inst->lower_state[filtno], curlen); + + // Energy in 250 Hz - 500 Hz + curlen = WEBRTC_SPL_RSHIFT_W16(curlen, 1); + WebRtcVad_LogOfEnergy(vecHP2, &out_vector[1], &power, kOffsetVector[1], curlen); + + // Remove DC and LFs + WebRtcVad_HpOutput(vecLP2, curlen, vecHP1, inst->hp_filter_state); + + // Power in 80 Hz - 250 Hz + WebRtcVad_LogOfEnergy(vecHP1, &out_vector[0], &power, kOffsetVector[0], curlen); + + return power; +} + +void WebRtcVad_LogOfEnergy(WebRtc_Word16 *vector, + WebRtc_Word16 *enerlogval, + WebRtc_Word16 *power, + WebRtc_Word16 offset, + int vector_length) +{ + WebRtc_Word16 enerSum = 0; + WebRtc_Word16 zeros, frac, log2; + WebRtc_Word32 energy; + + int shfts = 0, shfts2; + + energy = WebRtcSpl_Energy(vector, vector_length, &shfts); + + if (energy > 0) + { + + shfts2 = 16 - WebRtcSpl_NormW32(energy); + shfts += shfts2; + // "shfts" is the total number of right shifts that has been done to enerSum. + enerSum = (WebRtc_Word16)WEBRTC_SPL_SHIFT_W32(energy, -shfts2); + + // Find: + // 160*log10(enerSum*2^shfts) = 160*log10(2)*log2(enerSum*2^shfts) = + // 160*log10(2)*(log2(enerSum) + log2(2^shfts)) = + // 160*log10(2)*(log2(enerSum) + shfts) + + zeros = WebRtcSpl_NormU32(enerSum); + frac = (WebRtc_Word16)(((WebRtc_UWord32)((WebRtc_Word32)(enerSum) << zeros) + & 0x7FFFFFFF) >> 21); + log2 = (WebRtc_Word16)(((31 - zeros) << 10) + frac); + + *enerlogval = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(kLogConst, log2, 19) + + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(shfts, kLogConst, 9); + + if (*enerlogval < 0) + { + *enerlogval = 0; + } + } else + { + *enerlogval = 0; + shfts = -15; + enerSum = 0; + } + + *enerlogval += offset; + + // Total power in frame + if (*power <= MIN_ENERGY) + { + if (shfts > 0) + { + *power += MIN_ENERGY + 1; + } else if (WEBRTC_SPL_SHIFT_W16(enerSum, shfts) > MIN_ENERGY) + { + *power += MIN_ENERGY + 1; + } else + { + *power += WEBRTC_SPL_SHIFT_W16(enerSum, shfts); + } + } +} diff --git a/src/common_audio/vad/main/source/vad_filterbank.h b/src/common_audio/vad/main/source/vad_filterbank.h new file mode 100644 index 0000000..a5507ea --- /dev/null +++ b/src/common_audio/vad/main/source/vad_filterbank.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the description of the internal VAD call + * WebRtcVad_GaussianProbability. + */ + +#ifndef WEBRTC_VAD_FILTERBANK_H_ +#define WEBRTC_VAD_FILTERBANK_H_ + +#include "vad_core.h" + +/**************************************************************************** + * WebRtcVad_HpOutput(...) + * + * This function removes DC from the lowest frequency band + * + * Input: + * - in_vector : Samples in the frequency interval 0 - 250 Hz + * - in_vector_length : Length of input and output vector + * - filter_state : Current state of the filter + * + * Output: + * - out_vector : Samples in the frequency interval 80 - 250 Hz + * - filter_state : Updated state of the filter + * + */ +void WebRtcVad_HpOutput(WebRtc_Word16* in_vector, + WebRtc_Word16 in_vector_length, + WebRtc_Word16* out_vector, + WebRtc_Word16* filter_state); + +/**************************************************************************** + * WebRtcVad_Allpass(...) + * + * This function is used when before splitting a speech file into + * different frequency bands + * + * Note! Do NOT let the arrays in_vector and out_vector correspond to the same address. + * + * Input: + * - in_vector : (Q0) + * - filter_coefficients : (Q15) + * - vector_length : Length of input and output vector + * - filter_state : Current state of the filter (Q(-1)) + * + * Output: + * - out_vector : Output speech signal (Q(-1)) + * - filter_state : Updated state of the filter (Q(-1)) + * + */ +void WebRtcVad_Allpass(WebRtc_Word16* in_vector, + WebRtc_Word16* outw16, + WebRtc_Word16 filter_coefficients, + int vector_length, + WebRtc_Word16* filter_state); + +/**************************************************************************** + * WebRtcVad_SplitFilter(...) + * + * This function is used when before splitting a speech file into + * different frequency bands + * + * Input: + * - in_vector : Input signal to be split into two frequency bands. + * - upper_state : Current state of the upper filter + * - lower_state : Current state of the lower filter + * - in_vector_length : Length of input vector + * + * Output: + * - out_vector_hp : Upper half of the spectrum + * - out_vector_lp : Lower half of the spectrum + * - upper_state : Updated state of the upper filter + * - lower_state : Updated state of the lower filter + * + */ +void WebRtcVad_SplitFilter(WebRtc_Word16* in_vector, + WebRtc_Word16* out_vector_hp, + WebRtc_Word16* out_vector_lp, + WebRtc_Word16* upper_state, + WebRtc_Word16* lower_state, + int in_vector_length); + +/**************************************************************************** + * WebRtcVad_get_features(...) + * + * This function is used to get the logarithm of the power of each of the + * 6 frequency bands used by the VAD: + * 80 Hz - 250 Hz + * 250 Hz - 500 Hz + * 500 Hz - 1000 Hz + * 1000 Hz - 2000 Hz + * 2000 Hz - 3000 Hz + * 3000 Hz - 4000 Hz + * + * Input: + * - inst : Pointer to VAD instance + * - in_vector : Input speech signal + * - frame_size : Frame size, in number of samples + * + * Output: + * - out_vector : 10*log10(power in each freq. band), Q4 + * + * Return: total power in the signal (NOTE! This value is not exact since it + * is only used in a comparison. + */ +WebRtc_Word16 WebRtcVad_get_features(VadInstT* inst, + WebRtc_Word16* in_vector, + int frame_size, + WebRtc_Word16* out_vector); + +/**************************************************************************** + * WebRtcVad_LogOfEnergy(...) + * + * This function is used to get the logarithm of the power of one frequency band. + * + * Input: + * - vector : Input speech samples for one frequency band + * - offset : Offset value for the current frequency band + * - vector_length : Length of input vector + * + * Output: + * - enerlogval : 10*log10(energy); + * - power : Update total power in speech frame. NOTE! This value + * is not exact since it is only used in a comparison. + * + */ +void WebRtcVad_LogOfEnergy(WebRtc_Word16* vector, + WebRtc_Word16* enerlogval, + WebRtc_Word16* power, + WebRtc_Word16 offset, + int vector_length); + +#endif // WEBRTC_VAD_FILTERBANK_H_ diff --git a/src/common_audio/vad/main/source/vad_gmm.c b/src/common_audio/vad/main/source/vad_gmm.c new file mode 100644 index 0000000..23d12fb --- /dev/null +++ b/src/common_audio/vad/main/source/vad_gmm.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the internal VAD call + * WebRtcVad_GaussianProbability. For function description, see vad_gmm.h. + */ + +#include "vad_gmm.h" +#include "signal_processing_library.h" +#include "vad_const.h" + +WebRtc_Word32 WebRtcVad_GaussianProbability(WebRtc_Word16 in_sample, + WebRtc_Word16 mean, + WebRtc_Word16 std, + WebRtc_Word16 *delta) +{ + WebRtc_Word16 tmp16, tmpDiv, tmpDiv2, expVal, tmp16_1, tmp16_2; + WebRtc_Word32 tmp32, y32; + + // Calculate tmpDiv=1/std, in Q10 + tmp32 = (WebRtc_Word32)WEBRTC_SPL_RSHIFT_W16(std,1) + (WebRtc_Word32)131072; // 1 in Q17 + tmpDiv = (WebRtc_Word16)WebRtcSpl_DivW32W16(tmp32, std); // Q17/Q7 = Q10 + + // Calculate tmpDiv2=1/std^2, in Q14 + tmp16 = WEBRTC_SPL_RSHIFT_W16(tmpDiv, 2); // From Q10 to Q8 + tmpDiv2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16, tmp16, 2); // (Q8 * Q8)>>2 = Q14 + + tmp16 = WEBRTC_SPL_LSHIFT_W16(in_sample, 3); // Q7 + tmp16 = tmp16 - mean; // Q7 - Q7 = Q7 + + // To be used later, when updating noise/speech model + // delta = (x-m)/std^2, in Q11 + *delta = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmpDiv2, tmp16, 10); //(Q14*Q7)>>10 = Q11 + + // Calculate tmp32=(x-m)^2/(2*std^2), in Q10 + tmp32 = (WebRtc_Word32)WEBRTC_SPL_MUL_16_16_RSFT(*delta, tmp16, 9); // One shift for /2 + + // Calculate expVal ~= exp(-(x-m)^2/(2*std^2)) ~= exp2(-log2(exp(1))*tmp32) + if (tmp32 < kCompVar) + { + // Calculate tmp16 = log2(exp(1))*tmp32 , in Q10 + tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((WebRtc_Word16)tmp32, + kLog10Const, 12); + tmp16 = -tmp16; + tmp16_2 = (WebRtc_Word16)(0x0400 | (tmp16 & 0x03FF)); + tmp16_1 = (WebRtc_Word16)(tmp16 ^ 0xFFFF); + tmp16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W16(tmp16_1, 10); + tmp16 += 1; + // Calculate expVal=log2(-tmp32), in Q10 + expVal = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32((WebRtc_Word32)tmp16_2, tmp16); + + } else + { + expVal = 0; + } + + // Calculate y32=(1/std)*exp(-(x-m)^2/(2*std^2)), in Q20 + y32 = WEBRTC_SPL_MUL_16_16(tmpDiv, expVal); // Q10 * Q10 = Q20 + + return y32; // Q20 +} diff --git a/src/common_audio/vad/main/source/vad_gmm.h b/src/common_audio/vad/main/source/vad_gmm.h new file mode 100644 index 0000000..e0747fb --- /dev/null +++ b/src/common_audio/vad/main/source/vad_gmm.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the description of the internal VAD call + * WebRtcVad_GaussianProbability. + */ + +#ifndef WEBRTC_VAD_GMM_H_ +#define WEBRTC_VAD_GMM_H_ + +#include "typedefs.h" + +/**************************************************************************** + * WebRtcVad_GaussianProbability(...) + * + * This function calculates the probability for the value 'in_sample', given that in_sample + * comes from a normal distribution with mean 'mean' and standard deviation 'std'. + * + * Input: + * - in_sample : Input sample in Q4 + * - mean : mean value in the statistical model, Q7 + * - std : standard deviation, Q7 + * + * Output: + * + * - delta : Value used when updating the model, Q11 + * + * Return: + * - out : out = 1/std * exp(-(x-m)^2/(2*std^2)); + * Probability for x. + * + */ +WebRtc_Word32 WebRtcVad_GaussianProbability(WebRtc_Word16 in_sample, + WebRtc_Word16 mean, + WebRtc_Word16 std, + WebRtc_Word16 *delta); + +#endif // WEBRTC_VAD_GMM_H_ diff --git a/src/common_audio/vad/main/source/vad_sp.c b/src/common_audio/vad/main/source/vad_sp.c new file mode 100644 index 0000000..f347ab5 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_sp.c @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the VAD internal calls for Downsampling and + * FindMinimum. + * For function call descriptions; See vad_sp.h. + */ + +#include "vad_sp.h" +#include "vad_defines.h" +#include "vad_const.h" +#include "signal_processing_library.h" + +// Downsampling filter based on the splitting filter and the allpass functions +// in vad_filterbank.c +void WebRtcVad_Downsampling(WebRtc_Word16* signal_in, + WebRtc_Word16* signal_out, + WebRtc_Word32* filter_state, + int inlen) +{ + WebRtc_Word16 tmp16_1, tmp16_2; + WebRtc_Word32 tmp32_1, tmp32_2; + int n, halflen; + + // Downsampling by 2 and get two branches + halflen = WEBRTC_SPL_RSHIFT_W16(inlen, 1); + + tmp32_1 = filter_state[0]; + tmp32_2 = filter_state[1]; + + // Filter coefficients in Q13, filter state in Q0 + for (n = 0; n < halflen; n++) + { + // All-pass filtering upper branch + tmp16_1 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32_1, 1) + + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[0]), + *signal_in, 14); + *signal_out = tmp16_1; + tmp32_1 = (WebRtc_Word32)(*signal_in++) + - (WebRtc_Word32)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[0]), tmp16_1, 12); + + // All-pass filtering lower branch + tmp16_2 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32_2, 1) + + (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[1]), + *signal_in, 14); + *signal_out++ += tmp16_2; + tmp32_2 = (WebRtc_Word32)(*signal_in++) + - (WebRtc_Word32)WEBRTC_SPL_MUL_16_16_RSFT((kAllPassCoefsQ13[1]), tmp16_2, 12); + } + filter_state[0] = tmp32_1; + filter_state[1] = tmp32_2; +} + +WebRtc_Word16 WebRtcVad_FindMinimum(VadInstT* inst, + WebRtc_Word16 x, + int n) +{ + int i, j, k, II = -1, offset; + WebRtc_Word16 meanV, alpha; + WebRtc_Word32 tmp32, tmp32_1; + WebRtc_Word16 *valptr, *idxptr, *p1, *p2, *p3; + + // Offset to beginning of the 16 minimum values in memory + offset = WEBRTC_SPL_LSHIFT_W16(n, 4); + + // Pointer to memory for the 16 minimum values and the age of each value + idxptr = &inst->index_vector[offset]; + valptr = &inst->low_value_vector[offset]; + + // Each value in low_value_vector is getting 1 loop older. + // Update age of each value in indexVal, and remove old values. + for (i = 0; i < 16; i++) + { + p3 = idxptr + i; + if (*p3 != 100) + { + *p3 += 1; + } else + { + p1 = valptr + i + 1; + p2 = p3 + 1; + for (j = i; j < 16; j++) + { + *(valptr + j) = *p1++; + *(idxptr + j) = *p2++; + } + *(idxptr + 15) = 101; + *(valptr + 15) = 10000; + } + } + + // Check if x smaller than any of the values in low_value_vector. + // If so, find position. + if (x < *(valptr + 7)) + { + if (x < *(valptr + 3)) + { + if (x < *(valptr + 1)) + { + if (x < *valptr) + { + II = 0; + } else + { + II = 1; + } + } else if (x < *(valptr + 2)) + { + II = 2; + } else + { + II = 3; + } + } else if (x < *(valptr + 5)) + { + if (x < *(valptr + 4)) + { + II = 4; + } else + { + II = 5; + } + } else if (x < *(valptr + 6)) + { + II = 6; + } else + { + II = 7; + } + } else if (x < *(valptr + 15)) + { + if (x < *(valptr + 11)) + { + if (x < *(valptr + 9)) + { + if (x < *(valptr + 8)) + { + II = 8; + } else + { + II = 9; + } + } else if (x < *(valptr + 10)) + { + II = 10; + } else + { + II = 11; + } + } else if (x < *(valptr + 13)) + { + if (x < *(valptr + 12)) + { + II = 12; + } else + { + II = 13; + } + } else if (x < *(valptr + 14)) + { + II = 14; + } else + { + II = 15; + } + } + + // Put new min value on right position and shift bigger values up + if (II > -1) + { + for (i = 15; i > II; i--) + { + k = i - 1; + *(valptr + i) = *(valptr + k); + *(idxptr + i) = *(idxptr + k); + } + *(valptr + II) = x; + *(idxptr + II) = 1; + } + + meanV = 0; + if ((inst->frame_counter) > 4) + { + j = 5; + } else + { + j = inst->frame_counter; + } + + if (j > 2) + { + meanV = *(valptr + 2); + } else if (j > 0) + { + meanV = *valptr; + } else + { + meanV = 1600; + } + + if (inst->frame_counter > 0) + { + if (meanV < inst->mean_value[n]) + { + alpha = (WebRtc_Word16)ALPHA1; // 0.2 in Q15 + } else + { + alpha = (WebRtc_Word16)ALPHA2; // 0.99 in Q15 + } + } else + { + alpha = 0; + } + + tmp32 = WEBRTC_SPL_MUL_16_16((alpha+1), inst->mean_value[n]); + tmp32_1 = WEBRTC_SPL_MUL_16_16(WEBRTC_SPL_WORD16_MAX - alpha, meanV); + tmp32 += tmp32_1; + tmp32 += 16384; + inst->mean_value[n] = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(tmp32, 15); + + return inst->mean_value[n]; +} diff --git a/src/common_audio/vad/main/source/vad_sp.h b/src/common_audio/vad/main/source/vad_sp.h new file mode 100644 index 0000000..ae15c11 --- /dev/null +++ b/src/common_audio/vad/main/source/vad_sp.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the VAD internal calls for Downsampling and FindMinimum. + * Specific function calls are given below. + */ + +#ifndef WEBRTC_VAD_SP_H_ +#define WEBRTC_VAD_SP_H_ + +#include "vad_core.h" + +/**************************************************************************** + * WebRtcVad_Downsampling(...) + * + * Downsamples the signal a factor 2, eg. 32->16 or 16->8 + * + * Input: + * - signal_in : Input signal + * - in_length : Length of input signal in samples + * + * Input & Output: + * - filter_state : Filter state for first all-pass filters + * + * Output: + * - signal_out : Downsampled signal (of length len/2) + */ +void WebRtcVad_Downsampling(WebRtc_Word16* signal_in, + WebRtc_Word16* signal_out, + WebRtc_Word32* filter_state, + int in_length); + +/**************************************************************************** + * WebRtcVad_FindMinimum(...) + * + * Find the five lowest values of x in 100 frames long window. Return a mean + * value of these five values. + * + * Input: + * - feature_value : Feature value + * - channel : Channel number + * + * Input & Output: + * - inst : State information + * + * Output: + * return value : Weighted minimum value for a moving window. + */ +WebRtc_Word16 WebRtcVad_FindMinimum(VadInstT* inst, WebRtc_Word16 feature_value, int channel); + +#endif // WEBRTC_VAD_SP_H_ diff --git a/src/common_audio/vad/main/source/webrtc_vad.c b/src/common_audio/vad/main/source/webrtc_vad.c new file mode 100644 index 0000000..dcfbda1 --- /dev/null +++ b/src/common_audio/vad/main/source/webrtc_vad.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the VAD API calls. For a specific function call description, + * see webrtc_vad.h + */ + +#include <stdlib.h> +#include <string.h> + +#include "webrtc_vad.h" +#include "vad_core.h" + +static const int kInitCheck = 42; + +WebRtc_Word16 WebRtcVad_get_version(char *version, size_t size_bytes) +{ + const char my_version[] = "VAD 1.2.0"; + + if (version == NULL) + { + return -1; + } + + if (size_bytes < sizeof(my_version)) + { + return -1; + } + + memcpy(version, my_version, sizeof(my_version)); + return 0; +} + +WebRtc_Word16 WebRtcVad_AssignSize(int *size_in_bytes) +{ + *size_in_bytes = sizeof(VadInstT) * 2 / sizeof(WebRtc_Word16); + return 0; +} + +WebRtc_Word16 WebRtcVad_Assign(VadInst **vad_inst, void *vad_inst_addr) +{ + + if (vad_inst == NULL) + { + return -1; + } + + if (vad_inst_addr != NULL) + { + *vad_inst = (VadInst*)vad_inst_addr; + return 0; + } else + { + return -1; + } +} + +WebRtc_Word16 WebRtcVad_Create(VadInst **vad_inst) +{ + + VadInstT *vad_ptr = NULL; + + if (vad_inst == NULL) + { + return -1; + } + + *vad_inst = NULL; + + vad_ptr = (VadInstT *)malloc(sizeof(VadInstT)); + *vad_inst = (VadInst *)vad_ptr; + + if (vad_ptr == NULL) + { + return -1; + } + + vad_ptr->init_flag = 0; + + return 0; +} + +WebRtc_Word16 WebRtcVad_Free(VadInst *vad_inst) +{ + + if (vad_inst == NULL) + { + return -1; + } + + free(vad_inst); + return 0; +} + +WebRtc_Word16 WebRtcVad_Init(VadInst *vad_inst) +{ + short mode = 0; // Default high quality + + if (vad_inst == NULL) + { + return -1; + } + + return WebRtcVad_InitCore((VadInstT*)vad_inst, mode); +} + +WebRtc_Word16 WebRtcVad_set_mode(VadInst *vad_inst, WebRtc_Word16 mode) +{ + VadInstT* vad_ptr; + + if (vad_inst == NULL) + { + return -1; + } + + vad_ptr = (VadInstT*)vad_inst; + if (vad_ptr->init_flag != kInitCheck) + { + return -1; + } + + return WebRtcVad_set_mode_core((VadInstT*)vad_inst, mode); +} + +WebRtc_Word16 WebRtcVad_Process(VadInst *vad_inst, + WebRtc_Word16 fs, + WebRtc_Word16 *speech_frame, + WebRtc_Word16 frame_length) +{ + WebRtc_Word16 vad; + VadInstT* vad_ptr; + + if (vad_inst == NULL) + { + return -1; + } + + vad_ptr = (VadInstT*)vad_inst; + if (vad_ptr->init_flag != kInitCheck) + { + return -1; + } + + if (speech_frame == NULL) + { + return -1; + } + + if (fs == 32000) + { + if ((frame_length != 320) && (frame_length != 640) && (frame_length != 960)) + { + return -1; + } + vad = WebRtcVad_CalcVad32khz((VadInstT*)vad_inst, speech_frame, frame_length); + + } else if (fs == 16000) + { + if ((frame_length != 160) && (frame_length != 320) && (frame_length != 480)) + { + return -1; + } + vad = WebRtcVad_CalcVad16khz((VadInstT*)vad_inst, speech_frame, frame_length); + + } else if (fs == 8000) + { + if ((frame_length != 80) && (frame_length != 160) && (frame_length != 240)) + { + return -1; + } + vad = WebRtcVad_CalcVad8khz((VadInstT*)vad_inst, speech_frame, frame_length); + + } else + { + return -1; // Not a supported sampling frequency + } + + if (vad > 0) + { + return 1; + } else if (vad == 0) + { + return 0; + } else + { + return -1; + } +} diff --git a/src/common_audio/vad/main/test/unit_test/unit_test.cc b/src/common_audio/vad/main/test/unit_test/unit_test.cc new file mode 100644 index 0000000..8ac793e --- /dev/null +++ b/src/common_audio/vad/main/test/unit_test/unit_test.cc @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This file includes the implementation of the VAD unit tests. + */ + +#include <cstring> +#include "unit_test.h" +#include "webrtc_vad.h" + + +class VadEnvironment : public ::testing::Environment { + public: + virtual void SetUp() { + } + + virtual void TearDown() { + } +}; + +VadTest::VadTest() +{ +} + +void VadTest::SetUp() { +} + +void VadTest::TearDown() { +} + +TEST_F(VadTest, ApiTest) { + VadInst *vad_inst; + int i, j, k; + short zeros[960]; + short speech[960]; + char version[32]; + + // Valid test cases + int fs[3] = {8000, 16000, 32000}; + int nMode[4] = {0, 1, 2, 3}; + int framelen[3][3] = {{80, 160, 240}, + {160, 320, 480}, {320, 640, 960}} ; + int vad_counter = 0; + + memset(zeros, 0, sizeof(short) * 960); + memset(speech, 1, sizeof(short) * 960); + speech[13] = 1374; + speech[73] = -3747; + + + + // WebRtcVad_get_version() + WebRtcVad_get_version(version); + //printf("API Test for %s\n", version); + + // Null instance tests + EXPECT_EQ(-1, WebRtcVad_Create(NULL)); + EXPECT_EQ(-1, WebRtcVad_Init(NULL)); + EXPECT_EQ(-1, WebRtcVad_Assign(NULL, NULL)); + EXPECT_EQ(-1, WebRtcVad_Free(NULL)); + EXPECT_EQ(-1, WebRtcVad_set_mode(NULL, nMode[0])); + EXPECT_EQ(-1, WebRtcVad_Process(NULL, fs[0], speech, framelen[0][0])); + + + EXPECT_EQ(WebRtcVad_Create(&vad_inst), 0); + + // Not initialized tests + EXPECT_EQ(-1, WebRtcVad_Process(vad_inst, fs[0], speech, framelen[0][0])); + EXPECT_EQ(-1, WebRtcVad_set_mode(vad_inst, nMode[0])); + + // WebRtcVad_Init() tests + EXPECT_EQ(WebRtcVad_Init(vad_inst), 0); + + // WebRtcVad_set_mode() tests + EXPECT_EQ(-1, WebRtcVad_set_mode(vad_inst, -1)); + EXPECT_EQ(-1, WebRtcVad_set_mode(vad_inst, 4)); + + for (i = 0; i < sizeof(nMode)/sizeof(nMode[0]); i++) { + EXPECT_EQ(WebRtcVad_set_mode(vad_inst, nMode[i]), 0); + } + + // WebRtcVad_Process() tests + EXPECT_EQ(-1, WebRtcVad_Process(vad_inst, fs[0], NULL, framelen[0][0])); + EXPECT_EQ(-1, WebRtcVad_Process(vad_inst, 12000, speech, framelen[0][0])); + EXPECT_EQ(-1, WebRtcVad_Process(vad_inst, fs[0], speech, framelen[1][1])); + EXPECT_EQ(WebRtcVad_Process(vad_inst, fs[0], zeros, framelen[0][0]), 0); + for (i = 0; i < sizeof(fs)/sizeof(fs[0]); i++) { + for (j = 0; j < sizeof(framelen[0])/sizeof(framelen[0][0]); j++) { + for (k = 0; k < sizeof(nMode)/sizeof(nMode[0]); k++) { + EXPECT_EQ(WebRtcVad_set_mode(vad_inst, nMode[k]), 0); +// printf("%d\n", WebRtcVad_Process(vad_inst, fs[i], speech, framelen[i][j])); + if (vad_counter < 9) + { + EXPECT_EQ(WebRtcVad_Process(vad_inst, fs[i], speech, framelen[i][j]), 1); + } else + { + EXPECT_EQ(WebRtcVad_Process(vad_inst, fs[i], speech, framelen[i][j]), 0); + } + vad_counter++; + } + } + } + + EXPECT_EQ(0, WebRtcVad_Free(vad_inst)); + +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + VadEnvironment* env = new VadEnvironment; + ::testing::AddGlobalTestEnvironment(env); + + return RUN_ALL_TESTS(); +} diff --git a/src/common_audio/vad/main/test/unit_test/unit_test.h b/src/common_audio/vad/main/test/unit_test/unit_test.h new file mode 100644 index 0000000..62dac11 --- /dev/null +++ b/src/common_audio/vad/main/test/unit_test/unit_test.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* + * This header file includes the declaration of the VAD unit test. + */ + +#ifndef WEBRTC_VAD_UNIT_TEST_H_ +#define WEBRTC_VAD_UNIT_TEST_H_ + +#include <gtest/gtest.h> + +class VadTest : public ::testing::Test { + protected: + VadTest(); + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif // WEBRTC_VAD_UNIT_TEST_H_ |