Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "rom_enc.h"
46 : #include "wmc_auto.h"
47 : #include "ivas_prot.h"
48 :
49 : /*-----------------------------------------------------------------*
50 : * Local constants
51 : *-----------------------------------------------------------------*/
52 :
53 : #define HANGOVER_LONG 10 /* Hangover for CNG */
54 : #define HANGOVER_LONG_HE 20 /* Hangover of CNG */
55 : #define HANGOVER_LONG_MUSIC 20 /* Hangover of CNG */
56 : #define HANGOVER_LONG_NB 8 /* Hangover for CNG */
57 : #define ACTIVE_FRAMES 3 /* Number of consecutive active SPEECH frames necessary to trigger HO */
58 :
59 : #define TH16_2 35.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
60 : #define TH8_1 20.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
61 : #define TH16_2_NFLAG 35.0f
62 : #define TH8_1_NFLAG 35.0f
63 :
64 :
65 : #define SNR_OUTLIER_WGHT_1 1.00f
66 : #define SNR_OUTLIER_WGHT_2 1.01f
67 : #define SNR_OUTLIER_WGHT_3 1.02f
68 : #define OUTLIER_THR_1 10.0f
69 : #define OUTLIER_THR_2 6.0f
70 : #define MAX_SNR_OUTLIER_IND 17
71 : #define MAX_SNR_OUTLIER_1 10.0f
72 : #define MAX_SNR_OUTLIER_2 25.0f
73 : #define MAX_SNR_OUTLIER_3 50.0f
74 :
75 : /*---------------------------------------------------------------------*
76 : * wb_vad_init()
77 : *
78 : * VAD initializations
79 : *---------------------------------------------------------------------*/
80 :
81 149413 : void wb_vad_init(
82 : VAD_HANDLE hVAD /* i/o: VAD data handle */
83 : )
84 : {
85 149413 : hVAD->hangover_cnt = 0; /* Hangover counter initialized to 0 */
86 149413 : hVAD->nb_active_frames = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
87 : /* is set to max (-> start with hangover) */
88 :
89 149413 : hVAD->vad_flag_reg_H = 0L;
90 149413 : hVAD->vad_flag_reg_L = 0L;
91 149413 : hVAD->vad_prim_reg = 0L;
92 149413 : hVAD->vad_flag_cnt_50 = 0;
93 149413 : hVAD->vad_prim_cnt_16 = 0;
94 :
95 : /* By default one should not start with a hangover */
96 149413 : hVAD->hangover_cnt_dtx = HANGOVER_LONG; /* hangover for DTX */
97 149413 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC; /* hangover for DTX */
98 :
99 149413 : hVAD->hangover_cnt_he = 0; /* Hangover counter initialized to 0 */
100 149413 : hVAD->nb_active_frames_he = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
101 149413 : hVAD->bcg_flux = 70;
102 149413 : hVAD->soft_hangover = 0;
103 149413 : hVAD->voiced_burst = 0;
104 149413 : hVAD->bcg_flux_init = 50;
105 149413 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
106 149413 : hVAD->hangover_cnt_he1 = 0;
107 :
108 149413 : hVAD->prim_act_quick = 0.0f;
109 149413 : hVAD->prim_act_slow = 0.0f;
110 149413 : hVAD->prim_act = 0.0f;
111 149413 : hVAD->prim_act_quick_he = 0.0f;
112 149413 : hVAD->prim_act_slow_he = 0.0f;
113 149413 : hVAD->prim_act_he = 0.0f;
114 :
115 149413 : hVAD->consec_inactive = 0;
116 149413 : hVAD->spectral_tilt_reset = 1;
117 149413 : hVAD->running_avg = 0;
118 149413 : hVAD->ra_deltasum = 0;
119 149413 : hVAD->trigger_SID = 0;
120 149413 : hVAD->snr_sum_vad = 0;
121 :
122 149413 : hVAD->hangover_terminate_flag = 0;
123 :
124 149413 : return;
125 : }
126 :
127 : /*-----------------------------------------------------------------*
128 : * sing_thr_snr_acc()
129 : *
130 : * accumulate snr_sum with significance thresholds
131 : *-----------------------------------------------------------------*/
132 :
133 673898616 : static void sign_thr_snr_acc(
134 : float *snr_sum,
135 : float snr,
136 : float sign_thr,
137 : float min_snr )
138 : {
139 673898616 : if ( snr >= sign_thr )
140 : {
141 532091144 : *snr_sum = *snr_sum + snr;
142 : }
143 : else
144 : {
145 141807472 : *snr_sum = *snr_sum + min_snr;
146 : }
147 :
148 673898616 : return;
149 : }
150 :
151 : /*-----------------------------------------------------------------*
152 : * dtx_hangover_addition()
153 : *
154 : * accumulate snr_sum with significance thresholds
155 : *-----------------------------------------------------------------*/
156 :
157 16551924 : int16_t dtx_hangover_addition(
158 : Encoder_State *st, /* i/o: encoder state structure */
159 : const int16_t vad_flag, /* i : VAD flag */
160 : const float lp_snr, /* i : input single SNR estimate */
161 : const int16_t cldfb_subtraction, /* i : */
162 : int16_t *vad_hover_flag, /* o : VAD hangover flag */
163 : VAD_HANDLE hVAD, /* i/o: VAD handle for L or R channel */
164 : NOISE_EST_HANDLE hNoiseEst, /* i : Noise estimation handle */
165 : int16_t *rem_dtx_ho /* o : Expected remaining hangover frames */
166 : )
167 : {
168 : int16_t hangover_short_dtx, flag_dtx;
169 : int16_t ho_limit_clean;
170 :
171 16551924 : if ( hVAD == NULL )
172 : {
173 15307649 : hVAD = st->hVAD;
174 : }
175 16551924 : if ( hNoiseEst == NULL )
176 : {
177 15307649 : hNoiseEst = st->hNoiseEst;
178 : }
179 :
180 16551924 : flag_dtx = 0;
181 :
182 : /* Determine initial hangover length */
183 16551924 : hangover_short_dtx = 2; /* was 1 */
184 16551924 : if ( ( lp_snr < 16.0f && st->input_bwidth != NB ) ||
185 15481865 : hVAD->prim_act_he > 0.95f )
186 : {
187 8484320 : hangover_short_dtx = 3; /* was 2 */
188 : }
189 :
190 : /* Adjust hangover according to activity history */
191 16551924 : if ( hVAD->vad_prim_cnt_16 > 12 ) /* 12 requires roughly > 80% primary activity */
192 : {
193 12003853 : hangover_short_dtx = hangover_short_dtx + 2;
194 : }
195 :
196 16551924 : if ( hVAD->vad_flag_cnt_50 > 40 ) /* 40 requires roughtly > 80% flag activity */
197 : {
198 11245914 : hangover_short_dtx = hangover_short_dtx + 5;
199 : }
200 :
201 : /* Keep hangover_short lower than maximum hangover count */
202 16551924 : if ( hangover_short_dtx > HANGOVER_LONG - 1 )
203 : {
204 7741495 : hangover_short_dtx = HANGOVER_LONG - 1;
205 : }
206 :
207 : /* Only allow short HO if not sufficient active frames */
208 16551924 : ho_limit_clean = 3;
209 16551924 : if ( st->core == AMR_WB_CORE )
210 : {
211 5296 : ho_limit_clean = 2;
212 : }
213 :
214 16551924 : if ( st->input_bwidth != NB && st->core != AMR_WB_CORE && lp_snr > 25.0f )
215 : {
216 14515568 : ho_limit_clean = 2;
217 : }
218 :
219 16551924 : if ( ho_limit_clean != 0 )
220 : {
221 16551924 : if ( ( hangover_short_dtx > ho_limit_clean ) && ( ( hVAD->vad_prim_cnt_16 < 7 ) || ( lp_snr > 16 && hVAD->prim_act_he < 0.85 ) ) )
222 : {
223 3078990 : hangover_short_dtx = ho_limit_clean;
224 : }
225 : }
226 :
227 : /* hangover adjustment from combined FFT + CLDFBVAD */
228 16551924 : if ( st->core != AMR_WB_CORE )
229 : {
230 16546628 : hangover_short_dtx = hangover_short_dtx - cldfb_subtraction;
231 16546628 : if ( hangover_short_dtx < 0 )
232 : {
233 7817 : hangover_short_dtx = 0;
234 : }
235 : }
236 :
237 16551924 : if ( vad_flag == 1 ) /* Speech present */
238 : {
239 13612493 : flag_dtx = 1;
240 :
241 : /* Add hangover after sufficient # of active frames or sufficient activity during last second */
242 13612493 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES || hVAD->vad_flag_cnt_50 > 45 ) /* 45 requires roughtly > 90% flag activity */
243 : {
244 13287128 : hVAD->hangover_cnt_dtx = 0;
245 : }
246 :
247 : /* inside HO period */
248 13612493 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG && hVAD->hangover_cnt_dtx != 0 )
249 : {
250 36456 : hVAD->hangover_cnt_dtx++;
251 : }
252 :
253 13612493 : hVAD->hangover_terminate_flag = 0;
254 :
255 : /* Music hangover when music detected */
256 13612493 : if ( hVAD->prim_act_he > 0.98f && hNoiseEst->Etot_lp > 40 && hVAD->vad_prim_cnt_16 > 14 && hVAD->vad_flag_cnt_50 > 48 )
257 : {
258 4287526 : hVAD->hangover_cnt_music = 0;
259 : }
260 :
261 : /* inside music HO period */
262 13612493 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC && hVAD->hangover_cnt_music != 0 )
263 : {
264 61138 : hVAD->hangover_cnt_music++;
265 : }
266 : }
267 : else
268 : {
269 : /* Reset the counter of speech frames necessary to start hangover algorithm */
270 2939431 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG ) /* inside HO period */
271 : {
272 317159 : hVAD->hangover_cnt_dtx++;
273 : }
274 :
275 2939431 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC ) /* inside music HO period */
276 : {
277 77156 : hVAD->hangover_cnt_music++;
278 : }
279 :
280 : /* fast terminate DTX hangover if st->hangover_terminate_flag is set */
281 2939431 : if ( hVAD->hangover_terminate_flag == 1 )
282 : {
283 69 : hVAD->hangover_cnt = HANGOVER_LONG;
284 69 : hVAD->hangover_cnt_dtx = HANGOVER_LONG;
285 69 : hVAD->hangover_terminate_flag = 0;
286 : /* only shorten music hangover when low energy frames */
287 69 : if ( hNoiseEst->Etot_lp < 20.0f )
288 : {
289 0 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC;
290 : }
291 : }
292 :
293 2939431 : if ( hVAD->hangover_cnt_dtx <= hangover_short_dtx ) /* "hard" hangover */
294 : {
295 108610 : flag_dtx = 1;
296 : }
297 :
298 2939431 : if ( hVAD->hangover_cnt_music <= 15 ) /* "hard" hangover */
299 : {
300 64187 : flag_dtx = 1;
301 : }
302 : }
303 :
304 16551924 : if ( flag_dtx != 0 && st->localVAD == 0 )
305 : {
306 445494 : *vad_hover_flag = 1;
307 445494 : if ( rem_dtx_ho != NULL )
308 : {
309 30888 : *rem_dtx_ho = max( hangover_short_dtx - hVAD->hangover_cnt_dtx, 0 );
310 : }
311 : }
312 :
313 16551924 : return flag_dtx;
314 : }
315 :
316 :
317 : /*-----------------------------------------------------------------*
318 : * wb_vad()
319 : *
320 : * Voice Activity Detector
321 : *-----------------------------------------------------------------*/
322 :
323 16884952 : int16_t wb_vad(
324 : Encoder_State *st, /* i/o: encoder state structure */
325 : const float fr_bands[], /* i : per band input energy (contains 2 vectors) */
326 : int16_t *noisy_speech_HO, /* o : SC-VBR noisy speech HO flag */
327 : int16_t *clean_speech_HO, /* o : SC-VBR clean speech HO flag */
328 : int16_t *NB_speech_HO, /* o : SC-VBR NB speech HO flag */
329 : float *snr_sum_he, /* o : Output snr_sum as weighted spectral measure */
330 : int16_t *localVAD_HE_SAD, /* o : HE_SAD decision without hangovers */
331 : int16_t *flag_noisy_speech_snr, /* o : */
332 : VAD_HANDLE hVAD, /* i/o: VAD handle */
333 : NOISE_EST_HANDLE hNoiseEst, /* i/o: Noise estimation handle */
334 : float lp_speech, /* i : long term active speech energy average */
335 : float lp_noise /* i : long term noise energy */
336 : )
337 : {
338 : int16_t i, j, flag, hangover_short;
339 : float snr[NB_BANDS], snr_sum, thr1, thr2, lp_snr, nk, nc, th_clean;
340 : const float *pt1, *pt2, *pt3;
341 : float min_snr, sign_thr;
342 : float fr_enr;
343 : float ftmp, ftmp1;
344 16884952 : float mssnr = 0;
345 : float snr_sumt;
346 : float vad_thr;
347 : int16_t hangover_hd;
348 : int16_t snr_idx;
349 : float delta1, delta2, delta3;
350 : int16_t flag_he1;
351 : float mssnr_hov;
352 : int16_t stmp;
353 : float msnr;
354 : float snr_outlier;
355 : int16_t snr_outlier_index;
356 : float accum_ener_L, accum_ener_H;
357 : float delta4;
358 16884952 : float snr18 = 1.0f, snr19 = 1.0f;
359 : int16_t nb_sig_snr;
360 : float nv;
361 : float snr_sum_HE_SAD;
362 : float sign_thr_HE_SAD, min_snr_HE_SAD;
363 : float nv_ofs;
364 : float thr1_ol;
365 : float snr_sum_ol;
366 : int16_t last_7k2_coder_type;
367 :
368 16884952 : if ( hNoiseEst == NULL )
369 : {
370 15640677 : hNoiseEst = st->hNoiseEst;
371 : }
372 :
373 16884952 : if ( hVAD == NULL )
374 : {
375 15640677 : hVAD = st->hVAD;
376 : }
377 :
378 16884952 : if ( lp_speech < -100.0f )
379 : {
380 15640677 : lp_speech = st->lp_speech;
381 : }
382 :
383 16884952 : if ( lp_noise < -100.0f )
384 : {
385 15640677 : lp_noise = st->lp_noise;
386 : }
387 :
388 : /*---------------------------------------------------------------------*
389 : * Initialization
390 : *---------------------------------------------------------------------*/
391 :
392 16884952 : snr_outlier = 0;
393 16884952 : snr_outlier_index = 0;
394 16884952 : accum_ener_L = 0;
395 16884952 : accum_ener_H = 0;
396 :
397 16884952 : if ( st->input_bwidth == NB )
398 : {
399 187433 : st->min_band = 1;
400 187433 : st->max_band = 16;
401 : }
402 : else
403 : {
404 16697519 : st->min_band = 0;
405 16697519 : st->max_band = 19;
406 : }
407 :
408 16884952 : if ( st->Opt_SC_VBR )
409 : {
410 2390 : last_7k2_coder_type = st->hSC_VBR->last_7k2_coder_type;
411 : }
412 : else
413 : {
414 16882562 : last_7k2_coder_type = -1;
415 : }
416 :
417 : /*---------------------------------------------------------------------*
418 : * set SNR thresholds depending on the input rate
419 : *---------------------------------------------------------------------*/
420 :
421 16884952 : if ( st->max_band == 19 ) /* WB input */
422 : {
423 16697519 : nk = 0.1f;
424 16697519 : nc = 16.1f;
425 16697519 : nv = 2.05f;
426 16697519 : nv_ofs = 1.65f;
427 16697519 : th_clean = TH16_2;
428 16697519 : if ( st->input_bwidth == WB )
429 : {
430 2288739 : sign_thr = 1.3f;
431 2288739 : min_snr = 0.8f;
432 : }
433 : else
434 : {
435 14408780 : sign_thr = 1.75f;
436 14408780 : min_snr = 0.25f;
437 : }
438 16697519 : sign_thr_HE_SAD = 2.5f;
439 16697519 : min_snr_HE_SAD = 0.2f;
440 : }
441 : else /* NB input */
442 : {
443 187433 : nk = 0.10f;
444 187433 : nc = 16.0f;
445 187433 : nv = 4.00f; /* Was 4.5f but trunkated to 4.00 used when converted to short */
446 187433 : nv_ofs = 1.15f;
447 187433 : th_clean = TH8_1;
448 187433 : sign_thr = 1.75f;
449 187433 : min_snr = 0.25f;
450 :
451 187433 : sign_thr_HE_SAD = 2.65f;
452 187433 : min_snr_HE_SAD = 0.05f;
453 : }
454 :
455 16884952 : hangover_short = 0;
456 :
457 :
458 16884952 : if ( st->Opt_SC_VBR )
459 : {
460 2390 : *noisy_speech_HO = 0;
461 2390 : *clean_speech_HO = 0;
462 2390 : *NB_speech_HO = 0;
463 : }
464 :
465 : /*---------------------------------------------------------------------*
466 : * compute SNR for each band & total
467 : *---------------------------------------------------------------------*/
468 :
469 16884952 : pt1 = fr_bands;
470 16884952 : pt2 = fr_bands + NB_BANDS;
471 16884952 : snr_sum = 0.0f;
472 16884952 : *snr_sum_he = 0.0f;
473 16884952 : snr_sumt = 0;
474 16884952 : mssnr_hov = 0;
475 16884952 : snr_sum_HE_SAD = 0.0f;
476 16884952 : lp_snr = lp_speech - lp_noise;
477 :
478 16884952 : if ( lp_snr > 24.0f )
479 : {
480 15026201 : snr_idx = 0;
481 : }
482 1858751 : else if ( lp_snr > 18 )
483 : {
484 554138 : snr_idx = 1;
485 : }
486 : else
487 : {
488 1304613 : snr_idx = 2;
489 : }
490 :
491 16884952 : if ( snr_idx == 0 )
492 : {
493 15026201 : stmp = 6;
494 15026201 : delta1 = 0.0f;
495 15026201 : delta2 = 0.0f;
496 15026201 : delta3 = 0.0f;
497 15026201 : delta4 = 0.0f;
498 15026201 : vad_thr = 2.4f * lp_snr - 42.2f;
499 15026201 : vad_thr = min( vad_thr, 80 );
500 : }
501 1858751 : else if ( snr_idx == 1 )
502 : {
503 554138 : stmp = 6;
504 554138 : delta1 = 0.1f;
505 554138 : delta2 = 0.2f;
506 554138 : delta3 = 0.2f;
507 554138 : delta4 = 0.2f;
508 554138 : vad_thr = 2.4f * lp_snr - 40.2f;
509 554138 : vad_thr = min( vad_thr, 80 );
510 : }
511 : else
512 : {
513 1304613 : stmp = 9;
514 1304613 : delta1 = 0.2f;
515 1304613 : delta2 = 0.4f;
516 1304613 : delta3 = 0.3f;
517 1304613 : delta4 = 0.4f;
518 1304613 : vad_thr = 2.5f * lp_snr - 10.0f;
519 1304613 : vad_thr = max( vad_thr, 1 );
520 : }
521 16884952 : pt3 = hNoiseEst->bckr;
522 16884952 : nb_sig_snr = 20;
523 :
524 353834260 : for ( i = st->min_band; i <= st->max_band; i++ )
525 : {
526 336949308 : ftmp = *pt1++;
527 336949308 : ftmp1 = *pt2++;
528 336949308 : fr_enr = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 );
529 :
530 336949308 : if ( ftmp > ftmp1 )
531 : {
532 140466833 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 ) / *pt3++;
533 : }
534 : else
535 : {
536 196482475 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.3f * ftmp + 0.5f * ftmp1 ) / *pt3++;
537 : }
538 :
539 336949308 : if ( snr[i] < 2.0f )
540 : {
541 70409329 : nb_sig_snr--;
542 : }
543 :
544 336949308 : if ( snr[i] < 1 )
545 : {
546 18558103 : snr[i] = 1;
547 : }
548 :
549 336949308 : snr[i] = (float) log10( snr[i] );
550 336949308 : snr_sumt += snr[i];
551 336949308 : if ( i < 2 )
552 : {
553 33582471 : ftmp = snr[i] + delta1;
554 : }
555 303366837 : else if ( i < 7 )
556 : {
557 84424760 : ftmp = snr[i] + delta2;
558 : }
559 218942077 : else if ( i < 18 )
560 : {
561 185547039 : ftmp = snr[i] + delta3;
562 : }
563 : else
564 : {
565 33395038 : ftmp = snr[i] + delta4;
566 : }
567 336949308 : ftmp1 = ftmp;
568 336949308 : if ( i < 7 )
569 : {
570 118007231 : ftmp1 = ftmp + 0.4f;
571 : }
572 336949308 : ftmp = min( ftmp, 2.0f );
573 336949308 : ftmp1 = min( ftmp1, 2.0f );
574 336949308 : msnr = 1;
575 2436898476 : for ( j = 0; j < stmp; j++ )
576 : {
577 2099949168 : msnr *= ftmp;
578 : }
579 336949308 : mssnr += msnr;
580 336949308 : if ( i == 18 )
581 : {
582 16697519 : snr18 = msnr;
583 : }
584 320251789 : else if ( i == 19 )
585 : {
586 16697519 : snr19 = msnr;
587 : }
588 336949308 : msnr = 1;
589 2436898476 : for ( j = 0; j < stmp; j++ )
590 : {
591 2099949168 : msnr *= ftmp1;
592 : }
593 336949308 : mssnr_hov += msnr;
594 336949308 : snr[i] = fr_enr / hNoiseEst->bckr[i];
595 :
596 336949308 : sign_thr_snr_acc( &snr_sum_HE_SAD, snr[i], sign_thr_HE_SAD, min_snr_HE_SAD );
597 336949308 : sign_thr_snr_acc( &snr_sum, snr[i], sign_thr, min_snr );
598 :
599 : /* To make snr[] compatible with older versions where snr[i] >= 1
600 : also this could be removed if this no longer is a requriement */
601 336949308 : if ( snr[i] < 1.0f )
602 : {
603 18816037 : snr[i] = 1.0f;
604 : }
605 : /* accumulate background noise energy in bands [0-2] and in bands [3-19]*/
606 336949308 : if ( i < 3 )
607 : {
608 50467423 : accum_ener_L = accum_ener_L + hNoiseEst->bckr[i];
609 : }
610 : else
611 : {
612 286481885 : accum_ener_H = accum_ener_H + hNoiseEst->bckr[i];
613 : }
614 :
615 : /* identify the outlier band */
616 336949308 : if ( snr[i] > snr_outlier )
617 : {
618 53074525 : snr_outlier = snr[i];
619 53074525 : snr_outlier_index = i;
620 : }
621 : }
622 :
623 16884952 : if ( ( st->max_band == 19 ) && ( snr[18] > 5.0f ) && ( snr[19] > 5.0f ) )
624 : {
625 12053641 : ftmp = ( mssnr + 3 * ( snr18 + snr19 ) ) * 0.77f;
626 12053641 : if ( ftmp > mssnr )
627 : {
628 10521671 : mssnr = ftmp;
629 : }
630 : }
631 4831311 : else if ( snr_idx != 0 && nb_sig_snr > 13 )
632 : {
633 342757 : if ( 2.5f * lp_snr - 15.5f > 0 )
634 : {
635 327867 : mssnr += 2.5f * lp_snr - 15.5f;
636 : }
637 : }
638 :
639 :
640 : /* Separate SNR_SUM modification to */
641 16884952 : snr_sum_ol = snr_sum;
642 16884952 : if ( st->max_band == 19 && snr_outlier < MAX_SNR_OUTLIER_3 && snr_outlier_index > 3 && snr_outlier_index < MAX_SNR_OUTLIER_IND ) /* Update the total SNR only for WB signals */
643 : {
644 690637 : if ( ( accum_ener_L > OUTLIER_THR_1 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_1 ) )
645 : {
646 449125 : snr_sum_ol = SNR_OUTLIER_WGHT_1 * ( snr_sum_ol - snr_outlier );
647 : }
648 241512 : else if ( ( accum_ener_L > OUTLIER_THR_2 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_2 ) )
649 : {
650 143851 : snr_sum_ol = SNR_OUTLIER_WGHT_2 * ( snr_sum_ol - snr_outlier );
651 : }
652 : else
653 : {
654 97661 : snr_sum_ol = SNR_OUTLIER_WGHT_3 * ( snr_sum_ol - snr_outlier );
655 : }
656 : }
657 :
658 16884952 : hVAD->snr_sum_vad = 0.5f * hVAD->snr_sum_vad + 0.5f * snr_sum_ol;
659 :
660 16884952 : snr_sum_ol = 10.0f * (float) log10( snr_sum_ol );
661 16884952 : snr_sum = snr_sum_ol; /* for NB no outlier modification */
662 :
663 16884952 : snr_sum_HE_SAD = 10.0f * (float) log10( snr_sum_HE_SAD );
664 16884952 : *snr_sum_he = snr_sum_HE_SAD;
665 :
666 : /*---------------------------------------------------------------------*
667 : * compute threshold for VAD decision
668 : *---------------------------------------------------------------------*/
669 :
670 16884952 : lp_snr = lp_speech - lp_noise; /* long-term SNR */
671 :
672 16884952 : if ( lp_snr < hNoiseEst->sign_dyn_lp )
673 : {
674 4188543 : lp_snr += 1;
675 :
676 4188543 : if ( lp_snr > hNoiseEst->sign_dyn_lp )
677 : {
678 417509 : lp_snr = hNoiseEst->sign_dyn_lp;
679 : }
680 : }
681 :
682 16884952 : thr1 = nk * lp_snr + nc + nv * ( hNoiseEst->Etot_v_h2 - nv_ofs ); /* threshold as a linear function of long-term SNR */
683 :
684 16884952 : if ( st->element_mode > EVS_MONO && hNoiseEst->first_noise_updt_cnt < 100 )
685 : {
686 : /* lower threshold during warmup time */
687 4338156 : thr1 -= 10.0f;
688 4338156 : vad_thr = 0.f;
689 : }
690 :
691 16884952 : if ( lp_snr > 20.0f )
692 : {
693 15445455 : if ( st->element_mode == EVS_MONO || hNoiseEst->first_noise_updt_cnt >= 100 )
694 : {
695 : /* increase the threshold when SNR is high */
696 11135153 : thr1 = thr1 + 0.3f * ( lp_snr - 20.0f );
697 11135153 : if ( st->max_band == 16 && lp_snr > 40 && thr1 > 24.1f && lp_speech < 45.0f )
698 : {
699 73724 : thr1 = 24.1f;
700 : }
701 : }
702 : }
703 :
704 : /*---------------------------------------------------------------------*
705 : * WB input
706 : * SNR threshold computing
707 : * Hangover control & final VAD decision
708 : *---------------------------------------------------------------------*/
709 :
710 16884952 : if ( st->input_bwidth != NB )
711 : {
712 : /* Outlier Detection first calculates thr1_ol and snr_sum_ol instead of
713 : thr1 and snr_sum */
714 :
715 16697519 : thr1_ol = thr1;
716 16697519 : if ( lp_snr < th_clean )
717 : {
718 3038498 : hangover_short = 4;
719 3038498 : if ( ( snr_outlier_index <= 4 && ( st->last_coder_type > UNVOICED ) && !st->Opt_SC_VBR ) ||
720 42035 : ( snr_outlier_index <= 4 && ( last_7k2_coder_type > UNVOICED ) && st->Opt_SC_VBR ) )
721 : {
722 910819 : thr1_ol = thr1 - 1.0f;
723 910819 : snr_sum_ol = 10.0f * (float) log10( hVAD->snr_sum_vad );
724 : }
725 2127679 : else if ( ( ( st->last_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && !st->Opt_SC_VBR ) || ( ( last_7k2_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && st->Opt_SC_VBR ) )
726 : {
727 59771 : thr1_ol = thr1 + (float) ( 1.0f - 0.04f * snr_outlier );
728 : }
729 : else
730 : {
731 2067908 : thr1_ol = thr1 + max( 0, (float) ( 0.6f - 0.01f * snr_outlier ) );
732 : }
733 : }
734 : else
735 : {
736 13659021 : if ( st->Opt_SC_VBR )
737 : {
738 1230 : hangover_short = 3;
739 : }
740 : else
741 : {
742 13657791 : hangover_short = 3;
743 : }
744 : }
745 :
746 : /* The use of outlier detection had been removed by accident at some point */
747 16697519 : snr_sum = snr_sum_ol;
748 16697519 : thr1 = thr1_ol;
749 :
750 : /* DTX HANGOVER ADDITION MOVED TO pre_proc() */
751 :
752 16697519 : flag_he1 = 0;
753 16697519 : st->localVAD = 0;
754 16697519 : if ( mssnr > vad_thr )
755 : {
756 13669093 : st->localVAD = 1; /* he1 primary decision */
757 13669093 : flag_he1 = 1;
758 13669093 : hVAD->nb_active_frames_he1++; /* Counter of consecutive active speech frames */
759 13669093 : if ( hVAD->nb_active_frames_he1 >= ACTIVE_FRAMES )
760 : {
761 13475338 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
762 13475338 : hVAD->hangover_cnt_he1 = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
763 : }
764 : /* inside HO period */
765 13669093 : if ( hVAD->hangover_cnt_he1 < HANGOVER_LONG_HE && hVAD->hangover_cnt_he1 != 0 )
766 : {
767 188614 : hVAD->hangover_cnt_he1++;
768 : }
769 :
770 13669093 : if ( hVAD->soft_hangover > 0 )
771 : {
772 3774866 : hVAD->soft_hangover--;
773 : }
774 : }
775 : else
776 : {
777 : /* Reset the counter of speech frames necessary to start hangover algorithm */
778 3028426 : hVAD->nb_active_frames_he1 = 0;
779 : }
780 :
781 16697519 : if ( hVAD->voiced_burst > 3 )
782 : {
783 3825442 : if ( hVAD->bcg_flux < 40 )
784 : {
785 2792820 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx + 3];
786 : }
787 : else
788 : {
789 1032622 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx];
790 : }
791 : }
792 :
793 :
794 16697519 : hangover_hd = hangover_hd_tbl[snr_idx];
795 :
796 16697519 : if ( hVAD->bcg_flux < 40 )
797 : {
798 10362737 : hangover_hd = ( hangover_hd >> 1 ) + 1;
799 : }
800 :
801 16697519 : if ( flag_he1 == 0 && hVAD->soft_hangover > 0 )
802 : {
803 42780 : if ( mssnr_hov > vad_thr )
804 : {
805 16348 : flag_he1 = 1;
806 16348 : hVAD->soft_hangover--;
807 : }
808 : else
809 : {
810 26432 : hVAD->soft_hangover = 0;
811 : }
812 :
813 42780 : if ( hVAD->soft_hangover < 0 )
814 : {
815 0 : hVAD->soft_hangover = 0;
816 : }
817 : }
818 :
819 16697519 : if ( flag_he1 == 0 && hVAD->hangover_cnt_he1 < hangover_hd && hVAD->soft_hangover == 0 )
820 : {
821 131413 : flag_he1 = 1;
822 131413 : hVAD->hangover_cnt_he1++;
823 : }
824 :
825 : /* Calculate background stationarity */
826 16697519 : if ( flag_he1 == 0 && hNoiseEst->first_noise_updt > 0 )
827 : {
828 2776117 : if ( snr_sumt > hVAD->bcg_flux )
829 : {
830 21965 : if ( hVAD->bcg_flux_init-- > 0 )
831 : {
832 3 : if ( snr_sumt > hVAD->bcg_flux + 50 )
833 : {
834 0 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * ( hVAD->bcg_flux + 50 );
835 : }
836 : else
837 : {
838 3 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * snr_sumt;
839 : }
840 : }
841 : else
842 : {
843 21962 : if ( snr_sumt > hVAD->bcg_flux + 10 )
844 : {
845 1545 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * ( hVAD->bcg_flux + 10 );
846 : }
847 : else
848 : {
849 20417 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * snr_sumt;
850 : }
851 : }
852 : }
853 : else
854 : {
855 2754152 : if ( hVAD->bcg_flux_init-- > 0 )
856 : {
857 271131 : if ( snr_sumt < hVAD->bcg_flux - 30 )
858 : {
859 187825 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * ( hVAD->bcg_flux - 30 );
860 : }
861 : else
862 : {
863 83306 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * snr_sumt;
864 : }
865 : }
866 : else
867 : {
868 2483021 : if ( snr_sumt < hVAD->bcg_flux - 10 )
869 : {
870 82258 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * ( hVAD->bcg_flux - 10 );
871 : }
872 : else
873 : {
874 2400763 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * snr_sumt;
875 : }
876 : }
877 : }
878 :
879 2776117 : if ( hVAD->bcg_flux_init < 0 )
880 : {
881 2504983 : hVAD->bcg_flux_init = 0;
882 : }
883 : }
884 :
885 16697519 : flag = 0;
886 16697519 : st->localVAD = 0;
887 :
888 16697519 : if ( ( snr_sum > thr1 && flag_he1 == 1 ) ) /* Speech present */
889 : {
890 13421884 : flag = 1;
891 13421884 : st->localVAD = 1;
892 13421884 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
893 13421884 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
894 : {
895 13203442 : hVAD->nb_active_frames = ACTIVE_FRAMES;
896 13203442 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
897 : }
898 :
899 : /* inside HO period */
900 13421884 : if ( hVAD->hangover_cnt < HANGOVER_LONG && hVAD->hangover_cnt != 0 )
901 : {
902 141376 : hVAD->hangover_cnt++;
903 : }
904 : }
905 : else
906 : {
907 : /* Reset the counter of speech frames necessary to start hangover algorithm */
908 3275635 : hVAD->nb_active_frames = 0;
909 3275635 : if ( hVAD->hangover_cnt < HANGOVER_LONG ) /* inside HO period */
910 : {
911 612054 : hVAD->hangover_cnt++;
912 : }
913 :
914 3275635 : if ( hVAD->hangover_cnt <= hangover_short ) /* "hard" hangover */
915 : {
916 : /* send the extra 3 HO frames to NELP */
917 295334 : if ( st->element_mode == EVS_MONO && ( lp_snr < th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
918 : {
919 0 : *noisy_speech_HO = 1;
920 : }
921 :
922 295334 : if ( st->element_mode == EVS_MONO && ( lp_snr >= th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
923 : {
924 1 : *clean_speech_HO = 1;
925 : }
926 :
927 295334 : flag = 1;
928 : }
929 : }
930 :
931 : /* localVAD and vad_flag for HE-SAD - in parallel with normal localVAD and vad_flag */
932 16697519 : *localVAD_HE_SAD = 0;
933 16697519 : if ( snr_sum_HE_SAD > thr1 && ( flag_he1 == 1 ) ) /* Speech present */
934 : {
935 13383276 : *localVAD_HE_SAD = 1;
936 : }
937 : }
938 :
939 : /*---------------------------------------------------------------------*
940 : * NB input
941 : * SNR threshold computing
942 : * Hangover control & final VAD decision
943 : *---------------------------------------------------------------------*/
944 :
945 : else /* NB input */
946 : {
947 : /* Add localVAD_HE_SAD also for NB operation for use with speech music classifier */
948 187433 : *localVAD_HE_SAD = 0;
949 187433 : if ( snr_sum_HE_SAD > thr1 )
950 : {
951 151955 : *localVAD_HE_SAD = 1;
952 : }
953 :
954 187433 : st->localVAD = 0; /* init needed in NB, otherwise it can be undefined */
955 187433 : if ( snr_sum > thr1 ) /* Speech present */
956 : {
957 152224 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
958 152224 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
959 : {
960 146682 : hVAD->nb_active_frames = ACTIVE_FRAMES;
961 146682 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
962 : }
963 :
964 152224 : st->localVAD = 1;
965 : }
966 : else
967 : {
968 35209 : hVAD->nb_active_frames = 0; /* Reset the counter of speech frames necessary to start hangover algorithm */
969 : }
970 :
971 187433 : if ( hVAD->hangover_cnt < HANGOVER_LONG_NB )
972 : {
973 158551 : hVAD->hangover_cnt++;
974 158551 : if ( lp_snr < 19.0f ) /* very low SNR */
975 : {
976 1258 : thr1 -= 5.2f;
977 : }
978 157293 : else if ( lp_snr < 35.0f ) /* low SNR */
979 : {
980 51435 : thr1 -= 2.0f;
981 : }
982 : }
983 :
984 187433 : if ( st->Opt_DTX_ON )
985 : {
986 44622 : if ( lp_snr < th_clean )
987 : {
988 1563 : thr2 = thr1 - 1.10f;
989 : }
990 : else
991 : {
992 43059 : thr2 = thr1 - 1.5f;
993 : }
994 : }
995 : else
996 : {
997 142811 : if ( lp_snr < th_clean )
998 : {
999 2136 : thr2 = thr1 - 1.3f;
1000 : }
1001 : else
1002 : {
1003 140675 : thr2 = thr1 - 1.5f;
1004 : }
1005 : }
1006 :
1007 187433 : flag = 0;
1008 187433 : if ( snr_sum > thr1 ) /* Speech present */
1009 : {
1010 154111 : flag = 1;
1011 : }
1012 :
1013 187433 : if ( ( snr_sum < thr1 ) && ( snr_sum > thr2 ) ) /* Speech present */
1014 : {
1015 3637 : flag = 1;
1016 3637 : st->localVAD = 0;
1017 :
1018 3637 : if ( st->element_mode == EVS_MONO )
1019 : {
1020 52 : *NB_speech_HO = 1;
1021 : }
1022 : }
1023 :
1024 : /* Need to handle the case when switching from WB -> NB */
1025 : }
1026 :
1027 16884952 : if ( st->input_bwidth != NB )
1028 : {
1029 16697519 : *flag_noisy_speech_snr = ( lp_snr < TH16_2_NFLAG ); /*original threshold: 35dB*/
1030 : }
1031 : else
1032 : {
1033 187433 : *flag_noisy_speech_snr = ( lp_snr < TH8_1_NFLAG ); /*original threshold: 20dB, not yet tested!*/
1034 : }
1035 :
1036 : /* SC-VBR */
1037 16884952 : if ( st->hSC_VBR != NULL )
1038 : {
1039 102214 : st->hSC_VBR->vadsnr = snr_sum;
1040 102214 : st->hSC_VBR->vadnoise = thr1;
1041 : }
1042 :
1043 : /* Updates */
1044 16884952 : hVAD->prim_act_quick = 0.2f * ( st->localVAD ) + ( 1.0f - 0.2f ) * hVAD->prim_act_quick;
1045 16884952 : hVAD->prim_act_slow = 0.01f * ( st->localVAD ) + ( 1.0f - 0.01f ) * hVAD->prim_act_slow;
1046 16884952 : if ( hVAD->prim_act_quick <= hVAD->prim_act_slow )
1047 : {
1048 3541039 : hVAD->prim_act = 0.1f * hVAD->prim_act_quick + ( 1.0f - 0.1f ) * hVAD->prim_act;
1049 : }
1050 : else
1051 : {
1052 13343913 : hVAD->prim_act = 0.1f * hVAD->prim_act_slow + ( 1.0f - 0.1f ) * hVAD->prim_act;
1053 : }
1054 :
1055 16884952 : hVAD->prim_act_quick_he = 0.2f * *localVAD_HE_SAD + ( 1.0f - 0.2f ) * hVAD->prim_act_quick_he;
1056 16884952 : hVAD->prim_act_slow_he = 0.01f * *localVAD_HE_SAD + ( 1.0f - 0.01f ) * hVAD->prim_act_slow_he;
1057 :
1058 16884952 : if ( hVAD->prim_act_quick_he <= hVAD->prim_act_slow_he )
1059 : {
1060 3577573 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_quick_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1061 : }
1062 : else
1063 : {
1064 13307379 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_slow_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1065 : }
1066 :
1067 :
1068 16884952 : if ( ( hVAD->vad_flag_reg_H & (int32_t) 0x40000L ) != 0 ) /* 0x4000L = 0x01L << 18 */
1069 : {
1070 12005657 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 - 1;
1071 : }
1072 :
1073 16884952 : hVAD->vad_flag_reg_H = ( hVAD->vad_flag_reg_H & (int32_t) 0x3fffffffL ) << 1;
1074 :
1075 16884952 : if ( ( hVAD->vad_flag_reg_L & (int32_t) 0x40000000L ) != 0 )
1076 : {
1077 12348635 : hVAD->vad_flag_reg_H = hVAD->vad_flag_reg_H | 0x01L;
1078 : }
1079 :
1080 16884952 : hVAD->vad_flag_reg_L = ( hVAD->vad_flag_reg_L & (int32_t) 0x3fffffffL ) << 1;
1081 :
1082 16884952 : if ( flag ) /* should not include the extra DTX hangover */
1083 : {
1084 13874966 : hVAD->vad_flag_reg_L = hVAD->vad_flag_reg_L | 0x01L;
1085 13874966 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 + 1;
1086 : }
1087 :
1088 16884952 : if ( ( hVAD->vad_prim_reg & (int32_t) 0x8000L ) != 0 ) /* 0x8000L = 1L << 15 */
1089 : {
1090 12534822 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 - 1;
1091 : }
1092 :
1093 16884952 : hVAD->vad_prim_reg = ( hVAD->vad_prim_reg & (int32_t) 0x3fffffffL ) << 1;
1094 :
1095 16884952 : if ( st->localVAD )
1096 : {
1097 13574108 : hVAD->vad_prim_reg = hVAD->vad_prim_reg | 0x01L;
1098 13574108 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 + 1;
1099 : }
1100 :
1101 16884952 : return flag;
1102 : }
|