Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "rom_enc.h"
46 : #include "wmc_auto.h"
47 : #include "ivas_prot.h"
48 :
49 : /*-----------------------------------------------------------------*
50 : * Local constants
51 : *-----------------------------------------------------------------*/
52 :
53 : #define HANGOVER_LONG 10 /* Hangover for CNG */
54 : #define HANGOVER_LONG_HE 20 /* Hangover of CNG */
55 : #define HANGOVER_LONG_MUSIC 20 /* Hangover of CNG */
56 : #define HANGOVER_LONG_NB 8 /* Hangover for CNG */
57 : #define ACTIVE_FRAMES 3 /* Number of consecutive active SPEECH frames necessary to trigger HO */
58 :
59 : #define TH16_2 35.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
60 : #define TH8_1 20.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
61 : #define TH16_2_NFLAG 35.0f
62 : #define TH8_1_NFLAG 35.0f
63 :
64 :
65 : #define SNR_OUTLIER_WGHT_1 1.00f
66 : #define SNR_OUTLIER_WGHT_2 1.01f
67 : #define SNR_OUTLIER_WGHT_3 1.02f
68 : #define OUTLIER_THR_1 10.0f
69 : #define OUTLIER_THR_2 6.0f
70 : #define MAX_SNR_OUTLIER_IND 17
71 : #define MAX_SNR_OUTLIER_1 10.0f
72 : #define MAX_SNR_OUTLIER_2 25.0f
73 : #define MAX_SNR_OUTLIER_3 50.0f
74 :
75 : /*---------------------------------------------------------------------*
76 : * wb_vad_init()
77 : *
78 : * VAD initializations
79 : *---------------------------------------------------------------------*/
80 :
81 149409 : void wb_vad_init(
82 : VAD_HANDLE hVAD /* i/o: VAD data handle */
83 : )
84 : {
85 149409 : hVAD->hangover_cnt = 0; /* Hangover counter initialized to 0 */
86 149409 : hVAD->nb_active_frames = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
87 : /* is set to max (-> start with hangover) */
88 :
89 149409 : hVAD->vad_flag_reg_H = 0L;
90 149409 : hVAD->vad_flag_reg_L = 0L;
91 149409 : hVAD->vad_prim_reg = 0L;
92 149409 : hVAD->vad_flag_cnt_50 = 0;
93 149409 : hVAD->vad_prim_cnt_16 = 0;
94 :
95 : /* By default one should not start with a hangover */
96 149409 : hVAD->hangover_cnt_dtx = HANGOVER_LONG; /* hangover for DTX */
97 149409 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC; /* hangover for DTX */
98 :
99 149409 : hVAD->hangover_cnt_he = 0; /* Hangover counter initialized to 0 */
100 149409 : hVAD->nb_active_frames_he = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
101 149409 : hVAD->bcg_flux = 70;
102 149409 : hVAD->soft_hangover = 0;
103 149409 : hVAD->voiced_burst = 0;
104 149409 : hVAD->bcg_flux_init = 50;
105 149409 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
106 149409 : hVAD->hangover_cnt_he1 = 0;
107 :
108 149409 : hVAD->prim_act_quick = 0.0f;
109 149409 : hVAD->prim_act_slow = 0.0f;
110 149409 : hVAD->prim_act = 0.0f;
111 149409 : hVAD->prim_act_quick_he = 0.0f;
112 149409 : hVAD->prim_act_slow_he = 0.0f;
113 149409 : hVAD->prim_act_he = 0.0f;
114 :
115 149409 : hVAD->consec_inactive = 0;
116 149409 : hVAD->spectral_tilt_reset = 1;
117 149409 : hVAD->running_avg = 0;
118 149409 : hVAD->ra_deltasum = 0;
119 149409 : hVAD->trigger_SID = 0;
120 149409 : hVAD->snr_sum_vad = 0;
121 :
122 149409 : hVAD->hangover_terminate_flag = 0;
123 :
124 149409 : return;
125 : }
126 :
127 : /*-----------------------------------------------------------------*
128 : * sing_thr_snr_acc()
129 : *
130 : * accumulate snr_sum with significance thresholds
131 : *-----------------------------------------------------------------*/
132 :
133 673738616 : static void sign_thr_snr_acc(
134 : float *snr_sum,
135 : float snr,
136 : float sign_thr,
137 : float min_snr )
138 : {
139 673738616 : if ( snr >= sign_thr )
140 : {
141 531988519 : *snr_sum = *snr_sum + snr;
142 : }
143 : else
144 : {
145 141750097 : *snr_sum = *snr_sum + min_snr;
146 : }
147 :
148 673738616 : return;
149 : }
150 :
151 : /*-----------------------------------------------------------------*
152 : * dtx_hangover_addition()
153 : *
154 : * accumulate snr_sum with significance thresholds
155 : *-----------------------------------------------------------------*/
156 :
157 16547924 : int16_t dtx_hangover_addition(
158 : Encoder_State *st, /* i/o: encoder state structure */
159 : const int16_t vad_flag, /* i : VAD flag */
160 : const float lp_snr, /* i : input single SNR estimate */
161 : const int16_t cldfb_subtraction, /* i : */
162 : int16_t *vad_hover_flag, /* o : VAD hangover flag */
163 : VAD_HANDLE hVAD, /* i/o: VAD handle for L or R channel */
164 : NOISE_EST_HANDLE hNoiseEst, /* i : Noise estimation handle */
165 : int16_t *rem_dtx_ho /* o : Expected remaining hangover frames */
166 : )
167 : {
168 : int16_t hangover_short_dtx, flag_dtx;
169 : int16_t ho_limit_clean;
170 :
171 16547924 : if ( hVAD == NULL )
172 : {
173 15303649 : hVAD = st->hVAD;
174 : }
175 16547924 : if ( hNoiseEst == NULL )
176 : {
177 15303649 : hNoiseEst = st->hNoiseEst;
178 : }
179 :
180 16547924 : flag_dtx = 0;
181 :
182 : /* Determine initial hangover length */
183 16547924 : hangover_short_dtx = 2; /* was 1 */
184 16547924 : if ( ( lp_snr < 16.0f && st->input_bwidth != NB ) ||
185 15477865 : hVAD->prim_act_he > 0.95f )
186 : {
187 8483466 : hangover_short_dtx = 3; /* was 2 */
188 : }
189 :
190 : /* Adjust hangover according to activity history */
191 16547924 : if ( hVAD->vad_prim_cnt_16 > 12 ) /* 12 requires roughly > 80% primary activity */
192 : {
193 12001392 : hangover_short_dtx = hangover_short_dtx + 2;
194 : }
195 :
196 16547924 : if ( hVAD->vad_flag_cnt_50 > 40 ) /* 40 requires roughtly > 80% flag activity */
197 : {
198 11243564 : hangover_short_dtx = hangover_short_dtx + 5;
199 : }
200 :
201 : /* Keep hangover_short lower than maximum hangover count */
202 16547924 : if ( hangover_short_dtx > HANGOVER_LONG - 1 )
203 : {
204 7740641 : hangover_short_dtx = HANGOVER_LONG - 1;
205 : }
206 :
207 : /* Only allow short HO if not sufficient active frames */
208 16547924 : ho_limit_clean = 3;
209 16547924 : if ( st->core == AMR_WB_CORE )
210 : {
211 5296 : ho_limit_clean = 2;
212 : }
213 :
214 16547924 : if ( st->input_bwidth != NB && st->core != AMR_WB_CORE && lp_snr > 25.0f )
215 : {
216 14511562 : ho_limit_clean = 2;
217 : }
218 :
219 16547924 : if ( ho_limit_clean != 0 )
220 : {
221 16547924 : if ( ( hangover_short_dtx > ho_limit_clean ) && ( ( hVAD->vad_prim_cnt_16 < 7 ) || ( lp_snr > 16 && hVAD->prim_act_he < 0.85 ) ) )
222 : {
223 3077730 : hangover_short_dtx = ho_limit_clean;
224 : }
225 : }
226 :
227 : /* hangover adjustment from combined FFT + CLDFBVAD */
228 16547924 : if ( st->core != AMR_WB_CORE )
229 : {
230 16542628 : hangover_short_dtx = hangover_short_dtx - cldfb_subtraction;
231 16542628 : if ( hangover_short_dtx < 0 )
232 : {
233 7817 : hangover_short_dtx = 0;
234 : }
235 : }
236 :
237 16547924 : if ( vad_flag == 1 ) /* Speech present */
238 : {
239 13609781 : flag_dtx = 1;
240 :
241 : /* Add hangover after sufficient # of active frames or sufficient activity during last second */
242 13609781 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES || hVAD->vad_flag_cnt_50 > 45 ) /* 45 requires roughtly > 90% flag activity */
243 : {
244 13284492 : hVAD->hangover_cnt_dtx = 0;
245 : }
246 :
247 : /* inside HO period */
248 13609781 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG && hVAD->hangover_cnt_dtx != 0 )
249 : {
250 36443 : hVAD->hangover_cnt_dtx++;
251 : }
252 :
253 13609781 : hVAD->hangover_terminate_flag = 0;
254 :
255 : /* Music hangover when music detected */
256 13609781 : if ( hVAD->prim_act_he > 0.98f && hNoiseEst->Etot_lp > 40 && hVAD->vad_prim_cnt_16 > 14 && hVAD->vad_flag_cnt_50 > 48 )
257 : {
258 4287249 : hVAD->hangover_cnt_music = 0;
259 : }
260 :
261 : /* inside music HO period */
262 13609781 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC && hVAD->hangover_cnt_music != 0 )
263 : {
264 61122 : hVAD->hangover_cnt_music++;
265 : }
266 : }
267 : else
268 : {
269 : /* Reset the counter of speech frames necessary to start hangover algorithm */
270 2938143 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG ) /* inside HO period */
271 : {
272 317012 : hVAD->hangover_cnt_dtx++;
273 : }
274 :
275 2938143 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC ) /* inside music HO period */
276 : {
277 77132 : hVAD->hangover_cnt_music++;
278 : }
279 :
280 : /* fast terminate DTX hangover if st->hangover_terminate_flag is set */
281 2938143 : if ( hVAD->hangover_terminate_flag == 1 )
282 : {
283 69 : hVAD->hangover_cnt = HANGOVER_LONG;
284 69 : hVAD->hangover_cnt_dtx = HANGOVER_LONG;
285 69 : hVAD->hangover_terminate_flag = 0;
286 : /* only shorten music hangover when low energy frames */
287 69 : if ( hNoiseEst->Etot_lp < 20.0f )
288 : {
289 0 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC;
290 : }
291 : }
292 :
293 2938143 : if ( hVAD->hangover_cnt_dtx <= hangover_short_dtx ) /* "hard" hangover */
294 : {
295 108572 : flag_dtx = 1;
296 : }
297 :
298 2938143 : if ( hVAD->hangover_cnt_music <= 15 ) /* "hard" hangover */
299 : {
300 64164 : flag_dtx = 1;
301 : }
302 : }
303 :
304 16547924 : if ( flag_dtx != 0 && st->localVAD == 0 )
305 : {
306 445373 : *vad_hover_flag = 1;
307 445373 : if ( rem_dtx_ho != NULL )
308 : {
309 30888 : *rem_dtx_ho = max( hangover_short_dtx - hVAD->hangover_cnt_dtx, 0 );
310 : }
311 : }
312 :
313 16547924 : return flag_dtx;
314 : }
315 :
316 :
317 : /*-----------------------------------------------------------------*
318 : * wb_vad()
319 : *
320 : * Voice Activity Detector
321 : *-----------------------------------------------------------------*/
322 :
323 16880952 : int16_t wb_vad(
324 : Encoder_State *st, /* i/o: encoder state structure */
325 : const float fr_bands[], /* i : per band input energy (contains 2 vectors) */
326 : int16_t *noisy_speech_HO, /* o : SC-VBR noisy speech HO flag */
327 : int16_t *clean_speech_HO, /* o : SC-VBR clean speech HO flag */
328 : int16_t *NB_speech_HO, /* o : SC-VBR NB speech HO flag */
329 : float *snr_sum_he, /* o : Output snr_sum as weighted spectral measure */
330 : int16_t *localVAD_HE_SAD, /* o : HE_SAD decision without hangovers */
331 : int16_t *flag_noisy_speech_snr, /* o : */
332 : VAD_HANDLE hVAD, /* i/o: VAD handle */
333 : NOISE_EST_HANDLE hNoiseEst, /* i/o: Noise estimation handle */
334 : float lp_speech, /* i : long term active speech energy average */
335 : float lp_noise /* i : long term noise energy */
336 : )
337 : {
338 : int16_t i, j, flag, hangover_short;
339 : float snr[NB_BANDS], snr_sum, thr1, thr2, lp_snr, nk, nc, th_clean;
340 : const float *pt1, *pt2, *pt3;
341 : float min_snr, sign_thr;
342 : float fr_enr;
343 : float ftmp, ftmp1;
344 16880952 : float mssnr = 0;
345 : float snr_sumt;
346 : float vad_thr;
347 : int16_t hangover_hd;
348 : int16_t snr_idx;
349 : float delta1, delta2, delta3;
350 : int16_t flag_he1;
351 : float mssnr_hov;
352 : int16_t stmp;
353 : float msnr;
354 : float snr_outlier;
355 : int16_t snr_outlier_index;
356 : float accum_ener_L, accum_ener_H;
357 : float delta4;
358 16880952 : float snr18 = 1.0f, snr19 = 1.0f;
359 : int16_t nb_sig_snr;
360 : float nv;
361 : float snr_sum_HE_SAD;
362 : float sign_thr_HE_SAD, min_snr_HE_SAD;
363 : float nv_ofs;
364 : float thr1_ol;
365 : float snr_sum_ol;
366 : int16_t last_7k2_coder_type;
367 :
368 16880952 : if ( hNoiseEst == NULL )
369 : {
370 15636677 : hNoiseEst = st->hNoiseEst;
371 : }
372 :
373 16880952 : if ( hVAD == NULL )
374 : {
375 15636677 : hVAD = st->hVAD;
376 : }
377 :
378 16880952 : if ( lp_speech < -100.0f )
379 : {
380 15636677 : lp_speech = st->lp_speech;
381 : }
382 :
383 16880952 : if ( lp_noise < -100.0f )
384 : {
385 15636677 : lp_noise = st->lp_noise;
386 : }
387 :
388 : /*---------------------------------------------------------------------*
389 : * Initialization
390 : *---------------------------------------------------------------------*/
391 :
392 16880952 : snr_outlier = 0;
393 16880952 : snr_outlier_index = 0;
394 16880952 : accum_ener_L = 0;
395 16880952 : accum_ener_H = 0;
396 :
397 16880952 : if ( st->input_bwidth == NB )
398 : {
399 187433 : st->min_band = 1;
400 187433 : st->max_band = 16;
401 : }
402 : else
403 : {
404 16693519 : st->min_band = 0;
405 16693519 : st->max_band = 19;
406 : }
407 :
408 16880952 : if ( st->Opt_SC_VBR )
409 : {
410 2390 : last_7k2_coder_type = st->hSC_VBR->last_7k2_coder_type;
411 : }
412 : else
413 : {
414 16878562 : last_7k2_coder_type = -1;
415 : }
416 :
417 : /*---------------------------------------------------------------------*
418 : * set SNR thresholds depending on the input rate
419 : *---------------------------------------------------------------------*/
420 :
421 16880952 : if ( st->max_band == 19 ) /* WB input */
422 : {
423 16693519 : nk = 0.1f;
424 16693519 : nc = 16.1f;
425 16693519 : nv = 2.05f;
426 16693519 : nv_ofs = 1.65f;
427 16693519 : th_clean = TH16_2;
428 16693519 : if ( st->input_bwidth == WB )
429 : {
430 2288739 : sign_thr = 1.3f;
431 2288739 : min_snr = 0.8f;
432 : }
433 : else
434 : {
435 14404780 : sign_thr = 1.75f;
436 14404780 : min_snr = 0.25f;
437 : }
438 16693519 : sign_thr_HE_SAD = 2.5f;
439 16693519 : min_snr_HE_SAD = 0.2f;
440 : }
441 : else /* NB input */
442 : {
443 187433 : nk = 0.10f;
444 187433 : nc = 16.0f;
445 187433 : nv = 4.00f; /* Was 4.5f but trunkated to 4.00 used when converted to short */
446 187433 : nv_ofs = 1.15f;
447 187433 : th_clean = TH8_1;
448 187433 : sign_thr = 1.75f;
449 187433 : min_snr = 0.25f;
450 :
451 187433 : sign_thr_HE_SAD = 2.65f;
452 187433 : min_snr_HE_SAD = 0.05f;
453 : }
454 :
455 16880952 : hangover_short = 0;
456 :
457 :
458 16880952 : if ( st->Opt_SC_VBR )
459 : {
460 2390 : *noisy_speech_HO = 0;
461 2390 : *clean_speech_HO = 0;
462 2390 : *NB_speech_HO = 0;
463 : }
464 :
465 : /*---------------------------------------------------------------------*
466 : * compute SNR for each band & total
467 : *---------------------------------------------------------------------*/
468 :
469 16880952 : pt1 = fr_bands;
470 16880952 : pt2 = fr_bands + NB_BANDS;
471 16880952 : snr_sum = 0.0f;
472 16880952 : *snr_sum_he = 0.0f;
473 16880952 : snr_sumt = 0;
474 16880952 : mssnr_hov = 0;
475 16880952 : snr_sum_HE_SAD = 0.0f;
476 16880952 : lp_snr = lp_speech - lp_noise;
477 :
478 16880952 : if ( lp_snr > 24.0f )
479 : {
480 15022200 : snr_idx = 0;
481 : }
482 1858752 : else if ( lp_snr > 18 )
483 : {
484 554139 : snr_idx = 1;
485 : }
486 : else
487 : {
488 1304613 : snr_idx = 2;
489 : }
490 :
491 16880952 : if ( snr_idx == 0 )
492 : {
493 15022200 : stmp = 6;
494 15022200 : delta1 = 0.0f;
495 15022200 : delta2 = 0.0f;
496 15022200 : delta3 = 0.0f;
497 15022200 : delta4 = 0.0f;
498 15022200 : vad_thr = 2.4f * lp_snr - 42.2f;
499 15022200 : vad_thr = min( vad_thr, 80 );
500 : }
501 1858752 : else if ( snr_idx == 1 )
502 : {
503 554139 : stmp = 6;
504 554139 : delta1 = 0.1f;
505 554139 : delta2 = 0.2f;
506 554139 : delta3 = 0.2f;
507 554139 : delta4 = 0.2f;
508 554139 : vad_thr = 2.4f * lp_snr - 40.2f;
509 554139 : vad_thr = min( vad_thr, 80 );
510 : }
511 : else
512 : {
513 1304613 : stmp = 9;
514 1304613 : delta1 = 0.2f;
515 1304613 : delta2 = 0.4f;
516 1304613 : delta3 = 0.3f;
517 1304613 : delta4 = 0.4f;
518 1304613 : vad_thr = 2.5f * lp_snr - 10.0f;
519 1304613 : vad_thr = max( vad_thr, 1 );
520 : }
521 16880952 : pt3 = hNoiseEst->bckr;
522 16880952 : nb_sig_snr = 20;
523 :
524 353750260 : for ( i = st->min_band; i <= st->max_band; i++ )
525 : {
526 336869308 : ftmp = *pt1++;
527 336869308 : ftmp1 = *pt2++;
528 336869308 : fr_enr = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 );
529 :
530 336869308 : if ( ftmp > ftmp1 )
531 : {
532 140442236 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 ) / *pt3++;
533 : }
534 : else
535 : {
536 196427072 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.3f * ftmp + 0.5f * ftmp1 ) / *pt3++;
537 : }
538 :
539 336869308 : if ( snr[i] < 2.0f )
540 : {
541 70380749 : nb_sig_snr--;
542 : }
543 :
544 336869308 : if ( snr[i] < 1 )
545 : {
546 18557114 : snr[i] = 1;
547 : }
548 :
549 336869308 : snr[i] = (float) log10( snr[i] );
550 336869308 : snr_sumt += snr[i];
551 336869308 : if ( i < 2 )
552 : {
553 33574471 : ftmp = snr[i] + delta1;
554 : }
555 303294837 : else if ( i < 7 )
556 : {
557 84404760 : ftmp = snr[i] + delta2;
558 : }
559 218890077 : else if ( i < 18 )
560 : {
561 185503039 : ftmp = snr[i] + delta3;
562 : }
563 : else
564 : {
565 33387038 : ftmp = snr[i] + delta4;
566 : }
567 336869308 : ftmp1 = ftmp;
568 336869308 : if ( i < 7 )
569 : {
570 117979231 : ftmp1 = ftmp + 0.4f;
571 : }
572 336869308 : ftmp = min( ftmp, 2.0f );
573 336869308 : ftmp1 = min( ftmp1, 2.0f );
574 336869308 : msnr = 1;
575 2436338476 : for ( j = 0; j < stmp; j++ )
576 : {
577 2099469168 : msnr *= ftmp;
578 : }
579 336869308 : mssnr += msnr;
580 336869308 : if ( i == 18 )
581 : {
582 16693519 : snr18 = msnr;
583 : }
584 320175789 : else if ( i == 19 )
585 : {
586 16693519 : snr19 = msnr;
587 : }
588 336869308 : msnr = 1;
589 2436338476 : for ( j = 0; j < stmp; j++ )
590 : {
591 2099469168 : msnr *= ftmp1;
592 : }
593 336869308 : mssnr_hov += msnr;
594 336869308 : snr[i] = fr_enr / hNoiseEst->bckr[i];
595 :
596 336869308 : sign_thr_snr_acc( &snr_sum_HE_SAD, snr[i], sign_thr_HE_SAD, min_snr_HE_SAD );
597 336869308 : sign_thr_snr_acc( &snr_sum, snr[i], sign_thr, min_snr );
598 :
599 : /* To make snr[] compatible with older versions where snr[i] >= 1
600 : also this could be removed if this no longer is a requriement */
601 336869308 : if ( snr[i] < 1.0f )
602 : {
603 18815047 : snr[i] = 1.0f;
604 : }
605 : /* accumulate background noise energy in bands [0-2] and in bands [3-19]*/
606 336869308 : if ( i < 3 )
607 : {
608 50455423 : accum_ener_L = accum_ener_L + hNoiseEst->bckr[i];
609 : }
610 : else
611 : {
612 286413885 : accum_ener_H = accum_ener_H + hNoiseEst->bckr[i];
613 : }
614 :
615 : /* identify the outlier band */
616 336869308 : if ( snr[i] > snr_outlier )
617 : {
618 53065475 : snr_outlier = snr[i];
619 53065475 : snr_outlier_index = i;
620 : }
621 : }
622 :
623 16880952 : if ( ( st->max_band == 19 ) && ( snr[18] > 5.0f ) && ( snr[19] > 5.0f ) )
624 : {
625 12051207 : ftmp = ( mssnr + 3 * ( snr18 + snr19 ) ) * 0.77f;
626 12051207 : if ( ftmp > mssnr )
627 : {
628 10519703 : mssnr = ftmp;
629 : }
630 : }
631 4829745 : else if ( snr_idx != 0 && nb_sig_snr > 13 )
632 : {
633 342758 : if ( 2.5f * lp_snr - 15.5f > 0 )
634 : {
635 327868 : mssnr += 2.5f * lp_snr - 15.5f;
636 : }
637 : }
638 :
639 :
640 : /* Separate SNR_SUM modification to */
641 16880952 : snr_sum_ol = snr_sum;
642 16880952 : if ( st->max_band == 19 && snr_outlier < MAX_SNR_OUTLIER_3 && snr_outlier_index > 3 && snr_outlier_index < MAX_SNR_OUTLIER_IND ) /* Update the total SNR only for WB signals */
643 : {
644 690633 : if ( ( accum_ener_L > OUTLIER_THR_1 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_1 ) )
645 : {
646 449126 : snr_sum_ol = SNR_OUTLIER_WGHT_1 * ( snr_sum_ol - snr_outlier );
647 : }
648 241507 : else if ( ( accum_ener_L > OUTLIER_THR_2 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_2 ) )
649 : {
650 143845 : snr_sum_ol = SNR_OUTLIER_WGHT_2 * ( snr_sum_ol - snr_outlier );
651 : }
652 : else
653 : {
654 97662 : snr_sum_ol = SNR_OUTLIER_WGHT_3 * ( snr_sum_ol - snr_outlier );
655 : }
656 : }
657 :
658 16880952 : hVAD->snr_sum_vad = 0.5f * hVAD->snr_sum_vad + 0.5f * snr_sum_ol;
659 :
660 16880952 : snr_sum_ol = 10.0f * (float) log10( snr_sum_ol );
661 16880952 : snr_sum = snr_sum_ol; /* for NB no outlier modification */
662 :
663 16880952 : snr_sum_HE_SAD = 10.0f * (float) log10( snr_sum_HE_SAD );
664 16880952 : *snr_sum_he = snr_sum_HE_SAD;
665 :
666 : /*---------------------------------------------------------------------*
667 : * compute threshold for VAD decision
668 : *---------------------------------------------------------------------*/
669 :
670 16880952 : lp_snr = lp_speech - lp_noise; /* long-term SNR */
671 :
672 16880952 : if ( lp_snr < hNoiseEst->sign_dyn_lp )
673 : {
674 4187194 : lp_snr += 1;
675 :
676 4187194 : if ( lp_snr > hNoiseEst->sign_dyn_lp )
677 : {
678 417414 : lp_snr = hNoiseEst->sign_dyn_lp;
679 : }
680 : }
681 :
682 16880952 : thr1 = nk * lp_snr + nc + nv * ( hNoiseEst->Etot_v_h2 - nv_ofs ); /* threshold as a linear function of long-term SNR */
683 :
684 16880952 : if ( st->element_mode > EVS_MONO && hNoiseEst->first_noise_updt_cnt < 100 )
685 : {
686 : /* lower threshold during warmup time */
687 4335921 : thr1 -= 10.0f;
688 4335921 : vad_thr = 0.f;
689 : }
690 :
691 16880952 : if ( lp_snr > 20.0f )
692 : {
693 15441455 : if ( st->element_mode == EVS_MONO || hNoiseEst->first_noise_updt_cnt >= 100 )
694 : {
695 : /* increase the threshold when SNR is high */
696 11133388 : thr1 = thr1 + 0.3f * ( lp_snr - 20.0f );
697 11133388 : if ( st->max_band == 16 && lp_snr > 40 && thr1 > 24.1f && lp_speech < 45.0f )
698 : {
699 73724 : thr1 = 24.1f;
700 : }
701 : }
702 : }
703 :
704 : /*---------------------------------------------------------------------*
705 : * WB input
706 : * SNR threshold computing
707 : * Hangover control & final VAD decision
708 : *---------------------------------------------------------------------*/
709 :
710 16880952 : if ( st->input_bwidth != NB )
711 : {
712 : /* Outlier Detection first calculates thr1_ol and snr_sum_ol instead of
713 : thr1 and snr_sum */
714 :
715 16693519 : thr1_ol = thr1;
716 16693519 : if ( lp_snr < th_clean )
717 : {
718 3038499 : hangover_short = 4;
719 3038499 : if ( ( snr_outlier_index <= 4 && ( st->last_coder_type > UNVOICED ) && !st->Opt_SC_VBR ) ||
720 42035 : ( snr_outlier_index <= 4 && ( last_7k2_coder_type > UNVOICED ) && st->Opt_SC_VBR ) )
721 : {
722 910822 : thr1_ol = thr1 - 1.0f;
723 910822 : snr_sum_ol = 10.0f * (float) log10( hVAD->snr_sum_vad );
724 : }
725 2127677 : else if ( ( ( st->last_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && !st->Opt_SC_VBR ) || ( ( last_7k2_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && st->Opt_SC_VBR ) )
726 : {
727 59771 : thr1_ol = thr1 + (float) ( 1.0f - 0.04f * snr_outlier );
728 : }
729 : else
730 : {
731 2067906 : thr1_ol = thr1 + max( 0, (float) ( 0.6f - 0.01f * snr_outlier ) );
732 : }
733 : }
734 : else
735 : {
736 13655020 : if ( st->Opt_SC_VBR )
737 : {
738 1230 : hangover_short = 3;
739 : }
740 : else
741 : {
742 13653790 : hangover_short = 3;
743 : }
744 : }
745 :
746 : /* The use of outlier detection had been removed by accident at some point */
747 16693519 : snr_sum = snr_sum_ol;
748 16693519 : thr1 = thr1_ol;
749 :
750 : /* DTX HANGOVER ADDITION MOVED TO pre_proc() */
751 :
752 16693519 : flag_he1 = 0;
753 16693519 : st->localVAD = 0;
754 16693519 : if ( mssnr > vad_thr )
755 : {
756 13666337 : st->localVAD = 1; /* he1 primary decision */
757 13666337 : flag_he1 = 1;
758 13666337 : hVAD->nb_active_frames_he1++; /* Counter of consecutive active speech frames */
759 13666337 : if ( hVAD->nb_active_frames_he1 >= ACTIVE_FRAMES )
760 : {
761 13472619 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
762 13472619 : hVAD->hangover_cnt_he1 = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
763 : }
764 : /* inside HO period */
765 13666337 : if ( hVAD->hangover_cnt_he1 < HANGOVER_LONG_HE && hVAD->hangover_cnt_he1 != 0 )
766 : {
767 188577 : hVAD->hangover_cnt_he1++;
768 : }
769 :
770 13666337 : if ( hVAD->soft_hangover > 0 )
771 : {
772 3773972 : hVAD->soft_hangover--;
773 : }
774 : }
775 : else
776 : {
777 : /* Reset the counter of speech frames necessary to start hangover algorithm */
778 3027182 : hVAD->nb_active_frames_he1 = 0;
779 : }
780 :
781 16693519 : if ( hVAD->voiced_burst > 3 )
782 : {
783 3824538 : if ( hVAD->bcg_flux < 40 )
784 : {
785 2792504 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx + 3];
786 : }
787 : else
788 : {
789 1032034 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx];
790 : }
791 : }
792 :
793 :
794 16693519 : hangover_hd = hangover_hd_tbl[snr_idx];
795 :
796 16693519 : if ( hVAD->bcg_flux < 40 )
797 : {
798 10360789 : hangover_hd = ( hangover_hd >> 1 ) + 1;
799 : }
800 :
801 16693519 : if ( flag_he1 == 0 && hVAD->soft_hangover > 0 )
802 : {
803 42771 : if ( mssnr_hov > vad_thr )
804 : {
805 16340 : flag_he1 = 1;
806 16340 : hVAD->soft_hangover--;
807 : }
808 : else
809 : {
810 26431 : hVAD->soft_hangover = 0;
811 : }
812 :
813 42771 : if ( hVAD->soft_hangover < 0 )
814 : {
815 0 : hVAD->soft_hangover = 0;
816 : }
817 : }
818 :
819 16693519 : if ( flag_he1 == 0 && hVAD->hangover_cnt_he1 < hangover_hd && hVAD->soft_hangover == 0 )
820 : {
821 131395 : flag_he1 = 1;
822 131395 : hVAD->hangover_cnt_he1++;
823 : }
824 :
825 : /* Calculate background stationarity */
826 16693519 : if ( flag_he1 == 0 && hNoiseEst->first_noise_updt > 0 )
827 : {
828 2774923 : if ( snr_sumt > hVAD->bcg_flux )
829 : {
830 21963 : if ( hVAD->bcg_flux_init-- > 0 )
831 : {
832 3 : if ( snr_sumt > hVAD->bcg_flux + 50 )
833 : {
834 0 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * ( hVAD->bcg_flux + 50 );
835 : }
836 : else
837 : {
838 3 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * snr_sumt;
839 : }
840 : }
841 : else
842 : {
843 21960 : if ( snr_sumt > hVAD->bcg_flux + 10 )
844 : {
845 1545 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * ( hVAD->bcg_flux + 10 );
846 : }
847 : else
848 : {
849 20415 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * snr_sumt;
850 : }
851 : }
852 : }
853 : else
854 : {
855 2752960 : if ( hVAD->bcg_flux_init-- > 0 )
856 : {
857 270981 : if ( snr_sumt < hVAD->bcg_flux - 30 )
858 : {
859 187744 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * ( hVAD->bcg_flux - 30 );
860 : }
861 : else
862 : {
863 83237 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * snr_sumt;
864 : }
865 : }
866 : else
867 : {
868 2481979 : if ( snr_sumt < hVAD->bcg_flux - 10 )
869 : {
870 82258 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * ( hVAD->bcg_flux - 10 );
871 : }
872 : else
873 : {
874 2399721 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * snr_sumt;
875 : }
876 : }
877 : }
878 :
879 2774923 : if ( hVAD->bcg_flux_init < 0 )
880 : {
881 2503939 : hVAD->bcg_flux_init = 0;
882 : }
883 : }
884 :
885 16693519 : flag = 0;
886 16693519 : st->localVAD = 0;
887 :
888 16693519 : if ( ( snr_sum > thr1 && flag_he1 == 1 ) ) /* Speech present */
889 : {
890 13419236 : flag = 1;
891 13419236 : st->localVAD = 1;
892 13419236 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
893 13419236 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
894 : {
895 13200842 : hVAD->nb_active_frames = ACTIVE_FRAMES;
896 13200842 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
897 : }
898 :
899 : /* inside HO period */
900 13419236 : if ( hVAD->hangover_cnt < HANGOVER_LONG && hVAD->hangover_cnt != 0 )
901 : {
902 141354 : hVAD->hangover_cnt++;
903 : }
904 : }
905 : else
906 : {
907 : /* Reset the counter of speech frames necessary to start hangover algorithm */
908 3274283 : hVAD->nb_active_frames = 0;
909 3274283 : if ( hVAD->hangover_cnt < HANGOVER_LONG ) /* inside HO period */
910 : {
911 611875 : hVAD->hangover_cnt++;
912 : }
913 :
914 3274283 : if ( hVAD->hangover_cnt <= hangover_short ) /* "hard" hangover */
915 : {
916 : /* send the extra 3 HO frames to NELP */
917 295270 : if ( st->element_mode == EVS_MONO && ( lp_snr < th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
918 : {
919 0 : *noisy_speech_HO = 1;
920 : }
921 :
922 295270 : if ( st->element_mode == EVS_MONO && ( lp_snr >= th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
923 : {
924 1 : *clean_speech_HO = 1;
925 : }
926 :
927 295270 : flag = 1;
928 : }
929 : }
930 :
931 : /* localVAD and vad_flag for HE-SAD - in parallel with normal localVAD and vad_flag */
932 16693519 : *localVAD_HE_SAD = 0;
933 16693519 : if ( snr_sum_HE_SAD > thr1 && ( flag_he1 == 1 ) ) /* Speech present */
934 : {
935 13380631 : *localVAD_HE_SAD = 1;
936 : }
937 : }
938 :
939 : /*---------------------------------------------------------------------*
940 : * NB input
941 : * SNR threshold computing
942 : * Hangover control & final VAD decision
943 : *---------------------------------------------------------------------*/
944 :
945 : else /* NB input */
946 : {
947 : /* Add localVAD_HE_SAD also for NB operation for use with speech music classifier */
948 187433 : *localVAD_HE_SAD = 0;
949 187433 : if ( snr_sum_HE_SAD > thr1 )
950 : {
951 151955 : *localVAD_HE_SAD = 1;
952 : }
953 :
954 187433 : st->localVAD = 0; /* init needed in NB, otherwise it can be undefined */
955 187433 : if ( snr_sum > thr1 ) /* Speech present */
956 : {
957 152224 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
958 152224 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
959 : {
960 146682 : hVAD->nb_active_frames = ACTIVE_FRAMES;
961 146682 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
962 : }
963 :
964 152224 : st->localVAD = 1;
965 : }
966 : else
967 : {
968 35209 : hVAD->nb_active_frames = 0; /* Reset the counter of speech frames necessary to start hangover algorithm */
969 : }
970 :
971 187433 : if ( hVAD->hangover_cnt < HANGOVER_LONG_NB )
972 : {
973 158551 : hVAD->hangover_cnt++;
974 158551 : if ( lp_snr < 19.0f ) /* very low SNR */
975 : {
976 1258 : thr1 -= 5.2f;
977 : }
978 157293 : else if ( lp_snr < 35.0f ) /* low SNR */
979 : {
980 51435 : thr1 -= 2.0f;
981 : }
982 : }
983 :
984 187433 : if ( st->Opt_DTX_ON )
985 : {
986 44622 : if ( lp_snr < th_clean )
987 : {
988 1563 : thr2 = thr1 - 1.10f;
989 : }
990 : else
991 : {
992 43059 : thr2 = thr1 - 1.5f;
993 : }
994 : }
995 : else
996 : {
997 142811 : if ( lp_snr < th_clean )
998 : {
999 2136 : thr2 = thr1 - 1.3f;
1000 : }
1001 : else
1002 : {
1003 140675 : thr2 = thr1 - 1.5f;
1004 : }
1005 : }
1006 :
1007 187433 : flag = 0;
1008 187433 : if ( snr_sum > thr1 ) /* Speech present */
1009 : {
1010 154111 : flag = 1;
1011 : }
1012 :
1013 187433 : if ( ( snr_sum < thr1 ) && ( snr_sum > thr2 ) ) /* Speech present */
1014 : {
1015 3637 : flag = 1;
1016 3637 : st->localVAD = 0;
1017 :
1018 3637 : if ( st->element_mode == EVS_MONO )
1019 : {
1020 52 : *NB_speech_HO = 1;
1021 : }
1022 : }
1023 :
1024 : /* Need to handle the case when switching from WB -> NB */
1025 : }
1026 :
1027 16880952 : if ( st->input_bwidth != NB )
1028 : {
1029 16693519 : *flag_noisy_speech_snr = ( lp_snr < TH16_2_NFLAG ); /*original threshold: 35dB*/
1030 : }
1031 : else
1032 : {
1033 187433 : *flag_noisy_speech_snr = ( lp_snr < TH8_1_NFLAG ); /*original threshold: 20dB, not yet tested!*/
1034 : }
1035 :
1036 : /* SC-VBR */
1037 16880952 : if ( st->hSC_VBR != NULL )
1038 : {
1039 102214 : st->hSC_VBR->vadsnr = snr_sum;
1040 102214 : st->hSC_VBR->vadnoise = thr1;
1041 : }
1042 :
1043 : /* Updates */
1044 16880952 : hVAD->prim_act_quick = 0.2f * ( st->localVAD ) + ( 1.0f - 0.2f ) * hVAD->prim_act_quick;
1045 16880952 : hVAD->prim_act_slow = 0.01f * ( st->localVAD ) + ( 1.0f - 0.01f ) * hVAD->prim_act_slow;
1046 16880952 : if ( hVAD->prim_act_quick <= hVAD->prim_act_slow )
1047 : {
1048 3539602 : hVAD->prim_act = 0.1f * hVAD->prim_act_quick + ( 1.0f - 0.1f ) * hVAD->prim_act;
1049 : }
1050 : else
1051 : {
1052 13341350 : hVAD->prim_act = 0.1f * hVAD->prim_act_slow + ( 1.0f - 0.1f ) * hVAD->prim_act;
1053 : }
1054 :
1055 16880952 : hVAD->prim_act_quick_he = 0.2f * *localVAD_HE_SAD + ( 1.0f - 0.2f ) * hVAD->prim_act_quick_he;
1056 16880952 : hVAD->prim_act_slow_he = 0.01f * *localVAD_HE_SAD + ( 1.0f - 0.01f ) * hVAD->prim_act_slow_he;
1057 :
1058 16880952 : if ( hVAD->prim_act_quick_he <= hVAD->prim_act_slow_he )
1059 : {
1060 3576135 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_quick_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1061 : }
1062 : else
1063 : {
1064 13304817 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_slow_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1065 : }
1066 :
1067 :
1068 16880952 : if ( ( hVAD->vad_flag_reg_H & (int32_t) 0x40000L ) != 0 ) /* 0x4000L = 0x01L << 18 */
1069 : {
1070 12003094 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 - 1;
1071 : }
1072 :
1073 16880952 : hVAD->vad_flag_reg_H = ( hVAD->vad_flag_reg_H & (int32_t) 0x3fffffffL ) << 1;
1074 :
1075 16880952 : if ( ( hVAD->vad_flag_reg_L & (int32_t) 0x40000000L ) != 0 )
1076 : {
1077 12346015 : hVAD->vad_flag_reg_H = hVAD->vad_flag_reg_H | 0x01L;
1078 : }
1079 :
1080 16880952 : hVAD->vad_flag_reg_L = ( hVAD->vad_flag_reg_L & (int32_t) 0x3fffffffL ) << 1;
1081 :
1082 16880952 : if ( flag ) /* should not include the extra DTX hangover */
1083 : {
1084 13872254 : hVAD->vad_flag_reg_L = hVAD->vad_flag_reg_L | 0x01L;
1085 13872254 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 + 1;
1086 : }
1087 :
1088 16880952 : if ( ( hVAD->vad_prim_reg & (int32_t) 0x8000L ) != 0 ) /* 0x8000L = 1L << 15 */
1089 : {
1090 12532220 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 - 1;
1091 : }
1092 :
1093 16880952 : hVAD->vad_prim_reg = ( hVAD->vad_prim_reg & (int32_t) 0x3fffffffL ) << 1;
1094 :
1095 16880952 : if ( st->localVAD )
1096 : {
1097 13571460 : hVAD->vad_prim_reg = hVAD->vad_prim_reg | 0x01L;
1098 13571460 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 + 1;
1099 : }
1100 :
1101 16880952 : return flag;
1102 : }
|