Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "rom_enc.h"
46 : #include "wmc_auto.h"
47 : #include "ivas_prot.h"
48 :
49 : /*-----------------------------------------------------------------*
50 : * Local constants
51 : *-----------------------------------------------------------------*/
52 :
53 : #define HANGOVER_LONG 10 /* Hangover for CNG */
54 : #define HANGOVER_LONG_HE 20 /* Hangover of CNG */
55 : #define HANGOVER_LONG_MUSIC 20 /* Hangover of CNG */
56 : #define HANGOVER_LONG_NB 8 /* Hangover for CNG */
57 : #define ACTIVE_FRAMES 3 /* Number of consecutive active SPEECH frames necessary to trigger HO */
58 :
59 : #define TH16_2 35.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
60 : #define TH8_1 20.0f /* long-term SNR that separates the curves for clean speech and noisy speech */
61 : #define TH16_2_NFLAG 35.0f
62 : #define TH8_1_NFLAG 35.0f
63 :
64 :
65 : #define SNR_OUTLIER_WGHT_1 1.00f
66 : #define SNR_OUTLIER_WGHT_2 1.01f
67 : #define SNR_OUTLIER_WGHT_3 1.02f
68 : #define OUTLIER_THR_1 10.0f
69 : #define OUTLIER_THR_2 6.0f
70 : #define MAX_SNR_OUTLIER_IND 17
71 : #define MAX_SNR_OUTLIER_1 10.0f
72 : #define MAX_SNR_OUTLIER_2 25.0f
73 : #define MAX_SNR_OUTLIER_3 50.0f
74 :
75 : /*---------------------------------------------------------------------*
76 : * wb_vad_init()
77 : *
78 : * VAD initializations
79 : *---------------------------------------------------------------------*/
80 :
81 9716 : void wb_vad_init(
82 : VAD_HANDLE hVAD /* i/o: VAD data handle */
83 : )
84 : {
85 9716 : hVAD->hangover_cnt = 0; /* Hangover counter initialized to 0 */
86 9716 : hVAD->nb_active_frames = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
87 : /* is set to max (-> start with hangover) */
88 :
89 9716 : hVAD->vad_flag_reg_H = 0L;
90 9716 : hVAD->vad_flag_reg_L = 0L;
91 9716 : hVAD->vad_prim_reg = 0L;
92 9716 : hVAD->vad_flag_cnt_50 = 0;
93 9716 : hVAD->vad_prim_cnt_16 = 0;
94 :
95 : /* By default one should not start with a hangover */
96 9716 : hVAD->hangover_cnt_dtx = HANGOVER_LONG; /* hangover for DTX */
97 9716 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC; /* hangover for DTX */
98 :
99 9716 : hVAD->hangover_cnt_he = 0; /* Hangover counter initialized to 0 */
100 9716 : hVAD->nb_active_frames_he = ACTIVE_FRAMES; /* The counter of SPEECH frames necessary to trigger HO */
101 9716 : hVAD->bcg_flux = 70;
102 9716 : hVAD->soft_hangover = 0;
103 9716 : hVAD->voiced_burst = 0;
104 9716 : hVAD->bcg_flux_init = 50;
105 9716 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
106 9716 : hVAD->hangover_cnt_he1 = 0;
107 :
108 9716 : hVAD->prim_act_quick = 0.0f;
109 9716 : hVAD->prim_act_slow = 0.0f;
110 9716 : hVAD->prim_act = 0.0f;
111 9716 : hVAD->prim_act_quick_he = 0.0f;
112 9716 : hVAD->prim_act_slow_he = 0.0f;
113 9716 : hVAD->prim_act_he = 0.0f;
114 :
115 9716 : hVAD->consec_inactive = 0;
116 9716 : hVAD->spectral_tilt_reset = 1;
117 9716 : hVAD->running_avg = 0;
118 9716 : hVAD->ra_deltasum = 0;
119 9716 : hVAD->trigger_SID = 0;
120 9716 : hVAD->snr_sum_vad = 0;
121 :
122 9716 : hVAD->hangover_terminate_flag = 0;
123 :
124 9716 : return;
125 : }
126 :
127 : /*-----------------------------------------------------------------*
128 : * sing_thr_snr_acc()
129 : *
130 : * accumulate snr_sum with significance thresholds
131 : *-----------------------------------------------------------------*/
132 :
133 48678920 : static void sign_thr_snr_acc(
134 : float *snr_sum,
135 : float snr,
136 : float sign_thr,
137 : float min_snr )
138 : {
139 48678920 : if ( snr >= sign_thr )
140 : {
141 39149352 : *snr_sum = *snr_sum + snr;
142 : }
143 : else
144 : {
145 9529568 : *snr_sum = *snr_sum + min_snr;
146 : }
147 :
148 48678920 : return;
149 : }
150 :
151 : /*-----------------------------------------------------------------*
152 : * dtx_hangover_addition()
153 : *
154 : * accumulate snr_sum with significance thresholds
155 : *-----------------------------------------------------------------*/
156 :
157 1185802 : int16_t dtx_hangover_addition(
158 : Encoder_State *st, /* i/o: encoder state structure */
159 : const int16_t vad_flag, /* i : VAD flag */
160 : const float lp_snr, /* i : input single SNR estimate */
161 : const int16_t cldfb_subtraction, /* i : */
162 : int16_t *vad_hover_flag, /* o : VAD hangover flag */
163 : VAD_HANDLE hVAD, /* i/o: VAD handle for L or R channel */
164 : NOISE_EST_HANDLE hNoiseEst, /* i : Noise estimation handle */
165 : int16_t *rem_dtx_ho /* o : Expected remaining hangover frames */
166 : )
167 : {
168 : int16_t hangover_short_dtx, flag_dtx;
169 : int16_t ho_limit_clean;
170 :
171 1185802 : if ( hVAD == NULL )
172 : {
173 1104076 : hVAD = st->hVAD;
174 : }
175 1185802 : if ( hNoiseEst == NULL )
176 : {
177 1104076 : hNoiseEst = st->hNoiseEst;
178 : }
179 :
180 1185802 : flag_dtx = 0;
181 :
182 : /* Determine initial hangover length */
183 1185802 : hangover_short_dtx = 2; /* was 1 */
184 1185802 : if ( ( lp_snr < 16.0f && st->input_bwidth != NB ) ||
185 1100308 : hVAD->prim_act_he > 0.95f )
186 : {
187 472264 : hangover_short_dtx = 3; /* was 2 */
188 : }
189 :
190 : /* Adjust hangover according to activity history */
191 1185802 : if ( hVAD->vad_prim_cnt_16 > 12 ) /* 12 requires roughly > 80% primary activity */
192 : {
193 901657 : hangover_short_dtx = hangover_short_dtx + 2;
194 : }
195 :
196 1185802 : if ( hVAD->vad_flag_cnt_50 > 40 ) /* 40 requires roughtly > 80% flag activity */
197 : {
198 840204 : hangover_short_dtx = hangover_short_dtx + 5;
199 : }
200 :
201 : /* Keep hangover_short lower than maximum hangover count */
202 1185802 : if ( hangover_short_dtx > HANGOVER_LONG - 1 )
203 : {
204 419492 : hangover_short_dtx = HANGOVER_LONG - 1;
205 : }
206 :
207 : /* Only allow short HO if not sufficient active frames */
208 1185802 : ho_limit_clean = 3;
209 1185802 : if ( st->core == AMR_WB_CORE )
210 : {
211 0 : ho_limit_clean = 2;
212 : }
213 :
214 1185802 : if ( st->input_bwidth != NB && st->core != AMR_WB_CORE && lp_snr > 25.0f )
215 : {
216 1049625 : ho_limit_clean = 2;
217 : }
218 :
219 1185802 : if ( ho_limit_clean != 0 )
220 : {
221 1185802 : if ( ( hangover_short_dtx > ho_limit_clean ) && ( ( hVAD->vad_prim_cnt_16 < 7 ) || ( lp_snr > 16 && hVAD->prim_act_he < 0.85 ) ) )
222 : {
223 363985 : hangover_short_dtx = ho_limit_clean;
224 : }
225 : }
226 :
227 : /* hangover adjustment from combined FFT + CLDFBVAD */
228 1185802 : if ( st->core != AMR_WB_CORE )
229 : {
230 1185802 : hangover_short_dtx = hangover_short_dtx - cldfb_subtraction;
231 1185802 : if ( hangover_short_dtx < 0 )
232 : {
233 0 : hangover_short_dtx = 0;
234 : }
235 : }
236 :
237 1185802 : if ( vad_flag == 1 ) /* Speech present */
238 : {
239 1020812 : flag_dtx = 1;
240 :
241 : /* Add hangover after sufficient # of active frames or sufficient activity during last second */
242 1020812 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES || hVAD->vad_flag_cnt_50 > 45 ) /* 45 requires roughtly > 90% flag activity */
243 : {
244 1001460 : hVAD->hangover_cnt_dtx = 0;
245 : }
246 :
247 : /* inside HO period */
248 1020812 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG && hVAD->hangover_cnt_dtx != 0 )
249 : {
250 3408 : hVAD->hangover_cnt_dtx++;
251 : }
252 :
253 1020812 : hVAD->hangover_terminate_flag = 0;
254 :
255 : /* Music hangover when music detected */
256 1020812 : if ( hVAD->prim_act_he > 0.98f && hNoiseEst->Etot_lp > 40 && hVAD->vad_prim_cnt_16 > 14 && hVAD->vad_flag_cnt_50 > 48 )
257 : {
258 175348 : hVAD->hangover_cnt_music = 0;
259 : }
260 :
261 : /* inside music HO period */
262 1020812 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC && hVAD->hangover_cnt_music != 0 )
263 : {
264 6582 : hVAD->hangover_cnt_music++;
265 : }
266 : }
267 : else
268 : {
269 : /* Reset the counter of speech frames necessary to start hangover algorithm */
270 164990 : if ( hVAD->hangover_cnt_dtx < HANGOVER_LONG ) /* inside HO period */
271 : {
272 19542 : hVAD->hangover_cnt_dtx++;
273 : }
274 :
275 164990 : if ( hVAD->hangover_cnt_music < HANGOVER_LONG_MUSIC ) /* inside music HO period */
276 : {
277 5058 : hVAD->hangover_cnt_music++;
278 : }
279 :
280 : /* fast terminate DTX hangover if st->hangover_terminate_flag is set */
281 164990 : if ( hVAD->hangover_terminate_flag == 1 )
282 : {
283 18 : hVAD->hangover_cnt = HANGOVER_LONG;
284 18 : hVAD->hangover_cnt_dtx = HANGOVER_LONG;
285 18 : hVAD->hangover_terminate_flag = 0;
286 : /* only shorten music hangover when low energy frames */
287 18 : if ( hNoiseEst->Etot_lp < 20.0f )
288 : {
289 0 : hVAD->hangover_cnt_music = HANGOVER_LONG_MUSIC;
290 : }
291 : }
292 :
293 164990 : if ( hVAD->hangover_cnt_dtx <= hangover_short_dtx ) /* "hard" hangover */
294 : {
295 7457 : flag_dtx = 1;
296 : }
297 :
298 164990 : if ( hVAD->hangover_cnt_music <= 15 ) /* "hard" hangover */
299 : {
300 4782 : flag_dtx = 1;
301 : }
302 : }
303 :
304 1185802 : if ( flag_dtx != 0 && st->localVAD == 0 )
305 : {
306 27215 : *vad_hover_flag = 1;
307 27215 : if ( rem_dtx_ho != NULL )
308 : {
309 4782 : *rem_dtx_ho = max( hangover_short_dtx - hVAD->hangover_cnt_dtx, 0 );
310 : }
311 : }
312 :
313 1185802 : return flag_dtx;
314 : }
315 :
316 :
317 : /*-----------------------------------------------------------------*
318 : * wb_vad()
319 : *
320 : * Voice Activity Detector
321 : *-----------------------------------------------------------------*/
322 :
323 1217770 : int16_t wb_vad(
324 : Encoder_State *st, /* i/o: encoder state structure */
325 : const float fr_bands[], /* i : per band input energy (contains 2 vectors) */
326 : int16_t *noisy_speech_HO, /* o : SC-VBR noisy speech HO flag */
327 : int16_t *clean_speech_HO, /* o : SC-VBR clean speech HO flag */
328 : int16_t *NB_speech_HO, /* o : SC-VBR NB speech HO flag */
329 : float *snr_sum_he, /* o : Output snr_sum as weighted spectral measure */
330 : int16_t *localVAD_HE_SAD, /* o : HE_SAD decision without hangovers */
331 : int16_t *flag_noisy_speech_snr, /* o : */
332 : VAD_HANDLE hVAD, /* i/o: VAD handle */
333 : NOISE_EST_HANDLE hNoiseEst, /* i/o: Noise estimation handle */
334 : float lp_speech, /* i : long term active speech energy average */
335 : float lp_noise /* i : long term noise energy */
336 : )
337 : {
338 : int16_t i, j, flag, hangover_short;
339 : float snr[NB_BANDS], snr_sum, thr1, thr2, lp_snr, nk, nc, th_clean;
340 : const float *pt1, *pt2, *pt3;
341 : float min_snr, sign_thr;
342 : float fr_enr;
343 : float ftmp, ftmp1;
344 1217770 : float mssnr = 0;
345 : float snr_sumt;
346 : float vad_thr;
347 : int16_t hangover_hd;
348 : int16_t snr_idx;
349 : float delta1, delta2, delta3;
350 : int16_t flag_he1;
351 : float mssnr_hov;
352 : int16_t stmp;
353 : float msnr;
354 : float snr_outlier;
355 : int16_t snr_outlier_index;
356 : float accum_ener_L, accum_ener_H;
357 : float delta4;
358 1217770 : float snr18 = 1.0f, snr19 = 1.0f;
359 : int16_t nb_sig_snr;
360 : float nv;
361 : float snr_sum_HE_SAD;
362 : float sign_thr_HE_SAD, min_snr_HE_SAD;
363 : float nv_ofs;
364 : float thr1_ol;
365 : float snr_sum_ol;
366 : int16_t last_7k2_coder_type;
367 :
368 1217770 : if ( hNoiseEst == NULL )
369 : {
370 1136044 : hNoiseEst = st->hNoiseEst;
371 : }
372 :
373 1217770 : if ( hVAD == NULL )
374 : {
375 1136044 : hVAD = st->hVAD;
376 : }
377 :
378 1217770 : if ( lp_speech < -100.0f )
379 : {
380 1136044 : lp_speech = st->lp_speech;
381 : }
382 :
383 1217770 : if ( lp_noise < -100.0f )
384 : {
385 1136044 : lp_noise = st->lp_noise;
386 : }
387 :
388 : /*---------------------------------------------------------------------*
389 : * Initialization
390 : *---------------------------------------------------------------------*/
391 :
392 1217770 : snr_outlier = 0;
393 1217770 : snr_outlier_index = 0;
394 1217770 : accum_ener_L = 0;
395 1217770 : accum_ener_H = 0;
396 :
397 1217770 : if ( st->input_bwidth == NB )
398 : {
399 3985 : st->min_band = 1;
400 3985 : st->max_band = 16;
401 : }
402 : else
403 : {
404 1213785 : st->min_band = 0;
405 1213785 : st->max_band = 19;
406 : }
407 :
408 1217770 : if ( st->Opt_SC_VBR )
409 : {
410 0 : last_7k2_coder_type = st->hSC_VBR->last_7k2_coder_type;
411 : }
412 : else
413 : {
414 1217770 : last_7k2_coder_type = -1;
415 : }
416 :
417 : /*---------------------------------------------------------------------*
418 : * set SNR thresholds depending on the input rate
419 : *---------------------------------------------------------------------*/
420 :
421 1217770 : if ( st->max_band == 19 ) /* WB input */
422 : {
423 1213785 : nk = 0.1f;
424 1213785 : nc = 16.1f;
425 1213785 : nv = 2.05f;
426 1213785 : nv_ofs = 1.65f;
427 1213785 : th_clean = TH16_2;
428 1213785 : if ( st->input_bwidth == WB )
429 : {
430 115646 : sign_thr = 1.3f;
431 115646 : min_snr = 0.8f;
432 : }
433 : else
434 : {
435 1098139 : sign_thr = 1.75f;
436 1098139 : min_snr = 0.25f;
437 : }
438 1213785 : sign_thr_HE_SAD = 2.5f;
439 1213785 : min_snr_HE_SAD = 0.2f;
440 : }
441 : else /* NB input */
442 : {
443 3985 : nk = 0.10f;
444 3985 : nc = 16.0f;
445 3985 : nv = 4.00f; /* Was 4.5f but trunkated to 4.00 used when converted to short */
446 3985 : nv_ofs = 1.15f;
447 3985 : th_clean = TH8_1;
448 3985 : sign_thr = 1.75f;
449 3985 : min_snr = 0.25f;
450 :
451 3985 : sign_thr_HE_SAD = 2.65f;
452 3985 : min_snr_HE_SAD = 0.05f;
453 : }
454 :
455 1217770 : hangover_short = 0;
456 :
457 :
458 1217770 : if ( st->Opt_SC_VBR )
459 : {
460 0 : *noisy_speech_HO = 0;
461 0 : *clean_speech_HO = 0;
462 0 : *NB_speech_HO = 0;
463 : }
464 :
465 : /*---------------------------------------------------------------------*
466 : * compute SNR for each band & total
467 : *---------------------------------------------------------------------*/
468 :
469 1217770 : pt1 = fr_bands;
470 1217770 : pt2 = fr_bands + NB_BANDS;
471 1217770 : snr_sum = 0.0f;
472 1217770 : *snr_sum_he = 0.0f;
473 1217770 : snr_sumt = 0;
474 1217770 : mssnr_hov = 0;
475 1217770 : snr_sum_HE_SAD = 0.0f;
476 1217770 : lp_snr = lp_speech - lp_noise;
477 :
478 1217770 : if ( lp_snr > 24.0f )
479 : {
480 1064039 : snr_idx = 0;
481 : }
482 153731 : else if ( lp_snr > 18 )
483 : {
484 21284 : snr_idx = 1;
485 : }
486 : else
487 : {
488 132447 : snr_idx = 2;
489 : }
490 :
491 1217770 : if ( snr_idx == 0 )
492 : {
493 1064039 : stmp = 6;
494 1064039 : delta1 = 0.0f;
495 1064039 : delta2 = 0.0f;
496 1064039 : delta3 = 0.0f;
497 1064039 : delta4 = 0.0f;
498 1064039 : vad_thr = 2.4f * lp_snr - 42.2f;
499 1064039 : vad_thr = min( vad_thr, 80 );
500 : }
501 153731 : else if ( snr_idx == 1 )
502 : {
503 21284 : stmp = 6;
504 21284 : delta1 = 0.1f;
505 21284 : delta2 = 0.2f;
506 21284 : delta3 = 0.2f;
507 21284 : delta4 = 0.2f;
508 21284 : vad_thr = 2.4f * lp_snr - 40.2f;
509 21284 : vad_thr = min( vad_thr, 80 );
510 : }
511 : else
512 : {
513 132447 : stmp = 9;
514 132447 : delta1 = 0.2f;
515 132447 : delta2 = 0.4f;
516 132447 : delta3 = 0.3f;
517 132447 : delta4 = 0.4f;
518 132447 : vad_thr = 2.5f * lp_snr - 10.0f;
519 132447 : vad_thr = max( vad_thr, 1 );
520 : }
521 1217770 : pt3 = hNoiseEst->bckr;
522 1217770 : nb_sig_snr = 20;
523 :
524 25557230 : for ( i = st->min_band; i <= st->max_band; i++ )
525 : {
526 24339460 : ftmp = *pt1++;
527 24339460 : ftmp1 = *pt2++;
528 24339460 : fr_enr = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 );
529 :
530 24339460 : if ( ftmp > ftmp1 )
531 : {
532 10783066 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.4f * ftmp + 0.4f * ftmp1 ) / *pt3++;
533 : }
534 : else
535 : {
536 13556394 : snr[i] = ( 0.2f * hNoiseEst->enrO[i] + 0.3f * ftmp + 0.5f * ftmp1 ) / *pt3++;
537 : }
538 :
539 24339460 : if ( snr[i] < 2.0f )
540 : {
541 4739611 : nb_sig_snr--;
542 : }
543 :
544 24339460 : if ( snr[i] < 1 )
545 : {
546 1709562 : snr[i] = 1;
547 : }
548 :
549 24339460 : snr[i] = (float) log10( snr[i] );
550 24339460 : snr_sumt += snr[i];
551 24339460 : if ( i < 2 )
552 : {
553 2431555 : ftmp = snr[i] + delta1;
554 : }
555 21907905 : else if ( i < 7 )
556 : {
557 6088850 : ftmp = snr[i] + delta2;
558 : }
559 15819055 : else if ( i < 18 )
560 : {
561 13391485 : ftmp = snr[i] + delta3;
562 : }
563 : else
564 : {
565 2427570 : ftmp = snr[i] + delta4;
566 : }
567 24339460 : ftmp1 = ftmp;
568 24339460 : if ( i < 7 )
569 : {
570 8520405 : ftmp1 = ftmp + 0.4f;
571 : }
572 24339460 : ftmp = min( ftmp, 2.0f );
573 24339460 : ftmp1 = min( ftmp1, 2.0f );
574 24339460 : msnr = 1;
575 178323040 : for ( j = 0; j < stmp; j++ )
576 : {
577 153983580 : msnr *= ftmp;
578 : }
579 24339460 : mssnr += msnr;
580 24339460 : if ( i == 18 )
581 : {
582 1213785 : snr18 = msnr;
583 : }
584 23125675 : else if ( i == 19 )
585 : {
586 1213785 : snr19 = msnr;
587 : }
588 24339460 : msnr = 1;
589 178323040 : for ( j = 0; j < stmp; j++ )
590 : {
591 153983580 : msnr *= ftmp1;
592 : }
593 24339460 : mssnr_hov += msnr;
594 24339460 : snr[i] = fr_enr / hNoiseEst->bckr[i];
595 :
596 24339460 : sign_thr_snr_acc( &snr_sum_HE_SAD, snr[i], sign_thr_HE_SAD, min_snr_HE_SAD );
597 24339460 : sign_thr_snr_acc( &snr_sum, snr[i], sign_thr, min_snr );
598 :
599 : /* To make snr[] compatible with older versions where snr[i] >= 1
600 : also this could be removed if this no longer is a requriement */
601 24339460 : if ( snr[i] < 1.0f )
602 : {
603 1749490 : snr[i] = 1.0f;
604 : }
605 : /* accumulate background noise energy in bands [0-2] and in bands [3-19]*/
606 24339460 : if ( i < 3 )
607 : {
608 3649325 : accum_ener_L = accum_ener_L + hNoiseEst->bckr[i];
609 : }
610 : else
611 : {
612 20690135 : accum_ener_H = accum_ener_H + hNoiseEst->bckr[i];
613 : }
614 :
615 : /* identify the outlier band */
616 24339460 : if ( snr[i] > snr_outlier )
617 : {
618 3431283 : snr_outlier = snr[i];
619 3431283 : snr_outlier_index = i;
620 : }
621 : }
622 :
623 1217770 : if ( ( st->max_band == 19 ) && ( snr[18] > 5.0f ) && ( snr[19] > 5.0f ) )
624 : {
625 902504 : ftmp = ( mssnr + 3 * ( snr18 + snr19 ) ) * 0.77f;
626 902504 : if ( ftmp > mssnr )
627 : {
628 773898 : mssnr = ftmp;
629 : }
630 : }
631 315266 : else if ( snr_idx != 0 && nb_sig_snr > 13 )
632 : {
633 21596 : if ( 2.5f * lp_snr - 15.5f > 0 )
634 : {
635 20670 : mssnr += 2.5f * lp_snr - 15.5f;
636 : }
637 : }
638 :
639 :
640 : /* Separate SNR_SUM modification to */
641 1217770 : snr_sum_ol = snr_sum;
642 1217770 : if ( st->max_band == 19 && snr_outlier < MAX_SNR_OUTLIER_3 && snr_outlier_index > 3 && snr_outlier_index < MAX_SNR_OUTLIER_IND ) /* Update the total SNR only for WB signals */
643 : {
644 55585 : if ( ( accum_ener_L > OUTLIER_THR_1 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_1 ) )
645 : {
646 47751 : snr_sum_ol = SNR_OUTLIER_WGHT_1 * ( snr_sum_ol - snr_outlier );
647 : }
648 7834 : else if ( ( accum_ener_L > OUTLIER_THR_2 * accum_ener_H ) || ( snr_outlier < MAX_SNR_OUTLIER_2 ) )
649 : {
650 4502 : snr_sum_ol = SNR_OUTLIER_WGHT_2 * ( snr_sum_ol - snr_outlier );
651 : }
652 : else
653 : {
654 3332 : snr_sum_ol = SNR_OUTLIER_WGHT_3 * ( snr_sum_ol - snr_outlier );
655 : }
656 : }
657 :
658 1217770 : hVAD->snr_sum_vad = 0.5f * hVAD->snr_sum_vad + 0.5f * snr_sum_ol;
659 :
660 1217770 : snr_sum_ol = 10.0f * (float) log10( snr_sum_ol );
661 1217770 : snr_sum = snr_sum_ol; /* for NB no outlier modification */
662 :
663 1217770 : snr_sum_HE_SAD = 10.0f * (float) log10( snr_sum_HE_SAD );
664 1217770 : *snr_sum_he = snr_sum_HE_SAD;
665 :
666 : /*---------------------------------------------------------------------*
667 : * compute threshold for VAD decision
668 : *---------------------------------------------------------------------*/
669 :
670 1217770 : lp_snr = lp_speech - lp_noise; /* long-term SNR */
671 :
672 1217770 : if ( lp_snr < hNoiseEst->sign_dyn_lp )
673 : {
674 245961 : lp_snr += 1;
675 :
676 245961 : if ( lp_snr > hNoiseEst->sign_dyn_lp )
677 : {
678 10713 : lp_snr = hNoiseEst->sign_dyn_lp;
679 : }
680 : }
681 :
682 1217770 : thr1 = nk * lp_snr + nc + nv * ( hNoiseEst->Etot_v_h2 - nv_ofs ); /* threshold as a linear function of long-term SNR */
683 :
684 1217770 : if ( st->element_mode > EVS_MONO && hNoiseEst->first_noise_updt_cnt < 100 )
685 : {
686 : /* lower threshold during warmup time */
687 868810 : thr1 -= 10.0f;
688 868810 : vad_thr = 0.f;
689 : }
690 :
691 1217770 : if ( lp_snr > 20.0f )
692 : {
693 1075077 : if ( st->element_mode == EVS_MONO || hNoiseEst->first_noise_updt_cnt >= 100 )
694 : {
695 : /* increase the threshold when SNR is high */
696 218622 : thr1 = thr1 + 0.3f * ( lp_snr - 20.0f );
697 218622 : if ( st->max_band == 16 && lp_snr > 40 && thr1 > 24.1f && lp_speech < 45.0f )
698 : {
699 1690 : thr1 = 24.1f;
700 : }
701 : }
702 : }
703 :
704 : /*---------------------------------------------------------------------*
705 : * WB input
706 : * SNR threshold computing
707 : * Hangover control & final VAD decision
708 : *---------------------------------------------------------------------*/
709 :
710 1217770 : if ( st->input_bwidth != NB )
711 : {
712 : /* Outlier Detection first calculates thr1_ol and snr_sum_ol instead of
713 : thr1 and snr_sum */
714 :
715 1213785 : thr1_ol = thr1;
716 1213785 : if ( lp_snr < th_clean )
717 : {
718 212569 : hangover_short = 4;
719 212569 : if ( ( snr_outlier_index <= 4 && ( st->last_coder_type > UNVOICED ) && !st->Opt_SC_VBR ) ||
720 9933 : ( snr_outlier_index <= 4 && ( last_7k2_coder_type > UNVOICED ) && st->Opt_SC_VBR ) )
721 : {
722 88755 : thr1_ol = thr1 - 1.0f;
723 88755 : snr_sum_ol = 10.0f * (float) log10( hVAD->snr_sum_vad );
724 : }
725 123814 : else if ( ( ( st->last_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && !st->Opt_SC_VBR ) || ( ( last_7k2_coder_type <= UNVOICED ) && ( snr_outlier < MAX_SNR_OUTLIER_2 ) && st->Opt_SC_VBR ) )
726 : {
727 15647 : thr1_ol = thr1 + (float) ( 1.0f - 0.04f * snr_outlier );
728 : }
729 : else
730 : {
731 108167 : thr1_ol = thr1 + max( 0, (float) ( 0.6f - 0.01f * snr_outlier ) );
732 : }
733 : }
734 : else
735 : {
736 1001216 : if ( st->Opt_SC_VBR )
737 : {
738 0 : hangover_short = 3;
739 : }
740 : else
741 : {
742 1001216 : hangover_short = 3;
743 : }
744 : }
745 :
746 : /* The use of outlier detection had been removed by accident at some point */
747 1213785 : snr_sum = snr_sum_ol;
748 1213785 : thr1 = thr1_ol;
749 :
750 : /* DTX HANGOVER ADDITION MOVED TO pre_proc() */
751 :
752 1213785 : flag_he1 = 0;
753 1213785 : st->localVAD = 0;
754 1213785 : if ( mssnr > vad_thr )
755 : {
756 1038027 : st->localVAD = 1; /* he1 primary decision */
757 1038027 : flag_he1 = 1;
758 1038027 : hVAD->nb_active_frames_he1++; /* Counter of consecutive active speech frames */
759 1038027 : if ( hVAD->nb_active_frames_he1 >= ACTIVE_FRAMES )
760 : {
761 1026686 : hVAD->nb_active_frames_he1 = ACTIVE_FRAMES;
762 1026686 : hVAD->hangover_cnt_he1 = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
763 : }
764 : /* inside HO period */
765 1038027 : if ( hVAD->hangover_cnt_he1 < HANGOVER_LONG_HE && hVAD->hangover_cnt_he1 != 0 )
766 : {
767 10657 : hVAD->hangover_cnt_he1++;
768 : }
769 :
770 1038027 : if ( hVAD->soft_hangover > 0 )
771 : {
772 316725 : hVAD->soft_hangover--;
773 : }
774 : }
775 : else
776 : {
777 : /* Reset the counter of speech frames necessary to start hangover algorithm */
778 175758 : hVAD->nb_active_frames_he1 = 0;
779 : }
780 :
781 1213785 : if ( hVAD->voiced_burst > 3 )
782 : {
783 318566 : if ( hVAD->bcg_flux < 40 )
784 : {
785 29754 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx + 3];
786 : }
787 : else
788 : {
789 288812 : hVAD->soft_hangover = hangover_sf_tbl[snr_idx];
790 : }
791 : }
792 :
793 :
794 1213785 : hangover_hd = hangover_hd_tbl[snr_idx];
795 :
796 1213785 : if ( hVAD->bcg_flux < 40 )
797 : {
798 265840 : hangover_hd = ( hangover_hd >> 1 ) + 1;
799 : }
800 :
801 1213785 : if ( flag_he1 == 0 && hVAD->soft_hangover > 0 )
802 : {
803 2810 : if ( mssnr_hov > vad_thr )
804 : {
805 1292 : flag_he1 = 1;
806 1292 : hVAD->soft_hangover--;
807 : }
808 : else
809 : {
810 1518 : hVAD->soft_hangover = 0;
811 : }
812 :
813 2810 : if ( hVAD->soft_hangover < 0 )
814 : {
815 0 : hVAD->soft_hangover = 0;
816 : }
817 : }
818 :
819 1213785 : if ( flag_he1 == 0 && hVAD->hangover_cnt_he1 < hangover_hd && hVAD->soft_hangover == 0 )
820 : {
821 7559 : flag_he1 = 1;
822 7559 : hVAD->hangover_cnt_he1++;
823 : }
824 :
825 : /* Calculate background stationarity */
826 1213785 : if ( flag_he1 == 0 && hNoiseEst->first_noise_updt > 0 )
827 : {
828 159332 : if ( snr_sumt > hVAD->bcg_flux )
829 : {
830 138 : if ( hVAD->bcg_flux_init-- > 0 )
831 : {
832 0 : if ( snr_sumt > hVAD->bcg_flux + 50 )
833 : {
834 0 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * ( hVAD->bcg_flux + 50 );
835 : }
836 : else
837 : {
838 0 : hVAD->bcg_flux = 0.9f * hVAD->bcg_flux + ( 1 - 0.9f ) * snr_sumt;
839 : }
840 : }
841 : else
842 : {
843 138 : if ( snr_sumt > hVAD->bcg_flux + 10 )
844 : {
845 0 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * ( hVAD->bcg_flux + 10 );
846 : }
847 : else
848 : {
849 138 : hVAD->bcg_flux = 0.99f * hVAD->bcg_flux + ( 1 - 0.99f ) * snr_sumt;
850 : }
851 : }
852 : }
853 : else
854 : {
855 159194 : if ( hVAD->bcg_flux_init-- > 0 )
856 : {
857 27445 : if ( snr_sumt < hVAD->bcg_flux - 30 )
858 : {
859 17583 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * ( hVAD->bcg_flux - 30 );
860 : }
861 : else
862 : {
863 9862 : hVAD->bcg_flux = 0.95f * hVAD->bcg_flux + ( 1 - 0.95f ) * snr_sumt;
864 : }
865 : }
866 : else
867 : {
868 131749 : if ( snr_sumt < hVAD->bcg_flux - 10 )
869 : {
870 1341 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * ( hVAD->bcg_flux - 10 );
871 : }
872 : else
873 : {
874 130408 : hVAD->bcg_flux = 0.9992f * hVAD->bcg_flux + ( 1 - 0.9992f ) * snr_sumt;
875 : }
876 : }
877 : }
878 :
879 159332 : if ( hVAD->bcg_flux_init < 0 )
880 : {
881 131887 : hVAD->bcg_flux_init = 0;
882 : }
883 : }
884 :
885 1213785 : flag = 0;
886 1213785 : st->localVAD = 0;
887 :
888 1213785 : if ( ( snr_sum > thr1 && flag_he1 == 1 ) ) /* Speech present */
889 : {
890 1020032 : flag = 1;
891 1020032 : st->localVAD = 1;
892 1020032 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
893 1020032 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
894 : {
895 1005450 : hVAD->nb_active_frames = ACTIVE_FRAMES;
896 1005450 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
897 : }
898 :
899 : /* inside HO period */
900 1020032 : if ( hVAD->hangover_cnt < HANGOVER_LONG && hVAD->hangover_cnt != 0 )
901 : {
902 9225 : hVAD->hangover_cnt++;
903 : }
904 : }
905 : else
906 : {
907 : /* Reset the counter of speech frames necessary to start hangover algorithm */
908 193753 : hVAD->nb_active_frames = 0;
909 193753 : if ( hVAD->hangover_cnt < HANGOVER_LONG ) /* inside HO period */
910 : {
911 39025 : hVAD->hangover_cnt++;
912 : }
913 :
914 193753 : if ( hVAD->hangover_cnt <= hangover_short ) /* "hard" hangover */
915 : {
916 : /* send the extra 3 HO frames to NELP */
917 18058 : if ( st->element_mode == EVS_MONO && ( lp_snr < th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
918 : {
919 0 : *noisy_speech_HO = 1;
920 : }
921 :
922 18058 : if ( st->element_mode == EVS_MONO && ( lp_snr >= th_clean ) && ( st->Opt_SC_VBR ) && ( hVAD->hangover_cnt >= 2 ) )
923 : {
924 0 : *clean_speech_HO = 1;
925 : }
926 :
927 18058 : flag = 1;
928 : }
929 : }
930 :
931 : /* localVAD and vad_flag for HE-SAD - in parallel with normal localVAD and vad_flag */
932 1213785 : *localVAD_HE_SAD = 0;
933 1213785 : if ( snr_sum_HE_SAD > thr1 && ( flag_he1 == 1 ) ) /* Speech present */
934 : {
935 1011275 : *localVAD_HE_SAD = 1;
936 : }
937 : }
938 :
939 : /*---------------------------------------------------------------------*
940 : * NB input
941 : * SNR threshold computing
942 : * Hangover control & final VAD decision
943 : *---------------------------------------------------------------------*/
944 :
945 : else /* NB input */
946 : {
947 : /* Add localVAD_HE_SAD also for NB operation for use with speech music classifier */
948 3985 : *localVAD_HE_SAD = 0;
949 3985 : if ( snr_sum_HE_SAD > thr1 )
950 : {
951 1807 : *localVAD_HE_SAD = 1;
952 : }
953 :
954 3985 : st->localVAD = 0; /* init needed in NB, otherwise it can be undefined */
955 3985 : if ( snr_sum > thr1 ) /* Speech present */
956 : {
957 1831 : hVAD->nb_active_frames++; /* Counter of consecutive active speech frames */
958 1831 : if ( hVAD->nb_active_frames >= ACTIVE_FRAMES )
959 : {
960 1457 : hVAD->nb_active_frames = ACTIVE_FRAMES;
961 1457 : hVAD->hangover_cnt = 0; /* Reset the counter of hangover frames after at least "active_frames" speech frames */
962 : }
963 :
964 1831 : st->localVAD = 1;
965 : }
966 : else
967 : {
968 2154 : hVAD->nb_active_frames = 0; /* Reset the counter of speech frames necessary to start hangover algorithm */
969 : }
970 :
971 3985 : if ( hVAD->hangover_cnt < HANGOVER_LONG_NB )
972 : {
973 2503 : hVAD->hangover_cnt++;
974 2503 : if ( lp_snr < 19.0f ) /* very low SNR */
975 : {
976 0 : thr1 -= 5.2f;
977 : }
978 2503 : else if ( lp_snr < 35.0f ) /* low SNR */
979 : {
980 0 : thr1 -= 2.0f;
981 : }
982 : }
983 :
984 3985 : if ( st->Opt_DTX_ON )
985 : {
986 3985 : if ( lp_snr < th_clean )
987 : {
988 0 : thr2 = thr1 - 1.10f;
989 : }
990 : else
991 : {
992 3985 : thr2 = thr1 - 1.5f;
993 : }
994 : }
995 : else
996 : {
997 0 : if ( lp_snr < th_clean )
998 : {
999 0 : thr2 = thr1 - 1.3f;
1000 : }
1001 : else
1002 : {
1003 0 : thr2 = thr1 - 1.5f;
1004 : }
1005 : }
1006 :
1007 3985 : flag = 0;
1008 3985 : if ( snr_sum > thr1 ) /* Speech present */
1009 : {
1010 1831 : flag = 1;
1011 : }
1012 :
1013 3985 : if ( ( snr_sum < thr1 ) && ( snr_sum > thr2 ) ) /* Speech present */
1014 : {
1015 142 : flag = 1;
1016 142 : st->localVAD = 0;
1017 :
1018 142 : if ( st->element_mode == EVS_MONO )
1019 : {
1020 0 : *NB_speech_HO = 1;
1021 : }
1022 : }
1023 :
1024 : /* Need to handle the case when switching from WB -> NB */
1025 : }
1026 :
1027 1217770 : if ( st->input_bwidth != NB )
1028 : {
1029 1213785 : *flag_noisy_speech_snr = ( lp_snr < TH16_2_NFLAG ); /*original threshold: 35dB*/
1030 : }
1031 : else
1032 : {
1033 3985 : *flag_noisy_speech_snr = ( lp_snr < TH8_1_NFLAG ); /*original threshold: 20dB, not yet tested!*/
1034 : }
1035 :
1036 : /* SC-VBR */
1037 1217770 : if ( st->hSC_VBR != NULL )
1038 : {
1039 3100 : st->hSC_VBR->vadsnr = snr_sum;
1040 3100 : st->hSC_VBR->vadnoise = thr1;
1041 : }
1042 :
1043 : /* Updates */
1044 1217770 : hVAD->prim_act_quick = 0.2f * ( st->localVAD ) + ( 1.0f - 0.2f ) * hVAD->prim_act_quick;
1045 1217770 : hVAD->prim_act_slow = 0.01f * ( st->localVAD ) + ( 1.0f - 0.01f ) * hVAD->prim_act_slow;
1046 1217770 : if ( hVAD->prim_act_quick <= hVAD->prim_act_slow )
1047 : {
1048 200173 : hVAD->prim_act = 0.1f * hVAD->prim_act_quick + ( 1.0f - 0.1f ) * hVAD->prim_act;
1049 : }
1050 : else
1051 : {
1052 1017597 : hVAD->prim_act = 0.1f * hVAD->prim_act_slow + ( 1.0f - 0.1f ) * hVAD->prim_act;
1053 : }
1054 :
1055 1217770 : hVAD->prim_act_quick_he = 0.2f * *localVAD_HE_SAD + ( 1.0f - 0.2f ) * hVAD->prim_act_quick_he;
1056 1217770 : hVAD->prim_act_slow_he = 0.01f * *localVAD_HE_SAD + ( 1.0f - 0.01f ) * hVAD->prim_act_slow_he;
1057 :
1058 1217770 : if ( hVAD->prim_act_quick_he <= hVAD->prim_act_slow_he )
1059 : {
1060 207062 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_quick_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1061 : }
1062 : else
1063 : {
1064 1010708 : hVAD->prim_act_he = 0.1f * hVAD->prim_act_slow_he + ( 1.0f - 0.1f ) * hVAD->prim_act_he;
1065 : }
1066 :
1067 :
1068 1217770 : if ( ( hVAD->vad_flag_reg_H & (int32_t) 0x40000L ) != 0 ) /* 0x4000L = 0x01L << 18 */
1069 : {
1070 871746 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 - 1;
1071 : }
1072 :
1073 1217770 : hVAD->vad_flag_reg_H = ( hVAD->vad_flag_reg_H & (int32_t) 0x3fffffffL ) << 1;
1074 :
1075 1217770 : if ( ( hVAD->vad_flag_reg_L & (int32_t) 0x40000000L ) != 0 )
1076 : {
1077 909434 : hVAD->vad_flag_reg_H = hVAD->vad_flag_reg_H | 0x01L;
1078 : }
1079 :
1080 1217770 : hVAD->vad_flag_reg_L = ( hVAD->vad_flag_reg_L & (int32_t) 0x3fffffffL ) << 1;
1081 :
1082 1217770 : if ( flag ) /* should not include the extra DTX hangover */
1083 : {
1084 1040063 : hVAD->vad_flag_reg_L = hVAD->vad_flag_reg_L | 0x01L;
1085 1040063 : hVAD->vad_flag_cnt_50 = hVAD->vad_flag_cnt_50 + 1;
1086 : }
1087 :
1088 1217770 : if ( ( hVAD->vad_prim_reg & (int32_t) 0x8000L ) != 0 ) /* 0x8000L = 1L << 15 */
1089 : {
1090 934962 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 - 1;
1091 : }
1092 :
1093 1217770 : hVAD->vad_prim_reg = ( hVAD->vad_prim_reg & (int32_t) 0x3fffffffL ) << 1;
1094 :
1095 1217770 : if ( st->localVAD )
1096 : {
1097 1021863 : hVAD->vad_prim_reg = hVAD->vad_prim_reg | 0x01L;
1098 1021863 : hVAD->vad_prim_cnt_16 = hVAD->vad_prim_cnt_16 + 1;
1099 : }
1100 :
1101 1217770 : return flag;
1102 : }
|