Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define L_ENR ( NB_SSF + 2 )
52 : #define VOI_THRLD 0.2f
53 :
54 :
55 : /*-------------------------------------------------------------------*
56 : * find_ener_decrease()
57 : *
58 : * Find maximum energy ratio between short sub-subframes in case
59 : * energy is trailing off after a spike
60 : *-------------------------------------------------------------------*/
61 :
62 : /*! r: maximum energy ratio */
63 303181 : static float find_ener_decrease(
64 : const int16_t ind_deltaMax, /* i : index of the beginning of maximum energy search */
65 : const float *pt_enr_ssf /* i : Pointer to the energy buffer */
66 : )
67 : {
68 : int16_t i, j, end, flag;
69 : float maxEnr, minEnr, dE2;
70 :
71 303181 : dE2 = 0.0f;
72 303181 : j = ind_deltaMax + 2;
73 303181 : end = j + L_ENR;
74 303181 : maxEnr = pt_enr_ssf[j];
75 303181 : j++;
76 303181 : flag = 0;
77 3031810 : for ( i = j; i < end; i++ )
78 : {
79 2728629 : if ( pt_enr_ssf[i] > maxEnr && flag == 0 )
80 : {
81 126188 : maxEnr = pt_enr_ssf[i];
82 126188 : j++;
83 : }
84 : else
85 : {
86 2602441 : flag = 1;
87 : }
88 : }
89 :
90 303181 : minEnr = maxEnr;
91 2905622 : for ( i = j; i < end; i++ )
92 : {
93 2602441 : if ( pt_enr_ssf[i] < minEnr )
94 : {
95 766141 : minEnr = pt_enr_ssf[i];
96 : }
97 : }
98 :
99 303181 : dE2 = maxEnr / ( minEnr + 1.0e5f );
100 :
101 303181 : return dE2;
102 : }
103 :
104 :
105 : /*-------------------------------------------------------------------*
106 : * find_uv()
107 : *
108 : * Decision about coder type
109 : *-------------------------------------------------------------------*/
110 :
111 : /*! r: coding type */
112 15631381 : int16_t find_uv(
113 : Encoder_State *st, /* i/o: encoder state structure */
114 : const float *pitch_fr, /* i : pointer to adjusted fractional pitch (4 val.) */
115 : const float *voicing_fr, /* i : refined correlation for each subframes */
116 : const float *speech, /* i : pointer to speech signal for E computation */
117 : const float *ee, /* i : lf/hf Energy ratio for present frame */
118 : float *dE1X, /* o : sudden energy increase for S/M classifier */
119 : const float corr_shift, /* i : normalized correlation correction in noise */
120 : const float relE, /* i : relative frame energy */
121 : const float Etot, /* i : total energy */
122 : const float hp_E[], /* i : energy in HF */
123 : int16_t *flag_spitch, /* i/o: flag to indicate very short stable pitch and high correlation */
124 : const int16_t last_core_orig, /* i : original last core */
125 : STEREO_CLASSIF_HANDLE hStereoClassif /* i/o: stereo classifier structure */
126 : )
127 : {
128 : const float *pt_speech;
129 : int16_t i, coder_type, ind_deltaMax, tmp_offset_flag, nb_cond, flag_low_relE;
130 : float fac, mean_voi3, mean_ee, relE_thres;
131 : float enr_ssf[4 * NB_SSF + 2];
132 : float dE1, *pt_enr_ssf, *pt_enr_ssf1, dE2, dE3, dE2_th, ee0_th, ee1_th, voi_th;
133 : float mean_voi3_offset;
134 : float voicing_m, dpit1, dpit2, dpit3;
135 : int16_t Last_Resort;
136 : float vadnoise;
137 :
138 15631381 : if ( st->hSC_VBR != NULL )
139 : {
140 96918 : Last_Resort = st->hSC_VBR->Last_Resort;
141 96918 : vadnoise = st->hSC_VBR->vadnoise;
142 : }
143 : else
144 : {
145 15534463 : Last_Resort = 0;
146 15534463 : vadnoise = 0;
147 : }
148 :
149 : /*-----------------------------------------------------------------*
150 : * Detect sudden energy increases to catch voice and music attacks (dE1)
151 : *
152 : * - Find maximum energy per short sub-subframe
153 : * two sub-subframe sets are used, shifted by half the sub-subframe length
154 : * - Find maximum energy increase (ratio) between adjacent sub-subframes
155 : *-----------------------------------------------------------------*/
156 :
157 : /* find maximum energy per sub-subframe */
158 15631381 : pt_speech = speech - SSF;
159 15631381 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
160 296996239 : for ( i = 0; i < 2 * ( NB_SSF + 1 ); i++ )
161 : {
162 281364858 : emaximum( pt_speech, SSF, pt_enr_ssf );
163 281364858 : pt_speech += ( SSF / 2 );
164 281364858 : pt_enr_ssf++;
165 : }
166 :
167 15631381 : dE1 = 0.0f;
168 15631381 : ind_deltaMax = 0;
169 15631381 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
170 15631381 : pt_enr_ssf1 = pt_enr_ssf + 2;
171 :
172 : /* test on energy increase between adjacent sub-subframes */
173 265733477 : for ( i = 0; i < 2 * NB_SSF; i++ )
174 : {
175 250102096 : fac = *pt_enr_ssf1 / ( *pt_enr_ssf + 1.0f );
176 250102096 : if ( fac > dE1 )
177 : {
178 49815971 : dE1 = fac;
179 49815971 : ind_deltaMax = i;
180 : }
181 :
182 250102096 : pt_enr_ssf++;
183 250102096 : pt_enr_ssf1++;
184 : }
185 :
186 15631381 : if ( hStereoClassif != NULL )
187 : {
188 11910154 : if ( st->idchan == 0 )
189 : {
190 6355819 : hStereoClassif->dE1_ch1 = dE1;
191 : }
192 : else
193 : {
194 5554335 : hStereoClassif->dE1_ch2 = dE1;
195 : }
196 : }
197 :
198 15631381 : if ( dE1X != NULL )
199 : {
200 15534463 : *dE1X = dE1;
201 : }
202 :
203 : /*-----------------------------------------------------------------*
204 : * Average spectral tilt
205 : * Average voicing (normalized correlation)
206 : *-----------------------------------------------------------------*/
207 :
208 15631381 : mean_ee = 1.0f / 3.0f * ( st->ee_old + ee[0] + ee[1] );
209 15631381 : mean_voi3 = 1.0f / 3.0f * ( st->voicing[0] + st->voicing[1] + st->voicing[2] );
210 :
211 : /*-----------------------------------------------------------------*
212 : * Total frame energy difference (dE3)
213 : *-----------------------------------------------------------------*/
214 :
215 15631381 : dE3 = Etot - st->hNoiseEst->Etot_last;
216 :
217 : /*-----------------------------------------------------------------*
218 : * Energy decrease after spike (dE2)
219 : *-----------------------------------------------------------------*/
220 :
221 : /* set different thresholds and conditions for NB and WB input */
222 15631381 : if ( st->input_bwidth == NB )
223 : {
224 178549 : dE2_th = 21.0f;
225 178549 : nb_cond = ( mean_voi3 + corr_shift ) < 0.68f;
226 : }
227 : else
228 : {
229 15452832 : dE2_th = 30.0f;
230 15452832 : nb_cond = 1; /* no additional condition for WB input */
231 : }
232 :
233 : /* calcualte maximum energy decrease */
234 15631381 : dE2 = 0.0f;
235 15631381 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
236 :
237 15631381 : if ( dE1 > 30.0f && nb_cond )
238 : {
239 285069 : if ( 2 * NB_SSF - ind_deltaMax < L_ENR )
240 : {
241 117784 : st->old_ind_deltaMax = ind_deltaMax;
242 117784 : mvr2r( pt_enr_ssf, st->old_enr_ssf, 2 * NB_SSF );
243 : }
244 : else
245 : {
246 167285 : st->old_ind_deltaMax = -1;
247 167285 : dE2 = find_ener_decrease( ind_deltaMax, pt_enr_ssf );
248 167285 : if ( dE2 > dE2_th )
249 : {
250 8205 : st->spike_hyst = 0;
251 : }
252 : }
253 : }
254 : else
255 : {
256 15346312 : if ( st->old_ind_deltaMax >= 0 )
257 : {
258 135896 : mvr2r( st->old_enr_ssf, enr_ssf, 2 * NB_SSF );
259 135896 : dE2 = find_ener_decrease( st->old_ind_deltaMax, enr_ssf );
260 135896 : if ( dE2 > dE2_th )
261 : {
262 3970 : st->spike_hyst = 1;
263 : }
264 : }
265 15346312 : st->old_ind_deltaMax = -1;
266 : }
267 :
268 : /*-----------------------------------------------------------------*
269 : * Detection of voiced offsets (tmp_offset_flag)
270 : *-----------------------------------------------------------------*/
271 :
272 15631381 : tmp_offset_flag = 1;
273 :
274 15631381 : if ( st->input_bwidth != NB )
275 : {
276 15452832 : ee0_th = 2.4f;
277 15452832 : voi_th = 0.74f;
278 : }
279 : else
280 : {
281 178549 : ee0_th = 9.8f;
282 178549 : voi_th = 0.76f;
283 : }
284 :
285 15631381 : if ( ( st->last_coder_type_raw == UNVOICED ) || /* previous frame was unvoiced */
286 13067463 : ( ( ee[0] < ee0_th ) && ( hp_E[0] > (float) E_MIN ) && /* energy is concentrated in high frequencies provided that some energy is present in HF. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[0] before */
287 1218600 : ( st->voicing[0] + corr_shift < voi_th ) ) ) /* normalized correlation is low */
288 : {
289 3334949 : tmp_offset_flag = 0;
290 : }
291 :
292 : /*-----------------------------------------------------------------*
293 : * Decision about UC
294 : *-----------------------------------------------------------------*/
295 :
296 : /* SC-VBR - set additional parameters and thresholds for SC-VBR */
297 15631381 : mean_voi3_offset = 0.0f;
298 15631381 : flag_low_relE = 0;
299 15631381 : ee1_th = 9.5f;
300 15631381 : if ( st->Opt_SC_VBR || ( st->idchan == 1 && st->element_mode == IVAS_CPE_TD ) ) /* Allow the low energy flag for the secondary channel */
301 : {
302 31022 : ee1_th = 8.5f;
303 :
304 : /* SC-VBR - determine the threshold on relative energy as a function of lp_noise */
305 31022 : if ( st->input_bwidth != NB )
306 : {
307 29962 : if ( Last_Resort == 0 )
308 : {
309 29563 : relE_thres = 0.650f * st->lp_noise - 33.5f;
310 : }
311 : else
312 : {
313 399 : relE_thres = 0.700f * st->lp_noise - 33.5f;
314 : }
315 : }
316 : else
317 : {
318 1060 : relE_thres = 0.60f * st->lp_noise - 28.2f;
319 : }
320 :
321 31022 : if ( relE_thres < -25.0f )
322 : {
323 30340 : relE_thres = -25.0f;
324 : }
325 :
326 : /* SC-VBR = set flag on low relative energy */
327 31022 : if ( relE < relE_thres )
328 : {
329 11005 : flag_low_relE = 1;
330 : }
331 :
332 : /* SC-VBR - correction of voicing threshold for NB inputs (important only in noisy conditions) */
333 31022 : if ( st->input_bwidth == NB && vadnoise < 20.0f )
334 : {
335 0 : mean_voi3_offset = 0.05f;
336 : }
337 : }
338 :
339 : /* make decision whether frame is unvoiced */
340 15631381 : coder_type = GENERIC;
341 15631381 : if ( st->input_bwidth == NB )
342 : {
343 178549 : if ( ( ( mean_voi3 + corr_shift < 0.68f + mean_voi3_offset ) && /* normalized correlation low */
344 92606 : ( ( st->voicing[2] + corr_shift ) < 0.79f ) && /* normalized correlation low on look-ahead - onset detection */
345 91612 : ( ee[0] < 10.0f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF... */
346 2720 : ( ee[1] < ee1_th ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
347 1548 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
348 1507 : ( dE1 <= 29.0f ) && /* avoid on sharp energy spikes */
349 1507 : ( st->old_dE1 <= 29.0f ) && /* + one frame hysteresis */
350 178549 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
351 : flag_low_relE ) /* low relative frame energy (only for SC-VBR) */
352 : {
353 1619 : coder_type = UNVOICED;
354 : }
355 : }
356 : else
357 : {
358 15452832 : if ( ( ( mean_voi3 + corr_shift < 0.695f + mean_voi3_offset ) && /* normalized correlation low */
359 9911760 : ( ee[0] < 6.2f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF */
360 3403787 : ( ee[1] < 6.2f ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
361 2735554 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
362 2655034 : ( dE1 <= 30.0f ) && /* avoid on sharp energy spikes */
363 2655034 : ( st->old_dE1 <= 30.0f ) && /* + one frame hysteresis */
364 15452832 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
365 9316 : ( flag_low_relE && st->old_dE1 <= 30.0f ) ) /* low relative frame energy (only for SC-VBR) */
366 : {
367 2581024 : coder_type = UNVOICED;
368 : }
369 : }
370 :
371 : /*-----------------------------------------------------------------*
372 : * Decision about VC
373 : *-----------------------------------------------------------------*/
374 :
375 15631381 : if ( st->Opt_SC_VBR )
376 : {
377 2390 : st->hSC_VBR->set_ppp_generic = 0;
378 : }
379 :
380 15631381 : if ( st->localVAD == 1 && coder_type == GENERIC && last_core_orig != AMR_WB_CORE )
381 : {
382 10089154 : if ( ( voicing_fr[0] > 0.605f ) && /* normalized correlation high in 1st sf. */
383 6746955 : ( voicing_fr[1] > 0.605f ) && /* normalized correlation high in 2st sf. */
384 6451735 : ( voicing_fr[2] > 0.605f ) && /* normalized correlation high in 3st sf. */
385 5393323 : ( voicing_fr[3] > 0.605f ) && /* normalized correlation high in 4st sf. */
386 4794714 : ( mean_ee > 4.0f ) && /* energy concentrated in low frequencies */
387 4794714 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 3.0f ) && /* small OL pitch difference in 1st sf. */
388 4700607 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 3.0f ) && /* small OL pitch difference in 2nd sf. */
389 4035743 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 3.0f ) ) /* small OL pitch difference in 3rd sf. */
390 : {
391 4007733 : coder_type = VOICED;
392 : }
393 6081421 : else if ( st->Opt_SC_VBR && st->input_bwidth == NB && vadnoise < 20 )
394 : {
395 0 : if ( ( voicing_fr[0] > 0.25f ) && /* normalized correlation high in 1st sf. */
396 0 : ( voicing_fr[1] > 0.25f ) && /* normalized correlation high in 2st sf. */
397 0 : ( voicing_fr[2] > 0.25f ) && /* normalized correlation high in 3st sf. */
398 0 : ( voicing_fr[3] > 0.25f ) && /* normalized correlation high in 4st sf. */
399 0 : ( mean_ee > 1.0f ) && /* energy concentrated in low frequencies (used 1.0 for WB) */
400 0 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 5.0f ) && /* small OL pitch difference in 1st sf. */
401 0 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 5.0f ) && /* small OL pitch difference in 2nd sf. */
402 0 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 5.0f ) ) /* small OL pitch difference in 3rd sf. */
403 : {
404 0 : st->hSC_VBR->set_ppp_generic = 1;
405 0 : coder_type = VOICED;
406 : }
407 : }
408 :
409 : /* set VOICED mode for frames with very stable pitch and high correlation
410 : and avoid to switch to AUDIO/MUSIC later */
411 10089154 : voicing_m = mean( voicing_fr, NB_SUBFR );
412 :
413 10089154 : dpit1 = (float) fabs( pitch_fr[0] - pitch_fr[1] );
414 10089154 : dpit2 = (float) fabs( pitch_fr[1] - pitch_fr[2] );
415 10089154 : dpit3 = (float) fabs( pitch_fr[2] - pitch_fr[3] );
416 :
417 10089154 : if ( *flag_spitch || ( dpit1 <= 3.0f && dpit2 <= 3.0f && dpit3 <= 3.0f &&
418 572244 : voicing_m > 0.95f && st->voicing_sm > 0.97f ) )
419 : {
420 900749 : coder_type = VOICED;
421 900749 : *flag_spitch = 1; /*to avoid switch to AUDIO/MUSIC later*/
422 : }
423 : }
424 :
425 : /*-----------------------------------------------------------------*
426 : * Channel-aware mode - set RF mode and total bitrate
427 : *-----------------------------------------------------------------*/
428 :
429 15631381 : st->rf_mode = st->Opt_RF_ON;
430 :
431 15631381 : if ( coder_type == GENERIC )
432 : {
433 8960067 : if ( ( voicing_fr[0] < VOI_THRLD ) && /* normalized correlation high in 1st sf. */
434 1412411 : ( voicing_fr[1] < VOI_THRLD ) && /* normalized correlation high in 2st sf. */
435 1290347 : ( voicing_fr[2] < VOI_THRLD ) && /* normalized correlation high in 3st sf. */
436 1144279 : ( voicing_fr[3] < VOI_THRLD ) && /* normalized correlation high in 4st sf. */
437 : ( vadnoise > 25.0f ) ) /* when speech is clean */
438 : {
439 1 : st->rf_mode = 0;
440 :
441 : /* Current frame cannot be compressed to pack the partial redundancy */
442 1 : if ( st->rf_mode != st->Opt_RF_ON )
443 : {
444 0 : core_coder_mode_switch( st, st->last_total_brate, 0 );
445 : }
446 : }
447 : }
448 :
449 : /*-----------------------------------------------------------------*
450 : * UNCLR classifier
451 : *-----------------------------------------------------------------*/
452 :
453 15631381 : if ( hStereoClassif != NULL )
454 : {
455 11910154 : if ( st->element_mode > EVS_MONO && ( coder_type == GENERIC || coder_type == UNVOICED || coder_type == INACTIVE || st->localVAD == 0 ) && hStereoClassif->unclr_sw_enable_cnt[st->idchan] < MAX_UV_CNT )
456 : {
457 8658695 : hStereoClassif->unclr_sw_enable_cnt[st->idchan]++;
458 : }
459 : else
460 : {
461 3251459 : hStereoClassif->unclr_sw_enable_cnt[st->idchan] = 0;
462 : }
463 : }
464 :
465 : /*-----------------------------------------------------------------*
466 : * Updates
467 : *-----------------------------------------------------------------*/
468 :
469 : /* update spike hysteresis parameters */
470 15631381 : if ( st->spike_hyst >= 0 && st->spike_hyst < 2 )
471 : {
472 20296 : st->spike_hyst++;
473 : }
474 :
475 : /* reset spike hysteresis */
476 15631381 : if ( ( st->spike_hyst > 1 ) &&
477 89742 : ( dE3 > 5.0f || /* energy increases */
478 69899 : ( relE > -13.0f && ( mean_voi3 + corr_shift > 0.695f ) ) ) ) /* normalized correlation is high */
479 : {
480 10257 : st->spike_hyst = -1;
481 : }
482 :
483 : /* update tilt parameters */
484 15631381 : st->ee_old = ee[1];
485 15631381 : st->old_dE1 = dE1;
486 :
487 : /* save the raw coder_type for various modules later in the codec (the reason is that e.g. UNVOICED is not used (rewritten) at higher rates) */
488 15631381 : st->coder_type_raw = coder_type;
489 :
490 15631381 : return coder_type;
491 : }
|