Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define L_ENR ( NB_SSF + 2 )
52 : #define VOI_THRLD 0.2f
53 :
54 :
55 : /*-------------------------------------------------------------------*
56 : * find_ener_decrease()
57 : *
58 : * Find maximum energy ratio between short sub-subframes in case
59 : * energy is trailing off after a spike
60 : *-------------------------------------------------------------------*/
61 :
62 : /*! r: maximum energy ratio */
63 20625 : static float find_ener_decrease(
64 : const int16_t ind_deltaMax, /* i : index of the beginning of maximum energy search */
65 : const float *pt_enr_ssf /* i : Pointer to the energy buffer */
66 : )
67 : {
68 : int16_t i, j, end, flag;
69 : float maxEnr, minEnr, dE2;
70 :
71 20625 : dE2 = 0.0f;
72 20625 : j = ind_deltaMax + 2;
73 20625 : end = j + L_ENR;
74 20625 : maxEnr = pt_enr_ssf[j];
75 20625 : j++;
76 20625 : flag = 0;
77 206250 : for ( i = j; i < end; i++ )
78 : {
79 185625 : if ( pt_enr_ssf[i] > maxEnr && flag == 0 )
80 : {
81 7779 : maxEnr = pt_enr_ssf[i];
82 7779 : j++;
83 : }
84 : else
85 : {
86 177846 : flag = 1;
87 : }
88 : }
89 :
90 20625 : minEnr = maxEnr;
91 198471 : for ( i = j; i < end; i++ )
92 : {
93 177846 : if ( pt_enr_ssf[i] < minEnr )
94 : {
95 47391 : minEnr = pt_enr_ssf[i];
96 : }
97 : }
98 :
99 20625 : dE2 = maxEnr / ( minEnr + 1.0e5f );
100 :
101 20625 : return dE2;
102 : }
103 :
104 :
105 : /*-------------------------------------------------------------------*
106 : * find_uv()
107 : *
108 : * Decision about coder type
109 : *-------------------------------------------------------------------*/
110 :
111 : /*! r: coding type */
112 1136044 : int16_t find_uv(
113 : Encoder_State *st, /* i/o: encoder state structure */
114 : const float *pitch_fr, /* i : pointer to adjusted fractional pitch (4 val.) */
115 : const float *voicing_fr, /* i : refined correlation for each subframes */
116 : const float *speech, /* i : pointer to speech signal for E computation */
117 : const float *ee, /* i : lf/hf Energy ratio for present frame */
118 : float *dE1X, /* o : sudden energy increase for S/M classifier */
119 : const float corr_shift, /* i : normalized correlation correction in noise */
120 : const float relE, /* i : relative frame energy */
121 : const float Etot, /* i : total energy */
122 : const float hp_E[], /* i : energy in HF */
123 : int16_t *flag_spitch, /* i/o: flag to indicate very short stable pitch and high correlation */
124 : const int16_t last_core_orig, /* i : original last core */
125 : STEREO_CLASSIF_HANDLE hStereoClassif /* i/o: stereo classifier structure */
126 : )
127 : {
128 : const float *pt_speech;
129 : int16_t i, coder_type, ind_deltaMax, tmp_offset_flag, nb_cond, flag_low_relE;
130 : float fac, mean_voi3, mean_ee, relE_thres;
131 : float enr_ssf[4 * NB_SSF + 2];
132 : float dE1, *pt_enr_ssf, *pt_enr_ssf1, dE2, dE3, dE2_th, ee0_th, ee1_th, voi_th;
133 : float mean_voi3_offset;
134 : float voicing_m, dpit1, dpit2, dpit3;
135 : int16_t Last_Resort;
136 : float vadnoise;
137 :
138 1136044 : if ( st->hSC_VBR != NULL )
139 : {
140 3100 : Last_Resort = st->hSC_VBR->Last_Resort;
141 3100 : vadnoise = st->hSC_VBR->vadnoise;
142 : }
143 : else
144 : {
145 1132944 : Last_Resort = 0;
146 1132944 : vadnoise = 0;
147 : }
148 :
149 : /*-----------------------------------------------------------------*
150 : * Detect sudden energy increases to catch voice and music attacks (dE1)
151 : *
152 : * - Find maximum energy per short sub-subframe
153 : * two sub-subframe sets are used, shifted by half the sub-subframe length
154 : * - Find maximum energy increase (ratio) between adjacent sub-subframes
155 : *-----------------------------------------------------------------*/
156 :
157 : /* find maximum energy per sub-subframe */
158 1136044 : pt_speech = speech - SSF;
159 1136044 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
160 21584836 : for ( i = 0; i < 2 * ( NB_SSF + 1 ); i++ )
161 : {
162 20448792 : emaximum( pt_speech, SSF, pt_enr_ssf );
163 20448792 : pt_speech += ( SSF / 2 );
164 20448792 : pt_enr_ssf++;
165 : }
166 :
167 1136044 : dE1 = 0.0f;
168 1136044 : ind_deltaMax = 0;
169 1136044 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
170 1136044 : pt_enr_ssf1 = pt_enr_ssf + 2;
171 :
172 : /* test on energy increase between adjacent sub-subframes */
173 19312748 : for ( i = 0; i < 2 * NB_SSF; i++ )
174 : {
175 18176704 : fac = *pt_enr_ssf1 / ( *pt_enr_ssf + 1.0f );
176 18176704 : if ( fac > dE1 )
177 : {
178 3668462 : dE1 = fac;
179 3668462 : ind_deltaMax = i;
180 : }
181 :
182 18176704 : pt_enr_ssf++;
183 18176704 : pt_enr_ssf1++;
184 : }
185 :
186 1136044 : if ( hStereoClassif != NULL )
187 : {
188 782031 : if ( st->idchan == 0 )
189 : {
190 420855 : hStereoClassif->dE1_ch1 = dE1;
191 : }
192 : else
193 : {
194 361176 : hStereoClassif->dE1_ch2 = dE1;
195 : }
196 : }
197 :
198 1136044 : if ( dE1X != NULL )
199 : {
200 1132944 : *dE1X = dE1;
201 : }
202 :
203 : /*-----------------------------------------------------------------*
204 : * Average spectral tilt
205 : * Average voicing (normalized correlation)
206 : *-----------------------------------------------------------------*/
207 :
208 1136044 : mean_ee = 1.0f / 3.0f * ( st->ee_old + ee[0] + ee[1] );
209 1136044 : mean_voi3 = 1.0f / 3.0f * ( st->voicing[0] + st->voicing[1] + st->voicing[2] );
210 :
211 : /*-----------------------------------------------------------------*
212 : * Total frame energy difference (dE3)
213 : *-----------------------------------------------------------------*/
214 :
215 1136044 : dE3 = Etot - st->hNoiseEst->Etot_last;
216 :
217 : /*-----------------------------------------------------------------*
218 : * Energy decrease after spike (dE2)
219 : *-----------------------------------------------------------------*/
220 :
221 : /* set different thresholds and conditions for NB and WB input */
222 1136044 : if ( st->input_bwidth == NB )
223 : {
224 4011 : dE2_th = 21.0f;
225 4011 : nb_cond = ( mean_voi3 + corr_shift ) < 0.68f;
226 : }
227 : else
228 : {
229 1132033 : dE2_th = 30.0f;
230 1132033 : nb_cond = 1; /* no additional condition for WB input */
231 : }
232 :
233 : /* calcualte maximum energy decrease */
234 1136044 : dE2 = 0.0f;
235 1136044 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
236 :
237 1136044 : if ( dE1 > 30.0f && nb_cond )
238 : {
239 19641 : if ( 2 * NB_SSF - ind_deltaMax < L_ENR )
240 : {
241 8383 : st->old_ind_deltaMax = ind_deltaMax;
242 8383 : mvr2r( pt_enr_ssf, st->old_enr_ssf, 2 * NB_SSF );
243 : }
244 : else
245 : {
246 11258 : st->old_ind_deltaMax = -1;
247 11258 : dE2 = find_ener_decrease( ind_deltaMax, pt_enr_ssf );
248 11258 : if ( dE2 > dE2_th )
249 : {
250 164 : st->spike_hyst = 0;
251 : }
252 : }
253 : }
254 : else
255 : {
256 1116403 : if ( st->old_ind_deltaMax >= 0 )
257 : {
258 9367 : mvr2r( st->old_enr_ssf, enr_ssf, 2 * NB_SSF );
259 9367 : dE2 = find_ener_decrease( st->old_ind_deltaMax, enr_ssf );
260 9367 : if ( dE2 > dE2_th )
261 : {
262 135 : st->spike_hyst = 1;
263 : }
264 : }
265 1116403 : st->old_ind_deltaMax = -1;
266 : }
267 :
268 : /*-----------------------------------------------------------------*
269 : * Detection of voiced offsets (tmp_offset_flag)
270 : *-----------------------------------------------------------------*/
271 :
272 1136044 : tmp_offset_flag = 1;
273 :
274 1136044 : if ( st->input_bwidth != NB )
275 : {
276 1132033 : ee0_th = 2.4f;
277 1132033 : voi_th = 0.74f;
278 : }
279 : else
280 : {
281 4011 : ee0_th = 9.8f;
282 4011 : voi_th = 0.76f;
283 : }
284 :
285 1136044 : if ( ( st->last_coder_type_raw == UNVOICED ) || /* previous frame was unvoiced */
286 1056850 : ( ( ee[0] < ee0_th ) && ( hp_E[0] > (float) E_MIN ) && /* energy is concentrated in high frequencies provided that some energy is present in HF. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[0] before */
287 94540 : ( st->voicing[0] + corr_shift < voi_th ) ) ) /* normalized correlation is low */
288 : {
289 131879 : tmp_offset_flag = 0;
290 : }
291 :
292 : /*-----------------------------------------------------------------*
293 : * Decision about UC
294 : *-----------------------------------------------------------------*/
295 :
296 : /* SC-VBR - set additional parameters and thresholds for SC-VBR */
297 1136044 : mean_voi3_offset = 0.0f;
298 1136044 : flag_low_relE = 0;
299 1136044 : ee1_th = 9.5f;
300 1136044 : if ( st->Opt_SC_VBR || ( st->idchan == 1 && st->element_mode == IVAS_CPE_TD ) ) /* Allow the low energy flag for the secondary channel */
301 : {
302 3791 : ee1_th = 8.5f;
303 :
304 : /* SC-VBR - determine the threshold on relative energy as a function of lp_noise */
305 3791 : if ( st->input_bwidth != NB )
306 : {
307 3791 : if ( Last_Resort == 0 )
308 : {
309 3791 : relE_thres = 0.650f * st->lp_noise - 33.5f;
310 : }
311 : else
312 : {
313 0 : relE_thres = 0.700f * st->lp_noise - 33.5f;
314 : }
315 : }
316 : else
317 : {
318 0 : relE_thres = 0.60f * st->lp_noise - 28.2f;
319 : }
320 :
321 3791 : if ( relE_thres < -25.0f )
322 : {
323 3791 : relE_thres = -25.0f;
324 : }
325 :
326 : /* SC-VBR = set flag on low relative energy */
327 3791 : if ( relE < relE_thres )
328 : {
329 329 : flag_low_relE = 1;
330 : }
331 :
332 : /* SC-VBR - correction of voicing threshold for NB inputs (important only in noisy conditions) */
333 3791 : if ( st->input_bwidth == NB && vadnoise < 20.0f )
334 : {
335 0 : mean_voi3_offset = 0.05f;
336 : }
337 : }
338 :
339 : /* make decision whether frame is unvoiced */
340 1136044 : coder_type = GENERIC;
341 1136044 : if ( st->input_bwidth == NB )
342 : {
343 4011 : if ( ( ( mean_voi3 + corr_shift < 0.68f + mean_voi3_offset ) && /* normalized correlation low */
344 1221 : ( ( st->voicing[2] + corr_shift ) < 0.79f ) && /* normalized correlation low on look-ahead - onset detection */
345 1187 : ( ee[0] < 10.0f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF... */
346 190 : ( ee[1] < ee1_th ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
347 124 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
348 124 : ( dE1 <= 29.0f ) && /* avoid on sharp energy spikes */
349 124 : ( st->old_dE1 <= 29.0f ) && /* + one frame hysteresis */
350 4011 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
351 : flag_low_relE ) /* low relative frame energy (only for SC-VBR) */
352 : {
353 124 : coder_type = UNVOICED;
354 : }
355 : }
356 : else
357 : {
358 1132033 : if ( ( ( mean_voi3 + corr_shift < 0.695f + mean_voi3_offset ) && /* normalized correlation low */
359 590238 : ( ee[0] < 6.2f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF */
360 128452 : ( ee[1] < 6.2f ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
361 85565 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
362 82154 : ( dE1 <= 30.0f ) && /* avoid on sharp energy spikes */
363 82154 : ( st->old_dE1 <= 30.0f ) && /* + one frame hysteresis */
364 1132033 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
365 272 : ( flag_low_relE && st->old_dE1 <= 30.0f ) ) /* low relative frame energy (only for SC-VBR) */
366 : {
367 79756 : coder_type = UNVOICED;
368 : }
369 : }
370 :
371 : /*-----------------------------------------------------------------*
372 : * Decision about VC
373 : *-----------------------------------------------------------------*/
374 :
375 1136044 : if ( st->Opt_SC_VBR )
376 : {
377 0 : st->hSC_VBR->set_ppp_generic = 0;
378 : }
379 :
380 1136044 : if ( st->localVAD == 1 && coder_type == GENERIC && last_core_orig != AMR_WB_CORE )
381 : {
382 899524 : if ( ( voicing_fr[0] > 0.605f ) && /* normalized correlation high in 1st sf. */
383 642129 : ( voicing_fr[1] > 0.605f ) && /* normalized correlation high in 2st sf. */
384 609447 : ( voicing_fr[2] > 0.605f ) && /* normalized correlation high in 3st sf. */
385 513109 : ( voicing_fr[3] > 0.605f ) && /* normalized correlation high in 4st sf. */
386 456043 : ( mean_ee > 4.0f ) && /* energy concentrated in low frequencies */
387 456043 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 3.0f ) && /* small OL pitch difference in 1st sf. */
388 447351 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 3.0f ) && /* small OL pitch difference in 2nd sf. */
389 379809 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 3.0f ) ) /* small OL pitch difference in 3rd sf. */
390 : {
391 377240 : coder_type = VOICED;
392 : }
393 522284 : else if ( st->Opt_SC_VBR && st->input_bwidth == NB && vadnoise < 20 )
394 : {
395 0 : if ( ( voicing_fr[0] > 0.25f ) && /* normalized correlation high in 1st sf. */
396 0 : ( voicing_fr[1] > 0.25f ) && /* normalized correlation high in 2st sf. */
397 0 : ( voicing_fr[2] > 0.25f ) && /* normalized correlation high in 3st sf. */
398 0 : ( voicing_fr[3] > 0.25f ) && /* normalized correlation high in 4st sf. */
399 0 : ( mean_ee > 1.0f ) && /* energy concentrated in low frequencies (used 1.0 for WB) */
400 0 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 5.0f ) && /* small OL pitch difference in 1st sf. */
401 0 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 5.0f ) && /* small OL pitch difference in 2nd sf. */
402 0 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 5.0f ) ) /* small OL pitch difference in 3rd sf. */
403 : {
404 0 : st->hSC_VBR->set_ppp_generic = 1;
405 0 : coder_type = VOICED;
406 : }
407 : }
408 :
409 : /* set VOICED mode for frames with very stable pitch and high correlation
410 : and avoid to switch to AUDIO/MUSIC later */
411 899524 : voicing_m = mean( voicing_fr, NB_SUBFR );
412 :
413 899524 : dpit1 = (float) fabs( pitch_fr[0] - pitch_fr[1] );
414 899524 : dpit2 = (float) fabs( pitch_fr[1] - pitch_fr[2] );
415 899524 : dpit3 = (float) fabs( pitch_fr[2] - pitch_fr[3] );
416 :
417 899524 : if ( *flag_spitch || ( dpit1 <= 3.0f && dpit2 <= 3.0f && dpit3 <= 3.0f &&
418 39286 : voicing_m > 0.95f && st->voicing_sm > 0.97f ) )
419 : {
420 39319 : coder_type = VOICED;
421 39319 : *flag_spitch = 1; /*to avoid switch to AUDIO/MUSIC later*/
422 : }
423 : }
424 :
425 : /*-----------------------------------------------------------------*
426 : * Channel-aware mode - set RF mode and total bitrate
427 : *-----------------------------------------------------------------*/
428 :
429 1136044 : st->rf_mode = st->Opt_RF_ON;
430 :
431 1136044 : if ( coder_type == GENERIC )
432 : {
433 673699 : if ( ( voicing_fr[0] < VOI_THRLD ) && /* normalized correlation high in 1st sf. */
434 84363 : ( voicing_fr[1] < VOI_THRLD ) && /* normalized correlation high in 2st sf. */
435 74549 : ( voicing_fr[2] < VOI_THRLD ) && /* normalized correlation high in 3st sf. */
436 65835 : ( voicing_fr[3] < VOI_THRLD ) && /* normalized correlation high in 4st sf. */
437 : ( vadnoise > 25.0f ) ) /* when speech is clean */
438 : {
439 1 : st->rf_mode = 0;
440 :
441 : /* Current frame cannot be compressed to pack the partial redundancy */
442 1 : if ( st->rf_mode != st->Opt_RF_ON )
443 : {
444 0 : core_coder_mode_switch( st, st->last_total_brate, 0 );
445 : }
446 : }
447 : }
448 :
449 : /*-----------------------------------------------------------------*
450 : * UNCLR classifier
451 : *-----------------------------------------------------------------*/
452 :
453 1136044 : if ( hStereoClassif != NULL )
454 : {
455 782031 : if ( st->element_mode > EVS_MONO && ( coder_type == GENERIC || coder_type == UNVOICED || coder_type == INACTIVE || st->localVAD == 0 ) && hStereoClassif->unclr_sw_enable_cnt[st->idchan] < MAX_UV_CNT )
456 : {
457 492194 : hStereoClassif->unclr_sw_enable_cnt[st->idchan]++;
458 : }
459 : else
460 : {
461 289837 : hStereoClassif->unclr_sw_enable_cnt[st->idchan] = 0;
462 : }
463 : }
464 :
465 : /*-----------------------------------------------------------------*
466 : * Updates
467 : *-----------------------------------------------------------------*/
468 :
469 : /* update spike hysteresis parameters */
470 1136044 : if ( st->spike_hyst >= 0 && st->spike_hyst < 2 )
471 : {
472 463 : st->spike_hyst++;
473 : }
474 :
475 : /* reset spike hysteresis */
476 1136044 : if ( ( st->spike_hyst > 1 ) &&
477 2415 : ( dE3 > 5.0f || /* energy increases */
478 1523 : ( relE > -13.0f && ( mean_voi3 + corr_shift > 0.695f ) ) ) ) /* normalized correlation is high */
479 : {
480 289 : st->spike_hyst = -1;
481 : }
482 :
483 : /* update tilt parameters */
484 1136044 : st->ee_old = ee[1];
485 1136044 : st->old_dE1 = dE1;
486 :
487 : /* save the raw coder_type for various modules later in the codec (the reason is that e.g. UNVOICED is not used (rewritten) at higher rates) */
488 1136044 : st->coder_type_raw = coder_type;
489 :
490 1136044 : return coder_type;
491 : }
|