Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include <math.h>
43 : #include "cnst.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define L_ENR ( NB_SSF + 2 )
52 : #define VOI_THRLD 0.2f
53 :
54 :
55 : /*-------------------------------------------------------------------*
56 : * find_ener_decrease()
57 : *
58 : * Find maximum energy ratio between short sub-subframes in case
59 : * energy is trailing off after a spike
60 : *-------------------------------------------------------------------*/
61 :
62 : /*! r: maximum energy ratio */
63 98674 : static float find_ener_decrease(
64 : const int16_t ind_deltaMax, /* i : index of the beginning of maximum energy search */
65 : const float *pt_enr_ssf /* i : Pointer to the energy buffer */
66 : )
67 : {
68 : int16_t i, j, end, flag;
69 : float maxEnr, minEnr, dE2;
70 :
71 98674 : dE2 = 0.0f;
72 98674 : j = ind_deltaMax + 2;
73 98674 : end = j + L_ENR;
74 98674 : maxEnr = pt_enr_ssf[j];
75 98674 : j++;
76 98674 : flag = 0;
77 986740 : for ( i = j; i < end; i++ )
78 : {
79 888066 : if ( pt_enr_ssf[i] > maxEnr && flag == 0 )
80 : {
81 56469 : maxEnr = pt_enr_ssf[i];
82 56469 : j++;
83 : }
84 : else
85 : {
86 831597 : flag = 1;
87 : }
88 : }
89 :
90 98674 : minEnr = maxEnr;
91 930271 : for ( i = j; i < end; i++ )
92 : {
93 831597 : if ( pt_enr_ssf[i] < minEnr )
94 : {
95 279739 : minEnr = pt_enr_ssf[i];
96 : }
97 : }
98 :
99 98674 : dE2 = maxEnr / ( minEnr + 1.0e5f );
100 :
101 98674 : return dE2;
102 : }
103 :
104 :
105 : /*-------------------------------------------------------------------*
106 : * find_uv()
107 : *
108 : * Decision about coder type
109 : *-------------------------------------------------------------------*/
110 :
111 : /*! r: coding type */
112 2965078 : int16_t find_uv(
113 : Encoder_State *st, /* i/o: encoder state structure */
114 : const float *pitch_fr, /* i : pointer to adjusted fractional pitch (4 val.) */
115 : const float *voicing_fr, /* i : refined correlation for each subframes */
116 : const float *speech, /* i : pointer to speech signal for E computation */
117 : const float *ee, /* i : lf/hf Energy ratio for present frame */
118 : float *dE1X, /* o : sudden energy increase for S/M classifier */
119 : const float corr_shift, /* i : normalized correlation correction in noise */
120 : const float relE, /* i : relative frame energy */
121 : const float Etot, /* i : total energy */
122 : const float hp_E[], /* i : energy in HF */
123 : int16_t *flag_spitch, /* i/o: flag to indicate very short stable pitch and high correlation */
124 : const int16_t last_core_orig, /* i : original last core */
125 : STEREO_CLASSIF_HANDLE hStereoClassif /* i/o: stereo classifier structure */
126 : )
127 : {
128 : const float *pt_speech;
129 : int16_t i, coder_type, ind_deltaMax, tmp_offset_flag, nb_cond, flag_low_relE;
130 : float fac, mean_voi3, mean_ee, relE_thres;
131 : float enr_ssf[4 * NB_SSF + 2];
132 : float dE1, *pt_enr_ssf, *pt_enr_ssf1, dE2, dE3, dE2_th, ee0_th, ee1_th, voi_th;
133 : float mean_voi3_offset;
134 : float voicing_m, dpit1, dpit2, dpit3;
135 : int16_t Last_Resort;
136 : float vadnoise;
137 :
138 2965078 : if ( st->hSC_VBR != NULL )
139 : {
140 15110 : Last_Resort = st->hSC_VBR->Last_Resort;
141 15110 : vadnoise = st->hSC_VBR->vadnoise;
142 : }
143 : else
144 : {
145 2949968 : Last_Resort = 0;
146 2949968 : vadnoise = 0;
147 : }
148 :
149 : /*-----------------------------------------------------------------*
150 : * Detect sudden energy increases to catch voice and music attacks (dE1)
151 : *
152 : * - Find maximum energy per short sub-subframe
153 : * two sub-subframe sets are used, shifted by half the sub-subframe length
154 : * - Find maximum energy increase (ratio) between adjacent sub-subframes
155 : *-----------------------------------------------------------------*/
156 :
157 : /* find maximum energy per sub-subframe */
158 2965078 : pt_speech = speech - SSF;
159 2965078 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
160 56336482 : for ( i = 0; i < 2 * ( NB_SSF + 1 ); i++ )
161 : {
162 53371404 : emaximum( pt_speech, SSF, pt_enr_ssf );
163 53371404 : pt_speech += ( SSF / 2 );
164 53371404 : pt_enr_ssf++;
165 : }
166 :
167 2965078 : dE1 = 0.0f;
168 2965078 : ind_deltaMax = 0;
169 2965078 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
170 2965078 : pt_enr_ssf1 = pt_enr_ssf + 2;
171 :
172 : /* test on energy increase between adjacent sub-subframes */
173 50406326 : for ( i = 0; i < 2 * NB_SSF; i++ )
174 : {
175 47441248 : fac = *pt_enr_ssf1 / ( *pt_enr_ssf + 1.0f );
176 47441248 : if ( fac > dE1 )
177 : {
178 9148035 : dE1 = fac;
179 9148035 : ind_deltaMax = i;
180 : }
181 :
182 47441248 : pt_enr_ssf++;
183 47441248 : pt_enr_ssf1++;
184 : }
185 :
186 2965078 : if ( hStereoClassif != NULL )
187 : {
188 2278188 : if ( st->idchan == 0 )
189 : {
190 1194185 : hStereoClassif->dE1_ch1 = dE1;
191 : }
192 : else
193 : {
194 1084003 : hStereoClassif->dE1_ch2 = dE1;
195 : }
196 : }
197 :
198 2965078 : if ( dE1X != NULL )
199 : {
200 2949968 : *dE1X = dE1;
201 : }
202 :
203 : /*-----------------------------------------------------------------*
204 : * Average spectral tilt
205 : * Average voicing (normalized correlation)
206 : *-----------------------------------------------------------------*/
207 :
208 2965078 : mean_ee = 1.0f / 3.0f * ( st->ee_old + ee[0] + ee[1] );
209 2965078 : mean_voi3 = 1.0f / 3.0f * ( st->voicing[0] + st->voicing[1] + st->voicing[2] );
210 :
211 : /*-----------------------------------------------------------------*
212 : * Total frame energy difference (dE3)
213 : *-----------------------------------------------------------------*/
214 :
215 2965078 : dE3 = Etot - st->hNoiseEst->Etot_last;
216 :
217 : /*-----------------------------------------------------------------*
218 : * Energy decrease after spike (dE2)
219 : *-----------------------------------------------------------------*/
220 :
221 : /* set different thresholds and conditions for NB and WB input */
222 2965078 : if ( st->input_bwidth == NB )
223 : {
224 6382 : dE2_th = 21.0f;
225 6382 : nb_cond = ( mean_voi3 + corr_shift ) < 0.68f;
226 : }
227 : else
228 : {
229 2958696 : dE2_th = 30.0f;
230 2958696 : nb_cond = 1; /* no additional condition for WB input */
231 : }
232 :
233 : /* calcualte maximum energy decrease */
234 2965078 : dE2 = 0.0f;
235 2965078 : pt_enr_ssf = enr_ssf + 2 * NB_SSF;
236 :
237 2965078 : if ( dE1 > 30.0f && nb_cond )
238 : {
239 129016 : if ( 2 * NB_SSF - ind_deltaMax < L_ENR )
240 : {
241 64941 : st->old_ind_deltaMax = ind_deltaMax;
242 64941 : mvr2r( pt_enr_ssf, st->old_enr_ssf, 2 * NB_SSF );
243 : }
244 : else
245 : {
246 64075 : st->old_ind_deltaMax = -1;
247 64075 : dE2 = find_ener_decrease( ind_deltaMax, pt_enr_ssf );
248 64075 : if ( dE2 > dE2_th )
249 : {
250 19398 : st->spike_hyst = 0;
251 : }
252 : }
253 : }
254 : else
255 : {
256 2836062 : if ( st->old_ind_deltaMax >= 0 )
257 : {
258 34599 : mvr2r( st->old_enr_ssf, enr_ssf, 2 * NB_SSF );
259 34599 : dE2 = find_ener_decrease( st->old_ind_deltaMax, enr_ssf );
260 34599 : if ( dE2 > dE2_th )
261 : {
262 9683 : st->spike_hyst = 1;
263 : }
264 : }
265 2836062 : st->old_ind_deltaMax = -1;
266 : }
267 :
268 : /*-----------------------------------------------------------------*
269 : * Detection of voiced offsets (tmp_offset_flag)
270 : *-----------------------------------------------------------------*/
271 :
272 2965078 : tmp_offset_flag = 1;
273 :
274 2965078 : if ( st->input_bwidth != NB )
275 : {
276 2958696 : ee0_th = 2.4f;
277 2958696 : voi_th = 0.74f;
278 : }
279 : else
280 : {
281 6382 : ee0_th = 9.8f;
282 6382 : voi_th = 0.76f;
283 : }
284 :
285 2965078 : if ( ( st->last_coder_type_raw == UNVOICED ) || /* previous frame was unvoiced */
286 2827762 : ( ( ee[0] < ee0_th ) && ( hp_E[0] > (float) E_MIN ) && /* energy is concentrated in high frequencies provided that some energy is present in HF. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[0] before */
287 250512 : ( st->voicing[0] + corr_shift < voi_th ) ) ) /* normalized correlation is low */
288 : {
289 298430 : tmp_offset_flag = 0;
290 : }
291 :
292 : /*-----------------------------------------------------------------*
293 : * Decision about UC
294 : *-----------------------------------------------------------------*/
295 :
296 : /* SC-VBR - set additional parameters and thresholds for SC-VBR */
297 2965078 : mean_voi3_offset = 0.0f;
298 2965078 : flag_low_relE = 0;
299 2965078 : ee1_th = 9.5f;
300 2965078 : if ( st->Opt_SC_VBR || ( st->idchan == 1 && st->element_mode == IVAS_CPE_TD ) ) /* Allow the low energy flag for the secondary channel */
301 : {
302 4338 : ee1_th = 8.5f;
303 :
304 : /* SC-VBR - determine the threshold on relative energy as a function of lp_noise */
305 4338 : if ( st->input_bwidth != NB )
306 : {
307 4278 : if ( Last_Resort == 0 )
308 : {
309 4278 : relE_thres = 0.650f * st->lp_noise - 33.5f;
310 : }
311 : else
312 : {
313 0 : relE_thres = 0.700f * st->lp_noise - 33.5f;
314 : }
315 : }
316 : else
317 : {
318 60 : relE_thres = 0.60f * st->lp_noise - 28.2f;
319 : }
320 :
321 4338 : if ( relE_thres < -25.0f )
322 : {
323 4338 : relE_thres = -25.0f;
324 : }
325 :
326 : /* SC-VBR = set flag on low relative energy */
327 4338 : if ( relE < relE_thres )
328 : {
329 385 : flag_low_relE = 1;
330 : }
331 :
332 : /* SC-VBR - correction of voicing threshold for NB inputs (important only in noisy conditions) */
333 4338 : if ( st->input_bwidth == NB && vadnoise < 20.0f )
334 : {
335 0 : mean_voi3_offset = 0.05f;
336 : }
337 : }
338 :
339 : /* make decision whether frame is unvoiced */
340 2965078 : coder_type = GENERIC;
341 2965078 : if ( st->input_bwidth == NB )
342 : {
343 6382 : if ( ( ( mean_voi3 + corr_shift < 0.68f + mean_voi3_offset ) && /* normalized correlation low */
344 2223 : ( ( st->voicing[2] + corr_shift ) < 0.79f ) && /* normalized correlation low on look-ahead - onset detection */
345 2112 : ( ee[0] < 10.0f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF... */
346 237 : ( ee[1] < ee1_th ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
347 146 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
348 146 : ( dE1 <= 29.0f ) && /* avoid on sharp energy spikes */
349 146 : ( st->old_dE1 <= 29.0f ) && /* + one frame hysteresis */
350 6382 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
351 : flag_low_relE ) /* low relative frame energy (only for SC-VBR) */
352 : {
353 145 : coder_type = UNVOICED;
354 : }
355 : }
356 : else
357 : {
358 2958696 : if ( ( ( mean_voi3 + corr_shift < 0.695f + mean_voi3_offset ) && /* normalized correlation low */
359 1570459 : ( ee[0] < 6.2f ) && ( hp_E[0] > (float) E_MIN ) && /* energy concentrated in high frequencies provided that some energy is present in HF */
360 270376 : ( ee[1] < 6.2f ) && ( hp_E[1] > (float) E_MIN ) && /* ... biased towards look-ahead to detect onsets. The cast to (float) is needed for Linux i686 (gcc version 4.7.2), otherwise the criterion hp_E[0] > E_MIN holds true if E_MIN was assigned to hp_E[] before */
361 177134 : ( tmp_offset_flag == 0 ) && /* take care of voiced offsets */
362 142486 : ( dE1 <= 30.0f ) && /* avoid on sharp energy spikes */
363 142486 : ( st->old_dE1 <= 30.0f ) && /* + one frame hysteresis */
364 2958696 : ( st->spike_hyst < 0 ) ) || /* avoid after sharp energy spikes followed by decay (e.g. castanets) */
365 328 : ( flag_low_relE && st->old_dE1 <= 30.0f ) ) /* low relative frame energy (only for SC-VBR) */
366 : {
367 138312 : coder_type = UNVOICED;
368 : }
369 : }
370 :
371 : /*-----------------------------------------------------------------*
372 : * Decision about VC
373 : *-----------------------------------------------------------------*/
374 :
375 2965078 : if ( st->Opt_SC_VBR )
376 : {
377 200 : st->hSC_VBR->set_ppp_generic = 0;
378 : }
379 :
380 2965078 : if ( st->localVAD == 1 && coder_type == GENERIC && last_core_orig != AMR_WB_CORE )
381 : {
382 2367701 : if ( ( voicing_fr[0] > 0.605f ) && /* normalized correlation high in 1st sf. */
383 1661434 : ( voicing_fr[1] > 0.605f ) && /* normalized correlation high in 2st sf. */
384 1590739 : ( voicing_fr[2] > 0.605f ) && /* normalized correlation high in 3st sf. */
385 1323907 : ( voicing_fr[3] > 0.605f ) && /* normalized correlation high in 4st sf. */
386 1185366 : ( mean_ee > 4.0f ) && /* energy concentrated in low frequencies */
387 1185366 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 3.0f ) && /* small OL pitch difference in 1st sf. */
388 1168255 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 3.0f ) && /* small OL pitch difference in 2nd sf. */
389 972343 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 3.0f ) ) /* small OL pitch difference in 3rd sf. */
390 : {
391 966651 : coder_type = VOICED;
392 : }
393 1401050 : else if ( st->Opt_SC_VBR && st->input_bwidth == NB && vadnoise < 20 )
394 : {
395 0 : if ( ( voicing_fr[0] > 0.25f ) && /* normalized correlation high in 1st sf. */
396 0 : ( voicing_fr[1] > 0.25f ) && /* normalized correlation high in 2st sf. */
397 0 : ( voicing_fr[2] > 0.25f ) && /* normalized correlation high in 3st sf. */
398 0 : ( voicing_fr[3] > 0.25f ) && /* normalized correlation high in 4st sf. */
399 0 : ( mean_ee > 1.0f ) && /* energy concentrated in low frequencies (used 1.0 for WB) */
400 0 : ( fabs( pitch_fr[1] - pitch_fr[0] ) < 5.0f ) && /* small OL pitch difference in 1st sf. */
401 0 : ( fabs( pitch_fr[2] - pitch_fr[1] ) < 5.0f ) && /* small OL pitch difference in 2nd sf. */
402 0 : ( fabs( pitch_fr[3] - pitch_fr[2] ) < 5.0f ) ) /* small OL pitch difference in 3rd sf. */
403 : {
404 0 : st->hSC_VBR->set_ppp_generic = 1;
405 0 : coder_type = VOICED;
406 : }
407 : }
408 :
409 : /* set VOICED mode for frames with very stable pitch and high correlation
410 : and avoid to switch to AUDIO/MUSIC later */
411 2367701 : voicing_m = mean( voicing_fr, NB_SUBFR );
412 :
413 2367701 : dpit1 = (float) fabs( pitch_fr[0] - pitch_fr[1] );
414 2367701 : dpit2 = (float) fabs( pitch_fr[1] - pitch_fr[2] );
415 2367701 : dpit3 = (float) fabs( pitch_fr[2] - pitch_fr[3] );
416 :
417 2367701 : if ( *flag_spitch || ( dpit1 <= 3.0f && dpit2 <= 3.0f && dpit3 <= 3.0f &&
418 82009 : voicing_m > 0.95f && st->voicing_sm > 0.97f ) )
419 : {
420 43871 : coder_type = VOICED;
421 43871 : *flag_spitch = 1; /*to avoid switch to AUDIO/MUSIC later*/
422 : }
423 : }
424 :
425 : /*-----------------------------------------------------------------*
426 : * Channel-aware mode - set RF mode and total bitrate
427 : *-----------------------------------------------------------------*/
428 :
429 2965078 : st->rf_mode = st->Opt_RF_ON;
430 :
431 2965078 : if ( coder_type == GENERIC )
432 : {
433 1853979 : if ( ( voicing_fr[0] < VOI_THRLD ) && /* normalized correlation high in 1st sf. */
434 347382 : ( voicing_fr[1] < VOI_THRLD ) && /* normalized correlation high in 2st sf. */
435 324164 : ( voicing_fr[2] < VOI_THRLD ) && /* normalized correlation high in 3st sf. */
436 292889 : ( voicing_fr[3] < VOI_THRLD ) && /* normalized correlation high in 4st sf. */
437 : ( vadnoise > 25.0f ) ) /* when speech is clean */
438 : {
439 1 : st->rf_mode = 0;
440 :
441 : /* Current frame cannot be compressed to pack the partial redundancy */
442 1 : if ( st->rf_mode != st->Opt_RF_ON )
443 : {
444 0 : core_coder_mode_switch( st, st->last_total_brate, 0 );
445 : }
446 : }
447 : }
448 :
449 : /*-----------------------------------------------------------------*
450 : * UNCLR classifier
451 : *-----------------------------------------------------------------*/
452 :
453 2965078 : if ( hStereoClassif != NULL )
454 : {
455 2278188 : if ( st->element_mode > EVS_MONO && ( coder_type == GENERIC || coder_type == UNVOICED || coder_type == INACTIVE || st->localVAD == 0 ) && hStereoClassif->unclr_sw_enable_cnt[st->idchan] < MAX_UV_CNT )
456 : {
457 1470594 : hStereoClassif->unclr_sw_enable_cnt[st->idchan]++;
458 : }
459 : else
460 : {
461 807594 : hStereoClassif->unclr_sw_enable_cnt[st->idchan] = 0;
462 : }
463 : }
464 :
465 : /*-----------------------------------------------------------------*
466 : * Updates
467 : *-----------------------------------------------------------------*/
468 :
469 : /* update spike hysteresis parameters */
470 2965078 : if ( st->spike_hyst >= 0 && st->spike_hyst < 2 )
471 : {
472 47949 : st->spike_hyst++;
473 : }
474 :
475 : /* reset spike hysteresis */
476 2965078 : if ( ( st->spike_hyst > 1 ) &&
477 113464 : ( dE3 > 5.0f || /* energy increases */
478 17065 : ( relE > -13.0f && ( mean_voi3 + corr_shift > 0.695f ) ) ) ) /* normalized correlation is high */
479 : {
480 25604 : st->spike_hyst = -1;
481 : }
482 :
483 : /* update tilt parameters */
484 2965078 : st->ee_old = ee[1];
485 2965078 : st->old_dE1 = dE1;
486 :
487 : /* save the raw coder_type for various modules later in the codec (the reason is that e.g. UNVOICED is not used (rewritten) at higher rates) */
488 2965078 : st->coder_type_raw = coder_type;
489 :
490 2965078 : return coder_type;
491 : }
|