Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include "cnst.h"
38 : #include <stdint.h>
39 : #include "options.h"
40 : #ifdef DEBUGGING
41 : #include "debug.h"
42 : #endif
43 : #include <math.h>
44 : #include "prot.h"
45 : #include "rom_com.h"
46 : #include "stat_dec.h"
47 : #include "wmc_auto.h"
48 :
49 : /*-------------------------------------------------------------------*
50 : * Local constants
51 : *-------------------------------------------------------------------*/
52 :
53 : #define K_COR_DEC 0.8547f /* <-0.29, 0.88> */
54 : #define C_COR_DEC 0.2479f
55 :
56 : #define K_TILT_DEC 0.8333f /* <-0.35, 0.85> */
57 : #define C_TILT_DEC 0.2917f
58 :
59 : #define K_ZC_DEC -0.04f /* <63, 38> */
60 : #define C_ZC_DEC 2.52f
61 :
62 : #define K_ENR_DEC 0.04f /* <-14, 11> */
63 : #define C_ENR_DEC 0.56f
64 :
65 : #define K_PC_DEC -0.0357f /* <45, 17> */
66 : #define C_PC_DEC 1.6071f
67 :
68 : /*-------------------------------------------------------------------*
69 : * Local function prototypes
70 : *-------------------------------------------------------------------*/
71 :
72 : static float calculate_zero_crossings( const float *synth, const int16_t L_frame );
73 :
74 : static float calculate_pitch_synchr_norm_correlation( const float *pitch, const float *synth, const int16_t L_frame, const int16_t L_subfr );
75 :
76 : static float calculate_spectral_tilt( const float *synth, const int16_t nbSubfr, const int16_t L_subfr );
77 :
78 : static int16_t calculate_classification_result( const int16_t last_good, float fmerit1, const float ener, const int16_t codec_mode );
79 :
80 : static void FEC_ClassifierCore( const float *synth, const float *pitch, const int16_t L_frame, const int16_t codec_mode, int16_t *clas, float *lp_speech, float *frame_ener, float *ener, float *enern, float *class_para, const float LTP_Gain, const int16_t narrowBand, const SIGNAL_CLASSIFIER_MODE mode );
81 :
82 : static int16_t FEC_dec_class( Decoder_State *st, float *enr_q );
83 :
84 : static void FEC_classificationMusic( const int16_t coder_type, int16_t *decision_hyst, int16_t *clas );
85 :
86 :
87 : /*------------------------------------------------------------------------*
88 : * calculate_zero_crossings()
89 : *
90 : *
91 : *------------------------------------------------------------------------*/
92 :
93 301080 : static float calculate_zero_crossings(
94 : const float *synth,
95 : const int16_t L_frame )
96 : {
97 : int16_t i;
98 : float zc_frame;
99 :
100 301080 : zc_frame = 0.f;
101 :
102 85761816 : for ( i = 0; i < L_frame; ++i )
103 : {
104 85460736 : if ( ( synth[i] <= 0.0f ) && ( synth[i - 1] > 0.0f ) )
105 : {
106 6962205 : zc_frame += 1.0f;
107 : }
108 : }
109 :
110 : /* Renormalization for 12.8kHz core*/
111 301080 : zc_frame *= 256.0f / (float) L_frame;
112 :
113 301080 : return zc_frame;
114 : }
115 :
116 : /*------------------------------------------------------------------------*
117 : * calculate_pitch_synchr_norm_correlation()
118 : *
119 : *
120 : *------------------------------------------------------------------------*/
121 :
122 301080 : static float calculate_pitch_synchr_norm_correlation(
123 : const float *pitch,
124 : const float *synth,
125 : const int16_t L_frame,
126 : const int16_t L_subfr )
127 : {
128 : int16_t T0, pos;
129 : int16_t j, i;
130 : float cor_max[16];
131 : float enr1t, enr2t, voicing;
132 :
133 301080 : T0 = (int16_t) pitch[3];
134 301080 : set_zero( cor_max, 16 );
135 :
136 301080 : if ( T0 > ( 3 * ( L_subfr / 2 ) ) )
137 : {
138 217605 : T0 = (int16_t) ( 0.5f * ( pitch[2] + pitch[3] ) + 0.5f );
139 : }
140 :
141 301080 : pos = L_frame;
142 301080 : j = 0;
143 :
144 657273 : while ( pos > ( L_frame / L_subfr == 4 ? 3 * L_subfr : 4 * L_subfr ) )
145 : {
146 356193 : pos -= T0;
147 :
148 356193 : cor_max[j] = dotp( &synth[pos], &synth[pos - T0], T0 );
149 356193 : enr1t = dotp( &synth[pos - T0], &synth[pos - T0], T0 );
150 356193 : enr2t = dotp( &synth[pos], &synth[pos], T0 );
151 :
152 356193 : cor_max[j] *= inv_sqrt( enr1t * enr2t + 0.1f );
153 :
154 :
155 356193 : if ( ( pos - T0 ) < ( L_frame - L_subfr ) )
156 : {
157 347637 : T0 = (int16_t) ( 0.5f * ( pitch[2] + pitch[3] ) + 0.5f );
158 : }
159 356193 : j++;
160 : }
161 :
162 301080 : voicing = cor_max[0];
163 :
164 356193 : for ( i = 1; i < j; ++i )
165 : {
166 55113 : voicing += cor_max[i];
167 : }
168 :
169 301080 : if ( j > 1 )
170 : {
171 47568 : voicing /= j;
172 : }
173 :
174 301080 : return voicing;
175 : }
176 :
177 :
178 : /*------------------------------------------------------------------------*
179 : * calculate_spectral_tilt()
180 : *
181 : *
182 : *------------------------------------------------------------------------*/
183 :
184 301080 : static float calculate_spectral_tilt(
185 : const float *synth,
186 : const int16_t nbSubfr,
187 : const int16_t L_subfr )
188 : {
189 : int16_t i;
190 : float tilt;
191 :
192 301080 : float enr1t = 0.0f;
193 301080 : float enr2t = 0.0f;
194 301080 : const float *pt1 = synth + L_subfr;
195 301080 : const float *pt2 = synth + L_subfr - 1;
196 :
197 1335324 : for ( i = 0; i < nbSubfr - 1; ++i )
198 : {
199 1034244 : enr1t += dotp( pt1, pt1, L_subfr );
200 1034244 : enr2t += dotp( pt1, pt2, L_subfr );
201 :
202 1034244 : pt1 += L_subfr;
203 1034244 : pt2 += L_subfr;
204 : }
205 :
206 301080 : tilt = enr2t / ( enr1t + 0.1f );
207 :
208 301080 : return tilt;
209 : }
210 :
211 : /*------------------------------------------------------------------------*
212 : * calculate_classification_result()
213 : *
214 : *
215 : *------------------------------------------------------------------------*/
216 :
217 301080 : static int16_t calculate_classification_result(
218 : const int16_t last_good,
219 : float fmerit1,
220 : const float ener,
221 : const int16_t codec_mode )
222 : {
223 301080 : int16_t result = UNVOICED_CLAS;
224 :
225 301080 : switch ( last_good )
226 : {
227 243186 : case VOICED_CLAS:
228 : case ONSET:
229 : case SIN_ONSET:
230 : case VOICED_TRANSITION:
231 :
232 243186 : if ( fmerit1 < 0.39f )
233 : {
234 12567 : result = UNVOICED_CLAS;
235 : }
236 230619 : else if ( fmerit1 < 0.63f && ( codec_mode == MODE2 || ener < -15.0f ) )
237 : {
238 60012 : result = VOICED_TRANSITION;
239 : }
240 : else
241 : {
242 170607 : result = VOICED_CLAS;
243 : }
244 243186 : break;
245 :
246 57894 : case UNVOICED_CLAS:
247 : case UNVOICED_TRANSITION:
248 : case INACTIVE_CLAS: /* relevant for MODE2 only */
249 :
250 57894 : if ( fmerit1 > 0.56f )
251 : {
252 32616 : result = ONSET;
253 : }
254 25278 : else if ( fmerit1 > 0.45f )
255 : {
256 8241 : result = UNVOICED_TRANSITION;
257 : }
258 : else
259 : {
260 17037 : result = UNVOICED_CLAS;
261 : }
262 57894 : break;
263 : }
264 :
265 301080 : return result;
266 : }
267 :
268 :
269 : /*------------------------------------------------------------------------*
270 : * FEC_ClassifierCore()
271 : *
272 : *
273 : *------------------------------------------------------------------------*/
274 :
275 301080 : static void FEC_ClassifierCore(
276 : const float *synth,
277 : const float *pitch, /* i : pitch values for each subframe */
278 : const int16_t L_frame, /* i : length of the frame */
279 : const int16_t codec_mode, /* i : codec mode */
280 : int16_t *clas, /* i/o: frame classification */
281 : float *lp_speech, /* i/o: long term active speech energy average */
282 : float *frame_ener, /* o : */
283 : float *ener, /* o : */
284 : float *enern, /* o : */
285 : float *class_para, /* o : classification para. fmerit1 */
286 : const float LTP_Gain, /* i : */
287 : const int16_t narrowBand, /* i : */
288 : const SIGNAL_CLASSIFIER_MODE mode /* i : */
289 : )
290 : {
291 : float zc_frame;
292 : float tiltn, corn, zcn, pcn, fmerit1, tilt;
293 : float voicing;
294 301080 : float pc = 0.0f;
295 :
296 : /*------------------------------------------------------------------------*
297 : * Compute the zero crossing rate for all subframes
298 : *------------------------------------------------------------------------*/
299 :
300 301080 : zc_frame = calculate_zero_crossings( synth, L_frame );
301 :
302 : /*------------------------------------------------------------------------*
303 : * Compute the normalized correlation pitch-synch at the end of the frame
304 : *------------------------------------------------------------------------*/
305 :
306 301080 : voicing = calculate_pitch_synchr_norm_correlation( pitch, synth, L_frame, L_SUBFR );
307 :
308 : /*------------------------------------------------------------------------*
309 : * Compute pitch coherence
310 : *------------------------------------------------------------------------*/
311 :
312 301080 : pc = 0; /* just to remove MSVC warnings */
313 301080 : if ( codec_mode == MODE1 || !( LTP_Gain != -1.0f && mode == CLASSIFIER_TCX ) )
314 : {
315 109053 : pc = (float) fabs( pitch[3] + pitch[2] - pitch[1] - pitch[0] ) * 256.0f / (float) L_frame;
316 : }
317 :
318 : /*------------------------------------------------------------------------*
319 : * Compute spectral tilt
320 : *------------------------------------------------------------------------*/
321 :
322 301080 : tilt = calculate_spectral_tilt( synth, L_frame / L_SUBFR, L_SUBFR );
323 :
324 : /*------------------------------------------------------------------------*
325 : * Compute pitch-synchronous energy at the frame end
326 : *------------------------------------------------------------------------*/
327 :
328 301080 : *ener = frame_energy( L_frame, pitch, synth, *lp_speech, frame_ener );
329 :
330 : /*------------------------------------------------------------------------*
331 : * transform parameters between 0 & 1
332 : * find unique merit function
333 : *------------------------------------------------------------------------*/
334 :
335 301080 : *enern = K_ENR_DEC * *ener + C_ENR_DEC;
336 301080 : tiltn = K_TILT_DEC * tilt + C_TILT_DEC;
337 301080 : corn = K_COR_DEC * voicing + C_COR_DEC;
338 301080 : zcn = K_ZC_DEC * zc_frame + C_ZC_DEC;
339 :
340 301080 : if ( codec_mode == MODE2 && LTP_Gain != -1.0f && mode == CLASSIFIER_TCX )
341 : {
342 192027 : pcn = LTP_Gain * C_PC_DEC;
343 : }
344 : else
345 : {
346 109053 : pcn = K_PC_DEC * pc + C_PC_DEC;
347 : }
348 :
349 301080 : if ( pcn > 1.0f )
350 : {
351 114666 : pcn = 1.0f;
352 : }
353 186414 : else if ( pcn < 0.0f )
354 : {
355 8811 : pcn = 0.0f;
356 : }
357 :
358 301080 : fmerit1 = ( 1.0f / 6.0f ) * ( tiltn + 2.0f * corn + zcn + pcn + *enern );
359 :
360 301080 : if ( codec_mode == MODE2 && narrowBand )
361 : {
362 0 : fmerit1 *= 0.9f;
363 : }
364 :
365 : /*------------------------------------------------------------------------*
366 : * frame classification
367 : *------------------------------------------------------------------------*/
368 :
369 301080 : if ( codec_mode == MODE1 )
370 : {
371 63813 : *class_para = fmerit1;
372 : }
373 301080 : *clas = calculate_classification_result( *clas, fmerit1, *ener, codec_mode );
374 :
375 301080 : return;
376 : }
377 :
378 :
379 : /*------------------------------------------------------------------------*
380 : * FEC_clas_estim()
381 : *
382 : * Estimation of frame class, if not available in the bitstream
383 : *------------------------------------------------------------------------*/
384 :
385 845340 : void FEC_clas_estim(
386 : const float *syn,
387 : const float *pitch, /* i : pitch values for each subframe */
388 : const int16_t L_frame, /* i : length of the frame */
389 : const int16_t coder_type, /* i : coder type */
390 : const int16_t codec_mode, /* i : codec mode */
391 : float *mem_syn_clas_estim, /* i/o: memory of the synthesis signal for frame class estimation */
392 : int16_t *clas, /* i/o: frame classification */
393 : float *lp_speech, /* i/o: long term active speech energy average */
394 : const int16_t Opt_AMR_WB, /* i : flag indicating AMR-WB IO mode */
395 : int16_t *decision_hyst, /* i/o: hysteresis of the music/speech decision */
396 : int16_t *locattack, /* i/o: detection of attack (mainly to localized speech burst) */
397 : int16_t *UV_cnt, /* i/o: number of consecutives frames classified as UV */
398 : float *LT_UV_cnt, /* i/o: long term consecutives frames classified as UV */
399 : float *Last_ener, /* i/o: last_energy frame */
400 : int16_t *amr_io_class, /* i/o: classification for AMR-WB IO mode */
401 : float *lt_diff_etot, /* i/o: long-term total energy variation */
402 : float *class_para, /* o : classification para. fmerit1 */
403 : const float LTP_Gain, /* i : */
404 : const int16_t narrowBand, /* i : */
405 : const SIGNAL_CLASSIFIER_MODE mode, /* i : */
406 : const int16_t bfi, /* i : */
407 : const float preemph_fac, /* i : */
408 : const int16_t tcxonly, /* i : */
409 : const int32_t last_core_brate, /* i : last core bitrate */
410 : const int16_t FEC_mode /* i : ACELP FEC mode */
411 : )
412 : {
413 : float diff_ener;
414 : float mean_diff;
415 : int16_t i;
416 : float ftmp_c, fcorr, dev;
417 845340 : float frame_ener = 0.0f;
418 845340 : float ener = 0.0f;
419 845340 : float enern = 0.0f;
420 : float tmp;
421 : float old_synth[L_SYN_CLAS_ESTIM], *synth;
422 :
423 : /* After Rate Switching st->last_core is reset to 0. Check for last_core_brate is needed */
424 845340 : if ( ( last_core_brate == SID_1k75 || last_core_brate == ACELP_6k60 || last_core_brate == ACELP_8k85 || last_core_brate == ACELP_12k65 || last_core_brate == ACELP_14k25 || last_core_brate == ACELP_15k85 || last_core_brate == ACELP_18k25 || last_core_brate == ACELP_19k85 || last_core_brate == ACELP_23k05 || last_core_brate == ACELP_23k85 ) && !Opt_AMR_WB && codec_mode == MODE2 && L_frame > L_FRAME )
425 : {
426 : int16_t oldLenClasBuff, newLenClasBuff;
427 2052 : oldLenClasBuff = (int16_t) ( L_SYN_MEM_CLAS_ESTIM * L_FRAME / L_frame );
428 2052 : newLenClasBuff = L_SYN_MEM_CLAS_ESTIM;
429 2052 : lerp( &mem_syn_clas_estim[L_SYN_MEM_CLAS_ESTIM - oldLenClasBuff], &mem_syn_clas_estim[L_SYN_MEM_CLAS_ESTIM - newLenClasBuff], newLenClasBuff, oldLenClasBuff );
430 : }
431 :
432 845340 : synth = old_synth + L_SYN_MEM_CLAS_ESTIM; /* Set pointer to current frame */
433 845340 : mvr2r( mem_syn_clas_estim, old_synth, L_SYN_MEM_CLAS_ESTIM ); /* Copy old synthesis into local buffer */
434 845340 : mvr2r( syn, synth, L_frame ); /* Copy current synthesis into local buffer */
435 :
436 : /* TCX outputs non-pre-speech */
437 845340 : if ( codec_mode == MODE2 && mode == CLASSIFIER_TCX )
438 : {
439 383574 : tmp = syn[-1];
440 383574 : preemph( synth, preemph_fac, L_frame, &tmp );
441 : }
442 :
443 : /* Do the classification only
444 : - MODE1: when the class is not transmitted in the bitstream
445 : - MODE2: on good frames (classifier is also called for bfi=1) */
446 845340 : if ( ( codec_mode == MODE1 && ( FEC_mode == 0 || coder_type <= UNVOICED || Opt_AMR_WB ) ) || ( codec_mode == MODE2 && bfi != 1 && !tcxonly ) )
447 : {
448 :
449 : /*------------------------------------------------------------------------*
450 : * classification decision depending on coder_type information
451 : *------------------------------------------------------------------------*/
452 :
453 501483 : if ( coder_type == UNVOICED )
454 : {
455 68667 : *clas = UNVOICED_CLAS;
456 : }
457 432816 : else if ( coder_type == VOICED )
458 : {
459 89232 : *clas = VOICED_CLAS;
460 : }
461 343584 : else if ( coder_type == INACTIVE && !Opt_AMR_WB )
462 : {
463 42504 : *clas = INACTIVE_CLAS;
464 : }
465 : else
466 : {
467 : /*------------------------------------------------------------------------*
468 : * GC, TC and AC frames
469 : *------------------------------------------------------------------------*/
470 :
471 301080 : FEC_ClassifierCore( synth, pitch, L_frame, codec_mode, clas, lp_speech, &frame_ener, &ener, &enern, class_para, LTP_Gain, narrowBand, mode );
472 : }
473 :
474 : /*------------------------------------------------------------------------*
475 : * Overwrite classification decision in case of music
476 : *------------------------------------------------------------------------*/
477 :
478 501483 : if ( codec_mode == MODE1 )
479 : {
480 123315 : FEC_classificationMusic( coder_type, decision_hyst, clas );
481 : }
482 :
483 : /*---------------------------------------------------------------------------------*
484 : * Measure energy on active voice frames (to improve FEC performance)
485 : *---------------------------------------------------------------------------------*/
486 :
487 501483 : if ( *clas == VOICED_CLAS )
488 : {
489 263343 : if ( ( codec_mode == MODE2 && coder_type == VOICED ) ||
490 66093 : ( codec_mode == MODE1 && ( Opt_AMR_WB || ( coder_type != GENERIC && coder_type != TRANSITION ) ) ) )
491 : {
492 110643 : enern = frame_energy( L_frame, pitch, synth, *lp_speech, &frame_ener );
493 : }
494 :
495 263343 : *lp_speech = 0.99f * *lp_speech + 0.01f * frame_ener;
496 : }
497 :
498 501483 : if ( codec_mode == MODE1 )
499 : {
500 : /*---------------------------------------------------------------------------------*
501 : * Overwrite classification decision to UNVOICED_CLAS in case of INACTIVE frame
502 : *---------------------------------------------------------------------------------*/
503 :
504 123315 : if ( coder_type == INACTIVE && *clas != INACTIVE_CLAS )
505 : {
506 0 : *clas = UNVOICED_CLAS;
507 : }
508 :
509 : /*---------------------------------------------------------------------------------*
510 : * Classification refinement to improve noise coding (only in AMR-WB IO mode)
511 : *---------------------------------------------------------------------------------*/
512 :
513 123315 : if ( Opt_AMR_WB )
514 : {
515 0 : *locattack = 0;
516 :
517 0 : if ( *clas == UNVOICED_CLAS && coder_type != INACTIVE )
518 : {
519 : /* unvoiced signal but not silence */
520 0 : if ( *lp_speech <= 40 )
521 : {
522 0 : *UV_cnt = 16;
523 : }
524 : else
525 : {
526 0 : *UV_cnt -= 8;
527 : }
528 : }
529 0 : else if ( coder_type != INACTIVE )
530 : {
531 : /* neither unvoiced nor clean silence */
532 0 : ( *UV_cnt )++;
533 : }
534 :
535 0 : if ( *UV_cnt > 300 )
536 : {
537 0 : *UV_cnt = 300;
538 : }
539 0 : else if ( *UV_cnt < 0 )
540 : {
541 0 : *UV_cnt = 0;
542 : }
543 :
544 : /* Update long-term average number of frames between UNVOICED */
545 0 : if ( coder_type == INACTIVE )
546 : {
547 : /* tends to speech if no activity */
548 0 : *LT_UV_cnt = 0.95f * *LT_UV_cnt;
549 :
550 0 : if ( *UV_cnt > 125 )
551 : {
552 0 : *UV_cnt = 125;
553 : }
554 : }
555 : else
556 : {
557 0 : *LT_UV_cnt = 0.9f * *LT_UV_cnt + 0.1f * *UV_cnt;
558 : }
559 :
560 : /*-----------------------------------------------------------------------------*
561 : * Compute frame energy difference
562 : * IF long-term average is high and energy difference is relatively low
563 : * classification is overwritten to AUDIO
564 : * IF energy difference > 6.0dB
565 : * consider an attack
566 : *-----------------------------------------------------------------------------*/
567 :
568 0 : diff_ener = ener - *Last_ener;
569 0 : *Last_ener = ener;
570 0 : *amr_io_class = *clas;
571 :
572 0 : if ( *LT_UV_cnt > LT_UV_THR && diff_ener < 12.0f )
573 : {
574 0 : *amr_io_class = AUDIO_CLAS;
575 : }
576 :
577 0 : if ( ( diff_ener > 6.0f && *clas == AUDIO_CLAS ) || ( diff_ener > 9.0f ) )
578 : {
579 0 : *locattack = 1;
580 : }
581 :
582 : /*------------------------------------------------------------------------*
583 : * Compute statistical deviation of long-term energy variation and
584 : * overwrite classification, if needed
585 : *------------------------------------------------------------------------*/
586 :
587 0 : if ( coder_type != INACTIVE )
588 : {
589 : /* calculate mean energy variation of past MAX_LT frames */
590 0 : mean_diff = 0.0f;
591 0 : for ( i = 1; i < MAX_LT; i++ )
592 : {
593 0 : mean_diff += lt_diff_etot[i - 1] * INV_MAX_LT;
594 0 : lt_diff_etot[i - 1] = lt_diff_etot[i];
595 : }
596 0 : mean_diff += lt_diff_etot[i - 1] * INV_MAX_LT;
597 :
598 : /* find statistical deviation of the energy variation history against the last 15 frames */
599 0 : fcorr = 0.0f;
600 0 : for ( i = MAX_LT - 15; i < MAX_LT; i++ )
601 : {
602 0 : ftmp_c = lt_diff_etot[i] - mean_diff;
603 0 : fcorr += ftmp_c * ftmp_c;
604 : }
605 0 : lt_diff_etot[i - 1] = diff_ener;
606 :
607 : /* compute statistical deviation */
608 0 : dev = (float) sqrt( fcorr / ( MAX_LT - 15 ) );
609 :
610 : /* overwrite classification, if needed */
611 0 : if ( *amr_io_class == AUDIO_CLAS && dev > 5.0f )
612 : {
613 0 : *amr_io_class = *clas;
614 0 : *UV_cnt = (int16_t) ( 80 + *UV_cnt * 0.2f );
615 : }
616 : }
617 : }
618 : }
619 : }
620 :
621 : /* update the memory of synthesis for frame class estimation */
622 845340 : mvr2r( old_synth + L_frame, mem_syn_clas_estim, L_SYN_MEM_CLAS_ESTIM );
623 :
624 845340 : return;
625 : }
626 :
627 :
628 : /*------------------------------------------------------------------------*
629 : * FEC_dec_class()
630 : *
631 : * Decode class and energy information
632 : *------------------------------------------------------------------------*/
633 :
634 326259 : static int16_t FEC_dec_class(
635 : Decoder_State *st, /* i/o: decoder state structure */
636 : float *enr_q /* i : decoded energy */
637 : )
638 : {
639 : int16_t clas, tmpS;
640 :
641 326259 : clas = ONSET;
642 :
643 326259 : if ( st->coder_type != VOICED )
644 : {
645 : /* decode the class */
646 311544 : tmpS = get_next_indice( st, FEC_BITS_CLS );
647 :
648 311544 : if ( tmpS == 0 )
649 : {
650 134250 : clas = UNVOICED_CLAS;
651 : }
652 177294 : else if ( tmpS == 1 )
653 : {
654 21180 : if ( st->last_good >= VOICED_TRANSITION )
655 : {
656 17829 : clas = VOICED_TRANSITION;
657 : }
658 : else
659 : {
660 3351 : clas = UNVOICED_TRANSITION;
661 : }
662 : }
663 156114 : else if ( tmpS == 2 )
664 : {
665 134049 : clas = VOICED_CLAS;
666 : }
667 : }
668 : else
669 : {
670 14715 : clas = VOICED_CLAS;
671 : }
672 :
673 : /* decode the energy */
674 326259 : if ( st->acelp_cfg.FEC_mode > 1 )
675 : {
676 183759 : tmpS = get_next_indice( st, FEC_BITS_ENR );
677 :
678 : /* convert from logarithmic to linear domain (the range is 0 : 1.55 : 96 dB) */
679 183759 : *enr_q = (float) pow( 10.0f, ( (float) tmpS * FEC_ENR_STEP ) / 10.0f );
680 : }
681 :
682 326259 : return clas;
683 : }
684 :
685 :
686 : /*------------------------------------------------------------------------*
687 : * FEC_classificationMusic()
688 : *
689 : *
690 : *------------------------------------------------------------------------*/
691 :
692 123315 : static void FEC_classificationMusic(
693 : const int16_t coder_type, /* i : coder type */
694 : int16_t *decision_hyst, /* i/o: hysteresis of the music/speech decision */
695 : int16_t *clas /* i/o: frame classification */
696 : )
697 : {
698 123315 : if ( coder_type == AUDIO )
699 : {
700 22476 : ( *decision_hyst ) += 4;
701 : }
702 : else
703 : {
704 100839 : ( *decision_hyst )--;
705 : }
706 :
707 123315 : if ( coder_type == INACTIVE )
708 : {
709 42504 : *decision_hyst -= 10;
710 : }
711 :
712 123315 : if ( *decision_hyst > 200 )
713 : {
714 12099 : *decision_hyst = 200;
715 : }
716 111216 : else if ( *decision_hyst < 0 )
717 : {
718 79140 : *decision_hyst = 0;
719 : }
720 :
721 123315 : if ( *decision_hyst > 16 && *clas < VOICED_CLAS && coder_type == AUDIO )
722 : {
723 3504 : *clas = VOICED_CLAS;
724 : }
725 :
726 123315 : return;
727 : }
728 :
729 :
730 : /*------------------------------------------------------------------------*
731 : * FEC_pos_dec()
732 : *
733 : * Decode class, energy and last glottal pulse position at higher bitrates
734 : * ( last glottal pulse position is encoded only in GENERIC frames )
735 : *------------------------------------------------------------------------*/
736 :
737 357147 : int16_t FEC_pos_dec(
738 : Decoder_State *st, /* i/o: decoder state structure */
739 : int16_t *last_pulse_pos, /* o : last glottal pulse position in the lost ACB */
740 : float *enr_q, /* o : decoded energy */
741 : const int16_t nBits_es_Pred /* i : number of bits for Es_pred Q */
742 :
743 : )
744 : {
745 : int16_t pitch_index, T0, T0_frac, T0_min, T0_max;
746 : int16_t bit_pos_pitch_index, nBits;
747 :
748 357147 : T0 = 0;
749 357147 : if ( st->coder_type > UNVOICED )
750 : {
751 : /* decode the clas and energy information */
752 336639 : if ( st->coder_type < AUDIO )
753 : {
754 326259 : st->clas_dec = FEC_dec_class( st, enr_q );
755 :
756 326259 : if ( st->coder_type == GENERIC && st->clas_dec == VOICED_CLAS && ( st->last_good <= UNVOICED_CLAS || st->last_good == INACTIVE_CLAS ) )
757 : {
758 135 : st->clas_dec = SIN_ONSET;
759 : }
760 : }
761 :
762 336639 : if ( st->coder_type == GENERIC && st->acelp_cfg.FEC_mode > 2 )
763 : {
764 31713 : nBits = st->acelp_cfg.pitch_bits[0];
765 :
766 : /* The first pitch index is located right after the actual position + the last pulse position index + predicted innovation energy index */
767 31713 : bit_pos_pitch_index = st->next_bit_pos + FEC_BITS_POS + nBits_es_Pred;
768 31713 : if ( st->core_brate >= MIN_BRATE_AVQ_EXC && st->core_brate <= MAX_BRATE_AVQ_EXC_TD && st->coder_type == GENERIC )
769 : {
770 : /* Harmonic flag is present */
771 23295 : bit_pos_pitch_index++;
772 : }
773 :
774 : /* retrieve the pitch index */
775 31713 : pitch_index = get_indice( st, bit_pos_pitch_index, nBits );
776 :
777 : /* decode pitch period */
778 31713 : T0_min = PIT_MIN;
779 31713 : T0_max = PIT_MAX;
780 31713 : pit16k_Q_dec( pitch_index, 10, 1, &T0, &T0_frac, &T0_min, &T0_max, &st->BER_detect );
781 :
782 : /* decode last pulse position */
783 31713 : *last_pulse_pos = get_next_indice( st, FEC_BITS_POS );
784 :
785 : /* respect the sign */
786 31713 : if ( *last_pulse_pos >= 128 )
787 : {
788 5301 : *last_pulse_pos = -( *last_pulse_pos & 0x7F );
789 : }
790 :
791 31713 : if ( T0 >= 128 )
792 : {
793 10020 : *last_pulse_pos *= 2;
794 : }
795 :
796 31713 : if ( st->BER_detect )
797 : {
798 0 : *last_pulse_pos = 0;
799 : }
800 : }
801 : }
802 :
803 357147 : return T0;
804 : }
|