Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include "cnst.h"
43 : #include "rom_enc.h"
44 : #include "rom_com.h"
45 : #include "prot.h"
46 : #include "wmc_auto.h"
47 :
48 :
49 : /*-------------------------------------------------------------------*
50 : * pre_proc()
51 : *
52 : * Pre-processing (spectral analysis, LP analysis, VAD, OL pitch calculation, coder mode selection, ...)
53 : *--------------------------------------------------------------------*/
54 :
55 3100 : void pre_proc(
56 : Encoder_State *st, /* i/o: encoder state structure */
57 : const int16_t input_frame, /* i : frame length */
58 : float old_inp_12k8[], /* i/o: buffer of old input signal */
59 : float old_inp_16k[], /* i/o: buffer of old input signal @ 16kHz */
60 : float **inp, /* o : ptr. to inp. signal in the current frame*/
61 : float fr_bands[2 * NB_BANDS], /* i : energy in frequency bands */
62 : float *ener, /* o : residual energy from Levinson-Durbin */
63 : #ifndef FIX_I4_OL_PITCH
64 : int16_t pitch_orig[3], /* o : open-loop pitch values for quantization */
65 : #endif
66 : float A[NB_SUBFR16k * ( M + 1 )], /* i/o: A(z) unquantized for the 4 subframes */
67 : float Aw[NB_SUBFR16k * ( M + 1 )], /* i/o: weighted A(z) unquantized for subframes */
68 : float epsP[M + 1], /* i/o: LP prediction errors */
69 : float lsp_new[M], /* i/o: LSPs at the end of the frame */
70 : float lsp_mid[M], /* i/o: LSPs in the middle of the frame */
71 : int16_t *vad_hover_flag, /* i : VAD hangover flag */
72 : int16_t *attack_flag, /* o : attack flag */
73 : float *new_inp_resamp16k, /* o : new input signal @16kHz, non pre-emphasised, used by the WB TBE/BWE */
74 : int16_t *Voicing_flag, /* o : voicing flag for HQ FEC */
75 : float realBuffer[CLDFB_NO_COL_MAX][CLDFB_NO_CHANNELS_MAX], /* i/o: real buffer */
76 : float imagBuffer[CLDFB_NO_COL_MAX][CLDFB_NO_CHANNELS_MAX], /* i/o: imag buffer */
77 : int16_t *hq_core_type /* o : HQ core type */
78 : )
79 : {
80 : int16_t delay;
81 : const float *signal_in;
82 : float *inp_12k8, *new_inp_12k8, *inp_16k, *new_inp_16k; /* pointers to current frame and new data */
83 : float old_wsp[L_WSP], *wsp; /* weighted input signal buffer */
84 : float pitch_fr[NB_SUBFR]; /* fractional pitch values */
85 : float voicing_fr[NB_SUBFR]; /* fractional pitch gains */
86 : float Etot; /* total energy */
87 : float lf_E[2 * VOIC_BINS]; /* per bin spectrum energy in lf */
88 : float tmpN[NB_BANDS]; /* Temporary noise update */
89 : float tmpE[NB_BANDS]; /* Temporary averaged energy of 2 sf. */
90 : float ee[2]; /* Spectral tilt */
91 : float corr_shift; /* correlation shift */
92 : float relE; /* frame relative energy */
93 : int16_t loc_harm; /* harmonicity flag */
94 : float cor_map_sum, sp_div, PS[128]; /* speech/music clasif. parameters */
95 : int16_t L_look; /* length of look-ahead */
96 : float snr_sum_he; /* HE SAD parameters */
97 : int16_t localVAD_HE_SAD; /* HE SAD parameters */
98 : int16_t vad_flag_dtx; /* HE-SAD flag with additional DTX HO */
99 : int16_t vad_flag_cldfb;
100 : float old_cor;
101 : float hp_E[2]; /* Energy in HF */
102 : int16_t noisy_speech_HO, clean_speech_HO, NB_speech_HO; /* SC-VBR HO flags */
103 : float non_staX; /* unbound non-stationarity for sp/mus clas. */
104 : int32_t sr_core_tmp;
105 : int16_t L_frame_tmp;
106 : int16_t flag_spitch;
107 : float lsf_new[M], stab_fac;
108 : float band_energies[2 * NB_BANDS]; /* energy in critical bands without minimum noise floor E_MIN */
109 : float enerBuffer[CLDFB_NO_CHANNELS_MAX];
110 : float currFlatness;
111 : int16_t high_lpn_flag;
112 : int16_t cldfb_addition;
113 : int16_t alw_pitch_lag_12k8[2];
114 : float alw_voicing[2];
115 : float fft_buff[2 * L_FFT];
116 : float sp_floor;
117 : int16_t last_core_orig;
118 : int16_t clas_mod;
119 : int16_t old_pitch1;
120 :
121 3100 : push_wmops( "pre_proc" );
122 :
123 : /*------------------------------------------------------------------*
124 : * Initialization
125 : *------------------------------------------------------------------*/
126 :
127 3100 : signal_in = st->input;
128 :
129 3100 : localVAD_HE_SAD = 0;
130 3100 : NB_speech_HO = 0;
131 3100 : clean_speech_HO = 0;
132 3100 : noisy_speech_HO = 0;
133 3100 : snr_sum_he = 0;
134 3100 : currFlatness = 0;
135 :
136 3100 : *vad_hover_flag = 0;
137 3100 : st->sp_aud_decision1 = 0;
138 3100 : st->sp_aud_decision2 = 0;
139 3100 : st->coder_type = GENERIC;
140 3100 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
141 3100 : *attack_flag = 0;
142 :
143 3100 : if ( st->hSC_VBR != NULL )
144 : {
145 3100 : st->hSC_VBR->bump_up = 0;
146 3100 : st->hSC_VBR->ppp_mode = 0;
147 3100 : st->hSC_VBR->nelp_mode = 0;
148 3100 : st->hSC_VBR->avoid_HQ_VBR_NB = 0;
149 : }
150 :
151 3100 : L_look = L_LOOK_12k8; /* lookahead at 12.8kHz */
152 :
153 3100 : new_inp_12k8 = old_inp_12k8 + L_INP_MEM; /* pointer to new samples of the input signal in 12.8kHz core */
154 3100 : inp_12k8 = new_inp_12k8 - L_look; /* pointer to the current frame of input signal in 12.8kHz core */
155 3100 : mvr2r( st->old_inp_12k8, old_inp_12k8, L_INP_MEM );
156 :
157 3100 : mvr2r( st->old_wsp, old_wsp, L_WSP_MEM );
158 3100 : wsp = old_wsp + L_WSP_MEM; /* pointer to the current frame of weighted signal in 12.8kHz core */
159 :
160 3100 : old_cor = st->old_corr; /* save old_cor for speech/music classifier */
161 :
162 3100 : st->rf_mode = st->Opt_RF_ON;
163 :
164 3100 : last_core_orig = st->last_core;
165 :
166 : /*--------------------------------------------------------------*
167 : * Cldfb analysis
168 : *---------------------------------------------------------------*/
169 :
170 3100 : st->prevEnergyHF = st->currEnergyHF;
171 :
172 3100 : analysisCldfbEncoder( st, signal_in, input_frame, realBuffer, imagBuffer, enerBuffer );
173 :
174 : /*----------------------------------------------------------------*
175 : * Change the sampling frequency to 12.8 kHz
176 : *----------------------------------------------------------------*/
177 :
178 3100 : modify_Fs( signal_in, input_frame, st->input_Fs, new_inp_12k8, INT_FS_12k8, st->mem_decim, ( st->max_bwidth == NB ) );
179 :
180 : /* save input resampled at 12.8kHz, non-preemhasised */
181 3100 : mvr2r( new_inp_12k8, st->buf_speech_enc + L_FRAME32k, L_FRAME );
182 :
183 : /*------------------------------------------------------------------*
184 : * Perform fixed preemphasis (12.8 kHz signal) through 1 - g*z^-1
185 : *-----------------------------------------------------------------*/
186 :
187 3100 : preemph( new_inp_12k8, PREEMPH_FAC, L_FRAME, &st->mem_preemph );
188 :
189 : /*-------------------------------------------------------------------------*
190 : * Spectral analysis
191 : *--------------------------------------------------------------------------*/
192 :
193 3100 : analy_sp( -1, NULL, st->input_Fs, inp_12k8, st->Bin_E, st->Bin_E_old, fr_bands, lf_E, &Etot, st->min_band, st->max_band, band_energies, PS, fft_buff );
194 :
195 : /*----------------------------------------------------------------*
196 : * SAD (1-signal, 0-noise)
197 : *----------------------------------------------------------------*/
198 :
199 3100 : noise_est_pre( Etot, st->ini_frame, st->hNoiseEst, 0, EVS_MONO, EVS_MONO );
200 :
201 3100 : st->vad_flag = wb_vad( st, fr_bands, &noisy_speech_HO, &clean_speech_HO, &NB_speech_HO, &snr_sum_he, &localVAD_HE_SAD, &( st->flag_noisy_speech_snr ), NULL, NULL, -1000.0f, -1000.0f );
202 :
203 3100 : vad_flag_cldfb = vad_proc( realBuffer, imagBuffer, enerBuffer, st->cldfbAnaEnc->no_channels, st->hVAD_CLDFB, &cldfb_addition, st->vad_flag );
204 :
205 3100 : if ( st->Pos_relE_cnt < 20 ) /* Ensure the level is high enough and cldfb decision is reliable */
206 : {
207 : /* Combine decisions from SADS */
208 2365 : if ( st->vad_flag == 1 && vad_flag_cldfb == 0 )
209 : {
210 0 : st->localVAD = 0;
211 : }
212 :
213 2365 : st->vad_flag = vad_flag_cldfb;
214 : }
215 :
216 3100 : vad_flag_dtx = dtx_hangover_addition( st, st->vad_flag, st->lp_speech - st->lp_noise, cldfb_addition, vad_hover_flag, NULL, NULL, NULL );
217 :
218 : /*----------------------------------------------------------------*
219 : * NB/WB/SWB/FB bandwidth detector
220 : *----------------------------------------------------------------*/
221 :
222 3100 : bw_detect( st, st->input, NULL, enerBuffer, MONO_FORMAT, 0 );
223 :
224 : /*----------------------------------------------------------------*
225 : * Noise energy down-ward update and total noise energy estimation
226 : * Long-term energies and relative frame energy updates
227 : * Correlation correction as a function of total noise level
228 : *----------------------------------------------------------------*/
229 :
230 3100 : noise_est_down( fr_bands, st->hNoiseEst->bckr, tmpN, tmpE, st->min_band, st->max_band, &st->hNoiseEst->totalNoise, Etot, &st->hNoiseEst->Etot_last, &st->hNoiseEst->Etot_v_h2 );
231 :
232 3100 : relE = Etot - st->lp_speech;
233 :
234 3100 : if ( relE > 1.5f )
235 : {
236 964 : st->Pos_relE_cnt = 0;
237 : }
238 2136 : else if ( relE < 0.0f && st->vad_flag == 1 )
239 : {
240 1920 : st->Pos_relE_cnt++;
241 : }
242 3100 : corr_shift = correlation_shift( st->hNoiseEst->totalNoise );
243 :
244 : /*----------------------------------------------------------------*
245 : * FD-CNG Noise Estimator
246 : *----------------------------------------------------------------*/
247 :
248 3100 : resetFdCngEnc( st );
249 :
250 3100 : perform_noise_estimation_enc( band_energies, enerBuffer, st->hFdCngEnc, 1, NULL );
251 :
252 : /*-----------------------------------------------------------------*
253 : * Select SID or FRAME_NO_DATA frame if DTX enabled
254 : *-----------------------------------------------------------------*/
255 :
256 3100 : dtx( st, -1, -1, vad_flag_dtx, inp_12k8 );
257 :
258 : /*----------------------------------------------------------------*
259 : * Adjust FD-CNG Noise Estimator
260 : *----------------------------------------------------------------*/
261 :
262 3100 : if ( ( st->last_total_brate != st->total_brate ) || ( st->last_bwidth != st->bwidth ) )
263 : {
264 0 : configureFdCngEnc( st->hFdCngEnc, st->bwidth, st->rf_mode && st->total_brate == ACELP_13k20 ? ACELP_9k60 : st->total_brate );
265 : }
266 3100 : if ( st->hFdCngEnc != NULL && st->Opt_DTX_ON )
267 : {
268 0 : AdjustFirstSID( st );
269 : }
270 :
271 : /*----------------------------------------------------------------*
272 : * Reconfigure Mode 2
273 : *----------------------------------------------------------------*/
274 :
275 3100 : if ( st->codec_mode == MODE2 )
276 : {
277 1050 : SetModeIndex( st, st->last_total_brate, EVS_MONO, 0 );
278 : }
279 :
280 3100 : calcLoEnvCheckCorrHiLo( st->cldfbAnaEnc->no_col, freqTable, st->hTECEnc->loBuffer, st->hTECEnc->loTempEnv, st->hTECEnc->loTempEnv_ns, st->hTECEnc->hiTempEnv, &( st->hTECEnc->corrFlag ) );
281 :
282 : /*---------------------------------------------------------------*
283 : * Time Domain Transient Detector
284 : *---------------------------------------------------------------*/
285 :
286 3100 : if ( st->tcx10Enabled || st->tcx20Enabled )
287 : {
288 3100 : RunTransientDetection( signal_in, input_frame, st->hTranDet );
289 :
290 3100 : currFlatness = GetTCXAvgTemporalFlatnessMeasure( st->hTranDet, NSUBBLOCKS, 0 );
291 : }
292 :
293 : /*----------------------------------------------------------------*
294 : * LP analysis
295 : *----------------------------------------------------------------*/
296 :
297 3100 : alw_pitch_lag_12k8[0] = st->old_pitch_la;
298 3100 : alw_pitch_lag_12k8[1] = st->old_pitch_la;
299 3100 : alw_voicing[0] = st->voicing[2];
300 3100 : alw_voicing[1] = st->voicing[2];
301 :
302 3100 : analy_lp( inp_12k8, L_FRAME, L_look, ener, A, epsP, lsp_new, lsp_mid, st->lsp_old1, alw_pitch_lag_12k8, alw_voicing, INT_FS_12k8, -1 );
303 :
304 3100 : lsp2lsf( lsp_new, lsf_new, M, INT_FS_12k8 );
305 3100 : stab_fac = lsf_stab( lsf_new, st->lsf_old1, 0, L_FRAME );
306 3100 : mvr2r( lsf_new, st->lsf_old1, M );
307 :
308 : /*----------------------------------------------------------------*
309 : * Compute weighted input (for OL pitch analysis)
310 : * OL pitch analysis
311 : * stable high pitch detection
312 : * 1/4 pitch precision improvement
313 : *----------------------------------------------------------------*/
314 :
315 3100 : find_wsp( L_FRAME, L_SUBFR, NB_SUBFR, A, Aw, inp_12k8, TILT_FAC, wsp, &st->mem_wsp, GAMMA1, L_look );
316 :
317 3100 : if ( st->vad_flag == 0 )
318 : {
319 : /* reset the OL pitch tracker memories during inactive frames */
320 16 : pitch_ol_init( &st->old_thres, &st->old_pitch, &st->delta_pit, &st->old_corr );
321 : }
322 :
323 3100 : old_pitch1 = st->pitch[1];
324 :
325 3100 : pitch_ol( st->pitch, st->voicing, &st->old_pitch, &st->old_corr, corr_shift, &st->old_thres, &st->delta_pit, st->old_wsp2, wsp, st->mem_decim2, relE, L_look, st->clas, st->input_bwidth, st->Opt_SC_VBR );
326 :
327 : /* Updates for adaptive lag window memory */
328 3100 : st->old_pitch_la = st->pitch[2];
329 :
330 : /* Detection of very short stable pitch period (MODE1 bitrates) */
331 3100 : StableHighPitchDetect( &flag_spitch, st->pitch, st->voicing, st->Bin_E, wsp, st->localVAD, &st->voicing_sm, &st->voicing0_sm, &st->LF_EnergyRatio_sm, &st->predecision_flag, &st->diff_sm, &st->energy_sm );
332 :
333 : /* 1/4 pitch precision improvement */
334 3100 : if ( st->total_brate <= ACELP_24k40 )
335 : {
336 2100 : pitch_ol2( PIT_MIN_EXTEND, st->pitch[0], &pitch_fr[0], &voicing_fr[0], 0, wsp, 7 );
337 2100 : pitch_ol2( PIT_MIN_EXTEND, st->pitch[0], &pitch_fr[1], &voicing_fr[1], L_SUBFR, wsp, 7 );
338 2100 : pitch_ol2( PIT_MIN_EXTEND, st->pitch[1], &pitch_fr[2], &voicing_fr[2], 2 * L_SUBFR, wsp, 7 );
339 2100 : pitch_ol2( PIT_MIN_EXTEND, st->pitch[1], &pitch_fr[3], &voicing_fr[3], 3 * L_SUBFR, wsp, 7 );
340 : }
341 : else
342 : {
343 1000 : pitch_fr[0] = st->pitch[0];
344 1000 : pitch_fr[1] = st->pitch[0];
345 1000 : pitch_fr[2] = st->pitch[1];
346 1000 : pitch_fr[3] = st->pitch[1];
347 :
348 1000 : voicing_fr[0] = st->voicing[0];
349 1000 : voicing_fr[1] = st->voicing[0];
350 1000 : voicing_fr[2] = st->voicing[1];
351 1000 : voicing_fr[3] = st->voicing[1];
352 : }
353 :
354 : /*------------------------------------------------------------------*
355 : * Update estimated noise energy and voicing cut-off frequency
356 : *-----------------------------------------------------------------*/
357 :
358 3100 : noise_est( st, old_pitch1, tmpN, epsP, Etot, relE, corr_shift, tmpE, fr_bands, &cor_map_sum, NULL, &sp_div, &non_staX, &loc_harm, lf_E, &st->hNoiseEst->harm_cor_cnt, st->hNoiseEst->Etot_l_lp, &sp_floor, 0, NULL, NULL, st->ini_frame );
359 :
360 : /*------------------------------------------------------------------*
361 : * Update parameters used in the VAD and DTX
362 : *-----------------------------------------------------------------*/
363 :
364 3100 : vad_param_updt( st, corr_shift, corr_shift, A, old_pitch1, NULL, 1 );
365 :
366 : /*-----------------------------------------------------------------*
367 : * Find spectral tilt
368 : * UC and VC frame selection
369 : *-----------------------------------------------------------------*/
370 :
371 3100 : find_tilt( fr_bands, st->hNoiseEst->bckr, ee, st->pitch, st->voicing, lf_E, corr_shift, st->input_bwidth, st->max_band, hp_E, st->codec_mode, &( st->bckr_tilt_lt ), st->Opt_SC_VBR );
372 :
373 3100 : st->coder_type = find_uv( st, pitch_fr, voicing_fr, inp_12k8, ee, NULL, corr_shift, relE, Etot, hp_E, &flag_spitch, last_core_orig, NULL );
374 :
375 : /*-----------------------------------------------------------------*
376 : * channel aware mode configuration *
377 : *-----------------------------------------------------------------*/
378 :
379 3100 : if ( !st->Opt_RF_ON )
380 : {
381 3100 : st->rf_mode = 0;
382 3100 : st->rf_target_bits_write = 0;
383 : }
384 0 : else if ( st->rf_mode && st->core_brate != FRAME_NO_DATA && st->core_brate != SID_2k40 )
385 : {
386 : /* the RF config is for (n- fec_offset)th frame that will be packed along with the n-th frame bitstream */
387 0 : st->rf_mode = 1;
388 0 : st->codec_mode = MODE2;
389 :
390 0 : st->rf_target_bits_write = st->hRF->rf_targetbits_buff[st->rf_fec_offset];
391 : }
392 : else
393 : {
394 0 : st->rf_mode = 0;
395 0 : st->codec_mode = MODE1;
396 0 : if ( st->Opt_RF_ON )
397 : {
398 0 : st->hRF->rf_indx_frametype[0] = RF_NO_DATA;
399 0 : st->hRF->rf_targetbits_buff[0] = 6; /* rf_mode: 1, rf_frame_type: 3, and fec_offset: 2 */
400 : }
401 : }
402 :
403 : /*-----------------------------------------------------------------*
404 : * Signal classification for FEC
405 : * TC frame selection
406 : *-----------------------------------------------------------------*/
407 :
408 3100 : st->clas = signal_clas( st, inp_12k8, ee, relE, L_look, &clas_mod );
409 :
410 3100 : select_TC( st->codec_mode, st->tc_cnt, &st->coder_type, st->localVAD );
411 :
412 : /* limit coder_type depending on the bitrate */
413 3100 : coder_type_modif( st, relE );
414 :
415 3100 : if ( st->Opt_SC_VBR )
416 : {
417 0 : st->hSC_VBR->Local_VAD = st->localVAD;
418 : }
419 :
420 : /*----------------------------------------------------------------*
421 : * Speech/music classification
422 : * AC frame selection
423 : *----------------------------------------------------------------*/
424 :
425 3100 : st->GSC_IVAS_mode = 0;
426 :
427 3100 : speech_music_classif( st, new_inp_12k8, inp_12k8, localVAD_HE_SAD, lsp_new, cor_map_sum, epsP, PS, Etot, old_cor, attack_flag, non_staX, relE, &high_lpn_flag, flag_spitch );
428 :
429 3100 : long_enr( st, Etot, localVAD_HE_SAD, high_lpn_flag, NULL, 1, NULL, NULL );
430 :
431 : /*----------------------------------------------------------------*
432 : * Final VAD correction ( when HE-SAD is used instead of the normal VAD,
433 : * rewrite the VAD flag by VAD flag with DTX hangover for further processing)
434 : *----------------------------------------------------------------*/
435 :
436 3100 : if ( st->Opt_DTX_ON )
437 : {
438 0 : st->vad_flag = vad_flag_dtx;
439 : }
440 :
441 : /*----------------------------------------------------------------*
442 : * Selection of internal ACELP Fs (12.8 kHz or 16 kHz)
443 : *----------------------------------------------------------------*/
444 :
445 3100 : if ( st->codec_mode == MODE1 )
446 : {
447 2050 : if ( st->core_brate == FRAME_NO_DATA )
448 : {
449 : /* prevent "L_frame" changes in CNG segments */
450 0 : st->L_frame = st->last_L_frame;
451 : }
452 2050 : else if ( st->core_brate == SID_2k40 && st->bwidth >= WB && st->hDtxEnc->first_CNG && ( st->hTdCngEnc != NULL && st->hTdCngEnc->act_cnt2 < MIN_ACT_CNG_UPD ) )
453 : {
454 : /* prevent "L_frame" changes in SID frame after short segment of active frames */
455 0 : st->L_frame = st->hDtxEnc->last_CNG_L_frame;
456 : }
457 2050 : else if ( ( st->core_brate == SID_2k40 && st->total_brate >= ACELP_9k60 && ( ( st->bwidth == WB && !( st->total_brate == ACELP_13k20 && st->cng_type == FD_CNG ) ) || ( st->cng_type == LP_CNG && st->bwidth > WB && st->total_brate >= ACELP_16k40 ) ) ) || ( st->total_brate > ACELP_24k40 && st->total_brate < HQ_96k ) || ( st->total_brate == ACELP_24k40 && st->bwidth >= WB ) )
458 : {
459 1000 : st->L_frame = L_FRAME16k;
460 : }
461 : else
462 : {
463 1050 : st->L_frame = L_FRAME;
464 : }
465 :
466 2050 : if ( st->ini_frame == 0 )
467 : {
468 : /* avoid switching of internal ACELP Fs in the very first frame */
469 2 : st->last_L_frame = st->L_frame;
470 : }
471 :
472 2050 : if ( st->L_frame == L_FRAME )
473 : {
474 1050 : st->gamma = GAMMA1;
475 1050 : st->preemph_fac = PREEMPH_FAC;
476 : }
477 : else
478 : {
479 1000 : st->gamma = GAMMA16k;
480 1000 : st->preemph_fac = PREEMPH_FAC_16k;
481 : }
482 :
483 2050 : st->sr_core = st->L_frame * FRAMES_PER_SEC;
484 2050 : st->encoderLookahead_enc = NS2SA( st->sr_core, ACELP_LOOK_NS );
485 2050 : st->encoderPastSamples_enc = ( st->L_frame * 9 ) >> 4;
486 : }
487 :
488 : /*-----------------------------------------------------------------*
489 : * coder_type rewriting in case of switching
490 : * IC frames selection
491 : * enforce TC frames in case of switching
492 : *-----------------------------------------------------------------*/
493 :
494 3100 : if ( st->codec_mode == MODE1 )
495 : {
496 : /* enforce TRANSITION frames */
497 2050 : if ( st->last_L_frame != st->L_frame && st->core_brate != FRAME_NO_DATA && st->core_brate != SID_2k40 && ( st->coder_type_raw != VOICED ) )
498 : {
499 : /* enforce TC frame in case of ACELP@12k8 <-> ACELP@16k core switching */
500 0 : st->coder_type = TRANSITION;
501 : }
502 2050 : else if ( st->last_core == HQ_CORE || st->last_core == TCX_10_CORE || st->last_core == TCX_20_CORE )
503 : {
504 : /* enforce TC frame in case of HQ -> ACELP core switching */
505 639 : st->coder_type = TRANSITION;
506 : }
507 1411 : else if ( st->last_core_brate <= SID_2k40 && st->cng_type == FD_CNG )
508 : {
509 : /* enforce TC frame in case of FD_CNG -> ACELP switching (past excitation not available) */
510 0 : st->coder_type = TRANSITION;
511 : }
512 : /* select INACTIVE frames */
513 1411 : else if ( st->total_brate <= ACELP_24k40 && st->vad_flag == 0 )
514 : {
515 : /* inactive frames will be coded by GSC technology */
516 : /* except for the VBR mode. VBR mode uses NELP for that */
517 6 : if ( !( st->Opt_SC_VBR && vad_flag_dtx ) )
518 : {
519 6 : st->coder_type = INACTIVE;
520 6 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP3;
521 : }
522 : }
523 1405 : else if ( st->total_brate > ACELP_24k40 && ( ( st->vad_flag == 0 && st->bwidth >= SWB && st->max_bwidth >= SWB ) || ( st->localVAD == 0 && ( st->bwidth <= WB || st->max_bwidth <= WB ) ) ) )
524 : {
525 : /* inactive frames will be coded by AVQ technology */
526 0 : st->coder_type = INACTIVE;
527 : }
528 : }
529 : else /* st->codec_mode == MODE2 */
530 : {
531 1050 : if ( !st->vad_flag )
532 : {
533 8 : st->coder_type = INACTIVE;
534 : }
535 1042 : else if ( st->coder_type > GENERIC )
536 : {
537 0 : st->coder_type = GENERIC;
538 : }
539 : }
540 :
541 : /*---------------------------------------------------------------*
542 : * SC-VBR - decision about PPP/NELP mode
543 : *---------------------------------------------------------------*/
544 :
545 3100 : if ( st->Opt_SC_VBR )
546 : {
547 0 : set_ppp_mode( st, noisy_speech_HO, clean_speech_HO, NB_speech_HO, localVAD_HE_SAD );
548 : }
549 :
550 3100 : if ( !st->Opt_AMR_WB && !st->rf_mode )
551 : {
552 3100 : if ( st->total_brate == ACELP_13k20 || st->total_brate == ACELP_32k )
553 : {
554 1050 : st->mdct_sw_enable = MODE1;
555 : }
556 2050 : else if ( ACELP_16k40 <= st->total_brate && st->total_brate <= ACELP_24k40 )
557 : {
558 1050 : st->mdct_sw_enable = MODE2;
559 : }
560 : }
561 :
562 : /*---------------------------------------------------------------------*
563 : * Decision matrix (selection of technologies)
564 : *---------------------------------------------------------------------*/
565 :
566 3100 : if ( st->codec_mode == MODE1 ) /* mono case */
567 : {
568 2050 : decision_matrix_enc( st, hq_core_type );
569 :
570 :
571 : /* HQ_CORE/TCX_20_CORE decision */
572 2050 : if ( st->core == HQ_CORE ) /* Decision matrix decided for MDCT coding */
573 : {
574 640 : if ( ( st->bwidth == SWB || st->bwidth == FB ) && st->total_brate == ACELP_32k )
575 : {
576 : /* Select MDCT Core */
577 0 : st->core = mdct_classifier( st, fft_buff, enerBuffer, st->total_brate );
578 : }
579 640 : if ( st->total_brate == ACELP_13k20 && st->bwidth != FB )
580 : {
581 309 : MDCT_selector( st, sp_floor, Etot, cor_map_sum, enerBuffer );
582 : }
583 : }
584 : else
585 : {
586 1410 : MDCT_selector_reset( st->hTcxEnc );
587 : }
588 :
589 : /* Switch to MODE2 if TCX_20_CORE */
590 2050 : if ( st->core == TCX_20_CORE )
591 : {
592 275 : st->codec_mode = MODE2;
593 :
594 275 : if ( st->last_codec_mode == MODE1 )
595 : {
596 24 : int32_t last_total_brate = st->last_total_brate;
597 24 : st->last_total_brate = -1;
598 24 : SetModeIndex( st, st->last_total_brate, EVS_MONO, 0 );
599 24 : st->last_total_brate = last_total_brate;
600 : }
601 : else
602 : {
603 251 : SetModeIndex( st, st->last_total_brate, EVS_MONO, 0 );
604 251 : st->sr_core = getCoreSamplerateMode2( EVS_MONO, st->total_brate, st->bwidth, st->flag_ACELP16k, st->rf_mode, 0 );
605 251 : st->L_frame = (int16_t) ( st->sr_core / FRAMES_PER_SEC );
606 251 : st->encoderLookahead_enc = NS2SA( st->sr_core, ACELP_LOOK_NS );
607 251 : st->encoderPastSamples_enc = ( st->L_frame * 9 ) >> 4;
608 :
609 251 : if ( st->sr_core == INT_FS_12k8 )
610 : {
611 251 : st->preemph_fac = PREEMPH_FAC;
612 251 : st->gamma = GAMMA1;
613 : }
614 : else
615 : {
616 0 : st->preemph_fac = PREEMPH_FAC_16k;
617 0 : st->gamma = GAMMA16k;
618 : }
619 :
620 251 : st->igf = getIgfPresent( EVS_MONO, st->total_brate, st->bwidth, st->rf_mode );
621 : }
622 :
623 275 : st->coder_type = st->coder_type_raw;
624 :
625 275 : if ( st->vad_flag == 0 )
626 : {
627 1 : st->coder_type = INACTIVE;
628 : }
629 274 : else if ( st->coder_type > GENERIC )
630 : {
631 0 : st->coder_type = GENERIC;
632 : }
633 :
634 275 : st->mdct_sw = MODE1;
635 : }
636 : }
637 :
638 : /*-----------------------------------------------------------------*
639 : * Update of ACELP harmonicity counter (used in ACELP transform codebook @32kbps)
640 : *-----------------------------------------------------------------*/
641 :
642 3100 : if ( st->total_brate == ACELP_32k && loc_harm == 1 && cor_map_sum > 50 && st->clas == VOICED_CLAS && st->coder_type == GENERIC )
643 : {
644 0 : st->last_harm_flag_acelp++;
645 :
646 0 : if ( st->last_harm_flag_acelp > 10 )
647 : {
648 0 : st->last_harm_flag_acelp = 10;
649 : }
650 : }
651 : else
652 : {
653 3100 : st->last_harm_flag_acelp = 0;
654 : }
655 :
656 : /*-----------------------------------------------------------------*
657 : * Update audio frames counter (used for UV decision)
658 : *-----------------------------------------------------------------*/
659 :
660 3100 : if ( st->coder_type == AUDIO )
661 : {
662 17 : st->audio_frame_cnt += AUDIO_COUNTER_STEP;
663 : }
664 3083 : else if ( st->coder_type != INACTIVE )
665 : {
666 3068 : st->audio_frame_cnt--;
667 : }
668 :
669 3100 : if ( st->audio_frame_cnt > AUDIO_COUNTER_MAX )
670 : {
671 0 : st->audio_frame_cnt = AUDIO_COUNTER_MAX;
672 : }
673 :
674 3100 : if ( st->audio_frame_cnt < 0 )
675 : {
676 2298 : st->audio_frame_cnt = 0;
677 : }
678 :
679 : /*-----------------------------------------------------------------*
680 : * Set formant sharpening flag
681 : *-----------------------------------------------------------------*/
682 :
683 3100 : st->sharpFlag = 0;
684 :
685 3100 : if ( st->coder_type == TRANSITION )
686 : {
687 472 : if ( ( st->total_brate > ACELP_48k && st->bwidth < SWB ) || /* Deactivate for core bitrates higher than 48.0 kb/s */
688 472 : ( st->total_brate >= ACELP_13k20 && st->total_brate <= ACELP_16k40 ) || /* Deactivate for bitrates <13.2, 16.4> kb/s (this is basically due to lack of signaling configurations */
689 366 : ( st->total_brate > ACELP_16k40 && st->lp_noise > FORMANT_SHARPENING_NOISE_THRESHOLD ) ) /* Deactivate for bitrates >= 24.4 kb/s if the long-term noise level exceeds 34 dB */
690 : {
691 106 : st->sharpFlag = 0;
692 : }
693 : else
694 : {
695 366 : st->sharpFlag = 1;
696 : }
697 : }
698 :
699 3100 : if ( st->coder_type == GENERIC || st->coder_type == VOICED )
700 : {
701 2543 : if ( *vad_hover_flag ||
702 2482 : ( st->total_brate > ACELP_48k && st->bwidth < SWB ) || /* Deactivate for core bitrates higher than 48.0 kb/s */
703 2482 : ( st->total_brate >= ACELP_13k20 && st->lp_noise > FORMANT_SHARPENING_NOISE_THRESHOLD && st->total_brate > CNA_MAX_BRATE ) ) /* Deactivate for bitrates >= 13.2 kb/s if the long-term noise level exceeds 34 dB */
704 : {
705 61 : st->sharpFlag = 0;
706 : }
707 : else
708 : {
709 2482 : st->sharpFlag = 1;
710 : }
711 : }
712 :
713 : /* channel-aware mode - due to lack of signaling bit, sharpFlag is 1 always in RF mode */
714 3100 : if ( st->rf_mode && ( st->coder_type == VOICED || st->coder_type == GENERIC ) )
715 : {
716 0 : st->sharpFlag = 1;
717 : }
718 :
719 : /*-----------------------------------------------------------------*
720 : * Set voicing flag for HQ FEC
721 : *-----------------------------------------------------------------*/
722 :
723 3100 : if ( st->sp_aud_decision1 == 0 && ( st->coder_type == VOICED || st->coder_type == GENERIC ) )
724 : {
725 2006 : *Voicing_flag = 1;
726 : }
727 : else
728 : {
729 1094 : *Voicing_flag = 0;
730 : }
731 :
732 : /*---------------------------------------------------------------*
733 : * Preprocessing at other sampling frequency rate (16/25.6/32kHz)
734 : *----------------------------------------------------------------*/
735 :
736 3100 : sr_core_tmp = ( st->codec_mode == MODE1 ) ? INT_FS_16k : max( INT_FS_16k, st->sr_core ); /* indicates the ACELP sampling rate */
737 3100 : L_frame_tmp = ( st->codec_mode == MODE1 ) ? L_FRAME16k : max( L_FRAME16k, st->L_frame );
738 :
739 3100 : L_look = NS2SA( sr_core_tmp, ACELP_LOOK_NS ); /* lookahead at other sampling rate (16kHz, 25.6kHz, 32kHz) */
740 :
741 3100 : new_inp_16k = old_inp_16k + L_INP_MEM; /* pointer to new samples of the input signal in 16kHz core */
742 3100 : inp_16k = new_inp_16k - L_look; /* pointer to the current frame of input signal in 16kHz core */
743 :
744 3100 : mvr2r( st->old_inp_16k, old_inp_16k, L_INP_MEM );
745 :
746 : /*---------------------------------------------------------------*
747 : * Change the sampling frequency to 16/25.6/32 kHz
748 : *----------------------------------------------------------------*/
749 :
750 3100 : if ( st->input_Fs == sr_core_tmp )
751 : {
752 : /* no resampling needed, only delay adjustment to account for the FIR resampling delay */
753 0 : delay = NS2SA( st->input_Fs, DELAY_FIR_RESAMPL_NS );
754 0 : mvr2r( st->mem_decim16k + delay, new_inp_16k, delay );
755 0 : mvr2r( signal_in, new_inp_16k + delay, input_frame - delay );
756 0 : mvr2r( signal_in + input_frame - 2 * delay, st->mem_decim16k, 2 * delay );
757 : }
758 3100 : else if ( st->input_Fs == 32000 || st->input_Fs == 48000 )
759 : {
760 3100 : modify_Fs( signal_in, input_frame, st->input_Fs, new_inp_16k, sr_core_tmp, st->mem_decim16k, 0 );
761 : }
762 : else /* keep memories up-to-date in case of bitrate switching */
763 : {
764 : /* no resampling needed, only delay adjustment to account for the FIR resampling delay */
765 0 : delay = NS2SA( st->input_Fs, DELAY_FIR_RESAMPL_NS );
766 0 : mvr2r( st->mem_decim16k + delay, new_inp_16k, delay );
767 0 : mvr2r( signal_in, new_inp_16k + delay, input_frame - delay );
768 0 : mvr2r( signal_in + input_frame - 2 * delay, st->mem_decim16k, 2 * delay );
769 : }
770 :
771 3100 : if ( sr_core_tmp == INT_FS_16k )
772 : {
773 : /* save input resampled at 16kHz, non-preemhasised */
774 3100 : mvr2r( new_inp_16k, new_inp_resamp16k, L_FRAME16k );
775 : }
776 0 : else if ( sr_core_tmp > INT_FS_16k )
777 : {
778 : /* reset the buffer, the signal is needed for WB BWEs */
779 0 : set_f( new_inp_resamp16k, 0.0f, L_FRAME16k );
780 : }
781 :
782 : /*------------------------------------------------------------------*
783 : * Perform fixed preemphasis (16kHz signal) through 1 - g*z^-1
784 : *-----------------------------------------------------------------*/
785 :
786 3100 : if ( ( st->tcxonly == 0 || st->codec_mode == MODE1 ) && st->input_Fs > 8000 )
787 : {
788 3100 : st->mem_preemph_enc = new_inp_16k[L_frame_tmp - 1];
789 : }
790 :
791 3100 : if ( st->input_Fs > 8000 && sr_core_tmp == INT_FS_16k )
792 : {
793 3100 : preemph( new_inp_16k, PREEMPH_FAC_16k, L_FRAME16k, &( st->mem_preemph16k ) );
794 : }
795 0 : else if ( st->input_Fs > 8000 ) /* keep memory up-to-date in case of bitrate switching */
796 : {
797 0 : st->mem_preemph16k = new_inp_16k[L_frame_tmp - 1];
798 : }
799 :
800 : /*-----------------------------------------------------------------*
801 : * Redo LP analysis at 16kHz if ACELP@16k core was selected
802 : * update buffers
803 : *-----------------------------------------------------------------*/
804 :
805 3100 : if ( ( ( ( st->tcxonly == 0 ) || !( st->core_brate != FRAME_NO_DATA && st->core_brate != SID_2k40 ) ) &&
806 2100 : st->L_frame == L_FRAME16k && st->codec_mode == MODE2 ) ||
807 2050 : ( st->L_frame == L_FRAME16k && st->codec_mode == MODE1 ) )
808 : {
809 : /* update signal buffers */
810 2050 : mvr2r( new_inp_resamp16k, st->buf_speech_enc + L_FRAME16k, L_FRAME16k );
811 2050 : mvr2r( new_inp_16k, st->buf_speech_enc_pe + L_FRAME16k, L_FRAME16k );
812 :
813 : /*--------------------------------------------------------------*
814 : * LPC analysis
815 : *---------------------------------------------------------------*/
816 :
817 2050 : if ( st->last_L_frame == L_FRAME && st->codec_mode == MODE1 )
818 : {
819 : /* this is just an approximation, but it is sufficient */
820 0 : mvr2r( st->lsp_old1, st->lspold_enc, M );
821 : }
822 :
823 2050 : analy_lp( inp_16k, L_FRAME16k, L_look, ener, A, epsP, lsp_new, lsp_mid, st->lspold_enc, st->pitch, st->voicing, 16000, -1 );
824 :
825 : /*--------------------------------------------------------------*
826 : * Compute Weighted Input
827 : *---------------------------------------------------------------*/
828 :
829 2050 : if ( st->codec_mode == MODE2 )
830 : {
831 1050 : find_wsp( L_FRAME16k, L_SUBFR, st->nb_subfr, A, Aw, st->speech_enc_pe, PREEMPH_FAC_16k, st->wspeech_enc, &st->mem_wsp_enc, st->gamma, L_LOOK_16k );
832 : }
833 : else
834 : {
835 1000 : weight_a_subfr( NB_SUBFR16k, A, Aw, GAMMA16k, M );
836 : }
837 : }
838 : else
839 : {
840 : /* update signal buffers */
841 1050 : mvr2r( new_inp_12k8, st->buf_speech_enc_pe + st->L_frame, L_FRAME );
842 1050 : mvr2r( st->buf_speech_enc + L_FRAME32k, st->buf_speech_enc + st->L_frame, L_FRAME );
843 :
844 1050 : if ( st->tcxonly == 0 )
845 : {
846 1050 : mvr2r( wsp, st->wspeech_enc, L_FRAME + L_LOOK_12k8 );
847 : }
848 : }
849 :
850 : /*-----------------------------------------------------------------*
851 : * ACELP/TCX20 Switching Decision
852 : *-----------------------------------------------------------------*/
853 :
854 : #ifndef FIX_I4_OL_PITCH
855 3100 : mvs2s( st->pitch, pitch_orig, 3 );
856 : #endif
857 :
858 3100 : if ( st->codec_mode == MODE2 )
859 : {
860 1325 : if ( st->core_brate != FRAME_NO_DATA && st->core_brate != SID_2k40 && st->tcxonly == 0 )
861 : {
862 1325 : core_acelp_tcx20_switching( st, non_staX, pitch_fr, voicing_fr, currFlatness, lsp_mid, stab_fac );
863 : }
864 :
865 1325 : if ( st->mdct_sw_enable == MODE2 && !st->rf_mode )
866 : {
867 1050 : if ( st->core == TCX_20_CORE ) /* Switching only possible from TCX_20_CORE frames, not from TCX_10_CORE frames */
868 : {
869 : /* Select MDCT Core */
870 446 : if ( ( st->bwidth == SWB || st->bwidth == FB ) && st->total_brate == ACELP_24k40 )
871 : {
872 446 : st->core = mdct_classifier( st, fft_buff, enerBuffer, st->total_brate );
873 : }
874 :
875 446 : if ( st->total_brate == ACELP_16k40 && st->bwidth != FB )
876 : {
877 0 : MDCT_selector( st, sp_floor, Etot, cor_map_sum, enerBuffer );
878 : }
879 : }
880 : else
881 : {
882 604 : MDCT_selector_reset( st->hTcxEnc );
883 : }
884 :
885 : /* Do the switching that was decided in the MDCT selector */
886 1050 : if ( st->core == HQ_CORE )
887 : {
888 75 : st->codec_mode = MODE1;
889 75 : st->mdct_sw = MODE2;
890 : }
891 975 : else if ( st->last_codec_mode == MODE1 && st->last_core == HQ_CORE )
892 : {
893 0 : int16_t L_frame_old = st->last_L_frame;
894 0 : st->last_L_frame = st->L_frame;
895 0 : SetModeIndex( st, st->last_total_brate, EVS_MONO, 0 );
896 0 : st->last_L_frame = L_frame_old;
897 : }
898 : }
899 :
900 : /*--------------------------------------------------------------*
901 : * TCX mode decision
902 : *---------------------------------------------------------------*/
903 :
904 1325 : SetTCXModeInfo( st, st->hTranDet, &st->hTcxCfg->tcx_curr_overlap_mode );
905 : }
906 :
907 : /*-----------------------------------------------------------------*
908 : * Updates
909 : *-----------------------------------------------------------------*/
910 :
911 : /* update old weighted speech buffer - for OL pitch analysis */
912 3100 : mvr2r( &old_wsp[L_FRAME], st->old_wsp, L_WSP_MEM );
913 :
914 : /* update old input signal buffer */
915 3100 : mvr2r( &old_inp_12k8[L_FRAME], st->old_inp_12k8, L_INP_MEM );
916 :
917 : /* update old input signal @16kHz buffer */
918 3100 : if ( st->input_Fs > 8000 && sr_core_tmp == INT_FS_16k )
919 : {
920 3100 : mvr2r( &old_inp_16k[L_frame_tmp], st->old_inp_16k, L_INP_MEM );
921 : }
922 0 : else if ( st->input_Fs > 8000 )
923 : {
924 0 : lerp( st->old_inp_12k8 + L_INP_MEM - L_INP_MEM * 4 / 5, st->old_inp_16k, L_INP_MEM, L_INP_MEM * 4 / 5 );
925 : }
926 :
927 3100 : if ( sr_core_tmp == INT_FS_16k && st->tcxonly && st->codec_mode == MODE2 )
928 : {
929 : /* copy input resampled at 16kHz, non-preemhasised */
930 0 : mvr2r( new_inp_resamp16k, new_inp_16k, L_FRAME16k );
931 : }
932 :
933 : /* update of old per-band energy spectrum */
934 3100 : mvr2r( fr_bands + NB_BANDS, st->hNoiseEst->enrO, NB_BANDS );
935 :
936 : /* set the pointer of the current frame for the ACELP core */
937 3100 : if ( st->L_frame == L_FRAME )
938 : {
939 1050 : *inp = inp_12k8;
940 : }
941 : else
942 : {
943 2050 : *inp = inp_16k;
944 : }
945 3100 : if ( ( st->core != HQ_CORE && st->tcxonly == 0 ) || st->core == ACELP_CORE )
946 : {
947 : /* Update vAD hangover frame counter in active frames */
948 2660 : if ( !( st->core_brate == SID_2k40 || st->core_brate == FRAME_NO_DATA ) )
949 : {
950 2660 : if ( st->Opt_DTX_ON && *vad_hover_flag )
951 : {
952 0 : st->hTdCngEnc->burst_ho_cnt++;
953 0 : if ( st->hTdCngEnc->burst_ho_cnt > HO_HIST_SIZE )
954 : {
955 0 : st->hTdCngEnc->burst_ho_cnt = HO_HIST_SIZE;
956 : }
957 : }
958 2660 : else if ( st->hTdCngEnc != NULL && vad_flag_dtx )
959 : {
960 2660 : st->hTdCngEnc->burst_ho_cnt = 0;
961 : }
962 : }
963 : }
964 :
965 3100 : pop_wmops();
966 3100 : return;
967 : }
|