Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <assert.h>
38 : #include <stdint.h>
39 : #include "options.h"
40 : #ifdef DEBUGGING
41 : #include "debug.h"
42 : #endif
43 : #include <math.h>
44 : #include "cnst.h"
45 : #include "prot.h"
46 : #include "ivas_prot.h"
47 : #include "rom_enc.h"
48 : #include "rom_com.h" /* Common static table prototypes */
49 : #include "wmc_auto.h"
50 :
51 :
52 : /*---------------------------------------------------------------------*
53 : * Local constants
54 : *---------------------------------------------------------------------*/
55 :
56 : #define ATT_SEG_LEN ( L_FRAME / ATT_NSEG )
57 : #define ATT_3LSUB_POS ( 3 * ATT_NSEG / NB_SUBFR )
58 : #define ATT_3LSUB_POS_16k ( int16_t )( ( 4.0f * ATT_NSEG / (float) NB_SUBFR16k ) + 0.5f )
59 :
60 : #define THR_CORR_PEAK 0.95f
61 : #define TON_FACT 0.95f
62 : #define TON_ALPHA 0.95f
63 :
64 : #define DLP_BIAS 0.138121f
65 :
66 : #define THR_MASS_MAX 0.85f
67 : #define THR_MASS_MIN 0.75f
68 : #define THR_MASS_STEP_UP 0.01f
69 : #define THR_MASS_STEP_DN 0.02f
70 :
71 :
72 : /*---------------------------------------------------------------------*
73 : * Local function prototypes
74 : *---------------------------------------------------------------------*/
75 :
76 : static void spec_analysis( float *Bin_E, float *p2v_map );
77 :
78 : static void flux( float *Bin_E, float *p2v_map, float *old_Bin_E, float *buf_flux, int16_t attack_hangover, float dec_mov );
79 :
80 : static void tonal_dist( float *p2v_map, float *buf_pkh, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf );
81 :
82 : static int16_t mode_decision( Encoder_State *st, int16_t len, float *dec_mov, float *buf_flux, float *buf_epsP_tilt, float *buf_pkh, float *buf_cor_map_sum, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf, float *buf_dlp );
83 :
84 : static void var_cor_calc( const float old_corr, float *mold_corr, float var_cor_t[], int16_t *high_stable_cor );
85 :
86 : static int16_t attack_det( const float *inp, const int16_t last_clas, const int16_t localVAD, const int16_t coder_type, const int32_t total_brate, const int16_t element_mode, const int16_t clas, float finc_prev[], float *lt_finc, int16_t *last_strong_attack );
87 :
88 : static float tonal_det( const float S[], int16_t vad_flag, float tod_S_map_lt[], float *tod_thr_lt, float *tod_weight, float *tod_S_mass_prev, float *tod_S_mass_lt );
89 :
90 : static void tonal_context_improv( Encoder_State *st, const float PS[], const float voi_fv, const float cor_map_sum_fv, const float LPCErr );
91 :
92 : static void order_spectrum( float *vec, const int16_t len );
93 :
94 : static void detect_sparseness( Encoder_State *st, const int16_t localVAD_HE_SAD, const float voi_fv );
95 :
96 : static int16_t sp_mus_classif_1st( Encoder_State *st, const int16_t localVAD_HE_SAD, const float lsp_new[M], const float cor_map_sum, const float epsP[M + 1], const float PS[], float non_sta, float relE, float *voi_fv, float *cor_map_sum_fv, float *LPCErr, int16_t *high_lpn_flag );
97 :
98 : static void sp_mus_classif_2nd( Encoder_State *st, const float Etot, int16_t *attack_flag, const float *inp );
99 :
100 : static void music_mixed_classif_improv( Encoder_State *st, const float *new_inp, const float *epsP, const float etot, const float old_cor, const float cor_map_sum );
101 :
102 :
103 : /*---------------------------------------------------------------------*
104 : * speech_music_clas_init()
105 : *
106 : * Initialization of speech/music classifier
107 : *---------------------------------------------------------------------*/
108 :
109 129205 : void speech_music_clas_init(
110 : SP_MUS_CLAS_HANDLE hSpMusClas /* i/o: speech/music classifier handle */
111 : )
112 : {
113 : int16_t i;
114 :
115 129205 : set_f( hSpMusClas->FV_st, 0.0f, N_SMC_FEATURES );
116 :
117 129205 : hSpMusClas->inact_cnt = 0;
118 129205 : set_s( hSpMusClas->past_dec, 0, HANG_LEN - 1 );
119 129205 : set_f( hSpMusClas->past_dlp, 0, HANG_LEN - 1 );
120 129205 : set_f( hSpMusClas->past_dlp_mean_ST, 0, HANG_LEN - 1 );
121 129205 : hSpMusClas->dlp_mean_ST = 0.0f;
122 129205 : hSpMusClas->dlp_mean_LT = 0.0f;
123 129205 : hSpMusClas->dlp_var_LT = 0.0f;
124 :
125 2067280 : for ( i = 0; i < N_SMC_FEATURES; i++ )
126 : {
127 1938075 : hSpMusClas->prev_FV[i] = 0.5f * hout_intervals[2 * i] + 0.5f * hout_intervals[2 * i + 1];
128 : }
129 :
130 2067280 : for ( i = 0; i < NB_BANDS_SPMUS; i++ )
131 : {
132 1938075 : hSpMusClas->past_log_enr[i] = logf( E_MIN );
133 : }
134 :
135 129205 : hSpMusClas->sp_mus_state = -8;
136 129205 : hSpMusClas->wdrop = 0.0f;
137 129205 : hSpMusClas->wrise = 0.0f;
138 129205 : hSpMusClas->wdlp_0_95_sp = 0.0f;
139 129205 : hSpMusClas->wdlp_xtalk = 0.0f;
140 129205 : set_f( hSpMusClas->last_lsp, 0.0f, M_LSP_SPMUS );
141 129205 : hSpMusClas->last_cor_map_sum = 0.0f;
142 129205 : hSpMusClas->last_non_sta = 0.0f;
143 129205 : set_f( hSpMusClas->past_PS, 0.0f, HIGHEST_FBIN - LOWEST_FBIN );
144 129205 : hSpMusClas->past_ps_diff = 0;
145 129205 : hSpMusClas->past_epsP2 = 01;
146 129205 : hSpMusClas->past_epsP = 0;
147 129205 : hSpMusClas->flag_spitch_cnt = 0;
148 :
149 129205 : hSpMusClas->gsc_thres[0] = TH_0_MIN;
150 129205 : hSpMusClas->gsc_thres[1] = TH_1_MIN;
151 129205 : hSpMusClas->gsc_thres[2] = TH_2_MIN;
152 129205 : hSpMusClas->gsc_thres[3] = TH_3_MIN;
153 129205 : set_f( hSpMusClas->gsc_lt_diff_etot, 0.0f, MAX_LT );
154 129205 : hSpMusClas->gsc_mem_etot = 0.0f;
155 129205 : hSpMusClas->gsc_last_music_flag = 0;
156 129205 : hSpMusClas->gsc_nb_thr_1 = 0;
157 129205 : hSpMusClas->gsc_nb_thr_3 = 0;
158 129205 : hSpMusClas->mold_corr = 0.9f;
159 129205 : hSpMusClas->mean_avr_dyn = 0.5f;
160 129205 : hSpMusClas->last_sw_dyn = 10.0f;
161 :
162 129205 : hSpMusClas->relE_attack_cnt = 0;
163 129205 : hSpMusClas->prev_relE = 0.0f;
164 129205 : hSpMusClas->prev_Etot = 0.0f;
165 129205 : hSpMusClas->prev_vad = 0;
166 129205 : hSpMusClas->vad_0_1_cnt = 0;
167 129205 : hSpMusClas->relE_attack_sum = 0;
168 :
169 : /* speech/music classifier improvement */
170 7881505 : for ( i = 0; i < BUF_LEN; i++ )
171 : {
172 7752300 : hSpMusClas->buf_flux[i] = -100;
173 7752300 : hSpMusClas->buf_pkh[i] = 0;
174 7752300 : hSpMusClas->buf_epsP_tilt[i] = 0;
175 7752300 : hSpMusClas->buf_cor_map_sum[i] = 0;
176 7752300 : hSpMusClas->buf_Ntonal[i] = 0;
177 7752300 : hSpMusClas->buf_Ntonal2[i] = 0;
178 7752300 : hSpMusClas->buf_Ntonal_lf[i] = 0;
179 : }
180 :
181 129205 : set_f( hSpMusClas->lpe_buf, 0, HANG_LEN_INIT );
182 129205 : set_f( hSpMusClas->voicing_buf, 0, HANG_LEN_INIT );
183 129205 : hSpMusClas->gsc_hangover = 0;
184 129205 : set_f( hSpMusClas->sparse_buf, 0, HANG_LEN_INIT );
185 129205 : set_f( hSpMusClas->hf_spar_buf, 0, HANG_LEN_INIT );
186 129205 : hSpMusClas->LT_sparse = 0.0f;
187 129205 : hSpMusClas->gsc_cnt = 0;
188 129205 : hSpMusClas->last_vad_spa = 0;
189 :
190 129205 : set_f( hSpMusClas->old_Bin_E, 0.0f, 3 * N_OLD_BIN_E );
191 129205 : set_f( hSpMusClas->buf_etot, 0, 4 );
192 129205 : set_f( hSpMusClas->buf_dlp, 0, 10 );
193 :
194 129205 : hSpMusClas->UV_cnt1 = 300;
195 129205 : hSpMusClas->LT_UV_cnt1 = 250.0f;
196 129205 : hSpMusClas->onset_cnt = 0;
197 129205 : hSpMusClas->attack_hangover = 0;
198 129205 : hSpMusClas->dec_mov = 0.0f;
199 129205 : hSpMusClas->dec_mov1 = 0.0f;
200 129205 : hSpMusClas->mov_log_max_spl = 200.0f;
201 129205 : hSpMusClas->old_lt_diff[0] = 0.0f;
202 129205 : hSpMusClas->old_lt_diff[1] = 0.0f;
203 :
204 129205 : set_f( hSpMusClas->finc_prev, 0.0f, ATT_NSEG );
205 129205 : hSpMusClas->lt_finc = 0.0f;
206 129205 : hSpMusClas->last_strong_attack = 0;
207 129205 : hSpMusClas->tdm_lt_Etot = 0.01f;
208 129205 : set_f( hSpMusClas->tod_lt_Bin_E, 0.0f, TOD_NSPEC );
209 129205 : set_f( hSpMusClas->tod_S_map_lt, 0.0f, TOD_NSPEC );
210 129205 : hSpMusClas->tod_thr_lt = TOD_THR_MASS;
211 129205 : hSpMusClas->tod_weight = 0.0f;
212 129205 : hSpMusClas->tod_S_mass_prev = 0.0f;
213 129205 : hSpMusClas->tod_S_mass_lt = 0.0f;
214 :
215 : /* speech/music classification */
216 129205 : set_s( hSpMusClas->lt_old_mode, 1, 3 );
217 129205 : hSpMusClas->lt_voicing = 0.5f;
218 129205 : hSpMusClas->lt_corr = 0.5f;
219 129205 : hSpMusClas->lt_tonality = 0;
220 129205 : set_s( hSpMusClas->lt_corr_pitch, 0, 3 );
221 129205 : hSpMusClas->lt_hangover = 0;
222 129205 : hSpMusClas->lowrate_pitchGain = 0;
223 :
224 129205 : hSpMusClas->lt_music_hangover = 0;
225 129205 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
226 129205 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
227 129205 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
228 129205 : hSpMusClas->lt_music_state = 0;
229 129205 : hSpMusClas->lt_speech_state = 0;
230 129205 : hSpMusClas->lt_speech_hangover = 0;
231 :
232 129205 : hSpMusClas->lt_dec_thres = 10.0f;
233 129205 : hSpMusClas->ener_RAT = 0.0f;
234 :
235 129205 : hSpMusClas->high_stable_cor = 0;
236 129205 : set_f( hSpMusClas->var_cor_t, 0.0f, VAR_COR_LEN );
237 :
238 129205 : hSpMusClas->lps = 0.0f;
239 129205 : hSpMusClas->lpm = 0.0f;
240 129205 : hSpMusClas->lpn = 0.0f;
241 :
242 129205 : return;
243 : }
244 :
245 :
246 : /*---------------------------------------------------------------------*
247 : * speech_music_classif()
248 : *
249 : * Speech/music classification
250 : *
251 : * The following technologies are used based on the outcome of the sp/mus classifier
252 : * sp_aud_decision1 sp_aud_decision2
253 : * 0 0 use ACELP (+TD BWE)
254 : * 1 0 use ACELP (+FD BWE) or HQ/LR-MDCT depending on bitrate
255 : * 1 1 use GSC (+FD BWE) or HQ/LR-MDCT depending on bitrate
256 : *
257 : * 0 1 exceptionally use GSC (+FD BWE) instead of LR-MDCT at 13.2 kbps (WB/SWB) for sparse spectra
258 : *---------------------------------------------------------------------*/
259 :
260 : /*! r: 1st stage decision (1-music, 0-speech or noise) */
261 83858 : void speech_music_classif(
262 : Encoder_State *st, /* i/o: state structure */
263 : const float *new_inp, /* i : new input signal */
264 : const float *inp, /* i : input signal to locate attach position */
265 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
266 : const float lsp_new[M], /* i : LSPs in current frame */
267 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
268 : const float epsP[M + 1], /* i : LP prediciton error */
269 : const float PS[], /* i : energy spectrum */
270 : const float Etot, /* i : total frame energy */
271 : const float old_cor, /* i : max correlation from previous frame */
272 : int16_t *attack_flag, /* o : attack flag (GSC or TC) */
273 : const float non_sta, /* i : unbound non-stationarity for sp/mus classifier */
274 : const float relE, /* i : relative frame energy */
275 : int16_t *high_lpn_flag, /* o : sp/mus LPN flag */
276 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
277 : )
278 : {
279 : float voi_fv, cor_map_sum_fv, LPCErr;
280 :
281 : /* 1st stage speech/music classification based on the GMM model */
282 83858 : st->sp_aud_decision1 = sp_mus_classif_1st( st, localVAD_HE_SAD, lsp_new, cor_map_sum, epsP, PS, non_sta, relE, &voi_fv, &cor_map_sum_fv, &LPCErr, high_lpn_flag );
283 :
284 83858 : if ( st->codec_mode == MODE1 || st->sr_core == INT_FS_12k8 )
285 : {
286 :
287 : /* Improvement of the 1st stage decision for mixed/music content */
288 55265 : if ( !st->Opt_SC_VBR && ( st->total_brate != ACELP_24k40 ) )
289 : {
290 52755 : music_mixed_classif_improv( st, new_inp, epsP, Etot, old_cor, cor_map_sum );
291 : }
292 :
293 55265 : st->sp_aud_decision0 = st->sp_aud_decision1;
294 :
295 : /* 2nd stage speech/music classification (rewrite music to speech in onsets) */
296 55265 : st->sp_aud_decision2 = st->sp_aud_decision1;
297 :
298 55265 : if ( st->bwidth > NB )
299 : {
300 47715 : sp_mus_classif_2nd( st, Etot, attack_flag, inp );
301 :
302 47715 : if ( flag_spitch && st->bwidth == WB && st->total_brate < ACELP_13k20 )
303 : {
304 : /* avoid switch to AUDIO/MUSIC class for very short stable high pitch
305 : and/or stable pitch with high correlation at low bitrates*/
306 56 : st->sp_aud_decision2 = 0;
307 : }
308 : }
309 :
310 : /* Context-based improvement of 1st and 2nd stage decision on stable tonal signals */
311 55265 : if ( !st->Opt_SC_VBR && st->total_brate != ACELP_24k40 )
312 : {
313 52755 : tonal_context_improv( st, PS, voi_fv, cor_map_sum_fv, LPCErr );
314 : }
315 :
316 : /* Avoid using LR-MDCT on sparse spectra, use GSC instead at 13.2 kbps (WB/SWB) */
317 55265 : if ( !st->Opt_SC_VBR && st->total_brate == ACELP_13k20 && st->vad_flag == 1 && ( st->bwidth == WB || st->bwidth == SWB ) )
318 : {
319 16594 : detect_sparseness( st, localVAD_HE_SAD, voi_fv );
320 : }
321 :
322 : /* override speech/music classification to ACELP when background noise level reaches certain level */
323 : /* this is a patch against mis-classifications during active noisy speech segments */
324 55265 : if ( st->lp_noise > 12.0f )
325 : {
326 16504 : st->sp_aud_decision1 = 0;
327 16504 : st->sp_aud_decision2 = 0;
328 : }
329 :
330 : /* set GSC noisy speech flag on unvoiced SWB segments */
331 55265 : st->GSC_noisy_speech = 0;
332 55265 : if ( st->vad_flag == 1 && st->total_brate >= ACELP_13k20 && st->total_brate < ACELP_24k40 &&
333 18887 : st->lp_noise > 12.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB &&
334 1896 : st->coder_type_raw == UNVOICED )
335 : {
336 236 : st->GSC_noisy_speech = 1;
337 : }
338 :
339 : /* Select AUDIO frames */
340 : #ifdef DEBUGGING
341 : if ( st->codec_mode == MODE1 && ( st->force == 1 || ( st->force == -1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) ) ) )
342 : #else
343 55265 : if ( st->codec_mode == MODE1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) )
344 : #endif
345 : {
346 15582 : st->coder_type = AUDIO;
347 15582 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
348 : }
349 : }
350 : else
351 : {
352 28593 : st->sp_aud_decision0 = st->sp_aud_decision1;
353 : }
354 :
355 :
356 83858 : return;
357 : }
358 :
359 :
360 : /*---------------------------------------------------------------------*
361 : * sp_mus_classif_1st()
362 : *
363 : * 1st stage speech/music classification (based on the GMM model)
364 : *---------------------------------------------------------------------*/
365 :
366 : /*! r: decision flag (1-music, 0-speech or noise) */
367 83858 : static int16_t sp_mus_classif_1st(
368 : Encoder_State *st, /* i/o: state structure */
369 : const int16_t localVAD_HE_SAD, /* i : local VAD HE flag */
370 : const float lsp_new[M], /* i : LSPs in current frame */
371 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
372 : const float epsP[M + 1], /* i : LP prediciton error */
373 : const float PS[], /* i : energy spectrum */
374 : float non_sta, /* i : unbound non-stationarity */
375 : float relE, /* i : relative frame energy */
376 : float *voi_fv, /* o : scaled voicing feature */
377 : float *cor_map_sum_fv, /* o : scaled correlation map feature */
378 : float *LPCErr, /* o : scaled LP prediction error feature */
379 : int16_t *high_lpn_flag /* o : sp/mus LPN flag */
380 : )
381 : {
382 : int16_t i, k, p, dec, vad;
383 : float dlp, ftmp, lepsP1, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght, mx;
384 83858 : float FV[N_FEATURES], *pFV = FV, PS_norm[128], dPS[128], lsp[M];
385 83858 : float pys, pym, xm[N_FEATURES], py, lps = 0, lpm = 0;
386 : const float *pSF;
387 83858 : float pyn, lpn = 0;
388 :
389 83858 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
390 :
391 : /*------------------------------------------------------------------*
392 : * Initialization
393 : *------------------------------------------------------------------*/
394 :
395 83858 : vad = localVAD_HE_SAD;
396 :
397 : /*------------------------------------------------------------------*
398 : * Preparation of the feature vector
399 : *------------------------------------------------------------------*/
400 :
401 : /* [0] OL pitch */
402 83858 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
403 : {
404 6816 : *pFV++ = (float) st->pitch[2];
405 : }
406 : else
407 : {
408 77042 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
409 : }
410 :
411 : /* [1] voicing */
412 83858 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
413 : {
414 6816 : *pFV++ = st->voicing[2];
415 : }
416 : else
417 : {
418 77042 : *pFV++ = (float) ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
419 : }
420 :
421 : /* [2,3,4,5,6] LSFs */
422 83858 : mvr2r( lsp_new, lsp, M );
423 :
424 83858 : ftmp = (float) acos( lsp[1] );
425 83858 : *pFV++ = ftmp + hSpMusClas->last_lsp[1];
426 83858 : hSpMusClas->last_lsp[1] = ftmp;
427 :
428 83858 : ftmp = (float) acos( lsp[2] );
429 83858 : *pFV++ = ftmp + hSpMusClas->last_lsp[2];
430 83858 : hSpMusClas->last_lsp[2] = ftmp;
431 :
432 83858 : ftmp = (float) acos( lsp[3] );
433 83858 : *pFV++ = ftmp + hSpMusClas->last_lsp[3];
434 83858 : hSpMusClas->last_lsp[3] = ftmp;
435 :
436 83858 : ftmp = (float) acos( lsp[4] );
437 83858 : *pFV++ = ftmp + hSpMusClas->last_lsp[4];
438 83858 : hSpMusClas->last_lsp[4] = ftmp;
439 :
440 83858 : ftmp = (float) acos( lsp[5] );
441 83858 : *pFV++ = ftmp + hSpMusClas->last_lsp[5];
442 83858 : hSpMusClas->last_lsp[5] = ftmp;
443 :
444 : /* [7] cor_map_sum */
445 83858 : *pFV++ = cor_map_sum + hSpMusClas->last_cor_map_sum;
446 83858 : hSpMusClas->last_cor_map_sum = cor_map_sum;
447 :
448 : /* [8] non_sta */
449 83858 : *pFV++ = non_sta + hSpMusClas->last_non_sta;
450 83858 : hSpMusClas->last_non_sta = non_sta;
451 :
452 : /* [9] epsP */
453 83858 : if ( st->bwidth == NB )
454 : {
455 : /* do not take into account (statistics are too different) */
456 7550 : *pFV++ = -1.647f;
457 : }
458 : else
459 : {
460 76308 : lepsP1 = logf( epsP[1] + 1e-5f );
461 76308 : ftmp = logf( epsP[13] ) - lepsP1;
462 76308 : *pFV++ = ftmp + hSpMusClas->past_epsP2;
463 76308 : hSpMusClas->past_epsP2 = ftmp;
464 : }
465 :
466 : /* calculation of differential normalized power spectrum */
467 83858 : sum_PS = 1e-5f;
468 5702344 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
469 : {
470 5618486 : sum_PS += PS[i];
471 : }
472 :
473 5702344 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
474 : {
475 5618486 : PS_norm[i] = PS[i] / sum_PS;
476 5618486 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
477 : }
478 :
479 : /* [10] ps_diff (spectral difference) */
480 83858 : ps_diff = 0;
481 5702344 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
482 : {
483 5618486 : ps_diff += dPS[i];
484 : }
485 :
486 83858 : ps_diff = logf( ps_diff + 1e-5f );
487 83858 : *pFV++ = ps_diff + hSpMusClas->past_ps_diff;
488 83858 : hSpMusClas->past_ps_diff = ps_diff;
489 :
490 : /* [11] ps_sta (spectral stationarity) */
491 83858 : ps_sta = 0;
492 5702344 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
493 : {
494 5618486 : mx = PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] ? PS_norm[i] : hSpMusClas->past_PS[i - LOWEST_FBIN];
495 5618486 : ps_sta += mx / ( dPS[i] + 1e-5f );
496 : }
497 :
498 83858 : *pFV++ = logf( ps_sta + 1e-5f );
499 83858 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
500 :
501 : /*------------------------------------------------------------------*
502 : * Scaling of the feature vector
503 : *------------------------------------------------------------------*/
504 :
505 83858 : pFV = FV;
506 83858 : if ( st->bwidth == NB )
507 : {
508 7550 : pSF = SF_8k;
509 : }
510 : else
511 : {
512 76308 : pSF = SF;
513 : }
514 :
515 1090154 : for ( i = 0; i < N_FEATURES; i++, pFV++, pSF += 2 )
516 : {
517 1006296 : *pFV = pSF[0] * *pFV + pSF[1];
518 : }
519 :
520 : /* store some scaled parameters for later correction of the 1st stage speech/music classification */
521 83858 : *voi_fv = FV[1];
522 83858 : *cor_map_sum_fv = FV[7];
523 83858 : *LPCErr = FV[9];
524 :
525 : /*------------------------------------------------------------------*
526 : * Calculation of posterior probability
527 : * Log-probability
528 : *------------------------------------------------------------------*/
529 :
530 83858 : pys = pym = pyn = 1e-5f;
531 :
532 : /* run loop for all mixtures (for each mixture, calculate the probability of speech or noise and the probability of music) */
533 587006 : for ( k = 0; k < N_MIXTURES; k++ )
534 : {
535 : /* active frames - calculate the probability of speech */
536 6540924 : for ( p = 0; p < N_FEATURES; p++ )
537 : {
538 6037776 : xm[p] = FV[p] - m_speech[k * N_FEATURES + p];
539 : }
540 :
541 503148 : py = lvm_speech[k] + dot_product_mat( xm, &invV_speech[k * N_FEATURES * N_FEATURES], N_FEATURES );
542 503148 : pys += expf( py );
543 : /* inactive frames - calculate the probability of noise */
544 6540924 : for ( p = 0; p < N_FEATURES; p++ )
545 : {
546 6037776 : xm[p] = FV[p] - m_noise[k * N_FEATURES + p];
547 : }
548 :
549 503148 : py = lvm_noise[k] + dot_product_mat( xm, &invV_noise[k * N_FEATURES * N_FEATURES], N_FEATURES );
550 503148 : pyn += expf( py );
551 :
552 : /* either active or inactive frames - calculate the probability of music */
553 6540924 : for ( p = 0; p < N_FEATURES; p++ )
554 : {
555 6037776 : xm[p] = FV[p] - m_music[k * N_FEATURES + p];
556 : }
557 :
558 503148 : py = lvm_music[k] + dot_product_mat( xm, &invV_music[k * N_FEATURES * N_FEATURES], N_FEATURES );
559 503148 : pym += expf( py );
560 : }
561 :
562 : /* calculate log-probability */
563 83858 : lps = logf( pys ) - 0.5f * N_FEATURES * logf( PI2 );
564 83858 : lpm = logf( pym ) - 0.5f * N_FEATURES * logf( PI2 );
565 83858 : lpn = logf( pyn ) - 0.5f * N_FEATURES * logf( PI2 );
566 :
567 83858 : *high_lpn_flag = 0;
568 83858 : if ( lpn > lps && lpn > lpm )
569 : {
570 14367 : *high_lpn_flag = 1;
571 : }
572 :
573 83858 : if ( !vad )
574 : {
575 : /* artificially increase log-probability of noise */
576 10764 : lps = lpn * 1.2f;
577 : }
578 :
579 83858 : hSpMusClas->lpm = lpm;
580 83858 : hSpMusClas->lps = lps;
581 :
582 : /* determine HQ Generic speech class */
583 83858 : if ( st->hHQ_core != NULL )
584 : {
585 83858 : if ( lps > lpm + 0.5f )
586 : {
587 39034 : st->hHQ_core->hq_generic_speech_class = 1;
588 : }
589 : else
590 : {
591 44824 : st->hHQ_core->hq_generic_speech_class = 0;
592 : }
593 : }
594 :
595 : /*------------------------------------------------------------------*
596 : * State machine (sp_mus_state < 0 .. inactive, > 0 .. entry, = 0 .. active )
597 : *------------------------------------------------------------------*/
598 :
599 83858 : if ( vad )
600 : {
601 73094 : if ( relE < -20 || ( lps <= -5 && lpm <= -5 ) )
602 : {
603 7329 : if ( hSpMusClas->sp_mus_state > 0 )
604 : {
605 1280 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
606 : {
607 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
608 103 : hSpMusClas->inact_cnt = 0;
609 : }
610 :
611 : /* energy is too low -> we are going to instable state */
612 1280 : hSpMusClas->sp_mus_state = 0;
613 : }
614 6049 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
615 : {
616 : /* energy is still too low -> we are still in instable state */
617 2916 : hSpMusClas->sp_mus_state--;
618 : }
619 : }
620 65765 : else if ( hSpMusClas->sp_mus_state <= 0 )
621 : {
622 1597 : if ( hSpMusClas->inact_cnt == 0 )
623 : {
624 :
625 497 : hSpMusClas->sp_mus_state = 1;
626 : }
627 : else
628 : {
629 :
630 1100 : hSpMusClas->sp_mus_state = HANG_LEN;
631 : }
632 :
633 1597 : hSpMusClas->inact_cnt = 12;
634 : }
635 64168 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
636 : {
637 : /* we are inside an entry period -> increment the counter of entry frames */
638 2557 : hSpMusClas->sp_mus_state++;
639 : }
640 :
641 73094 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
642 : {
643 3472 : hSpMusClas->inact_cnt--;
644 : }
645 : }
646 : else
647 : {
648 10764 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
649 : {
650 50 : hSpMusClas->inact_cnt = 0;
651 : }
652 10714 : else if ( hSpMusClas->inact_cnt > 0 )
653 : {
654 1941 : hSpMusClas->inact_cnt--;
655 : }
656 :
657 10764 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
658 : {
659 50 : hSpMusClas->sp_mus_state = -HANG_LEN;
660 : }
661 10714 : else if ( hSpMusClas->sp_mus_state > 0 )
662 : {
663 264 : hSpMusClas->sp_mus_state = -1;
664 : }
665 10450 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
666 : {
667 : /* we are in inactive state */
668 1244 : hSpMusClas->sp_mus_state--;
669 : }
670 : }
671 :
672 : /*------------------------------------------------------------------*
673 : * Decision without hangover
674 : * Weighted decision
675 : *------------------------------------------------------------------*/
676 :
677 : /* decision without hangover (0 - speech/noise, 1 - music) */
678 83858 : dec = lpm > lps;
679 83858 : dlp = lpm - lps;
680 :
681 83858 : if ( !vad )
682 : {
683 10764 : dec = 0;
684 10764 : dlp = 0;
685 : }
686 :
687 : /* calculate weight based on relE (close to 0.01 in low-E regions, close to 1 in high-E regions) */
688 83858 : wrelE = 1.0f + relE / 15;
689 :
690 83858 : if ( wrelE > 1.0f )
691 : {
692 29024 : wrelE = 1.0f;
693 : }
694 54834 : else if ( wrelE < 0.01f )
695 : {
696 16772 : wrelE = 0.01f;
697 : }
698 :
699 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
700 83858 : if ( dlp < 0 && dlp < hSpMusClas->past_dlp[0] )
701 : {
702 19499 : if ( hSpMusClas->past_dlp[0] > 0 )
703 : {
704 6833 : hSpMusClas->wdrop = -dlp;
705 : }
706 : else
707 : {
708 12666 : hSpMusClas->wdrop += hSpMusClas->past_dlp[0] - dlp;
709 : }
710 : }
711 : else
712 : {
713 64359 : hSpMusClas->wdrop = 0;
714 : }
715 :
716 83858 : wdrop = hSpMusClas->wdrop / 20;
717 :
718 83858 : if ( wdrop > 1.0f )
719 : {
720 0 : wdrop = 1.0f;
721 : }
722 83858 : else if ( wdrop < 0.1f )
723 : {
724 73413 : wdrop = 0.1f;
725 : }
726 :
727 : /* combine weights into one */
728 83858 : wght = wrelE * wdrop;
729 83858 : if ( wght < 0.01f )
730 : {
731 18515 : wght = 0.01f;
732 : }
733 :
734 : /* calculate weighted decision */
735 83858 : hSpMusClas->wdlp_0_95_sp = wght * dlp + ( 1 - wght ) * hSpMusClas->wdlp_0_95_sp;
736 :
737 83858 : if ( hSpMusClas->sp_mus_state == -HANG_LEN )
738 : {
739 12770 : hSpMusClas->wdlp_0_95_sp = 0;
740 : }
741 :
742 : /*------------------------------------------------------------------*
743 : * Final speech/music decision
744 : *------------------------------------------------------------------*/
745 :
746 83858 : if ( !vad && hSpMusClas->sp_mus_state == -HANG_LEN )
747 : {
748 : /* inactive state */
749 9402 : dec = 0;
750 : }
751 74456 : else if ( hSpMusClas->sp_mus_state <= 0 )
752 : {
753 : /* transition from active to inactive state or instable state */
754 8691 : dec = hSpMusClas->past_dec[0];
755 : }
756 65765 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
757 : {
758 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
759 2710 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
760 2710 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
761 2710 : dec = ftmp > 2.0f;
762 : }
763 : else
764 : {
765 : /* stable active state */
766 63055 : if ( hSpMusClas->wdlp_0_95_sp > 0 && hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 )
767 : {
768 : /* switching from speech to music */
769 492 : dec = 1;
770 : }
771 62563 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < 0 )
772 : {
773 : /* switching from music to speech */
774 456 : dec = 0;
775 : }
776 : else
777 : {
778 62107 : dec = hSpMusClas->past_dec[0];
779 : }
780 : }
781 :
782 : /*------------------------------------------------------------------*
783 : * Updates
784 : *------------------------------------------------------------------*/
785 :
786 : /* update buffer of past non-binary decisions */
787 83858 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
788 83858 : hSpMusClas->past_dlp[0] = dlp;
789 :
790 : /* update buffer of past binary decisions */
791 83858 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
792 83858 : hSpMusClas->past_dec[0] = dec;
793 :
794 83858 : return dec;
795 : }
796 :
797 :
798 : /*---------------------------------------------------------------------*
799 : * sp_mus_classif_2nd()
800 : *
801 : * 2nd stage speech/music classifier (convert music to speech for onsets)
802 : *---------------------------------------------------------------------*/
803 :
804 47715 : static void sp_mus_classif_2nd(
805 : Encoder_State *st, /* i/o: encoder state structure */
806 : const float Etot, /* i : total frame energy */
807 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
808 : const float *inp /* i : input signal */
809 : )
810 : {
811 : int16_t attack;
812 47715 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
813 :
814 : /* initialization */
815 47715 : *attack_flag = 0;
816 :
817 : /* signal stability estimation */
818 47715 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
819 :
820 : /* calculate variance of correlation */
821 47715 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
822 :
823 : /* attack detection */
824 47715 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, st->total_brate, EVS_MONO, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
825 :
826 : /* change decision from music to speech in certain special cases */
827 47715 : if ( st->sp_aud_decision1 == 1 )
828 : {
829 17624 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
830 : {
831 : /* strong music decision but almost no content below 1kHz */
832 0 : st->sp_aud_decision2 = 0;
833 : }
834 17624 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
835 : {
836 : /* prevent GSC in highly correlated signal with low energy variation */
837 : /* this is basically a patch against bassoon-type of music */
838 53 : st->sp_aud_decision2 = 0;
839 :
840 53 : if ( st->codec_mode == MODE1 && st->coder_type == TRANSITION )
841 : {
842 0 : st->coder_type = GENERIC;
843 : }
844 : }
845 17571 : else if ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f && ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
846 : {
847 307 : if ( st->tc_cnt == 1 )
848 : {
849 : /* do TC coding instead of GC/VC if onset has been already declared before */
850 14 : st->sp_aud_decision2 = 0;
851 :
852 14 : if ( st->codec_mode == MODE1 )
853 : {
854 14 : st->coder_type = TRANSITION;
855 : }
856 : }
857 : else
858 : {
859 293 : if ( attack >= ATT_3LSUB_POS )
860 : {
861 : /* do TC coding if attack is located in the last subframe */
862 85 : st->sp_aud_decision2 = 0;
863 85 : *attack_flag = attack + 1;
864 :
865 85 : if ( st->codec_mode == MODE1 )
866 : {
867 84 : st->coder_type = TRANSITION;
868 : }
869 : }
870 208 : else if ( attack >= ATT_SEG_LEN / 2 )
871 : {
872 : /* do GSC coding if attack is located after the first quarter of the first subframe */
873 : /* (pre-echo will be treated at the decoder side) */
874 1 : st->sp_aud_decision2 = 1;
875 1 : *attack_flag = 31;
876 : }
877 : }
878 : }
879 : }
880 30091 : else if ( st->localVAD == 1 && st->coder_type == GENERIC && ( ( attack >= ATT_3LSUB_POS && st->total_brate < ACELP_24k40 ) || ( attack >= ATT_3LSUB_POS_16k && st->total_brate >= ACELP_24k40 && st->total_brate < ACELP_48k ) ) )
881 : {
882 : /* do TC coding if attack is located in the last subframe */
883 424 : *attack_flag = attack + 1;
884 :
885 424 : if ( st->codec_mode == MODE1 )
886 : {
887 414 : st->coder_type = TRANSITION;
888 : }
889 : }
890 :
891 47715 : return;
892 : }
893 :
894 :
895 : /*---------------------------------------------------------------------*
896 : * tonal_det()
897 : *
898 : * Tonal detector based on spectral stability and harmonicity
899 : *---------------------------------------------------------------------*/
900 :
901 3856787 : static float tonal_det(
902 : const float S[],
903 : int16_t vad_flag,
904 : float tod_S_map_lt[],
905 : float *tod_thr_lt,
906 : float *tod_weight,
907 : float *tod_S_mass_prev,
908 : float *tod_S_mass_lt )
909 : {
910 : int16_t i;
911 : float S_mass, alpha;
912 :
913 : /* update the adaptive weight */
914 3856787 : *tod_weight = TON_ALPHA * *tod_weight + ( 1 - TON_ALPHA ) * vad_flag;
915 3856787 : if ( *tod_weight > TON_ALPHA )
916 : {
917 2231668 : *tod_weight = TON_ALPHA;
918 : }
919 1625119 : else if ( *tod_weight < ( 1 - TON_ALPHA ) )
920 : {
921 633983 : *tod_weight = 1 - TON_ALPHA;
922 : }
923 :
924 : /* calculate LT spectral correlation in each band up to 4KHz */
925 3856787 : S_mass = 0.0f;
926 312399747 : for ( i = 0; i < TOD_NSPEC; i++ )
927 : {
928 308542960 : tod_S_map_lt[i] = *tod_weight * tod_S_map_lt[i] + ( 1 - *tod_weight ) * S[i];
929 :
930 308542960 : S_mass += tod_S_map_lt[i];
931 : }
932 3856787 : S_mass /= TOD_NSPEC;
933 :
934 3856787 : if ( S_mass > *tod_S_mass_prev )
935 : {
936 1820035 : alpha = 0.7f;
937 : }
938 : else
939 : {
940 2036752 : alpha = 0.3f;
941 : }
942 3856787 : *tod_S_mass_prev = S_mass;
943 3856787 : *tod_S_mass_lt = alpha * *tod_S_mass_lt + ( 1 - alpha ) * S_mass;
944 3856787 : S_mass = *tod_S_mass_lt;
945 :
946 : /* updating adaptive decision threshold */
947 3856787 : if ( S_mass > *tod_thr_lt )
948 : {
949 69580 : *tod_thr_lt -= THR_MASS_STEP_DN;
950 : }
951 : else
952 : {
953 3787207 : *tod_thr_lt += THR_MASS_STEP_UP;
954 : }
955 :
956 3856787 : if ( *tod_thr_lt > THR_MASS_MAX )
957 : {
958 3776539 : *tod_thr_lt = THR_MASS_MAX;
959 : }
960 :
961 3856787 : if ( *tod_thr_lt < THR_MASS_MIN )
962 : {
963 63303 : *tod_thr_lt = THR_MASS_MIN;
964 : }
965 :
966 3856787 : return S_mass;
967 : }
968 :
969 : /*---------------------------------------------------------------------*
970 : * var_cor_calc()
971 : *
972 : * Calculate variance of correlation
973 : *---------------------------------------------------------------------*/
974 :
975 3904502 : static void var_cor_calc(
976 : const float old_corr,
977 : float *mold_corr,
978 : float var_cor_t[],
979 : int16_t *high_stable_cor )
980 : {
981 : int16_t i;
982 : float var_cor;
983 :
984 : /* update buffer of old correlation values */
985 39045020 : for ( i = VAR_COR_LEN - 1; i > 0; i-- )
986 : {
987 35140518 : var_cor_t[i] = var_cor_t[i - 1];
988 : }
989 3904502 : var_cor_t[i] = old_corr;
990 :
991 : /* calculate variance of correlation */
992 3904502 : var_cor = var( var_cor_t, VAR_COR_LEN );
993 :
994 : /* set flag in case of highly-correlated stable signal */
995 3904502 : if ( *mold_corr > 0.8f && var_cor < 5e-4f )
996 : {
997 309208 : *high_stable_cor = 1;
998 : }
999 : else
1000 : {
1001 3595294 : *high_stable_cor = 0;
1002 : }
1003 :
1004 : /* update average correlation */
1005 3904502 : *mold_corr = 0.1f * old_corr + 0.9f * *mold_corr;
1006 :
1007 3904502 : return;
1008 : }
1009 :
1010 : /*---------------------------------------------------------------------*
1011 : * attack_det()
1012 : *
1013 : * Attack detection
1014 : *---------------------------------------------------------------------*/
1015 :
1016 3904502 : static int16_t attack_det(
1017 : const float *inp, /* i : input signal */
1018 : const int16_t last_clas, /* i : last signal clas */
1019 : const int16_t localVAD, /* i : local VAD flag */
1020 : const int16_t coder_type, /* i : coder type */
1021 : const int32_t total_brate, /* i : total bitrate */
1022 : const int16_t element_mode, /* i : IVAS element mode */
1023 : const int16_t clas, /* i : signal class */
1024 : float finc_prev[], /* i/o: previous finc */
1025 : float *lt_finc, /* i/o: long-term mean finc */
1026 : int16_t *last_strong_attack /* i/o: last strong attack flag */
1027 : )
1028 : {
1029 : int16_t i, attack;
1030 : float etmp, etmp2, finc[ATT_NSEG];
1031 : int16_t att_3lsub_pos;
1032 : int16_t attack1;
1033 :
1034 3904502 : att_3lsub_pos = ATT_3LSUB_POS;
1035 3904502 : if ( total_brate >= ACELP_24k40 )
1036 : {
1037 19544 : att_3lsub_pos = ATT_3LSUB_POS_16k;
1038 : }
1039 :
1040 : /* compute energy per section */
1041 128848566 : for ( i = 0; i < ATT_NSEG; i++ )
1042 : {
1043 124944064 : finc[i] = sum2_f( inp + i * ATT_SEG_LEN, ATT_SEG_LEN );
1044 : }
1045 :
1046 3904502 : attack = maximum( finc, ATT_NSEG, &etmp );
1047 3904502 : attack1 = attack;
1048 :
1049 3904502 : if ( localVAD == 1 && coder_type == GENERIC )
1050 : {
1051 : /* compute mean energy in the first three subframes */
1052 1525620 : etmp = mean( finc, att_3lsub_pos );
1053 :
1054 : /* compute mean energy after the attack */
1055 1525620 : etmp2 = mean( finc + attack, ATT_NSEG - attack );
1056 :
1057 : /* and compare them */
1058 1525620 : if ( etmp * 8 > etmp2 )
1059 : {
1060 : /* stop, if the attack is not sufficiently strong */
1061 1472062 : attack = 0;
1062 : }
1063 :
1064 1525620 : if ( last_clas == VOICED_CLAS && etmp * 20 > etmp2 )
1065 : {
1066 : /* stop, if the signal was voiced and the attack is not sufficiently strong */
1067 344469 : attack = 0;
1068 : }
1069 :
1070 : /* compare wrt. other sections (reduces miss-classification) */
1071 1525620 : if ( attack > 0 )
1072 : {
1073 47353 : etmp2 = finc[attack];
1074 :
1075 981519 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1076 : {
1077 936250 : if ( finc[i] * 2.0f > etmp2 )
1078 : {
1079 : /* stop, if the attack is not sufficiently strong */
1080 2084 : attack = 0;
1081 2084 : break;
1082 : }
1083 : }
1084 : }
1085 :
1086 1525620 : if ( attack == 0 && element_mode > EVS_MONO && ( clas < VOICED_TRANSITION || clas == ONSET ) )
1087 : {
1088 1018467 : mvr2r( finc, finc_prev, attack1 );
1089 :
1090 : /* compute mean energy before the attack */
1091 1018467 : etmp = mean( finc_prev, ATT_NSEG );
1092 :
1093 1018467 : etmp2 = finc[attack1];
1094 :
1095 1018467 : if ( ( etmp * 16 < etmp2 ) || ( etmp * 12 < etmp2 && last_clas == UNVOICED_CLAS ) )
1096 : {
1097 50957 : attack = attack1;
1098 : }
1099 :
1100 1018467 : if ( 20 * *lt_finc > etmp2 || *last_strong_attack )
1101 : {
1102 948245 : attack = 0;
1103 : }
1104 : }
1105 :
1106 1525620 : *last_strong_attack = attack;
1107 : }
1108 :
1109 : /* compare wrt. other sections (reduces miss-classification) */
1110 2378882 : else if ( attack > 0 )
1111 : {
1112 26951973 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1113 : {
1114 26128567 : if ( i != attack && finc[i] * 1.3f > finc[attack] )
1115 : {
1116 : /* stop, if the attack is not sufficiently strong */
1117 1319275 : attack = 0;
1118 1319275 : break;
1119 : }
1120 : }
1121 2142681 : *last_strong_attack = 0;
1122 : }
1123 :
1124 : /* updates */
1125 3904502 : mvr2r( finc, finc_prev, ATT_NSEG );
1126 3904502 : *lt_finc = 0.95f * *lt_finc + 0.05f * mean( finc, ATT_NSEG );
1127 :
1128 3904502 : return attack;
1129 : }
1130 :
1131 : /*---------------------------------------------------------------------*
1132 : * ivas_smc_gmm()
1133 : *
1134 : * 1st stage of the speech/music classification (based on the GMM model)
1135 : *---------------------------------------------------------------------*/
1136 :
1137 : /*! r: S/M decision (0=speech or noise,1=unclear,2=music) */
1138 14443821 : int16_t ivas_smc_gmm(
1139 : Encoder_State *st, /* i/o: state structure */
1140 : STEREO_CLASSIF_HANDLE hStereoClassif, /* i/o: stereo classifier structure */
1141 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
1142 : const float Etot, /* i : total frame energy */
1143 : const float lsp_new[M], /* i : LSPs in current frame */
1144 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
1145 : const float epsP[M + 1], /* i : LP prediciton error */
1146 : const float PS[], /* i : energy spectrum */
1147 : const float non_sta, /* i : unbound non-stationarity */
1148 : const float relE, /* i : relative frame energy */
1149 : int16_t *high_lpn_flag, /* i/o: sp/mus LPN flag */
1150 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
1151 : )
1152 : {
1153 : int16_t i, m, dec;
1154 : int16_t flag_odv;
1155 : float lps, lpm, lpn;
1156 : float ps[N_SMC_MIXTURES], pm[N_SMC_MIXTURES], pn[N_SMC_MIXTURES];
1157 : float fvm[N_PCA_COEF], lprob;
1158 : float dlp, ftmp, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght;
1159 : float wrise;
1160 : float dlp_mean2var;
1161 : float FV[N_SMC_FEATURES], *pFV, PS_norm[128], dPS[128];
1162 : const float *pODV;
1163 : float *pFV_st, smc_st_mean_fact;
1164 : int16_t relE_attack_flag;
1165 : int16_t j, len;
1166 : const float *pt_mel_fb;
1167 : float melS[NB_MEL_BANDS], mfcc[NB_MEL_BANDS];
1168 : int16_t odv_cnt;
1169 : int16_t i_out[N_SMC_FEATURES], *p_out;
1170 :
1171 : /*------------------------------------------------------------------*
1172 : * Initialization
1173 : *------------------------------------------------------------------*/
1174 :
1175 14443821 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1176 :
1177 : /*------------------------------------------------------------------*
1178 : * State machine (sp_mus_state: -8 = INACTIVE, -7:-1 = UNSTABLE, 0:7 = ENTRY, 8 = STABLE )
1179 : *------------------------------------------------------------------*/
1180 :
1181 14443821 : if ( localVAD_HE_SAD )
1182 : {
1183 11464286 : if ( relE < -20 )
1184 : {
1185 635064 : if ( hSpMusClas->sp_mus_state > 0 )
1186 : {
1187 119896 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
1188 : {
1189 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
1190 28713 : hSpMusClas->inact_cnt = 0;
1191 : }
1192 :
1193 : /* energy is too low -> we are going to instable state */
1194 119896 : hSpMusClas->sp_mus_state = 0;
1195 : }
1196 515168 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1197 : {
1198 : /* energy is still too low -> we are still in instable state */
1199 238348 : hSpMusClas->sp_mus_state--;
1200 : }
1201 : }
1202 10829222 : else if ( hSpMusClas->sp_mus_state <= 0 )
1203 : {
1204 271401 : if ( hSpMusClas->inact_cnt == 0 )
1205 : {
1206 :
1207 159961 : hSpMusClas->sp_mus_state = 1;
1208 : }
1209 : else
1210 : {
1211 :
1212 111440 : hSpMusClas->sp_mus_state = HANG_LEN;
1213 : }
1214 :
1215 271401 : hSpMusClas->inact_cnt = 12;
1216 : }
1217 10557821 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1218 : {
1219 : /* we are inside an entry period -> increment the counter of entry frames */
1220 651673 : hSpMusClas->sp_mus_state++;
1221 : }
1222 :
1223 11464286 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
1224 : {
1225 219233 : hSpMusClas->inact_cnt--;
1226 : }
1227 : }
1228 : else
1229 : {
1230 2979535 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1231 : {
1232 15370 : hSpMusClas->inact_cnt = 0;
1233 : }
1234 2964165 : else if ( hSpMusClas->inact_cnt > 0 )
1235 : {
1236 357843 : hSpMusClas->inact_cnt--;
1237 : }
1238 :
1239 2979535 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1240 : {
1241 15370 : hSpMusClas->sp_mus_state = -HANG_LEN;
1242 : }
1243 2964165 : else if ( hSpMusClas->sp_mus_state > 0 )
1244 : {
1245 48076 : hSpMusClas->sp_mus_state = -1;
1246 : }
1247 2916089 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1248 : {
1249 : /* we are in inactive state */
1250 245813 : hSpMusClas->sp_mus_state--;
1251 : }
1252 : }
1253 :
1254 : /* detect attacks based on relE */
1255 14443821 : if ( relE > hSpMusClas->prev_relE )
1256 : {
1257 5635157 : hSpMusClas->relE_attack_sum += relE - hSpMusClas->prev_relE;
1258 : }
1259 : else
1260 : {
1261 8808664 : hSpMusClas->relE_attack_sum = 0;
1262 : }
1263 14443821 : hSpMusClas->prev_relE = relE;
1264 :
1265 : /* update counter from last VAD 0->1 change */
1266 14443821 : if ( hSpMusClas->prev_vad == 0 && localVAD_HE_SAD == 1 )
1267 : {
1268 202011 : hSpMusClas->vad_0_1_cnt = 1;
1269 : }
1270 14241810 : else if ( localVAD_HE_SAD == 1 && hSpMusClas->vad_0_1_cnt > 0 && hSpMusClas->vad_0_1_cnt < 50 )
1271 : {
1272 2778195 : hSpMusClas->vad_0_1_cnt++;
1273 : }
1274 : else
1275 : {
1276 11463615 : hSpMusClas->vad_0_1_cnt = 0;
1277 : }
1278 14443821 : hSpMusClas->prev_vad = localVAD_HE_SAD;
1279 :
1280 14443821 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && hSpMusClas->relE_attack_sum > 5.0f )
1281 : {
1282 228439 : hSpMusClas->relE_attack_cnt++;
1283 :
1284 : /* set flag only in the first X frames in a series */
1285 228439 : if ( hSpMusClas->relE_attack_cnt > 0 && hSpMusClas->relE_attack_cnt < 3 )
1286 : {
1287 177059 : relE_attack_flag = 1;
1288 : }
1289 : else
1290 : {
1291 51380 : relE_attack_flag = 0;
1292 : }
1293 : }
1294 : else
1295 : {
1296 14215382 : hSpMusClas->relE_attack_cnt = 0;
1297 14215382 : relE_attack_flag = 0;
1298 : }
1299 :
1300 14443821 : hSpMusClas->prev_Etot = Etot;
1301 :
1302 : /*------------------------------------------------------------------*
1303 : * Preparation of the feature vector
1304 : *------------------------------------------------------------------*/
1305 :
1306 14443821 : pFV = FV;
1307 :
1308 : /* [0] OL pitch */
1309 14443821 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1310 : {
1311 998785 : *pFV++ = (float) st->pitch[2];
1312 : }
1313 : else
1314 : {
1315 13445036 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
1316 : }
1317 :
1318 : /* [1] voicing */
1319 14443821 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1320 : {
1321 998785 : *pFV++ = st->voicing[2];
1322 : }
1323 : else
1324 : {
1325 13445036 : *pFV++ = ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
1326 : }
1327 :
1328 : /* [2,3,4,5,6] LSFs */
1329 14443821 : *pFV++ = acosf( lsp_new[2] );
1330 14443821 : *pFV++ = acosf( lsp_new[3] );
1331 14443821 : *pFV++ = acosf( lsp_new[4] );
1332 14443821 : *pFV++ = acosf( lsp_new[5] );
1333 14443821 : *pFV++ = acosf( lsp_new[6] );
1334 :
1335 : /* [7] cor_map_sum */
1336 14443821 : *pFV++ = cor_map_sum;
1337 :
1338 : /* [8] non_sta */
1339 14443821 : *pFV++ = non_sta;
1340 :
1341 : /* [9] epsP */
1342 14443821 : *pFV++ = logf( epsP[14] + 1e-5f ) - logf( epsP[0] + 1e-5f );
1343 :
1344 : /* [10,11,12] MFCCs */
1345 14443821 : set_zero( melS, NB_MEL_BANDS );
1346 14443821 : pt_mel_fb = mel_fb;
1347 592196661 : for ( i = 0; i < NB_MEL_BANDS; i++ )
1348 : {
1349 577752840 : j = mel_fb_start[i];
1350 577752840 : len = mel_fb_len[i];
1351 577752840 : melS[i] = logf( dotp( &PS[j], pt_mel_fb, len ) + 1e-5f );
1352 577752840 : pt_mel_fb += len;
1353 : }
1354 :
1355 14443821 : v_mult_mat( mfcc, melS, dct_mtx, NB_MEL_BANDS, NB_MEL_COEF );
1356 :
1357 14443821 : *pFV++ = mfcc[2];
1358 14443821 : *pFV++ = mfcc[6];
1359 14443821 : *pFV++ = mfcc[12];
1360 :
1361 : /* calculation of differential normalized power spectrum */
1362 14443821 : sum_PS = 1e-5f;
1363 982179828 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1364 : {
1365 967736007 : sum_PS += PS[i];
1366 : }
1367 :
1368 982179828 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1369 : {
1370 967736007 : PS_norm[i] = PS[i] / sum_PS;
1371 967736007 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
1372 : }
1373 :
1374 : /* [13] ps_diff (spectral difference) */
1375 14443821 : ps_diff = 0;
1376 982179828 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1377 : {
1378 967736007 : ps_diff += dPS[i];
1379 : }
1380 :
1381 14443821 : *pFV++ = ps_diff;
1382 :
1383 : /* [14] ps_sta (spectral stationarity) */
1384 14443821 : ps_sta = 0;
1385 982179828 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1386 : {
1387 967736007 : if ( PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] )
1388 : {
1389 452891683 : ps_sta += PS_norm[i] / ( dPS[i] + 1e-5f );
1390 : }
1391 : else
1392 : {
1393 514844324 : ps_sta += hSpMusClas->past_PS[i - LOWEST_FBIN] / ( dPS[i] + 1e-5f );
1394 : }
1395 : }
1396 :
1397 14443821 : *pFV++ = logf( ps_sta + 1e-5f );
1398 14443821 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
1399 :
1400 : /* save ps_diff and ps_sta features for XTALK and UNCLR classifier */
1401 14443821 : if ( hStereoClassif != NULL )
1402 : {
1403 10785106 : if ( st->idchan == 0 )
1404 : {
1405 5756464 : hStereoClassif->ps_diff_ch1 = ps_diff;
1406 5756464 : hStereoClassif->ps_sta_ch1 = logf( ps_sta + 1e-5f );
1407 : }
1408 : else
1409 : {
1410 5028642 : hStereoClassif->ps_diff_ch2 = ps_diff;
1411 5028642 : hStereoClassif->ps_sta_ch2 = logf( ps_sta + 1e-5f );
1412 : }
1413 : }
1414 :
1415 : /*------------------------------------------------------------------*
1416 : * Outlier detection based on feature histograms
1417 : *------------------------------------------------------------------*/
1418 :
1419 14443821 : flag_odv = 0;
1420 14443821 : if ( localVAD_HE_SAD )
1421 : {
1422 11464286 : pFV = FV;
1423 11464286 : pODV = hout_intervals;
1424 11464286 : p_out = i_out;
1425 11464286 : odv_cnt = 0;
1426 183428576 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1427 : {
1428 171964290 : if ( *pFV < pODV[0] || *pFV > pODV[1] )
1429 : {
1430 157599 : *p_out++ = i;
1431 157599 : odv_cnt++;
1432 : }
1433 :
1434 171964290 : pFV++;
1435 171964290 : pODV += 2;
1436 : }
1437 :
1438 : /* set outlier flag */
1439 11464286 : if ( odv_cnt >= 2 )
1440 : {
1441 43278 : flag_odv = 1;
1442 :
1443 : /* replace outlying features with values from the previous frame */
1444 139824 : for ( i = 0; i < odv_cnt; i++ )
1445 : {
1446 96546 : FV[i_out[i]] = hSpMusClas->prev_FV[i_out[i]];
1447 : }
1448 : }
1449 : }
1450 :
1451 : /*------------------------------------------------------------------*
1452 : * Adaptive short-term mean filter on feature vector
1453 : *------------------------------------------------------------------*/
1454 :
1455 14443821 : pFV = FV;
1456 14443821 : pFV_st = hSpMusClas->FV_st;
1457 14443821 : smc_st_mean_fact = SMC_ST_MEAN_FACT;
1458 231101136 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1459 : {
1460 216657315 : *pFV_st = smc_st_mean_fact * ( *pFV_st ) + ( 1 - smc_st_mean_fact ) * ( *pFV );
1461 :
1462 216657315 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) )
1463 : {
1464 : /* strong attack or outlier frame during entry state -> features cannot be trusted but there is also no useful past info -> */
1465 : /* -> do whatever you want because dlp will be reset to 0 anyway */
1466 2676075 : pFV++;
1467 2676075 : pFV_st++;
1468 : }
1469 213981240 : else if ( hSpMusClas->sp_mus_state == HANG_LEN && ( st->tc_cnt == 1 || st->tc_cnt == 2 ) )
1470 : {
1471 : /* energy attack in stable state -> use current features intead of the long-term average */
1472 11521245 : pFV++;
1473 11521245 : pFV_st++;
1474 : }
1475 : else
1476 : {
1477 202459995 : *pFV++ = *pFV_st++;
1478 : }
1479 : }
1480 :
1481 : /* update */
1482 14443821 : mvr2r( FV, hSpMusClas->prev_FV, N_SMC_FEATURES );
1483 :
1484 : /*------------------------------------------------------------------*
1485 : * Non-linear power transformation (boxcox) on certain features
1486 : *------------------------------------------------------------------*/
1487 :
1488 14443821 : pFV = FV;
1489 231101136 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1490 : {
1491 216657315 : if ( bcox_lmbd[i] != 0 )
1492 : {
1493 43331463 : *pFV -= bcox_add_cnst[i];
1494 43331463 : if ( *pFV < 1 )
1495 : {
1496 2713281 : *pFV = 1;
1497 : }
1498 43331463 : *pFV = ( powf( *pFV, bcox_lmbd[i] ) - 1 ) / bcox_lmbd[i];
1499 : }
1500 :
1501 216657315 : pFV++;
1502 : }
1503 :
1504 : /*------------------------------------------------------------------*
1505 : * Scaling of the feature vector
1506 : * PCA
1507 : *------------------------------------------------------------------*/
1508 :
1509 14443821 : pFV = FV;
1510 231101136 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1511 : {
1512 : /* Standard scaler - mean and variance normalization */
1513 216657315 : *pFV = ( *pFV - sm_means[i] ) / sm_scale[i];
1514 216657315 : pFV++;
1515 :
1516 : /* MinMax sclaer - mean and variance normalization */
1517 : /**pFV = *pFV * sm_scale[i] + sm_min[i];*/
1518 : /*pFV++;*/
1519 : }
1520 :
1521 : /* PCA */
1522 14443821 : v_sub( FV, pca_mean_, FV, N_SMC_FEATURES );
1523 14443821 : v_mult_mat( FV, FV, pca_components_, N_SMC_FEATURES, N_PCA_COEF );
1524 :
1525 : /*------------------------------------------------------------------*
1526 : * Calculation of posterior probability
1527 : * Log-probability
1528 : *------------------------------------------------------------------*/
1529 :
1530 : /* run loop for all mixtures (for each mixture, calculate the probability of speech, music and noise) */
1531 14443821 : lps = lpm = lpn = 0;
1532 101106747 : for ( m = 0; m < N_SMC_MIXTURES; m++ )
1533 : {
1534 86662926 : v_sub( FV, &means_speech[m * N_PCA_COEF], fvm, N_PCA_COEF );
1535 86662926 : lprob = dot_product_cholesky( fvm, &prec_chol_speech[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1536 86662926 : ps[m] = logf( weights_speech[m] ) + log_det_chol_speech[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1537 :
1538 86662926 : v_sub( FV, &means_music[m * N_PCA_COEF], fvm, N_PCA_COEF );
1539 86662926 : lprob = dot_product_cholesky( fvm, &prec_chol_music[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1540 86662926 : pm[m] = logf( weights_music[m] ) + log_det_chol_music[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1541 :
1542 86662926 : v_sub( FV, &means_noise[m * N_PCA_COEF], fvm, N_PCA_COEF );
1543 86662926 : lprob = dot_product_cholesky( fvm, &prec_chol_noise[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1544 86662926 : pn[m] = logf( weights_noise[m] ) + log_det_chol_noise[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1545 : }
1546 :
1547 14443821 : lps = logsumexp( ps, N_SMC_MIXTURES );
1548 14443821 : lpm = logsumexp( pm, N_SMC_MIXTURES );
1549 14443821 : lpn = logsumexp( pn, N_SMC_MIXTURES );
1550 :
1551 14443821 : *high_lpn_flag = 0;
1552 14443821 : if ( lpn > lps && lpn > lpm )
1553 : {
1554 1731725 : *high_lpn_flag = 1;
1555 : }
1556 :
1557 14443821 : hSpMusClas->lpm = lpm;
1558 14443821 : hSpMusClas->lps = lps;
1559 14443821 : hSpMusClas->lpn = lpn;
1560 :
1561 : /* determine HQ Generic speech class */
1562 14443821 : if ( st->hHQ_core != NULL )
1563 : {
1564 3964156 : if ( lps > lpm + 0.5f )
1565 : {
1566 1824000 : st->hHQ_core->hq_generic_speech_class = 1;
1567 : }
1568 : else
1569 : {
1570 2140156 : st->hHQ_core->hq_generic_speech_class = 0;
1571 : }
1572 : }
1573 :
1574 : /*------------------------------------------------------------------*
1575 : * Decision without hangover
1576 : * Weighted decision
1577 : *------------------------------------------------------------------*/
1578 :
1579 : /* decision without hangover (0 - speech/noise, 1 - music) */
1580 14443821 : if ( !localVAD_HE_SAD || Etot < 10 || ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) ) )
1581 : {
1582 3270081 : dlp = 0;
1583 : }
1584 : else
1585 : {
1586 11173740 : dlp = lpm - lps + DLP_BIAS;
1587 :
1588 11173740 : if ( dlp > 30.0f )
1589 : {
1590 853567 : dlp = 30.0f;
1591 : }
1592 10320173 : else if ( dlp < -30.0f )
1593 : {
1594 0 : dlp = -30.0f;
1595 : }
1596 : }
1597 :
1598 14443821 : dec = dlp > 0;
1599 :
1600 : /* calculate weight based on relE (higher relE -> lower weight, lower relE -> higher weight) */
1601 14443821 : wrelE = lin_interp( relE, 15.0f, 0.9f, -15.0f, 0.99f, 1 );
1602 :
1603 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
1604 14443821 : hSpMusClas->dlp_mean_ST = 0.8f * hSpMusClas->dlp_mean_ST + 0.2f * dlp;
1605 14443821 : hSpMusClas->lt_dec_thres = hSpMusClas->dlp_mean_ST;
1606 :
1607 14443821 : if ( dlp < 0 && dlp < hSpMusClas->dlp_mean_ST )
1608 : {
1609 3141745 : if ( hSpMusClas->dlp_mean_ST > 0 )
1610 : {
1611 1047194 : hSpMusClas->wdrop = -dlp;
1612 : }
1613 2094551 : else if ( hSpMusClas->wdrop > 0 )
1614 : {
1615 566249 : hSpMusClas->wdrop += hSpMusClas->dlp_mean_ST - dlp;
1616 : }
1617 : }
1618 : else
1619 : {
1620 11302076 : hSpMusClas->wdrop = 0;
1621 : }
1622 :
1623 14443821 : wdrop = lin_interp( hSpMusClas->wdrop, 15.0f, 0.7f, 0.0f, 1.0f, 1 );
1624 :
1625 : /* calculate weight based on rises of dlp (close to 1 during sudden rise of dlp, close to 0 otherwise) */
1626 14443821 : if ( hSpMusClas->sp_mus_state == HANG_LEN && hSpMusClas->dlp_mean_ST > 0 && hSpMusClas->dlp_mean_ST > hSpMusClas->past_dlp_mean_ST[0] )
1627 : {
1628 3380092 : if ( hSpMusClas->past_dlp_mean_ST[0] < 0 )
1629 : {
1630 201216 : hSpMusClas->wrise = hSpMusClas->dlp_mean_ST;
1631 : }
1632 3178876 : else if ( hSpMusClas->wrise > 0 )
1633 : {
1634 472264 : hSpMusClas->wrise += hSpMusClas->dlp_mean_ST - hSpMusClas->past_dlp_mean_ST[0];
1635 : }
1636 : }
1637 : else
1638 : {
1639 11063729 : hSpMusClas->wrise = 0;
1640 : }
1641 :
1642 14443821 : wrise = lin_interp( hSpMusClas->wrise, 5.0f, 0.95f, 0.0f, 1.0f, 1 );
1643 :
1644 : /* combine weights into one */
1645 14443821 : wght = wrelE * wdrop * wrise;
1646 :
1647 : /* ratio of delta means vs. delta variances */
1648 14443821 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1649 : {
1650 743278 : hSpMusClas->dlp_mean_LT = dlp;
1651 743278 : hSpMusClas->dlp_var_LT = 0;
1652 : }
1653 :
1654 14443821 : hSpMusClas->dlp_mean_LT = 0.9f * hSpMusClas->dlp_mean_LT + 0.1f * dlp;
1655 14443821 : ftmp = dlp - hSpMusClas->dlp_mean_LT;
1656 14443821 : hSpMusClas->dlp_var_LT = 0.9f * hSpMusClas->dlp_var_LT + 0.1f * ( ftmp * ftmp );
1657 :
1658 14443821 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1659 : {
1660 743278 : dlp_mean2var = 0;
1661 : }
1662 : else
1663 : {
1664 13700543 : dlp_mean2var = fabsf( hSpMusClas->dlp_mean_LT ) / ( sqrtf( fabsf( hSpMusClas->dlp_var_LT ) ) + 1.0f );
1665 : }
1666 :
1667 14443821 : if ( dlp_mean2var > 15.0f )
1668 : {
1669 : /* decrease the weight little bit when the classifier indicates "strong speech" or "strong music" */
1670 205877 : wght *= 0.9f;
1671 : }
1672 :
1673 14443821 : if ( wght > 1.0f )
1674 : {
1675 0 : wght = 1.0f;
1676 : }
1677 14443821 : else if ( wght < 0.01f )
1678 : {
1679 0 : wght = 0.01f;
1680 : }
1681 :
1682 14443821 : if ( Etot < 10 )
1683 : {
1684 : /* silence */
1685 2475571 : wght = 0.92f;
1686 : }
1687 :
1688 : /* calculate weighted decision */
1689 14443821 : hSpMusClas->wdlp_0_95_sp = wght * hSpMusClas->wdlp_0_95_sp + ( 1 - wght ) * dlp;
1690 :
1691 : /* xtalk classifier: apply long hysteresis to prevent LRTD on music */
1692 14443821 : hSpMusClas->wdlp_xtalk = 0.995f * hSpMusClas->wdlp_xtalk + 0.005f * dlp;
1693 :
1694 : /*------------------------------------------------------------------*
1695 : * Final speech/music decision
1696 : *------------------------------------------------------------------*/
1697 :
1698 14443821 : if ( flag_spitch )
1699 : {
1700 903172 : hSpMusClas->flag_spitch_cnt = 5;
1701 : }
1702 13540649 : else if ( hSpMusClas->flag_spitch_cnt > 0 )
1703 : {
1704 111767 : hSpMusClas->flag_spitch_cnt--;
1705 : }
1706 :
1707 14443821 : if ( Etot < 10 )
1708 : {
1709 : /* silence */
1710 2475571 : dec = 0;
1711 : }
1712 11968250 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1713 : {
1714 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
1715 742127 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
1716 742127 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
1717 742127 : if ( ftmp > 2.0f )
1718 : {
1719 352258 : if ( dlp > 2.0f )
1720 : {
1721 224130 : dec = 2;
1722 : }
1723 : else
1724 : {
1725 128128 : dec = 1;
1726 : }
1727 : }
1728 : else
1729 : {
1730 389869 : dec = 0;
1731 : }
1732 : }
1733 : else
1734 : {
1735 : /* stable active state */
1736 11226123 : if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 &&
1737 5023739 : ( ( hSpMusClas->flag_spitch_cnt > 0 && hSpMusClas->wdlp_0_95_sp > 3.4f ) || ( hSpMusClas->flag_spitch_cnt == 0 && hSpMusClas->wdlp_0_95_sp > 2.1f ) ) )
1738 : {
1739 : /* switching from speech to unclear */
1740 20499 : dec = 1;
1741 : }
1742 11205624 : else if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->vad_0_1_cnt < 50 && hSpMusClas->relE_attack_sum == 0.0f && hSpMusClas->wdlp_0_95_sp > 1.0f )
1743 : {
1744 : /* switch from speech to unclear also during slowly rising weak music onsets */
1745 31435 : dec = 1;
1746 : }
1747 11174189 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp > 2.5f )
1748 : {
1749 : /* switching from unclear to music */
1750 43642 : dec = 2;
1751 : }
1752 11130547 : else if ( hSpMusClas->past_dec[0] == 2 && hSpMusClas->past_dec[1] == 2 && hSpMusClas->past_dec[2] == 2 && hSpMusClas->wdlp_0_95_sp < -1.0f )
1753 : {
1754 : /* switching from music to unclear */
1755 31224 : dec = 1;
1756 : }
1757 11099323 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < -2.5f )
1758 : {
1759 : /* switching from unclear to speech */
1760 31447 : dec = 0;
1761 : }
1762 : else
1763 : {
1764 11067876 : dec = hSpMusClas->past_dec[0];
1765 : }
1766 : }
1767 :
1768 : /*------------------------------------------------------------------*
1769 : * raw S/M decision based on smoothed GMM score
1770 : *------------------------------------------------------------------*/
1771 :
1772 14443821 : if ( dec == 0 || st->hSpMusClas->wdlp_0_95_sp <= 0 )
1773 : {
1774 8699603 : st->sp_aud_decision0 = 0;
1775 8699603 : st->sp_aud_decision1 = 0;
1776 : }
1777 : else
1778 : {
1779 5744218 : st->sp_aud_decision0 = 1;
1780 5744218 : st->sp_aud_decision1 = 1;
1781 : }
1782 :
1783 : /*------------------------------------------------------------------*
1784 : * Updates
1785 : *------------------------------------------------------------------*/
1786 :
1787 : /* update buffer of past non-binary decisions */
1788 14443821 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
1789 14443821 : hSpMusClas->past_dlp[0] = dlp;
1790 :
1791 14443821 : mvr2r( &hSpMusClas->past_dlp_mean_ST[0], &hSpMusClas->past_dlp_mean_ST[1], HANG_LEN - 2 );
1792 14443821 : hSpMusClas->past_dlp_mean_ST[0] = hSpMusClas->dlp_mean_ST;
1793 :
1794 : /* update buffer of past binary decisions */
1795 14443821 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
1796 14443821 : hSpMusClas->past_dec[0] = dec;
1797 :
1798 : #ifdef DEBUG_MODE_INFO
1799 : dbgwrite( &st->hSpMusClas->wdlp_0_95_sp, sizeof( float ), 1, 1, "res/wdlp_0_95_sp.x" );
1800 : #endif
1801 :
1802 14443821 : return dec;
1803 : }
1804 :
1805 : /*---------------------------------------------------------------------*
1806 : * ivas_smc_mode_selection()
1807 : *
1808 : * 2nd stage speech/music classifier (select coding mode (ACELP, GSC and TCX) based on S/M classification)
1809 : * output (sp_aud_decision1 - sp_aud_decision2 -> coding mode):
1810 : * 0 - 0 -> ACELP
1811 : * 1 - 0 -> GSC
1812 : * 1 - 1 -> TCX
1813 : *---------------------------------------------------------------------*/
1814 :
1815 3856787 : void ivas_smc_mode_selection(
1816 : Encoder_State *st, /* i/o: encoder state structure */
1817 : const int32_t element_brate, /* i : element bitrate */
1818 : int16_t smc_dec, /* i : raw decision of the 1st stage classifier*/
1819 : const float relE, /* i : relative frame energy */
1820 : const float Etot, /* i : total frame energy */
1821 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
1822 : const float *inp, /* i : input signal */
1823 : const float S_map[], /* i : short-term correlation map */
1824 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch*/
1825 : )
1826 : {
1827 : int16_t attack;
1828 : float ton;
1829 : int16_t i;
1830 : float S_p2a, S_max, S_ave;
1831 : float thr_sp2a;
1832 :
1833 3856787 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1834 :
1835 : /* initialization */
1836 3856787 : *attack_flag = 0;
1837 3856787 : st->sp_aud_decision2 = 0;
1838 :
1839 : /* signal stability estimation */
1840 3856787 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
1841 :
1842 : /* calculate variance of correlation */
1843 3856787 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
1844 :
1845 : /* attack detection */
1846 3856787 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, 0, st->element_mode, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
1847 :
1848 : /* tonal detector */
1849 3856787 : ton = tonal_det( S_map, st->vad_flag, hSpMusClas->tod_S_map_lt, &hSpMusClas->tod_thr_lt, &hSpMusClas->tod_weight, &hSpMusClas->tod_S_mass_prev, &hSpMusClas->tod_S_mass_lt );
1850 :
1851 :
1852 : /* calculate spectral peak-to-average ratio */
1853 312399747 : for ( i = 0; i < TOD_NSPEC; i++ )
1854 : {
1855 308542960 : st->hSpMusClas->tod_lt_Bin_E[i] = P2A_FACT * st->hSpMusClas->tod_lt_Bin_E[i] + ( 1 - P2A_FACT ) * st->Bin_E[i];
1856 : }
1857 :
1858 3856787 : maximum( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC, &S_max );
1859 3856787 : S_ave = sum_f( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC ) / TOD_NSPEC;
1860 3856787 : S_p2a = S_max - S_ave;
1861 :
1862 3856787 : if ( element_brate <= IVAS_16k4 )
1863 : {
1864 1149554 : thr_sp2a = THR_P2A_HIGH;
1865 : }
1866 : else
1867 : {
1868 2707233 : thr_sp2a = THR_P2A;
1869 : }
1870 :
1871 : /* initial 3-way selection of coding modes (ACELP/GSC/TCX) */
1872 3856787 : if ( relE > -10.0f && ( S_p2a > thr_sp2a || ton > hSpMusClas->tod_thr_lt ) )
1873 : {
1874 : /* select TCX to encode extremely peaky signals or strongly tonal signals */
1875 206213 : st->sp_aud_decision1 = 1;
1876 206213 : st->sp_aud_decision2 = 1;
1877 : }
1878 3650574 : else if ( smc_dec == SPEECH )
1879 : {
1880 : /* select ACELP to encode speech */
1881 1173295 : st->sp_aud_decision1 = 0;
1882 1173295 : st->sp_aud_decision2 = 0;
1883 : }
1884 2477279 : else if ( smc_dec == SPEECH_OR_MUSIC )
1885 : {
1886 : /* select GSC to encode "unclear" segments (classifier's score on the borderline) */
1887 61195 : st->sp_aud_decision1 = 1;
1888 61195 : st->sp_aud_decision2 = 0;
1889 : }
1890 : else
1891 : {
1892 : /* select TCX to encode music */
1893 2416084 : st->sp_aud_decision1 = 1;
1894 2416084 : st->sp_aud_decision2 = 1;
1895 : }
1896 :
1897 : /* change decision from GSC to ACELP/TCX in some special cases */
1898 3856787 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1899 : {
1900 61195 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
1901 : {
1902 : /* prevent GSC on strong music with almost no content below 1kHz */
1903 107 : st->sp_aud_decision2 = 1;
1904 : }
1905 61088 : else if ( flag_spitch )
1906 : {
1907 : /* prevent GSC on signals with very short and stable high pitch period */
1908 2458 : if ( hSpMusClas->wdlp_0_95_sp < 2.5f )
1909 : {
1910 : /* select ACELP instead */
1911 2333 : st->sp_aud_decision1 = 0;
1912 : }
1913 : else
1914 : {
1915 : /* select TCX instead */
1916 125 : st->sp_aud_decision2 = 1;
1917 : }
1918 : }
1919 58630 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
1920 : {
1921 : /* prevent GSC in highly correlated signal with low energy variation */
1922 : /* this is basically a patch against bassoon-type of music */
1923 2 : st->sp_aud_decision2 = 1;
1924 : }
1925 : }
1926 :
1927 : /* change decision from GSC to ACELP TC during attacks/onsets */
1928 3856787 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1929 : {
1930 58628 : if ( ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f ) &&
1931 3859 : ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
1932 : {
1933 1051 : if ( st->tc_cnt == 1 )
1934 : {
1935 : /* do ACELP TC coding instead of GC/VC if onset has been already declared before */
1936 282 : st->sp_aud_decision1 = 0;
1937 282 : st->coder_type = TRANSITION;
1938 : }
1939 : else
1940 : {
1941 769 : if ( attack >= ATT_3LSUB_POS )
1942 : {
1943 : /* do ACELP TC coding also if attack is located in the last subframe */
1944 222 : st->sp_aud_decision1 = 0;
1945 222 : *attack_flag = attack + 1;
1946 222 : st->coder_type = TRANSITION;
1947 : }
1948 547 : else if ( attack >= ATT_SEG_LEN / 2 )
1949 : {
1950 : /* do GSC coding if attack is located after the first quarter of the first subframe */
1951 : /* (pre-echo will be treated at the decoder side) */
1952 67 : *attack_flag = 31;
1953 67 : *attack_flag = attack + 1;
1954 : }
1955 : }
1956 : }
1957 : }
1958 :
1959 3856787 : if ( st->localVAD == 1 && st->coder_type == GENERIC && attack > 0 /*&& *attack_flag < 32*/ /*&& st->tc_cnt != 2*/ && !( st->sp_aud_decision2 == 1 && ton > 0.65f ) )
1960 : {
1961 : /* change ACELP coder_type to TC if attack has been detected */
1962 66044 : st->sp_aud_decision1 = 0;
1963 66044 : st->sp_aud_decision2 = 0;
1964 :
1965 66044 : st->coder_type = TRANSITION;
1966 66044 : *attack_flag = attack + 1;
1967 : }
1968 :
1969 : #ifdef DEBUGGING
1970 : if ( st->idchan == 0 && st->coder_type != INACTIVE )
1971 : {
1972 : if ( st->force == FORCE_GSC && element_brate < IVAS_24k4 )
1973 : {
1974 : /* enforce GSC */
1975 : st->sp_aud_decision1 = 1;
1976 : st->sp_aud_decision2 = 0;
1977 : }
1978 : else if ( st->force == FORCE_SPEECH && ( st->sp_aud_decision1 == 1 || st->sp_aud_decision2 == 1 ) )
1979 : {
1980 : if ( element_brate < IVAS_24k4 )
1981 : {
1982 : /* convert TCX to GSC */
1983 : st->sp_aud_decision1 = 1;
1984 : st->sp_aud_decision2 = 0;
1985 : }
1986 : else
1987 : {
1988 : /* convert TCX to ACELP */
1989 : st->sp_aud_decision1 = 0;
1990 : st->sp_aud_decision2 = 0;
1991 : }
1992 : }
1993 : else if ( st->force == FORCE_MUSIC )
1994 : {
1995 : /* enforce TCX */
1996 : st->sp_aud_decision1 = 1;
1997 : st->sp_aud_decision2 = 1;
1998 : }
1999 : }
2000 : #endif
2001 :
2002 : /* set GSC noisy speech flag on unvoiced SWB segments */
2003 3856787 : st->GSC_noisy_speech = 0;
2004 3856787 : if ( st->vad_flag == 1 && element_brate <= IVAS_16k4 && st->lp_noise > 30.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB && st->coder_type_raw == UNVOICED )
2005 : {
2006 1561 : st->GSC_noisy_speech = 1;
2007 : }
2008 :
2009 : /* set GSC submode */
2010 3856787 : if ( st->element_mode > EVS_MONO && ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) && st->total_brate > STEREO_GSC_BIT_RATE_ALLOC ) /* below STEREO_GSC_BIT_RATE_ALLOC, fall back on normal GSC */
2011 : {
2012 54531 : st->GSC_IVAS_mode = 1;
2013 54531 : if ( st->hSpMusClas->wdlp_0_95_sp > 0.0f )
2014 : {
2015 : /* music-like content */
2016 35857 : st->GSC_IVAS_mode = 3;
2017 : }
2018 18674 : else if ( st->tc_cnt > 0 )
2019 : {
2020 : /* likely presence of an onset, GSC bit allocation will be more focused on LF */
2021 1693 : st->GSC_IVAS_mode = 2;
2022 : }
2023 :
2024 54531 : if ( st->coder_type_raw == UNVOICED && st->sp_aud_decision0 == 0 /*&& st->GSC_IVAS_mode < 3*/ )
2025 : {
2026 4053 : st->GSC_noisy_speech = 1;
2027 : }
2028 : else
2029 : {
2030 50478 : st->GSC_noisy_speech = 0;
2031 : }
2032 : }
2033 :
2034 : /* set coder_type to AUDIO when GSC is selected (st->core will be set later in the decision matrix) */
2035 3856787 : if ( ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) || st->GSC_noisy_speech )
2036 : {
2037 59050 : st->coder_type = AUDIO;
2038 59050 : if ( st->hGSCEnc != NULL && st->GSC_noisy_speech == 0 ) /* In case of GSC_noisy_speech, NOISE_LEVEL should remain at NOISE_LEVEL_SP3 */
2039 : {
2040 53436 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
2041 : }
2042 : }
2043 :
2044 3856787 : return;
2045 : }
2046 :
2047 :
2048 : /*------------------------------------------------------------------------*
2049 : * music_mixed_classif_improv()
2050 : *
2051 : * Improve 1st stage speech/music decision for mixed&music signals
2052 : *------------------------------------------------------------------------*/
2053 :
2054 52755 : static void music_mixed_classif_improv(
2055 : Encoder_State *st, /* i/o: Encoder state structure */
2056 : const float *new_inp, /* i : new input signal */
2057 : const float *epsP, /* i : LP prediction error */
2058 : const float etot, /* i : total frame energy */
2059 : const float old_cor, /* i : normalized correlation */
2060 : const float cor_map_sum /* i : correlation map sum */
2061 : )
2062 : {
2063 : int16_t i, dec, len, percus_flag;
2064 : float p2v_map[128], ftmp, ftmp1, lt_diff, log_max_spl, epsP_tilt, max_spl;
2065 :
2066 52755 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2067 :
2068 : /* find sample with maximum absolute amplitude */
2069 52755 : max_spl = 0;
2070 13558035 : for ( i = 0; i < L_FRAME; i++ )
2071 : {
2072 13505280 : if ( fabs( new_inp[i] ) > max_spl )
2073 : {
2074 525845 : max_spl = fabsf( new_inp[i] );
2075 : }
2076 : }
2077 :
2078 : /* music is considered only appearing in high SNR condition and active signal */
2079 52755 : if ( st->vad_flag == 0 || st->lp_speech - st->lp_noise < 25 )
2080 : {
2081 15316 : hSpMusClas->dec_mov = 0.5f;
2082 15316 : hSpMusClas->dec_mov1 = 0.5f;
2083 :
2084 15316 : if ( st->vad_flag == 0 )
2085 : {
2086 7353 : hSpMusClas->onset_cnt = 0;
2087 : }
2088 :
2089 15316 : return;
2090 : }
2091 :
2092 37439 : hSpMusClas->onset_cnt++;
2093 :
2094 37439 : if ( hSpMusClas->onset_cnt > 9 )
2095 : {
2096 36499 : hSpMusClas->onset_cnt = 9;
2097 : }
2098 :
2099 37439 : if ( hSpMusClas->onset_cnt == 1 )
2100 : {
2101 109 : set_f( hSpMusClas->buf_flux, -100, BUF_LEN );
2102 : }
2103 :
2104 : /* spectral analysis */
2105 37439 : spec_analysis( st->Bin_E, p2v_map );
2106 :
2107 : /* percussive music detection */
2108 37439 : log_max_spl = 20 * logf( max_spl + 0.0001f );
2109 37439 : lt_diff = log_max_spl - hSpMusClas->mov_log_max_spl;
2110 :
2111 149756 : for ( i = 0; i < 3; i++ )
2112 : {
2113 112317 : hSpMusClas->buf_etot[i] = hSpMusClas->buf_etot[i + 1];
2114 : }
2115 37439 : hSpMusClas->buf_etot[i] = etot;
2116 :
2117 37439 : percus_flag = 0;
2118 37439 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[0] > 6 && hSpMusClas->buf_etot[2] < hSpMusClas->buf_etot[1] && hSpMusClas->buf_etot[1] - st->lp_speech > 3 )
2119 : {
2120 223 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[3] > 3 && hSpMusClas->buf_etot[3] < hSpMusClas->buf_etot[2] && 0.5f * ( 0.5f * ( st->voicing[0] + st->voicing[1] ) + old_cor ) < 0.75f )
2121 : {
2122 33 : if ( hSpMusClas->dec_mov > 0.8f )
2123 : {
2124 2 : percus_flag = 1;
2125 : }
2126 31 : else if ( old_cor < 0.75f && st->voicing[0] < 0.75f && st->voicing[1] < 0.75f && hSpMusClas->old_lt_diff[0] > 10 )
2127 : {
2128 0 : percus_flag = 1;
2129 : }
2130 : }
2131 : }
2132 :
2133 : /* sound attack detection */
2134 37439 : if ( hSpMusClas->buf_etot[3] - hSpMusClas->buf_etot[2] > 6 && hSpMusClas->dec_mov > 0.9f && etot - st->lp_speech > 5 && hSpMusClas->old_lt_diff[0] > 5 )
2135 : {
2136 0 : hSpMusClas->attack_hangover = 3;
2137 : }
2138 :
2139 37439 : if ( st->voicing[0] > 0.9f && st->voicing[1] > 0.9f )
2140 : {
2141 9187 : if ( log_max_spl > hSpMusClas->mov_log_max_spl )
2142 : {
2143 411 : hSpMusClas->mov_log_max_spl = 0.75f * hSpMusClas->mov_log_max_spl + ( 1 - 0.75f ) * log_max_spl;
2144 : }
2145 : else
2146 : {
2147 8776 : hSpMusClas->mov_log_max_spl = 0.995f * hSpMusClas->mov_log_max_spl + ( 1 - 0.995f ) * log_max_spl;
2148 : }
2149 : }
2150 :
2151 37439 : hSpMusClas->old_lt_diff[0] = hSpMusClas->old_lt_diff[1];
2152 37439 : hSpMusClas->old_lt_diff[1] = lt_diff;
2153 :
2154 : /* calculate and buffer spectral energy fluctuation */
2155 37439 : flux( st->Bin_E, p2v_map, hSpMusClas->old_Bin_E, hSpMusClas->buf_flux, hSpMusClas->attack_hangover, hSpMusClas->dec_mov );
2156 :
2157 37439 : hSpMusClas->attack_hangover--;
2158 37439 : if ( hSpMusClas->attack_hangover < 0 )
2159 : {
2160 37439 : hSpMusClas->attack_hangover = 0;
2161 : }
2162 :
2163 : /* identify flux buffer status */
2164 37439 : len = 0;
2165 2125319 : for ( i = BUF_LEN - 1; i >= 0 && hSpMusClas->buf_flux[i] >= 0; i-- )
2166 : {
2167 2087880 : len++;
2168 : }
2169 :
2170 : /* reset flux buffer if percussive music is detected */
2171 37439 : if ( percus_flag == 1 )
2172 : {
2173 2 : set_f( &hSpMusClas->buf_flux[BUF_LEN - len], 5, len );
2174 : }
2175 :
2176 : /* calculate and buffer the tilt of residual LP analysis energies */
2177 37439 : ftmp = 0.00001f;
2178 37439 : ftmp1 = 0;
2179 599024 : for ( i = 1; i < 16; i++ )
2180 : {
2181 561585 : ftmp += epsP[i] * epsP[i];
2182 561585 : ftmp1 += epsP[i] * epsP[i + 1];
2183 : }
2184 :
2185 37439 : epsP_tilt = ftmp1 / ftmp;
2186 :
2187 2246340 : for ( i = 0; i < BUF_LEN - 1; i++ )
2188 : {
2189 2208901 : hSpMusClas->buf_epsP_tilt[i] = hSpMusClas->buf_epsP_tilt[i + 1];
2190 : }
2191 37439 : hSpMusClas->buf_epsP_tilt[i] = epsP_tilt;
2192 :
2193 : /* calculate and buffer highband spectral peakness */
2194 37439 : tonal_dist( p2v_map, hSpMusClas->buf_pkh, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf );
2195 :
2196 : /* buffer sum of correlation map */
2197 2246340 : for ( i = 0; i < BUF_LEN - 1; i++ )
2198 : {
2199 2208901 : hSpMusClas->buf_cor_map_sum[i] = hSpMusClas->buf_cor_map_sum[i + 1];
2200 : }
2201 37439 : hSpMusClas->buf_cor_map_sum[i] = cor_map_sum;
2202 :
2203 : /* buffer voicing metric */
2204 374390 : for ( i = 0; i < 9; i++ )
2205 : {
2206 336951 : hSpMusClas->buf_dlp[i] = hSpMusClas->buf_dlp[i + 1];
2207 : }
2208 37439 : hSpMusClas->buf_dlp[i] = hSpMusClas->lps - hSpMusClas->lpm;
2209 :
2210 : /* classification */
2211 37439 : dec = mode_decision( st, len, &hSpMusClas->dec_mov, hSpMusClas->buf_flux, hSpMusClas->buf_epsP_tilt, hSpMusClas->buf_pkh, hSpMusClas->buf_cor_map_sum, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf, hSpMusClas->buf_dlp );
2212 :
2213 : /* update long term moving average of the classification decisions */
2214 37439 : if ( len > 30 )
2215 : {
2216 34680 : hSpMusClas->dec_mov = 0.97f * hSpMusClas->dec_mov + ( 1 - 0.97f ) * dec;
2217 34680 : hSpMusClas->dec_mov1 = 0.97f * hSpMusClas->dec_mov1 + ( 1 - 0.97f ) * dec;
2218 : }
2219 :
2220 : /* update long-term unvoiced counter */
2221 37439 : if ( ( st->coder_type_raw == UNVOICED || st->coder_type_raw == INACTIVE ) && etot > 1.5f && hSpMusClas->buf_Ntonal2[59] < 2 )
2222 : {
2223 1607 : hSpMusClas->UV_cnt1 -= 8;
2224 : }
2225 : else
2226 : {
2227 35832 : hSpMusClas->UV_cnt1++;
2228 : }
2229 :
2230 37439 : if ( hSpMusClas->UV_cnt1 > 300 )
2231 : {
2232 23019 : hSpMusClas->UV_cnt1 = 300;
2233 : }
2234 14420 : else if ( hSpMusClas->UV_cnt1 < 0 )
2235 : {
2236 4 : hSpMusClas->UV_cnt1 = 0;
2237 : }
2238 :
2239 37439 : hSpMusClas->LT_UV_cnt1 = 0.9f * hSpMusClas->LT_UV_cnt1 + 0.1f * hSpMusClas->UV_cnt1;
2240 :
2241 : /* revert classification decision due to long-term unvoiced counter */
2242 37439 : if ( dec == 1 && hSpMusClas->dec_mov1 < 0.2f && hSpMusClas->LT_UV_cnt1 < 200 )
2243 : {
2244 40 : dec = 0;
2245 : }
2246 :
2247 : /* overwrite 1st stage speech/music decision to music */
2248 37439 : if ( dec == 1 )
2249 : {
2250 10852 : st->sp_aud_decision1 = 1;
2251 : }
2252 :
2253 37439 : return;
2254 : }
2255 :
2256 :
2257 : /*---------------------------------------------------------------------*
2258 : * spec_analysis()
2259 : *
2260 : * Spectral analysis for mixed/music classification improvement
2261 : *---------------------------------------------------------------------*/
2262 :
2263 37439 : static void spec_analysis(
2264 : float *Bin_E, /* i : log energy spectrum of the current frame */
2265 : float *p2v_map /* o : spectral peakiness map */
2266 : )
2267 : {
2268 : int16_t i, k, m;
2269 : float peak[L_FFT / 4 + 1];
2270 : float valley[L_FFT / 4 + 1];
2271 : int16_t peak_idx[L_FFT / 4 + 1];
2272 : int16_t valey_idx[L_FFT / 4 + 1];
2273 : float p2v[L_FFT / 4 + 1];
2274 :
2275 : /* find spectral peaks */
2276 37439 : k = 0;
2277 4717314 : for ( i = 1; i < L_FFT / 2 - 2; i++ )
2278 : {
2279 4679875 : if ( Bin_E[i] > Bin_E[i - 1] && Bin_E[i] > Bin_E[i + 1] )
2280 : {
2281 1256698 : peak[k] = Bin_E[i];
2282 1256698 : peak_idx[k] = i;
2283 1256698 : k++;
2284 : }
2285 : }
2286 37439 : assert( k + 1 < L_FFT / 4 + 1 );
2287 37439 : peak_idx[k] = -1;
2288 37439 : peak_idx[k + 1] = -1;
2289 :
2290 37439 : if ( k == 0 )
2291 : {
2292 768 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2293 : {
2294 762 : p2v_map[i] = 0;
2295 : }
2296 :
2297 6 : return;
2298 : }
2299 :
2300 : /* find spectral valleys */
2301 37433 : m = 0;
2302 37433 : if ( Bin_E[0] < Bin_E[1] )
2303 : {
2304 19380 : valley[0] = Bin_E[0];
2305 19380 : valey_idx[0] = 0;
2306 19380 : m++;
2307 : }
2308 :
2309 37433 : k = L_FFT / 2 - 2;
2310 64363 : for ( i = L_FFT / 2 - 3; i >= 0 && Bin_E[i + 1] > Bin_E[i]; i-- )
2311 : {
2312 26930 : k = i;
2313 : }
2314 :
2315 4689628 : for ( i = 1; i < k; i++ )
2316 : {
2317 4652195 : if ( Bin_E[i] < Bin_E[i - 1] && Bin_E[i] < Bin_E[i + 1] )
2318 : {
2319 1237318 : valley[m] = Bin_E[i];
2320 1237318 : valey_idx[m] = i;
2321 1237318 : m++;
2322 : }
2323 : }
2324 :
2325 37433 : valley[m] = Bin_E[k];
2326 37433 : valey_idx[m] = k;
2327 :
2328 : /* find spectral peak to valley distances */
2329 37433 : k = 0;
2330 1294131 : for ( i = 0; i < m; i++ )
2331 : {
2332 1256698 : if ( peak_idx[k] > valey_idx[i] && peak_idx[k] < valey_idx[i + 1] )
2333 : {
2334 1256698 : p2v[k] = 2 * peak[k] - valley[i] - valley[i + 1];
2335 1256698 : k++;
2336 : }
2337 : }
2338 :
2339 4791424 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2340 : {
2341 4753991 : p2v_map[i] = 0;
2342 : }
2343 :
2344 1294131 : for ( i = 0; i < k; i++ )
2345 : {
2346 1256698 : p2v_map[peak_idx[i]] = p2v[i];
2347 : }
2348 :
2349 37433 : return;
2350 : }
2351 :
2352 : /*---------------------------------------------------------------------*
2353 : * flux()
2354 : *
2355 : * Calculation of spectral flux
2356 : *---------------------------------------------------------------------*/
2357 :
2358 37439 : static void flux(
2359 : float *Bin_E, /* i : log energy spectrum of the current frame */
2360 : float *p2v_map, /* i : spectral peakiness map */
2361 : float *old_Bin_E, /* i/o: log energy spectrum of the frame 60ms ago */
2362 : float *buf_flux, /* i/o: buffer storing spectral energy fluctuation */
2363 : int16_t attack_hangover, /* i/o: hangover preventing flux buffering */
2364 : float dec_mov /* i/o: moving average of classifier decision */
2365 : )
2366 : {
2367 : int16_t i;
2368 : float *pt1, *pt2, *pt3, *pt4, *pt5, *pt6;
2369 : float flux;
2370 : int16_t cnt;
2371 :
2372 : /* calculate flux */
2373 37439 : flux = 0;
2374 37439 : cnt = 0;
2375 1609877 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2376 : {
2377 1572438 : if ( p2v_map[i] != 0 )
2378 : {
2379 389399 : flux += fabsf( Bin_E[i] - old_Bin_E[i] );
2380 389399 : cnt++;
2381 : }
2382 : }
2383 :
2384 37439 : if ( cnt == 0 )
2385 : {
2386 6 : flux = 5;
2387 : }
2388 : else
2389 : {
2390 37433 : flux = flux / (float) cnt;
2391 : }
2392 :
2393 37439 : if ( flux > 20 && dec_mov > 0.8f )
2394 : {
2395 940 : flux = 20;
2396 : }
2397 :
2398 : /* update old Bin_E buffer */
2399 37439 : pt1 = old_Bin_E;
2400 37439 : pt2 = old_Bin_E + N_OLD_BIN_E;
2401 37439 : pt3 = Bin_E;
2402 37439 : pt4 = old_Bin_E + N_OLD_BIN_E;
2403 37439 : pt5 = old_Bin_E + 2 * N_OLD_BIN_E;
2404 37439 : pt6 = old_Bin_E + 2 * N_OLD_BIN_E;
2405 :
2406 1609877 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2407 : {
2408 1572438 : *pt1++ = *pt2++;
2409 1572438 : *pt4++ = *pt5++;
2410 1572438 : *pt6++ = *pt3++;
2411 : }
2412 :
2413 : /* update flux buffer */
2414 37439 : if ( attack_hangover <= 0 )
2415 : {
2416 2246340 : for ( i = 0; i < BUF_LEN - 1; i++ )
2417 : {
2418 2208901 : buf_flux[i] = buf_flux[i + 1];
2419 : }
2420 :
2421 37439 : buf_flux[i] = flux;
2422 : }
2423 :
2424 37439 : return;
2425 : }
2426 :
2427 :
2428 : /*---------------------------------------------------------------------*
2429 : * tonal_dist()
2430 : *
2431 : * Calculation of spectral distance
2432 : *---------------------------------------------------------------------*/
2433 :
2434 37439 : static void tonal_dist(
2435 : float *p2v_map, /* i : spectral peakiness map */
2436 : float *buf_pkh, /* i/o: buffer storing highband spectral peakiness */
2437 : float *buf_Ntonal, /* i/o: buffer storing No.of 1st spectral tone */
2438 : float *buf_Ntonal2, /* i/o: buffer storing No.of 2nd spectral tone */
2439 : float *buf_Ntonal_lf /* i/o: buffer storing low band spectral tone ratio */
2440 : )
2441 : {
2442 : int16_t i;
2443 : float pk;
2444 : int16_t Ntonal;
2445 : int16_t Ntonal2;
2446 : int16_t Ntonal_lf;
2447 :
2448 : /* find number of tonals, number of tonals at low-band,
2449 : spectral peakiness at high-band */
2450 37439 : pk = 0;
2451 37439 : Ntonal = 0;
2452 37439 : Ntonal2 = 0;
2453 37439 : Ntonal_lf = 0;
2454 2433535 : for ( i = 0; i < 64; i++ )
2455 : {
2456 2396096 : if ( p2v_map[i] > 55 )
2457 : {
2458 175285 : Ntonal++;
2459 : }
2460 :
2461 2396096 : if ( p2v_map[i] > 80 )
2462 : {
2463 96794 : Ntonal2++;
2464 96794 : Ntonal_lf++;
2465 : }
2466 : }
2467 :
2468 2396096 : for ( i = 64; i < 127; i++ )
2469 : {
2470 2358657 : if ( p2v_map[i] != 0 )
2471 : {
2472 651045 : pk += p2v_map[i];
2473 : }
2474 :
2475 2358657 : if ( p2v_map[i] > 55 )
2476 : {
2477 83834 : Ntonal++;
2478 : }
2479 :
2480 2358657 : if ( p2v_map[i] > 80 )
2481 : {
2482 33248 : Ntonal2++;
2483 : }
2484 : }
2485 :
2486 : /* update buffers */
2487 2246340 : for ( i = 0; i < BUF_LEN - 1; i++ )
2488 : {
2489 2208901 : buf_pkh[i] = buf_pkh[i + 1];
2490 2208901 : buf_Ntonal[i] = buf_Ntonal[i + 1];
2491 2208901 : buf_Ntonal2[i] = buf_Ntonal2[i + 1];
2492 2208901 : buf_Ntonal_lf[i] = buf_Ntonal_lf[i + 1];
2493 : }
2494 :
2495 37439 : buf_pkh[i] = pk;
2496 37439 : buf_Ntonal[i] = (float) Ntonal;
2497 37439 : buf_Ntonal2[i] = (float) Ntonal2;
2498 37439 : buf_Ntonal_lf[i] = (float) Ntonal_lf;
2499 :
2500 37439 : return;
2501 : }
2502 :
2503 :
2504 : /*---------------------------------------------------------------------*
2505 : * mode_decision()
2506 : *
2507 : * Decision about internal mode of the mixed/music classifier improvement
2508 : *---------------------------------------------------------------------*/
2509 :
2510 37439 : static int16_t mode_decision(
2511 : Encoder_State *st,
2512 : int16_t len, /* i : buffering status */
2513 : float *dec_mov, /* i/o: moving average of classifier decision */
2514 : float *buf_flux, /* i : buffer storing spectral energy fluctuation */
2515 : float *buf_epsP_tilt, /* i : buffer storing LP prediciton error tilt */
2516 : float *buf_pkh, /* i : buffer storing highband spectral peakiness */
2517 : float *buf_cor_map_sum, /* i : buffer storing correlation map sum */
2518 : float *buf_Ntonal, /* i : buffer storing No.of 1st spectral tone */
2519 : float *buf_Ntonal2, /* i : buffer storing No.of 2nd spectral tone */
2520 : float *buf_Ntonal_lf, /* i : buffer storing low band spectral tone ratio */
2521 : float *buf_dlp /* i : buffer storing voicing estimate */
2522 : )
2523 : {
2524 : int16_t mode;
2525 : int16_t i;
2526 : int16_t voiced_cnt;
2527 : float M_pkh;
2528 : float M_cor_map_sum;
2529 : float M_Ntonal;
2530 : float M_flux;
2531 : float V_epsP_tilt;
2532 : float lf_Ntonal_ratio;
2533 :
2534 37439 : mode = *dec_mov > 0.5f;
2535 :
2536 37439 : if ( len <= 5 )
2537 : {
2538 529 : return ( mode );
2539 : }
2540 36910 : else if ( len < 10 )
2541 : {
2542 411 : M_pkh = mean( buf_pkh + BUF_LEN - len, len );
2543 411 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - len, len );
2544 411 : M_Ntonal = mean( buf_Ntonal + BUF_LEN - len, len );
2545 411 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - len, len );
2546 :
2547 411 : voiced_cnt = 0;
2548 2877 : for ( i = 9; i > 3; i-- )
2549 : {
2550 2466 : if ( buf_dlp[i] > 0.0f )
2551 : {
2552 866 : voiced_cnt++;
2553 : }
2554 : }
2555 :
2556 411 : if ( ( M_pkh > 1100 || V_epsP_tilt < 0.00008f || M_cor_map_sum > 100 ) && voiced_cnt < 4 )
2557 : {
2558 68 : mode = 1;
2559 : }
2560 343 : else if ( M_Ntonal > 27 && voiced_cnt < 4 )
2561 : {
2562 0 : mode = 1;
2563 : }
2564 : }
2565 : else
2566 : {
2567 36499 : voiced_cnt = 0;
2568 401489 : for ( i = 0; i < 10; i++ )
2569 : {
2570 364990 : if ( buf_dlp[i] > 0.0f )
2571 : {
2572 163863 : voiced_cnt++;
2573 : }
2574 : }
2575 :
2576 36499 : M_flux = mean( &buf_flux[BUF_LEN - 10], 10 );
2577 36499 : M_pkh = mean( buf_pkh + BUF_LEN - 10, 10 );
2578 36499 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - 10, 10 );
2579 36499 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - 10, 10 );
2580 :
2581 36499 : if ( ( M_flux < 8.5f || ( V_epsP_tilt < 0.001f && M_flux < 12.0f ) || M_pkh > 1050 || M_cor_map_sum > 100 ) && voiced_cnt < 3 && mean( &buf_flux[55], 5 ) < 15 )
2582 : {
2583 6923 : mode = 1;
2584 6923 : *dec_mov = 1;
2585 6923 : return ( mode );
2586 : }
2587 :
2588 29576 : if ( M_flux > 16.0f || ( M_flux > 15 && voiced_cnt > 2 ) || mean( &buf_flux[55], 5 ) > 19.0f || ( buf_flux[59] >= 20 && st->hSpMusClas->lps - st->hSpMusClas->lpm > 0 ) )
2589 : {
2590 23779 : *dec_mov = 0;
2591 23779 : mode = 0;
2592 23779 : return ( mode );
2593 : }
2594 :
2595 129567 : for ( i = 10; i < len; i++ )
2596 : {
2597 126891 : M_flux = mean( &buf_flux[BUF_LEN - i], i );
2598 126891 : M_pkh = mean( buf_pkh + BUF_LEN - i, i );
2599 126891 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - i, i );
2600 126891 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - i, i );
2601 :
2602 126891 : if ( ( ( M_flux < 12 + 0.05f * ( len - 10 ) && mean( &buf_flux[BUF_LEN - 10], 10 ) < 15 ) || V_epsP_tilt < 0.0001f + 0.000018f * ( len - 10 ) || M_pkh > 1050 - 5.0f * ( len - 10 ) || M_cor_map_sum > 95 - 0.3f * ( len - 10 ) ) && voiced_cnt < 3 )
2603 : {
2604 3121 : mode = 1;
2605 3121 : return ( mode );
2606 : }
2607 : }
2608 :
2609 2676 : if ( len == BUF_LEN )
2610 : {
2611 2331 : M_Ntonal = mean( buf_Ntonal, BUF_LEN );
2612 2331 : lf_Ntonal_ratio = sum_f( buf_Ntonal_lf, BUF_LEN ) / ( sum_f( buf_Ntonal2, BUF_LEN ) + 0.0001f );
2613 :
2614 2331 : if ( M_Ntonal > 18 || lf_Ntonal_ratio < 0.2f )
2615 : {
2616 20 : mode = 1;
2617 : }
2618 2311 : else if ( M_Ntonal < 1 )
2619 : {
2620 0 : mode = 0;
2621 : }
2622 : }
2623 : }
2624 :
2625 3087 : return ( mode );
2626 : }
2627 :
2628 :
2629 : /*----------------------------------------------------------------------------------*
2630 : * tonal_context_improv()
2631 : *
2632 : * Context-based improvement of 1st/2nd stage speech/music decision on stable tonal signals
2633 : *----------------------------------------------------------------------------------*/
2634 :
2635 52755 : static void tonal_context_improv(
2636 : Encoder_State *st, /* i/o: encoder state structure */
2637 : const float PS[], /* i : energy spectrum */
2638 : const float voi_fv, /* i : scaled voicing feature */
2639 : const float cor_map_sum_fv, /* i : scaled correlation map feature */
2640 : const float LPCErr /* i : scaled LP prediction error feature */
2641 : )
2642 : {
2643 : int16_t lt_pitch_diff;
2644 : float sort_max, sort_avg, sort_val[80];
2645 : float tonality, tonality1, tonality2, tonality3, t2, t3, tL, err, cor, dft;
2646 :
2647 52755 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2648 :
2649 : /* reset in case of codec mode switching */
2650 52755 : if ( st->last_codec_mode == MODE2 )
2651 : {
2652 10114 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
2653 10114 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
2654 10114 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
2655 10114 : hSpMusClas->lt_music_hangover = 0;
2656 10114 : hSpMusClas->lt_music_state = 0;
2657 10114 : hSpMusClas->lt_speech_state = 0;
2658 10114 : hSpMusClas->lt_speech_hangover = 0;
2659 : }
2660 :
2661 : /* estimate maximum tonality in bands [0-1 kHz], [1-2kHz] and [2-4kHz] */
2662 52755 : mvr2r( PS, sort_val, 80 );
2663 :
2664 : /* tonality in band 0-1 kHz */
2665 52755 : v_sort( sort_val, 0, 19 );
2666 52755 : sort_max = sort_val[19];
2667 52755 : sort_avg = sum_f( &sort_val[0], 10 );
2668 52755 : tonality1 = sort_max / sort_avg;
2669 :
2670 : /* tonality in band 1-2 kHz */
2671 52755 : v_sort( sort_val, 20, 39 );
2672 52755 : sort_max = sort_val[39];
2673 52755 : sort_avg = sum_f( &sort_val[20], 10 );
2674 52755 : tonality2 = sort_max / sort_avg;
2675 :
2676 : /* tonality in band 2-4 kHz */
2677 52755 : v_sort( sort_val, 40, 79 );
2678 52755 : sort_max = sort_val[79];
2679 52755 : sort_avg = sum_f( &sort_val[40], 20 );
2680 52755 : tonality3 = sort_max / sort_avg;
2681 :
2682 52755 : tonality = max( max( tonality1, tonality2 ), tonality3 );
2683 :
2684 52755 : if ( st->hVAD->hangover_cnt == 10 && st->vad_flag == 1 )
2685 : {
2686 : /* long-term voicing parameter */
2687 657 : hSpMusClas->lt_voicing = 0.1f * hSpMusClas->lt_voicing + 0.9f * *st->voicing;
2688 :
2689 : /* long-term correlation value */
2690 657 : hSpMusClas->lt_corr = 0.1f * hSpMusClas->lt_corr + 0.9f * st->old_corr;
2691 :
2692 : /* long-term tonality measure */
2693 657 : hSpMusClas->lt_tonality = 0.1f * hSpMusClas->lt_tonality + 0.9f * tonality;
2694 : }
2695 : else
2696 : {
2697 : /* long-term voicing parameter */
2698 52098 : hSpMusClas->lt_voicing = 0.7f * hSpMusClas->lt_voicing + 0.3f * *st->voicing;
2699 :
2700 : /* long-term correlation value */
2701 52098 : hSpMusClas->lt_corr = 0.7f * hSpMusClas->lt_corr + 0.3f * st->old_corr;
2702 :
2703 : /* long-term tonality measure */
2704 52098 : hSpMusClas->lt_tonality = 0.5f * hSpMusClas->lt_tonality + 0.5f * tonality;
2705 : }
2706 :
2707 : /* pitch difference w.r.t to past 3 frames */
2708 52755 : lt_pitch_diff = (int16_t) abs( hSpMusClas->lt_corr_pitch[0] - st->pitch[0] );
2709 52755 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[1] - st->pitch[0] );
2710 52755 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[2] - st->pitch[0] );
2711 :
2712 52755 : hSpMusClas->lt_corr_pitch[0] = hSpMusClas->lt_corr_pitch[1];
2713 52755 : hSpMusClas->lt_corr_pitch[1] = hSpMusClas->lt_corr_pitch[2];
2714 52755 : hSpMusClas->lt_corr_pitch[2] = st->pitch[0];
2715 :
2716 52755 : hSpMusClas->lt_old_mode[0] = hSpMusClas->lt_old_mode[1];
2717 52755 : hSpMusClas->lt_old_mode[1] = hSpMusClas->lt_old_mode[2];
2718 :
2719 71952 : if ( st->sp_aud_decision1 == 1 &&
2720 35333 : ( min( min( tonality1, tonality2 ), tonality3 ) > 50.0f ) &&
2721 1263 : ( tonality1 + tonality2 > 200.0f && tonality2 + tonality3 > 200.0f && tonality1 + tonality3 > 200.0f ) &&
2722 952 : ( hSpMusClas->lt_tonality < 20000.0f ) &&
2723 952 : ( ( hSpMusClas->lt_tonality > 1000 && max( hSpMusClas->lt_voicing, *st->voicing ) > 0.99f ) ||
2724 907 : ( hSpMusClas->lt_tonality > 1500 && hSpMusClas->lt_corr > 0.99f ) ||
2725 905 : ( hSpMusClas->lt_tonality > 3000 && hSpMusClas->lowrate_pitchGain > 0.96f ) ||
2726 487 : ( lt_pitch_diff == 0 && hSpMusClas->lowrate_pitchGain > 0.89f ) ) )
2727 : {
2728 98 : if ( sum_s( hSpMusClas->lt_old_mode, 2 ) < 2 )
2729 : {
2730 : /* probably speech - change the decision to speech */
2731 26 : st->sp_aud_decision1 = 0;
2732 26 : st->sp_aud_decision2 = 0;
2733 :
2734 26 : if ( hSpMusClas->lt_hangover == 0 )
2735 : {
2736 6 : hSpMusClas->lt_hangover = 6;
2737 : }
2738 : }
2739 : }
2740 : else
2741 : {
2742 : /* not speech, but still in the hangover period - change the decision to speech */
2743 52657 : if ( hSpMusClas->lt_hangover > 0 )
2744 : {
2745 36 : st->sp_aud_decision1 = 0;
2746 36 : st->sp_aud_decision2 = 0;
2747 36 : hSpMusClas->lt_hangover--;
2748 : }
2749 : }
2750 :
2751 : /* calculate standard deviation of log-tonality */
2752 52755 : mvr2r( hSpMusClas->tonality2_buf + 1, hSpMusClas->tonality2_buf, HANG_LEN_INIT - 1 );
2753 52755 : hSpMusClas->tonality2_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality2 );
2754 52755 : t2 = std_dev( hSpMusClas->tonality2_buf, HANG_LEN_INIT );
2755 :
2756 52755 : mvr2r( hSpMusClas->tonality3_buf + 1, hSpMusClas->tonality3_buf, HANG_LEN_INIT - 1 );
2757 52755 : hSpMusClas->tonality3_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality3 );
2758 52755 : t3 = std_dev( hSpMusClas->tonality3_buf, HANG_LEN_INIT );
2759 :
2760 52755 : tL = 0.2f * log10f( hSpMusClas->lt_tonality );
2761 :
2762 : /* calculate standard deviation of residual LP energy */
2763 52755 : mvr2r( hSpMusClas->LPCErr_buf + 1, hSpMusClas->LPCErr_buf, HANG_LEN_INIT - 1 );
2764 52755 : hSpMusClas->LPCErr_buf[HANG_LEN_INIT - 1] = LPCErr;
2765 52755 : err = std_dev( hSpMusClas->LPCErr_buf, HANG_LEN_INIT );
2766 :
2767 52755 : cor = max( voi_fv - cor_map_sum_fv, 0.0f );
2768 52755 : dft = 0.2f * fabsf( log10f( tonality2 ) - log10f( tonality3 ) );
2769 :
2770 : /* state machine for strong music */
2771 52755 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_music_state == 0 && hSpMusClas->lt_music_hangover == 0 &&
2772 16492 : t2 < 0.54f && t2 > 0.26f && t3 > 0.22f && tL < 0.54f && tL > 0.26f && err > 0.5f )
2773 : {
2774 104 : hSpMusClas->lt_music_state = 1;
2775 104 : hSpMusClas->lt_music_hangover = 6;
2776 : }
2777 52651 : else if ( hSpMusClas->lt_music_state == 1 && hSpMusClas->lt_music_hangover == 0 && t2 < 0.34 && t3 < 0.26f && tL < 0.45f )
2778 : {
2779 95 : hSpMusClas->lt_music_state = 0;
2780 95 : hSpMusClas->lt_music_hangover = 6;
2781 : }
2782 :
2783 52755 : if ( hSpMusClas->lt_music_hangover > 0 )
2784 : {
2785 1162 : hSpMusClas->lt_music_hangover--;
2786 : }
2787 :
2788 : /* state machine for strong speech */
2789 52755 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 0 && hSpMusClas->lt_speech_hangover == 0 &&
2790 1741 : cor > 0.40f && dft < 0.1f && voi_fv > 2 * cor_map_sum_fv + 0.12f &&
2791 287 : t2 < cor && t3 < cor && tL < cor && cor_map_sum_fv < cor && voi_fv > cor && voi_fv > 0.76f )
2792 : {
2793 77 : hSpMusClas->lt_speech_state = 1;
2794 77 : hSpMusClas->lt_speech_hangover = 6;
2795 : }
2796 52678 : else if ( hSpMusClas->lt_speech_state == 1 && hSpMusClas->lt_speech_hangover == 0 && cor < 0.40f )
2797 : {
2798 69 : hSpMusClas->lt_speech_state = 0;
2799 69 : hSpMusClas->lt_speech_hangover = 6;
2800 : }
2801 :
2802 52755 : if ( hSpMusClas->lt_speech_hangover > 0 )
2803 : {
2804 779 : hSpMusClas->lt_speech_hangover--;
2805 : }
2806 :
2807 : /* final decision */
2808 52755 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 1 )
2809 : {
2810 : /* strong speech - probably error in speech/music classification */
2811 403 : st->sp_aud_decision1 = 0;
2812 403 : st->sp_aud_decision2 = 0;
2813 : }
2814 52352 : else if ( st->sp_aud_decision1 == 0 && hSpMusClas->lt_music_state == 1 )
2815 : {
2816 : /* strong music - probably error in speech/music classification */
2817 111 : st->sp_aud_decision1 = 1;
2818 111 : st->sp_aud_decision2 = 1;
2819 : }
2820 :
2821 : /* update the buffer of past decisions */
2822 52755 : hSpMusClas->lt_old_mode[2] = st->sp_aud_decision1;
2823 :
2824 52755 : return;
2825 : }
2826 :
2827 : /*---------------------------------------------------------------------*
2828 : * detect_sparseness()
2829 : *
2830 : *
2831 : *---------------------------------------------------------------------*/
2832 :
2833 16594 : static void detect_sparseness(
2834 : Encoder_State *st, /* i/o: encoder state structure */
2835 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
2836 : const float voi_fv /* i : scaled voicing feature */
2837 : )
2838 : {
2839 : float sum;
2840 : float ftmp;
2841 : float ftmp1;
2842 : float S1[128];
2843 : int16_t i, j;
2844 16594 : int16_t hb_sp_high_flag = 0;
2845 16594 : int16_t lb_sp_high_flag = 0;
2846 : float sumh;
2847 : float sparse;
2848 : float tmp_buf[4];
2849 16594 : float Mlpe = 0.0f;
2850 16594 : float Mv = 0.0f;
2851 : float Msp;
2852 :
2853 16594 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2854 :
2855 16594 : mvr2r( st->Bin_E, S1, 128 );
2856 :
2857 16594 : sum = 0;
2858 1344114 : for ( i = 0; i < 80; i++ )
2859 : {
2860 1327520 : if ( S1[i] < 0 )
2861 : {
2862 222182 : S1[i] = 0;
2863 : }
2864 1327520 : sum += S1[i];
2865 : }
2866 :
2867 16594 : sumh = 0;
2868 813106 : for ( i = 80; i < 128; i++ )
2869 : {
2870 796512 : if ( S1[i] < 0 )
2871 : {
2872 224127 : S1[i] = 0;
2873 : }
2874 796512 : sumh += S1[i];
2875 : }
2876 :
2877 16594 : sum += sumh;
2878 :
2879 : /* order spectral from max to min */
2880 16594 : order_spectrum( S1, 128 );
2881 :
2882 : /* calculate spectral sparseness in the range 0 - 6.4 kHz */
2883 16594 : j = 0;
2884 16594 : ftmp = 0.0f;
2885 16594 : ftmp1 = 0.75f * sum;
2886 916137 : for ( i = 0; i < 128; i++ )
2887 : {
2888 916094 : ftmp += S1[i];
2889 916094 : if ( ftmp > ftmp1 )
2890 : {
2891 16551 : j = i;
2892 16551 : break;
2893 : }
2894 : }
2895 :
2896 132752 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2897 : {
2898 116158 : hSpMusClas->sparse_buf[i] = hSpMusClas->sparse_buf[i + 1];
2899 : }
2900 :
2901 16594 : sparse = (float) j;
2902 16594 : hSpMusClas->sparse_buf[i] = sparse;
2903 :
2904 16594 : if ( st->bwidth == WB )
2905 : {
2906 1596 : Msp = mean( hSpMusClas->sparse_buf, 8 );
2907 :
2908 : /* find long-term smoothed sparseness */
2909 1596 : if ( hSpMusClas->last_vad_spa == 0 )
2910 : {
2911 169 : set_f( &hSpMusClas->sparse_buf[0], sparse, HANG_LEN_INIT - 1 );
2912 169 : hSpMusClas->LT_sparse = sparse;
2913 : }
2914 : else
2915 : {
2916 1427 : set_f( tmp_buf, 0.0f, 4 );
2917 :
2918 12843 : for ( i = 0; i < HANG_LEN_INIT; i++ )
2919 : {
2920 32527 : for ( j = 0; j < 4; j++ )
2921 : {
2922 29620 : if ( hSpMusClas->sparse_buf[i] > tmp_buf[j] )
2923 : {
2924 8509 : mvr2r( &tmp_buf[j], &tmp_buf[j + 1], 3 - j );
2925 8509 : tmp_buf[j] = hSpMusClas->sparse_buf[i];
2926 8509 : break;
2927 : }
2928 : }
2929 : }
2930 :
2931 1427 : ftmp = 0.25f * ( HANG_LEN_INIT * Msp - sum_f( tmp_buf, 4 ) ) - hSpMusClas->LT_sparse;
2932 :
2933 1427 : hSpMusClas->LT_sparse = hSpMusClas->LT_sparse + 0.25f * ftmp;
2934 : }
2935 :
2936 : /* find high-band sparseness */
2937 1596 : mvr2r( st->Bin_E + 80, S1, 48 );
2938 1596 : order_spectrum( S1, 48 );
2939 :
2940 12768 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2941 : {
2942 11172 : hSpMusClas->hf_spar_buf[i] = hSpMusClas->hf_spar_buf[i + 1];
2943 : }
2944 1596 : hSpMusClas->hf_spar_buf[i] = sum_f( S1, 5 ) / ( sumh + 0.1f );
2945 1596 : if ( mean( hSpMusClas->hf_spar_buf, 8 ) > 0.2f )
2946 : {
2947 490 : hb_sp_high_flag = 1;
2948 : }
2949 :
2950 : /* find low-band sparseness */
2951 1596 : mvr2r( st->Bin_E, S1, 60 );
2952 1596 : order_spectrum( S1, 60 );
2953 :
2954 1596 : if ( sum_f( S1, 5 ) / sum_f( S1, 60 ) > 0.18f )
2955 : {
2956 1015 : lb_sp_high_flag = 1;
2957 : }
2958 :
2959 : /* find smoothed linear prediction efficiency */
2960 12768 : for ( i = 0; i < 7; i++ )
2961 : {
2962 11172 : hSpMusClas->lpe_buf[i] = hSpMusClas->lpe_buf[i + 1];
2963 : }
2964 :
2965 1596 : hSpMusClas->lpe_buf[i] = hSpMusClas->past_epsP2;
2966 1596 : Mlpe = mean( hSpMusClas->lpe_buf, 8 );
2967 :
2968 : /* find smoothed voicing */
2969 12768 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2970 : {
2971 11172 : hSpMusClas->voicing_buf[i] = hSpMusClas->voicing_buf[i + 1];
2972 : }
2973 :
2974 1596 : hSpMusClas->voicing_buf[i] = voi_fv;
2975 1596 : Mv = mean( hSpMusClas->voicing_buf, 8 );
2976 : }
2977 :
2978 : /* avoid using LR-MDCT on sparse spectra */
2979 16594 : if ( st->sp_aud_decision1 == 1 )
2980 : {
2981 6896 : if ( st->bwidth == WB )
2982 : {
2983 717 : ftmp = 90;
2984 : }
2985 : else
2986 : {
2987 6179 : ftmp = 91;
2988 : }
2989 6896 : if ( sparse > ftmp )
2990 : {
2991 0 : st->sp_aud_decision1 = 0;
2992 0 : st->sp_aud_decision2 = 1;
2993 0 : hSpMusClas->gsc_hangover = 1;
2994 : }
2995 6896 : else if ( hSpMusClas->gsc_hangover == 1 )
2996 : {
2997 53 : if ( sparse > 85 )
2998 : {
2999 0 : st->sp_aud_decision1 = 0;
3000 0 : st->sp_aud_decision2 = 1;
3001 : }
3002 53 : else if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3003 : {
3004 52 : st->sp_aud_decision1 = 0;
3005 52 : st->sp_aud_decision2 = 1;
3006 : }
3007 : }
3008 :
3009 6896 : if ( st->bwidth == WB )
3010 : {
3011 717 : if ( hSpMusClas->LT_sparse > 60 && sparse > 50 && Mlpe < -1.3f && Mv > 0.85f &&
3012 58 : lb_sp_high_flag == 0 && ( ( hb_sp_high_flag == 0 && sumh > 0.15f * sum ) || sumh <= 0.15f * sum ) )
3013 : {
3014 9 : st->sp_aud_decision1 = 0;
3015 9 : st->sp_aud_decision2 = 1;
3016 9 : hSpMusClas->gsc_hangover = 1;
3017 : }
3018 708 : else if ( hSpMusClas->gsc_hangover == 1 && !( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 ) )
3019 : {
3020 1 : if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3021 : {
3022 0 : st->sp_aud_decision1 = 0;
3023 0 : st->sp_aud_decision2 = 1;
3024 : }
3025 : }
3026 : }
3027 : }
3028 :
3029 : /* update the counter of consecutive GSC frames with sparse spectrum */
3030 16594 : if ( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 )
3031 : {
3032 54 : ( hSpMusClas->gsc_cnt )++;
3033 54 : if ( hSpMusClas->gsc_cnt > 7 )
3034 : {
3035 46 : hSpMusClas->gsc_cnt = 7;
3036 : }
3037 : }
3038 : else
3039 : {
3040 16540 : hSpMusClas->gsc_cnt = 0;
3041 16540 : hSpMusClas->gsc_hangover = 0;
3042 : }
3043 :
3044 16594 : hSpMusClas->last_vad_spa = localVAD_HE_SAD;
3045 :
3046 16594 : return;
3047 : }
3048 :
3049 :
3050 : /*---------------------------------------------------------------------*
3051 : * order_spectrum()
3052 : *
3053 : *
3054 : *---------------------------------------------------------------------*/
3055 :
3056 19786 : static void order_spectrum(
3057 : float *vec,
3058 : const int16_t len )
3059 : {
3060 : int16_t i, j, imax, imin;
3061 : float temp;
3062 :
3063 1167986 : for ( i = 0; i < len / 2; i++ )
3064 : {
3065 1148200 : imax = i;
3066 1148200 : imin = i;
3067 72621120 : for ( j = i; j < len - i; j++ )
3068 : {
3069 71472920 : if ( vec[j] > vec[imax] )
3070 : {
3071 3114189 : imax = j;
3072 : }
3073 : else
3074 : {
3075 68358731 : if ( vec[j] < vec[imin] )
3076 : {
3077 4518666 : imin = j;
3078 : }
3079 : }
3080 : }
3081 :
3082 1148200 : temp = vec[i];
3083 1148200 : vec[i] = vec[imax];
3084 1148200 : vec[imax] = temp;
3085 :
3086 1148200 : if ( imin == i )
3087 : {
3088 163891 : imin = imax;
3089 : }
3090 :
3091 1148200 : temp = vec[len - i - 1];
3092 1148200 : vec[len - i - 1] = vec[imin];
3093 1148200 : vec[imin] = temp;
3094 : }
3095 :
3096 19786 : return;
3097 : }
|