Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <assert.h>
38 : #include <stdint.h>
39 : #include "options.h"
40 : #ifdef DEBUGGING
41 : #include "debug.h"
42 : #endif
43 : #include <math.h>
44 : #include "cnst.h"
45 : #include "prot.h"
46 : #include "ivas_prot.h"
47 : #include "rom_enc.h"
48 : #include "rom_com.h" /* Common static table prototypes */
49 : #include "wmc_auto.h"
50 :
51 :
52 : /*---------------------------------------------------------------------*
53 : * Local constants
54 : *---------------------------------------------------------------------*/
55 :
56 : #define ATT_SEG_LEN ( L_FRAME / ATT_NSEG )
57 : #define ATT_3LSUB_POS ( 3 * ATT_NSEG / NB_SUBFR )
58 : #define ATT_3LSUB_POS_16k ( int16_t )( ( 4.0f * ATT_NSEG / (float) NB_SUBFR16k ) + 0.5f )
59 :
60 : #define THR_CORR_PEAK 0.95f
61 : #define TON_FACT 0.95f
62 : #define TON_ALPHA 0.95f
63 :
64 : #define DLP_BIAS 0.138121f
65 :
66 : #define THR_MASS_MAX 0.85f
67 : #define THR_MASS_MIN 0.75f
68 : #define THR_MASS_STEP_UP 0.01f
69 : #define THR_MASS_STEP_DN 0.02f
70 :
71 :
72 : /*---------------------------------------------------------------------*
73 : * Local function prototypes
74 : *---------------------------------------------------------------------*/
75 :
76 : static void spec_analysis( float *Bin_E, float *p2v_map );
77 :
78 : static void flux( float *Bin_E, float *p2v_map, float *old_Bin_E, float *buf_flux, int16_t attack_hangover, float dec_mov );
79 :
80 : static void tonal_dist( float *p2v_map, float *buf_pkh, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf );
81 :
82 : static int16_t mode_decision( Encoder_State *st, int16_t len, float *dec_mov, float *buf_flux, float *buf_epsP_tilt, float *buf_pkh, float *buf_cor_map_sum, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf, float *buf_dlp );
83 :
84 : static void var_cor_calc( const float old_corr, float *mold_corr, float var_cor_t[], int16_t *high_stable_cor );
85 :
86 : static int16_t attack_det( const float *inp, const int16_t last_clas, const int16_t localVAD, const int16_t coder_type, const int32_t total_brate, const int16_t element_mode, const int16_t clas, float finc_prev[], float *lt_finc, int16_t *last_strong_attack );
87 :
88 : static float tonal_det( const float S[], int16_t vad_flag, float tod_S_map_lt[], float *tod_thr_lt, float *tod_weight, float *tod_S_mass_prev, float *tod_S_mass_lt );
89 :
90 : static void tonal_context_improv( Encoder_State *st, const float PS[], const float voi_fv, const float cor_map_sum_fv, const float LPCErr );
91 :
92 : static void order_spectrum( float *vec, const int16_t len );
93 :
94 : static void detect_sparseness( Encoder_State *st, const int16_t localVAD_HE_SAD, const float voi_fv );
95 :
96 : static int16_t sp_mus_classif_1st( Encoder_State *st, const int16_t localVAD_HE_SAD, const float lsp_new[M], const float cor_map_sum, const float epsP[M + 1], const float PS[], float non_sta, float relE, float *voi_fv, float *cor_map_sum_fv, float *LPCErr, int16_t *high_lpn_flag );
97 :
98 : static void sp_mus_classif_2nd( Encoder_State *st, const float Etot, int16_t *attack_flag, const float *inp );
99 :
100 : static void music_mixed_classif_improv( Encoder_State *st, const float *new_inp, const float *epsP, const float etot, const float old_cor, const float cor_map_sum );
101 :
102 :
103 : /*---------------------------------------------------------------------*
104 : * speech_music_clas_init()
105 : *
106 : * Initialization of speech/music classifier
107 : *---------------------------------------------------------------------*/
108 :
109 11665 : void speech_music_clas_init(
110 : SP_MUS_CLAS_HANDLE hSpMusClas /* i/o: speech/music classifier handle */
111 : )
112 : {
113 : int16_t i;
114 :
115 11665 : set_f( hSpMusClas->FV_st, 0.0f, N_SMC_FEATURES );
116 :
117 11665 : hSpMusClas->inact_cnt = 0;
118 11665 : set_s( hSpMusClas->past_dec, 0, HANG_LEN - 1 );
119 11665 : set_f( hSpMusClas->past_dlp, 0, HANG_LEN - 1 );
120 11665 : set_f( hSpMusClas->past_dlp_mean_ST, 0, HANG_LEN - 1 );
121 11665 : hSpMusClas->dlp_mean_ST = 0.0f;
122 11665 : hSpMusClas->dlp_mean_LT = 0.0f;
123 11665 : hSpMusClas->dlp_var_LT = 0.0f;
124 :
125 186640 : for ( i = 0; i < N_SMC_FEATURES; i++ )
126 : {
127 174975 : hSpMusClas->prev_FV[i] = 0.5f * hout_intervals[2 * i] + 0.5f * hout_intervals[2 * i + 1];
128 : }
129 :
130 186640 : for ( i = 0; i < NB_BANDS_SPMUS; i++ )
131 : {
132 174975 : hSpMusClas->past_log_enr[i] = logf( E_MIN );
133 : }
134 :
135 11665 : hSpMusClas->sp_mus_state = -8;
136 11665 : hSpMusClas->wdrop = 0.0f;
137 11665 : hSpMusClas->wrise = 0.0f;
138 11665 : hSpMusClas->wdlp_0_95_sp = 0.0f;
139 11665 : hSpMusClas->wdlp_xtalk = 0.0f;
140 11665 : set_f( hSpMusClas->last_lsp, 0.0f, M_LSP_SPMUS );
141 11665 : hSpMusClas->last_cor_map_sum = 0.0f;
142 11665 : hSpMusClas->last_non_sta = 0.0f;
143 11665 : set_f( hSpMusClas->past_PS, 0.0f, HIGHEST_FBIN - LOWEST_FBIN );
144 11665 : hSpMusClas->past_ps_diff = 0;
145 11665 : hSpMusClas->past_epsP2 = 01;
146 11665 : hSpMusClas->past_epsP = 0;
147 11665 : hSpMusClas->flag_spitch_cnt = 0;
148 :
149 11665 : hSpMusClas->gsc_thres[0] = TH_0_MIN;
150 11665 : hSpMusClas->gsc_thres[1] = TH_1_MIN;
151 11665 : hSpMusClas->gsc_thres[2] = TH_2_MIN;
152 11665 : hSpMusClas->gsc_thres[3] = TH_3_MIN;
153 11665 : set_f( hSpMusClas->gsc_lt_diff_etot, 0.0f, MAX_LT );
154 11665 : hSpMusClas->gsc_mem_etot = 0.0f;
155 11665 : hSpMusClas->gsc_last_music_flag = 0;
156 11665 : hSpMusClas->gsc_nb_thr_1 = 0;
157 11665 : hSpMusClas->gsc_nb_thr_3 = 0;
158 11665 : hSpMusClas->mold_corr = 0.9f;
159 11665 : hSpMusClas->mean_avr_dyn = 0.5f;
160 11665 : hSpMusClas->last_sw_dyn = 10.0f;
161 :
162 11665 : hSpMusClas->relE_attack_cnt = 0;
163 11665 : hSpMusClas->prev_relE = 0.0f;
164 11665 : hSpMusClas->prev_Etot = 0.0f;
165 11665 : hSpMusClas->prev_vad = 0;
166 11665 : hSpMusClas->vad_0_1_cnt = 0;
167 11665 : hSpMusClas->relE_attack_sum = 0;
168 :
169 : /* speech/music classifier improvement */
170 711565 : for ( i = 0; i < BUF_LEN; i++ )
171 : {
172 699900 : hSpMusClas->buf_flux[i] = -100;
173 699900 : hSpMusClas->buf_pkh[i] = 0;
174 699900 : hSpMusClas->buf_epsP_tilt[i] = 0;
175 699900 : hSpMusClas->buf_cor_map_sum[i] = 0;
176 699900 : hSpMusClas->buf_Ntonal[i] = 0;
177 699900 : hSpMusClas->buf_Ntonal2[i] = 0;
178 699900 : hSpMusClas->buf_Ntonal_lf[i] = 0;
179 : }
180 :
181 11665 : set_f( hSpMusClas->lpe_buf, 0, HANG_LEN_INIT );
182 11665 : set_f( hSpMusClas->voicing_buf, 0, HANG_LEN_INIT );
183 11665 : hSpMusClas->gsc_hangover = 0;
184 11665 : set_f( hSpMusClas->sparse_buf, 0, HANG_LEN_INIT );
185 11665 : set_f( hSpMusClas->hf_spar_buf, 0, HANG_LEN_INIT );
186 11665 : hSpMusClas->LT_sparse = 0.0f;
187 11665 : hSpMusClas->gsc_cnt = 0;
188 11665 : hSpMusClas->last_vad_spa = 0;
189 :
190 11665 : set_f( hSpMusClas->old_Bin_E, 0.0f, 3 * N_OLD_BIN_E );
191 11665 : set_f( hSpMusClas->buf_etot, 0, 4 );
192 11665 : set_f( hSpMusClas->buf_dlp, 0, 10 );
193 :
194 11665 : hSpMusClas->UV_cnt1 = 300;
195 11665 : hSpMusClas->LT_UV_cnt1 = 250.0f;
196 11665 : hSpMusClas->onset_cnt = 0;
197 11665 : hSpMusClas->attack_hangover = 0;
198 11665 : hSpMusClas->dec_mov = 0.0f;
199 11665 : hSpMusClas->dec_mov1 = 0.0f;
200 11665 : hSpMusClas->mov_log_max_spl = 200.0f;
201 11665 : hSpMusClas->old_lt_diff[0] = 0.0f;
202 11665 : hSpMusClas->old_lt_diff[1] = 0.0f;
203 :
204 11665 : set_f( hSpMusClas->finc_prev, 0.0f, ATT_NSEG );
205 11665 : hSpMusClas->lt_finc = 0.0f;
206 11665 : hSpMusClas->last_strong_attack = 0;
207 11665 : hSpMusClas->tdm_lt_Etot = 0.01f;
208 11665 : set_f( hSpMusClas->tod_lt_Bin_E, 0.0f, TOD_NSPEC );
209 11665 : set_f( hSpMusClas->tod_S_map_lt, 0.0f, TOD_NSPEC );
210 11665 : hSpMusClas->tod_thr_lt = TOD_THR_MASS;
211 11665 : hSpMusClas->tod_weight = 0.0f;
212 11665 : hSpMusClas->tod_S_mass_prev = 0.0f;
213 11665 : hSpMusClas->tod_S_mass_lt = 0.0f;
214 :
215 : /* speech/music classification */
216 11665 : set_s( hSpMusClas->lt_old_mode, 1, 3 );
217 11665 : hSpMusClas->lt_voicing = 0.5f;
218 11665 : hSpMusClas->lt_corr = 0.5f;
219 11665 : hSpMusClas->lt_tonality = 0;
220 11665 : set_s( hSpMusClas->lt_corr_pitch, 0, 3 );
221 11665 : hSpMusClas->lt_hangover = 0;
222 11665 : hSpMusClas->lowrate_pitchGain = 0;
223 :
224 11665 : hSpMusClas->lt_music_hangover = 0;
225 11665 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
226 11665 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
227 11665 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
228 11665 : hSpMusClas->lt_music_state = 0;
229 11665 : hSpMusClas->lt_speech_state = 0;
230 11665 : hSpMusClas->lt_speech_hangover = 0;
231 :
232 11665 : hSpMusClas->lt_dec_thres = 10.0f;
233 11665 : hSpMusClas->ener_RAT = 0.0f;
234 :
235 11665 : hSpMusClas->high_stable_cor = 0;
236 11665 : set_f( hSpMusClas->var_cor_t, 0.0f, VAR_COR_LEN );
237 :
238 11665 : hSpMusClas->lps = 0.0f;
239 11665 : hSpMusClas->lpm = 0.0f;
240 11665 : hSpMusClas->lpn = 0.0f;
241 :
242 11665 : return;
243 : }
244 :
245 :
246 : /*---------------------------------------------------------------------*
247 : * speech_music_classif()
248 : *
249 : * Speech/music classification
250 : *
251 : * The following technologies are used based on the outcome of the sp/mus classifier
252 : * sp_aud_decision1 sp_aud_decision2
253 : * 0 0 use ACELP (+TD BWE)
254 : * 1 0 use ACELP (+FD BWE) or HQ/LR-MDCT depending on bitrate
255 : * 1 1 use GSC (+FD BWE) or HQ/LR-MDCT depending on bitrate
256 : *
257 : * 0 1 exceptionally use GSC (+FD BWE) instead of LR-MDCT at 13.2 kbps (WB/SWB) for sparse spectra
258 : *---------------------------------------------------------------------*/
259 :
260 : /*! r: 1st stage decision (1-music, 0-speech or noise) */
261 5150 : void speech_music_classif(
262 : Encoder_State *st, /* i/o: state structure */
263 : const float *new_inp, /* i : new input signal */
264 : const float *inp, /* i : input signal to locate attach position */
265 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
266 : const float lsp_new[M], /* i : LSPs in current frame */
267 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
268 : const float epsP[M + 1], /* i : LP prediciton error */
269 : const float PS[], /* i : energy spectrum */
270 : const float Etot, /* i : total frame energy */
271 : const float old_cor, /* i : max correlation from previous frame */
272 : int16_t *attack_flag, /* o : attack flag (GSC or TC) */
273 : const float non_sta, /* i : unbound non-stationarity for sp/mus classifier */
274 : const float relE, /* i : relative frame energy */
275 : int16_t *high_lpn_flag, /* o : sp/mus LPN flag */
276 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
277 : )
278 : {
279 : float voi_fv, cor_map_sum_fv, LPCErr;
280 :
281 : /* 1st stage speech/music classification based on the GMM model */
282 5150 : st->sp_aud_decision1 = sp_mus_classif_1st( st, localVAD_HE_SAD, lsp_new, cor_map_sum, epsP, PS, non_sta, relE, &voi_fv, &cor_map_sum_fv, &LPCErr, high_lpn_flag );
283 :
284 5150 : if ( st->codec_mode == MODE1 || st->sr_core == INT_FS_12k8 )
285 : {
286 :
287 : /* Improvement of the 1st stage decision for mixed/music content */
288 3050 : if ( !st->Opt_SC_VBR && ( st->total_brate != ACELP_24k40 ) )
289 : {
290 3050 : music_mixed_classif_improv( st, new_inp, epsP, Etot, old_cor, cor_map_sum );
291 : }
292 :
293 3050 : st->sp_aud_decision0 = st->sp_aud_decision1;
294 :
295 : /* 2nd stage speech/music classification (rewrite music to speech in onsets) */
296 3050 : st->sp_aud_decision2 = st->sp_aud_decision1;
297 :
298 3050 : if ( st->bwidth > NB )
299 : {
300 3050 : sp_mus_classif_2nd( st, Etot, attack_flag, inp );
301 :
302 3050 : if ( flag_spitch && st->bwidth == WB && st->total_brate < ACELP_13k20 )
303 : {
304 : /* avoid switch to AUDIO/MUSIC class for very short stable high pitch
305 : and/or stable pitch with high correlation at low bitrates*/
306 0 : st->sp_aud_decision2 = 0;
307 : }
308 : }
309 :
310 : /* Context-based improvement of 1st and 2nd stage decision on stable tonal signals */
311 3050 : if ( !st->Opt_SC_VBR && st->total_brate != ACELP_24k40 )
312 : {
313 3050 : tonal_context_improv( st, PS, voi_fv, cor_map_sum_fv, LPCErr );
314 : }
315 :
316 : /* Avoid using LR-MDCT on sparse spectra, use GSC instead at 13.2 kbps (WB/SWB) */
317 3050 : if ( !st->Opt_SC_VBR && st->total_brate == ACELP_13k20 && st->vad_flag == 1 && ( st->bwidth == WB || st->bwidth == SWB ) )
318 : {
319 2042 : detect_sparseness( st, localVAD_HE_SAD, voi_fv );
320 : }
321 :
322 : /* override speech/music classification to ACELP when background noise level reaches certain level */
323 : /* this is a patch against mis-classifications during active noisy speech segments */
324 3050 : if ( st->lp_noise > 12.0f )
325 : {
326 0 : st->sp_aud_decision1 = 0;
327 0 : st->sp_aud_decision2 = 0;
328 : }
329 :
330 : /* set GSC noisy speech flag on unvoiced SWB segments */
331 3050 : st->GSC_noisy_speech = 0;
332 3050 : if ( st->vad_flag == 1 && st->total_brate >= ACELP_13k20 && st->total_brate < ACELP_24k40 &&
333 2042 : st->lp_noise > 12.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB &&
334 0 : st->coder_type_raw == UNVOICED )
335 : {
336 0 : st->GSC_noisy_speech = 1;
337 : }
338 :
339 : /* Select AUDIO frames */
340 : #ifdef DEBUGGING
341 : if ( st->codec_mode == MODE1 && ( st->force == 1 || ( st->force == -1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) ) ) )
342 : #else
343 3050 : if ( st->codec_mode == MODE1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) )
344 : #endif
345 : {
346 959 : st->coder_type = AUDIO;
347 959 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
348 : }
349 : }
350 : else
351 : {
352 2100 : st->sp_aud_decision0 = st->sp_aud_decision1;
353 : }
354 :
355 :
356 5150 : return;
357 : }
358 :
359 :
360 : /*---------------------------------------------------------------------*
361 : * sp_mus_classif_1st()
362 : *
363 : * 1st stage speech/music classification (based on the GMM model)
364 : *---------------------------------------------------------------------*/
365 :
366 : /*! r: decision flag (1-music, 0-speech or noise) */
367 5150 : static int16_t sp_mus_classif_1st(
368 : Encoder_State *st, /* i/o: state structure */
369 : const int16_t localVAD_HE_SAD, /* i : local VAD HE flag */
370 : const float lsp_new[M], /* i : LSPs in current frame */
371 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
372 : const float epsP[M + 1], /* i : LP prediciton error */
373 : const float PS[], /* i : energy spectrum */
374 : float non_sta, /* i : unbound non-stationarity */
375 : float relE, /* i : relative frame energy */
376 : float *voi_fv, /* o : scaled voicing feature */
377 : float *cor_map_sum_fv, /* o : scaled correlation map feature */
378 : float *LPCErr, /* o : scaled LP prediction error feature */
379 : int16_t *high_lpn_flag /* o : sp/mus LPN flag */
380 : )
381 : {
382 : int16_t i, k, p, dec, vad;
383 : float dlp, ftmp, lepsP1, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght, mx;
384 5150 : float FV[N_FEATURES], *pFV = FV, PS_norm[128], dPS[128], lsp[M];
385 5150 : float pys, pym, xm[N_FEATURES], py, lps = 0, lpm = 0;
386 : const float *pSF;
387 5150 : float pyn, lpn = 0;
388 :
389 5150 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
390 :
391 : /*------------------------------------------------------------------*
392 : * Initialization
393 : *------------------------------------------------------------------*/
394 :
395 5150 : vad = localVAD_HE_SAD;
396 :
397 : /*------------------------------------------------------------------*
398 : * Preparation of the feature vector
399 : *------------------------------------------------------------------*/
400 :
401 : /* [0] OL pitch */
402 5150 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
403 : {
404 436 : *pFV++ = (float) st->pitch[2];
405 : }
406 : else
407 : {
408 4714 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
409 : }
410 :
411 : /* [1] voicing */
412 5150 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
413 : {
414 436 : *pFV++ = st->voicing[2];
415 : }
416 : else
417 : {
418 4714 : *pFV++ = (float) ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
419 : }
420 :
421 : /* [2,3,4,5,6] LSFs */
422 5150 : mvr2r( lsp_new, lsp, M );
423 :
424 5150 : ftmp = (float) acos( lsp[1] );
425 5150 : *pFV++ = ftmp + hSpMusClas->last_lsp[1];
426 5150 : hSpMusClas->last_lsp[1] = ftmp;
427 :
428 5150 : ftmp = (float) acos( lsp[2] );
429 5150 : *pFV++ = ftmp + hSpMusClas->last_lsp[2];
430 5150 : hSpMusClas->last_lsp[2] = ftmp;
431 :
432 5150 : ftmp = (float) acos( lsp[3] );
433 5150 : *pFV++ = ftmp + hSpMusClas->last_lsp[3];
434 5150 : hSpMusClas->last_lsp[3] = ftmp;
435 :
436 5150 : ftmp = (float) acos( lsp[4] );
437 5150 : *pFV++ = ftmp + hSpMusClas->last_lsp[4];
438 5150 : hSpMusClas->last_lsp[4] = ftmp;
439 :
440 5150 : ftmp = (float) acos( lsp[5] );
441 5150 : *pFV++ = ftmp + hSpMusClas->last_lsp[5];
442 5150 : hSpMusClas->last_lsp[5] = ftmp;
443 :
444 : /* [7] cor_map_sum */
445 5150 : *pFV++ = cor_map_sum + hSpMusClas->last_cor_map_sum;
446 5150 : hSpMusClas->last_cor_map_sum = cor_map_sum;
447 :
448 : /* [8] non_sta */
449 5150 : *pFV++ = non_sta + hSpMusClas->last_non_sta;
450 5150 : hSpMusClas->last_non_sta = non_sta;
451 :
452 : /* [9] epsP */
453 5150 : if ( st->bwidth == NB )
454 : {
455 : /* do not take into account (statistics are too different) */
456 0 : *pFV++ = -1.647f;
457 : }
458 : else
459 : {
460 5150 : lepsP1 = logf( epsP[1] + 1e-5f );
461 5150 : ftmp = logf( epsP[13] ) - lepsP1;
462 5150 : *pFV++ = ftmp + hSpMusClas->past_epsP2;
463 5150 : hSpMusClas->past_epsP2 = ftmp;
464 : }
465 :
466 : /* calculation of differential normalized power spectrum */
467 5150 : sum_PS = 1e-5f;
468 350200 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
469 : {
470 345050 : sum_PS += PS[i];
471 : }
472 :
473 350200 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
474 : {
475 345050 : PS_norm[i] = PS[i] / sum_PS;
476 345050 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
477 : }
478 :
479 : /* [10] ps_diff (spectral difference) */
480 5150 : ps_diff = 0;
481 350200 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
482 : {
483 345050 : ps_diff += dPS[i];
484 : }
485 :
486 5150 : ps_diff = logf( ps_diff + 1e-5f );
487 5150 : *pFV++ = ps_diff + hSpMusClas->past_ps_diff;
488 5150 : hSpMusClas->past_ps_diff = ps_diff;
489 :
490 : /* [11] ps_sta (spectral stationarity) */
491 5150 : ps_sta = 0;
492 350200 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
493 : {
494 345050 : mx = PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] ? PS_norm[i] : hSpMusClas->past_PS[i - LOWEST_FBIN];
495 345050 : ps_sta += mx / ( dPS[i] + 1e-5f );
496 : }
497 :
498 5150 : *pFV++ = logf( ps_sta + 1e-5f );
499 5150 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
500 :
501 : /*------------------------------------------------------------------*
502 : * Scaling of the feature vector
503 : *------------------------------------------------------------------*/
504 :
505 5150 : pFV = FV;
506 5150 : if ( st->bwidth == NB )
507 : {
508 0 : pSF = SF_8k;
509 : }
510 : else
511 : {
512 5150 : pSF = SF;
513 : }
514 :
515 66950 : for ( i = 0; i < N_FEATURES; i++, pFV++, pSF += 2 )
516 : {
517 61800 : *pFV = pSF[0] * *pFV + pSF[1];
518 : }
519 :
520 : /* store some scaled parameters for later correction of the 1st stage speech/music classification */
521 5150 : *voi_fv = FV[1];
522 5150 : *cor_map_sum_fv = FV[7];
523 5150 : *LPCErr = FV[9];
524 :
525 : /*------------------------------------------------------------------*
526 : * Calculation of posterior probability
527 : * Log-probability
528 : *------------------------------------------------------------------*/
529 :
530 5150 : pys = pym = pyn = 1e-5f;
531 :
532 : /* run loop for all mixtures (for each mixture, calculate the probability of speech or noise and the probability of music) */
533 36050 : for ( k = 0; k < N_MIXTURES; k++ )
534 : {
535 : /* active frames - calculate the probability of speech */
536 401700 : for ( p = 0; p < N_FEATURES; p++ )
537 : {
538 370800 : xm[p] = FV[p] - m_speech[k * N_FEATURES + p];
539 : }
540 :
541 30900 : py = lvm_speech[k] + dot_product_mat( xm, &invV_speech[k * N_FEATURES * N_FEATURES], N_FEATURES );
542 30900 : pys += expf( py );
543 : /* inactive frames - calculate the probability of noise */
544 401700 : for ( p = 0; p < N_FEATURES; p++ )
545 : {
546 370800 : xm[p] = FV[p] - m_noise[k * N_FEATURES + p];
547 : }
548 :
549 30900 : py = lvm_noise[k] + dot_product_mat( xm, &invV_noise[k * N_FEATURES * N_FEATURES], N_FEATURES );
550 30900 : pyn += expf( py );
551 :
552 : /* either active or inactive frames - calculate the probability of music */
553 401700 : for ( p = 0; p < N_FEATURES; p++ )
554 : {
555 370800 : xm[p] = FV[p] - m_music[k * N_FEATURES + p];
556 : }
557 :
558 30900 : py = lvm_music[k] + dot_product_mat( xm, &invV_music[k * N_FEATURES * N_FEATURES], N_FEATURES );
559 30900 : pym += expf( py );
560 : }
561 :
562 : /* calculate log-probability */
563 5150 : lps = logf( pys ) - 0.5f * N_FEATURES * logf( PI2 );
564 5150 : lpm = logf( pym ) - 0.5f * N_FEATURES * logf( PI2 );
565 5150 : lpn = logf( pyn ) - 0.5f * N_FEATURES * logf( PI2 );
566 :
567 5150 : *high_lpn_flag = 0;
568 5150 : if ( lpn > lps && lpn > lpm )
569 : {
570 72 : *high_lpn_flag = 1;
571 : }
572 :
573 5150 : if ( !vad )
574 : {
575 : /* artificially increase log-probability of noise */
576 128 : lps = lpn * 1.2f;
577 : }
578 :
579 5150 : hSpMusClas->lpm = lpm;
580 5150 : hSpMusClas->lps = lps;
581 :
582 : /* determine HQ Generic speech class */
583 5150 : if ( st->hHQ_core != NULL )
584 : {
585 5150 : if ( lps > lpm + 0.5f )
586 : {
587 2342 : st->hHQ_core->hq_generic_speech_class = 1;
588 : }
589 : else
590 : {
591 2808 : st->hHQ_core->hq_generic_speech_class = 0;
592 : }
593 : }
594 :
595 : /*------------------------------------------------------------------*
596 : * State machine (sp_mus_state < 0 .. inactive, > 0 .. entry, = 0 .. active )
597 : *------------------------------------------------------------------*/
598 :
599 5150 : if ( vad )
600 : {
601 5022 : if ( relE < -20 || ( lps <= -5 && lpm <= -5 ) )
602 : {
603 657 : if ( hSpMusClas->sp_mus_state > 0 )
604 : {
605 116 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
606 : {
607 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
608 14 : hSpMusClas->inact_cnt = 0;
609 : }
610 :
611 : /* energy is too low -> we are going to instable state */
612 116 : hSpMusClas->sp_mus_state = 0;
613 : }
614 541 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
615 : {
616 : /* energy is still too low -> we are still in instable state */
617 256 : hSpMusClas->sp_mus_state--;
618 : }
619 : }
620 4365 : else if ( hSpMusClas->sp_mus_state <= 0 )
621 : {
622 116 : if ( hSpMusClas->inact_cnt == 0 )
623 : {
624 :
625 39 : hSpMusClas->sp_mus_state = 1;
626 : }
627 : else
628 : {
629 :
630 77 : hSpMusClas->sp_mus_state = HANG_LEN;
631 : }
632 :
633 116 : hSpMusClas->inact_cnt = 12;
634 : }
635 4249 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
636 : {
637 : /* we are inside an entry period -> increment the counter of entry frames */
638 211 : hSpMusClas->sp_mus_state++;
639 : }
640 :
641 5022 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
642 : {
643 326 : hSpMusClas->inact_cnt--;
644 : }
645 : }
646 : else
647 : {
648 128 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
649 : {
650 0 : hSpMusClas->inact_cnt = 0;
651 : }
652 128 : else if ( hSpMusClas->inact_cnt > 0 )
653 : {
654 40 : hSpMusClas->inact_cnt--;
655 : }
656 :
657 128 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
658 : {
659 0 : hSpMusClas->sp_mus_state = -HANG_LEN;
660 : }
661 128 : else if ( hSpMusClas->sp_mus_state > 0 )
662 : {
663 0 : hSpMusClas->sp_mus_state = -1;
664 : }
665 128 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
666 : {
667 : /* we are in inactive state */
668 68 : hSpMusClas->sp_mus_state--;
669 : }
670 : }
671 :
672 : /*------------------------------------------------------------------*
673 : * Decision without hangover
674 : * Weighted decision
675 : *------------------------------------------------------------------*/
676 :
677 : /* decision without hangover (0 - speech/noise, 1 - music) */
678 5150 : dec = lpm > lps;
679 5150 : dlp = lpm - lps;
680 :
681 5150 : if ( !vad )
682 : {
683 128 : dec = 0;
684 128 : dlp = 0;
685 : }
686 :
687 : /* calculate weight based on relE (close to 0.01 in low-E regions, close to 1 in high-E regions) */
688 5150 : wrelE = 1.0f + relE / 15;
689 :
690 5150 : if ( wrelE > 1.0f )
691 : {
692 1937 : wrelE = 1.0f;
693 : }
694 3213 : else if ( wrelE < 0.01f )
695 : {
696 1088 : wrelE = 0.01f;
697 : }
698 :
699 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
700 5150 : if ( dlp < 0 && dlp < hSpMusClas->past_dlp[0] )
701 : {
702 1458 : if ( hSpMusClas->past_dlp[0] > 0 )
703 : {
704 468 : hSpMusClas->wdrop = -dlp;
705 : }
706 : else
707 : {
708 990 : hSpMusClas->wdrop += hSpMusClas->past_dlp[0] - dlp;
709 : }
710 : }
711 : else
712 : {
713 3692 : hSpMusClas->wdrop = 0;
714 : }
715 :
716 5150 : wdrop = hSpMusClas->wdrop / 20;
717 :
718 5150 : if ( wdrop > 1.0f )
719 : {
720 0 : wdrop = 1.0f;
721 : }
722 5150 : else if ( wdrop < 0.1f )
723 : {
724 4370 : wdrop = 0.1f;
725 : }
726 :
727 : /* combine weights into one */
728 5150 : wght = wrelE * wdrop;
729 5150 : if ( wght < 0.01f )
730 : {
731 1187 : wght = 0.01f;
732 : }
733 :
734 : /* calculate weighted decision */
735 5150 : hSpMusClas->wdlp_0_95_sp = wght * dlp + ( 1 - wght ) * hSpMusClas->wdlp_0_95_sp;
736 :
737 5150 : if ( hSpMusClas->sp_mus_state == -HANG_LEN )
738 : {
739 376 : hSpMusClas->wdlp_0_95_sp = 0;
740 : }
741 :
742 : /*------------------------------------------------------------------*
743 : * Final speech/music decision
744 : *------------------------------------------------------------------*/
745 :
746 5150 : if ( !vad && hSpMusClas->sp_mus_state == -HANG_LEN )
747 : {
748 : /* inactive state */
749 66 : dec = 0;
750 : }
751 5084 : else if ( hSpMusClas->sp_mus_state <= 0 )
752 : {
753 : /* transition from active to inactive state or instable state */
754 719 : dec = hSpMusClas->past_dec[0];
755 : }
756 4365 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
757 : {
758 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
759 225 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
760 225 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
761 225 : dec = ftmp > 2.0f;
762 : }
763 : else
764 : {
765 : /* stable active state */
766 4140 : if ( hSpMusClas->wdlp_0_95_sp > 0 && hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 )
767 : {
768 : /* switching from speech to music */
769 29 : dec = 1;
770 : }
771 4111 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < 0 )
772 : {
773 : /* switching from music to speech */
774 28 : dec = 0;
775 : }
776 : else
777 : {
778 4083 : dec = hSpMusClas->past_dec[0];
779 : }
780 : }
781 :
782 : /*------------------------------------------------------------------*
783 : * Updates
784 : *------------------------------------------------------------------*/
785 :
786 : /* update buffer of past non-binary decisions */
787 5150 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
788 5150 : hSpMusClas->past_dlp[0] = dlp;
789 :
790 : /* update buffer of past binary decisions */
791 5150 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
792 5150 : hSpMusClas->past_dec[0] = dec;
793 :
794 5150 : return dec;
795 : }
796 :
797 :
798 : /*---------------------------------------------------------------------*
799 : * sp_mus_classif_2nd()
800 : *
801 : * 2nd stage speech/music classifier (convert music to speech for onsets)
802 : *---------------------------------------------------------------------*/
803 :
804 3050 : static void sp_mus_classif_2nd(
805 : Encoder_State *st, /* i/o: encoder state structure */
806 : const float Etot, /* i : total frame energy */
807 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
808 : const float *inp /* i : input signal */
809 : )
810 : {
811 : int16_t attack;
812 3050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
813 :
814 : /* initialization */
815 3050 : *attack_flag = 0;
816 :
817 : /* signal stability estimation */
818 3050 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
819 :
820 : /* calculate variance of correlation */
821 3050 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
822 :
823 : /* attack detection */
824 3050 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, st->total_brate, EVS_MONO, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
825 :
826 : /* change decision from music to speech in certain special cases */
827 3050 : if ( st->sp_aud_decision1 == 1 )
828 : {
829 1022 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
830 : {
831 : /* strong music decision but almost no content below 1kHz */
832 0 : st->sp_aud_decision2 = 0;
833 : }
834 1022 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
835 : {
836 : /* prevent GSC in highly correlated signal with low energy variation */
837 : /* this is basically a patch against bassoon-type of music */
838 0 : st->sp_aud_decision2 = 0;
839 :
840 0 : if ( st->codec_mode == MODE1 && st->coder_type == TRANSITION )
841 : {
842 0 : st->coder_type = GENERIC;
843 : }
844 : }
845 1022 : else if ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f && ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
846 : {
847 31 : if ( st->tc_cnt == 1 )
848 : {
849 : /* do TC coding instead of GC/VC if onset has been already declared before */
850 0 : st->sp_aud_decision2 = 0;
851 :
852 0 : if ( st->codec_mode == MODE1 )
853 : {
854 0 : st->coder_type = TRANSITION;
855 : }
856 : }
857 : else
858 : {
859 31 : if ( attack >= ATT_3LSUB_POS )
860 : {
861 : /* do TC coding if attack is located in the last subframe */
862 9 : st->sp_aud_decision2 = 0;
863 9 : *attack_flag = attack + 1;
864 :
865 9 : if ( st->codec_mode == MODE1 )
866 : {
867 9 : st->coder_type = TRANSITION;
868 : }
869 : }
870 22 : else if ( attack >= ATT_SEG_LEN / 2 )
871 : {
872 : /* do GSC coding if attack is located after the first quarter of the first subframe */
873 : /* (pre-echo will be treated at the decoder side) */
874 0 : st->sp_aud_decision2 = 1;
875 0 : *attack_flag = 31;
876 : }
877 : }
878 : }
879 : }
880 2028 : else if ( st->localVAD == 1 && st->coder_type == GENERIC && ( ( attack >= ATT_3LSUB_POS && st->total_brate < ACELP_24k40 ) || ( attack >= ATT_3LSUB_POS_16k && st->total_brate >= ACELP_24k40 && st->total_brate < ACELP_48k ) ) )
881 : {
882 : /* do TC coding if attack is located in the last subframe */
883 39 : *attack_flag = attack + 1;
884 :
885 39 : if ( st->codec_mode == MODE1 )
886 : {
887 39 : st->coder_type = TRANSITION;
888 : }
889 : }
890 :
891 3050 : return;
892 : }
893 :
894 :
895 : /*---------------------------------------------------------------------*
896 : * tonal_det()
897 : *
898 : * Tonal detector based on spectral stability and harmonicity
899 : *---------------------------------------------------------------------*/
900 :
901 322167 : static float tonal_det(
902 : const float S[],
903 : int16_t vad_flag,
904 : float tod_S_map_lt[],
905 : float *tod_thr_lt,
906 : float *tod_weight,
907 : float *tod_S_mass_prev,
908 : float *tod_S_mass_lt )
909 : {
910 : int16_t i;
911 : float S_mass, alpha;
912 :
913 : /* update the adaptive weight */
914 322167 : *tod_weight = TON_ALPHA * *tod_weight + ( 1 - TON_ALPHA ) * vad_flag;
915 322167 : if ( *tod_weight > TON_ALPHA )
916 : {
917 198079 : *tod_weight = TON_ALPHA;
918 : }
919 124088 : else if ( *tod_weight < ( 1 - TON_ALPHA ) )
920 : {
921 21793 : *tod_weight = 1 - TON_ALPHA;
922 : }
923 :
924 : /* calculate LT spectral correlation in each band up to 4KHz */
925 322167 : S_mass = 0.0f;
926 26095527 : for ( i = 0; i < TOD_NSPEC; i++ )
927 : {
928 25773360 : tod_S_map_lt[i] = *tod_weight * tod_S_map_lt[i] + ( 1 - *tod_weight ) * S[i];
929 :
930 25773360 : S_mass += tod_S_map_lt[i];
931 : }
932 322167 : S_mass /= TOD_NSPEC;
933 :
934 322167 : if ( S_mass > *tod_S_mass_prev )
935 : {
936 154107 : alpha = 0.7f;
937 : }
938 : else
939 : {
940 168060 : alpha = 0.3f;
941 : }
942 322167 : *tod_S_mass_prev = S_mass;
943 322167 : *tod_S_mass_lt = alpha * *tod_S_mass_lt + ( 1 - alpha ) * S_mass;
944 322167 : S_mass = *tod_S_mass_lt;
945 :
946 : /* updating adaptive decision threshold */
947 322167 : if ( S_mass > *tod_thr_lt )
948 : {
949 3491 : *tod_thr_lt -= THR_MASS_STEP_DN;
950 : }
951 : else
952 : {
953 318676 : *tod_thr_lt += THR_MASS_STEP_UP;
954 : }
955 :
956 322167 : if ( *tod_thr_lt > THR_MASS_MAX )
957 : {
958 317788 : *tod_thr_lt = THR_MASS_MAX;
959 : }
960 :
961 322167 : if ( *tod_thr_lt < THR_MASS_MIN )
962 : {
963 3004 : *tod_thr_lt = THR_MASS_MIN;
964 : }
965 :
966 322167 : return S_mass;
967 : }
968 :
969 : /*---------------------------------------------------------------------*
970 : * var_cor_calc()
971 : *
972 : * Calculate variance of correlation
973 : *---------------------------------------------------------------------*/
974 :
975 325217 : static void var_cor_calc(
976 : const float old_corr,
977 : float *mold_corr,
978 : float var_cor_t[],
979 : int16_t *high_stable_cor )
980 : {
981 : int16_t i;
982 : float var_cor;
983 :
984 : /* update buffer of old correlation values */
985 3252170 : for ( i = VAR_COR_LEN - 1; i > 0; i-- )
986 : {
987 2926953 : var_cor_t[i] = var_cor_t[i - 1];
988 : }
989 325217 : var_cor_t[i] = old_corr;
990 :
991 : /* calculate variance of correlation */
992 325217 : var_cor = var( var_cor_t, VAR_COR_LEN );
993 :
994 : /* set flag in case of highly-correlated stable signal */
995 325217 : if ( *mold_corr > 0.8f && var_cor < 5e-4f )
996 : {
997 10692 : *high_stable_cor = 1;
998 : }
999 : else
1000 : {
1001 314525 : *high_stable_cor = 0;
1002 : }
1003 :
1004 : /* update average correlation */
1005 325217 : *mold_corr = 0.1f * old_corr + 0.9f * *mold_corr;
1006 :
1007 325217 : return;
1008 : }
1009 :
1010 : /*---------------------------------------------------------------------*
1011 : * attack_det()
1012 : *
1013 : * Attack detection
1014 : *---------------------------------------------------------------------*/
1015 :
1016 325217 : static int16_t attack_det(
1017 : const float *inp, /* i : input signal */
1018 : const int16_t last_clas, /* i : last signal clas */
1019 : const int16_t localVAD, /* i : local VAD flag */
1020 : const int16_t coder_type, /* i : coder type */
1021 : const int32_t total_brate, /* i : total bitrate */
1022 : const int16_t element_mode, /* i : IVAS element mode */
1023 : const int16_t clas, /* i : signal class */
1024 : float finc_prev[], /* i/o: previous finc */
1025 : float *lt_finc, /* i/o: long-term mean finc */
1026 : int16_t *last_strong_attack /* i/o: last strong attack flag */
1027 : )
1028 : {
1029 : int16_t i, attack;
1030 : float etmp, etmp2, finc[ATT_NSEG];
1031 : int16_t att_3lsub_pos;
1032 : int16_t attack1;
1033 :
1034 325217 : att_3lsub_pos = ATT_3LSUB_POS;
1035 325217 : if ( total_brate >= ACELP_24k40 )
1036 : {
1037 1000 : att_3lsub_pos = ATT_3LSUB_POS_16k;
1038 : }
1039 :
1040 : /* compute energy per section */
1041 10732161 : for ( i = 0; i < ATT_NSEG; i++ )
1042 : {
1043 10406944 : finc[i] = sum2_f( inp + i * ATT_SEG_LEN, ATT_SEG_LEN );
1044 : }
1045 :
1046 325217 : attack = maximum( finc, ATT_NSEG, &etmp );
1047 325217 : attack1 = attack;
1048 :
1049 325217 : if ( localVAD == 1 && coder_type == GENERIC )
1050 : {
1051 : /* compute mean energy in the first three subframes */
1052 158276 : etmp = mean( finc, att_3lsub_pos );
1053 :
1054 : /* compute mean energy after the attack */
1055 158276 : etmp2 = mean( finc + attack, ATT_NSEG - attack );
1056 :
1057 : /* and compare them */
1058 158276 : if ( etmp * 8 > etmp2 )
1059 : {
1060 : /* stop, if the attack is not sufficiently strong */
1061 152695 : attack = 0;
1062 : }
1063 :
1064 158276 : if ( last_clas == VOICED_CLAS && etmp * 20 > etmp2 )
1065 : {
1066 : /* stop, if the signal was voiced and the attack is not sufficiently strong */
1067 34854 : attack = 0;
1068 : }
1069 :
1070 : /* compare wrt. other sections (reduces miss-classification) */
1071 158276 : if ( attack > 0 )
1072 : {
1073 5140 : etmp2 = finc[attack];
1074 :
1075 105836 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1076 : {
1077 101025 : if ( finc[i] * 2.0f > etmp2 )
1078 : {
1079 : /* stop, if the attack is not sufficiently strong */
1080 329 : attack = 0;
1081 329 : break;
1082 : }
1083 : }
1084 : }
1085 :
1086 158276 : if ( attack == 0 && element_mode > EVS_MONO && ( clas < VOICED_TRANSITION || clas == ONSET ) )
1087 : {
1088 104592 : mvr2r( finc, finc_prev, attack1 );
1089 :
1090 : /* compute mean energy before the attack */
1091 104592 : etmp = mean( finc_prev, ATT_NSEG );
1092 :
1093 104592 : etmp2 = finc[attack1];
1094 :
1095 104592 : if ( ( etmp * 16 < etmp2 ) || ( etmp * 12 < etmp2 && last_clas == UNVOICED_CLAS ) )
1096 : {
1097 5288 : attack = attack1;
1098 : }
1099 :
1100 104592 : if ( 20 * *lt_finc > etmp2 || *last_strong_attack )
1101 : {
1102 96249 : attack = 0;
1103 : }
1104 : }
1105 :
1106 158276 : *last_strong_attack = attack;
1107 : }
1108 :
1109 : /* compare wrt. other sections (reduces miss-classification) */
1110 166941 : else if ( attack > 0 )
1111 : {
1112 1909292 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1113 : {
1114 1854782 : if ( i != attack && finc[i] * 1.3f > finc[attack] )
1115 : {
1116 : /* stop, if the attack is not sufficiently strong */
1117 97315 : attack = 0;
1118 97315 : break;
1119 : }
1120 : }
1121 151825 : *last_strong_attack = 0;
1122 : }
1123 :
1124 : /* updates */
1125 325217 : mvr2r( finc, finc_prev, ATT_NSEG );
1126 325217 : *lt_finc = 0.95f * *lt_finc + 0.05f * mean( finc, ATT_NSEG );
1127 :
1128 325217 : return attack;
1129 : }
1130 :
1131 : /*---------------------------------------------------------------------*
1132 : * ivas_smc_gmm()
1133 : *
1134 : * 1st stage of the speech/music classification (based on the GMM model)
1135 : *---------------------------------------------------------------------*/
1136 :
1137 : /*! r: S/M decision (0=speech or noise,1=unclear,2=music) */
1138 846471 : int16_t ivas_smc_gmm(
1139 : Encoder_State *st, /* i/o: state structure */
1140 : STEREO_CLASSIF_HANDLE hStereoClassif, /* i/o: stereo classifier structure */
1141 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
1142 : const float Etot, /* i : total frame energy */
1143 : const float lsp_new[M], /* i : LSPs in current frame */
1144 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
1145 : const float epsP[M + 1], /* i : LP prediciton error */
1146 : const float PS[], /* i : energy spectrum */
1147 : const float non_sta, /* i : unbound non-stationarity */
1148 : const float relE, /* i : relative frame energy */
1149 : int16_t *high_lpn_flag, /* i/o: sp/mus LPN flag */
1150 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
1151 : )
1152 : {
1153 : int16_t i, m, dec;
1154 : int16_t flag_odv;
1155 : float lps, lpm, lpn;
1156 : float ps[N_SMC_MIXTURES], pm[N_SMC_MIXTURES], pn[N_SMC_MIXTURES];
1157 : float fvm[N_PCA_COEF], lprob;
1158 : float dlp, ftmp, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght;
1159 : float wrise;
1160 : float dlp_mean2var;
1161 : float FV[N_SMC_FEATURES], *pFV, PS_norm[128], dPS[128];
1162 : const float *pODV;
1163 : float *pFV_st, smc_st_mean_fact;
1164 : int16_t relE_attack_flag;
1165 : int16_t j, len;
1166 : const float *pt_mel_fb;
1167 : float melS[NB_MEL_BANDS], mfcc[NB_MEL_BANDS];
1168 : int16_t odv_cnt;
1169 : int16_t i_out[N_SMC_FEATURES], *p_out;
1170 :
1171 : /*------------------------------------------------------------------*
1172 : * Initialization
1173 : *------------------------------------------------------------------*/
1174 :
1175 846471 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1176 :
1177 : /*------------------------------------------------------------------*
1178 : * State machine (sp_mus_state: -8 = INACTIVE, -7:-1 = UNSTABLE, 0:7 = ENTRY, 8 = STABLE )
1179 : *------------------------------------------------------------------*/
1180 :
1181 846471 : if ( localVAD_HE_SAD )
1182 : {
1183 723440 : if ( relE < -20 )
1184 : {
1185 68617 : if ( hSpMusClas->sp_mus_state > 0 )
1186 : {
1187 9346 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
1188 : {
1189 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
1190 2227 : hSpMusClas->inact_cnt = 0;
1191 : }
1192 :
1193 : /* energy is too low -> we are going to instable state */
1194 9346 : hSpMusClas->sp_mus_state = 0;
1195 : }
1196 59271 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1197 : {
1198 : /* energy is still too low -> we are still in instable state */
1199 22505 : hSpMusClas->sp_mus_state--;
1200 : }
1201 : }
1202 654823 : else if ( hSpMusClas->sp_mus_state <= 0 )
1203 : {
1204 20164 : if ( hSpMusClas->inact_cnt == 0 )
1205 : {
1206 :
1207 13412 : hSpMusClas->sp_mus_state = 1;
1208 : }
1209 : else
1210 : {
1211 :
1212 6752 : hSpMusClas->sp_mus_state = HANG_LEN;
1213 : }
1214 :
1215 20164 : hSpMusClas->inact_cnt = 12;
1216 : }
1217 634659 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1218 : {
1219 : /* we are inside an entry period -> increment the counter of entry frames */
1220 54484 : hSpMusClas->sp_mus_state++;
1221 : }
1222 :
1223 723440 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
1224 : {
1225 23279 : hSpMusClas->inact_cnt--;
1226 : }
1227 : }
1228 : else
1229 : {
1230 123031 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1231 : {
1232 632 : hSpMusClas->inact_cnt = 0;
1233 : }
1234 122399 : else if ( hSpMusClas->inact_cnt > 0 )
1235 : {
1236 17101 : hSpMusClas->inact_cnt--;
1237 : }
1238 :
1239 123031 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1240 : {
1241 632 : hSpMusClas->sp_mus_state = -HANG_LEN;
1242 : }
1243 122399 : else if ( hSpMusClas->sp_mus_state > 0 )
1244 : {
1245 2172 : hSpMusClas->sp_mus_state = -1;
1246 : }
1247 120227 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1248 : {
1249 : /* we are in inactive state */
1250 11718 : hSpMusClas->sp_mus_state--;
1251 : }
1252 : }
1253 :
1254 : /* detect attacks based on relE */
1255 846471 : if ( relE > hSpMusClas->prev_relE )
1256 : {
1257 361404 : hSpMusClas->relE_attack_sum += relE - hSpMusClas->prev_relE;
1258 : }
1259 : else
1260 : {
1261 485067 : hSpMusClas->relE_attack_sum = 0;
1262 : }
1263 846471 : hSpMusClas->prev_relE = relE;
1264 :
1265 : /* update counter from last VAD 0->1 change */
1266 846471 : if ( hSpMusClas->prev_vad == 0 && localVAD_HE_SAD == 1 )
1267 : {
1268 15129 : hSpMusClas->vad_0_1_cnt = 1;
1269 : }
1270 831342 : else if ( localVAD_HE_SAD == 1 && hSpMusClas->vad_0_1_cnt > 0 && hSpMusClas->vad_0_1_cnt < 50 )
1271 : {
1272 194777 : hSpMusClas->vad_0_1_cnt++;
1273 : }
1274 : else
1275 : {
1276 636565 : hSpMusClas->vad_0_1_cnt = 0;
1277 : }
1278 846471 : hSpMusClas->prev_vad = localVAD_HE_SAD;
1279 :
1280 846471 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && hSpMusClas->relE_attack_sum > 5.0f )
1281 : {
1282 22344 : hSpMusClas->relE_attack_cnt++;
1283 :
1284 : /* set flag only in the first X frames in a series */
1285 22344 : if ( hSpMusClas->relE_attack_cnt > 0 && hSpMusClas->relE_attack_cnt < 3 )
1286 : {
1287 16650 : relE_attack_flag = 1;
1288 : }
1289 : else
1290 : {
1291 5694 : relE_attack_flag = 0;
1292 : }
1293 : }
1294 : else
1295 : {
1296 824127 : hSpMusClas->relE_attack_cnt = 0;
1297 824127 : relE_attack_flag = 0;
1298 : }
1299 :
1300 846471 : hSpMusClas->prev_Etot = Etot;
1301 :
1302 : /*------------------------------------------------------------------*
1303 : * Preparation of the feature vector
1304 : *------------------------------------------------------------------*/
1305 :
1306 846471 : pFV = FV;
1307 :
1308 : /* [0] OL pitch */
1309 846471 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1310 : {
1311 92105 : *pFV++ = (float) st->pitch[2];
1312 : }
1313 : else
1314 : {
1315 754366 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
1316 : }
1317 :
1318 : /* [1] voicing */
1319 846471 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1320 : {
1321 92105 : *pFV++ = st->voicing[2];
1322 : }
1323 : else
1324 : {
1325 754366 : *pFV++ = ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
1326 : }
1327 :
1328 : /* [2,3,4,5,6] LSFs */
1329 846471 : *pFV++ = acosf( lsp_new[2] );
1330 846471 : *pFV++ = acosf( lsp_new[3] );
1331 846471 : *pFV++ = acosf( lsp_new[4] );
1332 846471 : *pFV++ = acosf( lsp_new[5] );
1333 846471 : *pFV++ = acosf( lsp_new[6] );
1334 :
1335 : /* [7] cor_map_sum */
1336 846471 : *pFV++ = cor_map_sum;
1337 :
1338 : /* [8] non_sta */
1339 846471 : *pFV++ = non_sta;
1340 :
1341 : /* [9] epsP */
1342 846471 : *pFV++ = logf( epsP[14] + 1e-5f ) - logf( epsP[0] + 1e-5f );
1343 :
1344 : /* [10,11,12] MFCCs */
1345 846471 : set_zero( melS, NB_MEL_BANDS );
1346 846471 : pt_mel_fb = mel_fb;
1347 34705311 : for ( i = 0; i < NB_MEL_BANDS; i++ )
1348 : {
1349 33858840 : j = mel_fb_start[i];
1350 33858840 : len = mel_fb_len[i];
1351 33858840 : melS[i] = logf( dotp( &PS[j], pt_mel_fb, len ) + 1e-5f );
1352 33858840 : pt_mel_fb += len;
1353 : }
1354 :
1355 846471 : v_mult_mat( mfcc, melS, dct_mtx, NB_MEL_BANDS, NB_MEL_COEF );
1356 :
1357 846471 : *pFV++ = mfcc[2];
1358 846471 : *pFV++ = mfcc[6];
1359 846471 : *pFV++ = mfcc[12];
1360 :
1361 : /* calculation of differential normalized power spectrum */
1362 846471 : sum_PS = 1e-5f;
1363 57560028 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1364 : {
1365 56713557 : sum_PS += PS[i];
1366 : }
1367 :
1368 57560028 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1369 : {
1370 56713557 : PS_norm[i] = PS[i] / sum_PS;
1371 56713557 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
1372 : }
1373 :
1374 : /* [13] ps_diff (spectral difference) */
1375 846471 : ps_diff = 0;
1376 57560028 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1377 : {
1378 56713557 : ps_diff += dPS[i];
1379 : }
1380 :
1381 846471 : *pFV++ = ps_diff;
1382 :
1383 : /* [14] ps_sta (spectral stationarity) */
1384 846471 : ps_sta = 0;
1385 57560028 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1386 : {
1387 56713557 : if ( PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] )
1388 : {
1389 26949557 : ps_sta += PS_norm[i] / ( dPS[i] + 1e-5f );
1390 : }
1391 : else
1392 : {
1393 29764000 : ps_sta += hSpMusClas->past_PS[i - LOWEST_FBIN] / ( dPS[i] + 1e-5f );
1394 : }
1395 : }
1396 :
1397 846471 : *pFV++ = logf( ps_sta + 1e-5f );
1398 846471 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
1399 :
1400 : /* save ps_diff and ps_sta features for XTALK and UNCLR classifier */
1401 846471 : if ( hStereoClassif != NULL )
1402 : {
1403 567678 : if ( st->idchan == 0 )
1404 : {
1405 309015 : hStereoClassif->ps_diff_ch1 = ps_diff;
1406 309015 : hStereoClassif->ps_sta_ch1 = logf( ps_sta + 1e-5f );
1407 : }
1408 : else
1409 : {
1410 258663 : hStereoClassif->ps_diff_ch2 = ps_diff;
1411 258663 : hStereoClassif->ps_sta_ch2 = logf( ps_sta + 1e-5f );
1412 : }
1413 : }
1414 :
1415 : /*------------------------------------------------------------------*
1416 : * Outlier detection based on feature histograms
1417 : *------------------------------------------------------------------*/
1418 :
1419 846471 : flag_odv = 0;
1420 846471 : if ( localVAD_HE_SAD )
1421 : {
1422 723440 : pFV = FV;
1423 723440 : pODV = hout_intervals;
1424 723440 : p_out = i_out;
1425 723440 : odv_cnt = 0;
1426 11575040 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1427 : {
1428 10851600 : if ( *pFV < pODV[0] || *pFV > pODV[1] )
1429 : {
1430 2822 : *p_out++ = i;
1431 2822 : odv_cnt++;
1432 : }
1433 :
1434 10851600 : pFV++;
1435 10851600 : pODV += 2;
1436 : }
1437 :
1438 : /* set outlier flag */
1439 723440 : if ( odv_cnt >= 2 )
1440 : {
1441 545 : flag_odv = 1;
1442 :
1443 : /* replace outlying features with values from the previous frame */
1444 1925 : for ( i = 0; i < odv_cnt; i++ )
1445 : {
1446 1380 : FV[i_out[i]] = hSpMusClas->prev_FV[i_out[i]];
1447 : }
1448 : }
1449 : }
1450 :
1451 : /*------------------------------------------------------------------*
1452 : * Adaptive short-term mean filter on feature vector
1453 : *------------------------------------------------------------------*/
1454 :
1455 846471 : pFV = FV;
1456 846471 : pFV_st = hSpMusClas->FV_st;
1457 846471 : smc_st_mean_fact = SMC_ST_MEAN_FACT;
1458 13543536 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1459 : {
1460 12697065 : *pFV_st = smc_st_mean_fact * ( *pFV_st ) + ( 1 - smc_st_mean_fact ) * ( *pFV );
1461 :
1462 12697065 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) )
1463 : {
1464 : /* strong attack or outlier frame during entry state -> features cannot be trusted but there is also no useful past info -> */
1465 : /* -> do whatever you want because dlp will be reset to 0 anyway */
1466 249780 : pFV++;
1467 249780 : pFV_st++;
1468 : }
1469 12447285 : else if ( hSpMusClas->sp_mus_state == HANG_LEN && ( st->tc_cnt == 1 || st->tc_cnt == 2 ) )
1470 : {
1471 : /* energy attack in stable state -> use current features intead of the long-term average */
1472 1027785 : pFV++;
1473 1027785 : pFV_st++;
1474 : }
1475 : else
1476 : {
1477 11419500 : *pFV++ = *pFV_st++;
1478 : }
1479 : }
1480 :
1481 : /* update */
1482 846471 : mvr2r( FV, hSpMusClas->prev_FV, N_SMC_FEATURES );
1483 :
1484 : /*------------------------------------------------------------------*
1485 : * Non-linear power transformation (boxcox) on certain features
1486 : *------------------------------------------------------------------*/
1487 :
1488 846471 : pFV = FV;
1489 13543536 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1490 : {
1491 12697065 : if ( bcox_lmbd[i] != 0 )
1492 : {
1493 2539413 : *pFV -= bcox_add_cnst[i];
1494 2539413 : if ( *pFV < 1 )
1495 : {
1496 77738 : *pFV = 1;
1497 : }
1498 2539413 : *pFV = ( powf( *pFV, bcox_lmbd[i] ) - 1 ) / bcox_lmbd[i];
1499 : }
1500 :
1501 12697065 : pFV++;
1502 : }
1503 :
1504 : /*------------------------------------------------------------------*
1505 : * Scaling of the feature vector
1506 : * PCA
1507 : *------------------------------------------------------------------*/
1508 :
1509 846471 : pFV = FV;
1510 13543536 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1511 : {
1512 : /* Standard scaler - mean and variance normalization */
1513 12697065 : *pFV = ( *pFV - sm_means[i] ) / sm_scale[i];
1514 12697065 : pFV++;
1515 :
1516 : /* MinMax sclaer - mean and variance normalization */
1517 : /**pFV = *pFV * sm_scale[i] + sm_min[i];*/
1518 : /*pFV++;*/
1519 : }
1520 :
1521 : /* PCA */
1522 846471 : v_sub( FV, pca_mean_, FV, N_SMC_FEATURES );
1523 846471 : v_mult_mat( FV, FV, pca_components_, N_SMC_FEATURES, N_PCA_COEF );
1524 :
1525 : /*------------------------------------------------------------------*
1526 : * Calculation of posterior probability
1527 : * Log-probability
1528 : *------------------------------------------------------------------*/
1529 :
1530 : /* run loop for all mixtures (for each mixture, calculate the probability of speech, music and noise) */
1531 846471 : lps = lpm = lpn = 0;
1532 5925297 : for ( m = 0; m < N_SMC_MIXTURES; m++ )
1533 : {
1534 5078826 : v_sub( FV, &means_speech[m * N_PCA_COEF], fvm, N_PCA_COEF );
1535 5078826 : lprob = dot_product_cholesky( fvm, &prec_chol_speech[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1536 5078826 : ps[m] = logf( weights_speech[m] ) + log_det_chol_speech[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1537 :
1538 5078826 : v_sub( FV, &means_music[m * N_PCA_COEF], fvm, N_PCA_COEF );
1539 5078826 : lprob = dot_product_cholesky( fvm, &prec_chol_music[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1540 5078826 : pm[m] = logf( weights_music[m] ) + log_det_chol_music[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1541 :
1542 5078826 : v_sub( FV, &means_noise[m * N_PCA_COEF], fvm, N_PCA_COEF );
1543 5078826 : lprob = dot_product_cholesky( fvm, &prec_chol_noise[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1544 5078826 : pn[m] = logf( weights_noise[m] ) + log_det_chol_noise[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1545 : }
1546 :
1547 846471 : lps = logsumexp( ps, N_SMC_MIXTURES );
1548 846471 : lpm = logsumexp( pm, N_SMC_MIXTURES );
1549 846471 : lpn = logsumexp( pn, N_SMC_MIXTURES );
1550 :
1551 846471 : *high_lpn_flag = 0;
1552 846471 : if ( lpn > lps && lpn > lpm )
1553 : {
1554 105527 : *high_lpn_flag = 1;
1555 : }
1556 :
1557 846471 : hSpMusClas->lpm = lpm;
1558 846471 : hSpMusClas->lps = lps;
1559 846471 : hSpMusClas->lpn = lpn;
1560 :
1561 : /* determine HQ Generic speech class */
1562 846471 : if ( st->hHQ_core != NULL )
1563 : {
1564 330047 : if ( lps > lpm + 0.5f )
1565 : {
1566 125290 : st->hHQ_core->hq_generic_speech_class = 1;
1567 : }
1568 : else
1569 : {
1570 204757 : st->hHQ_core->hq_generic_speech_class = 0;
1571 : }
1572 : }
1573 :
1574 : /*------------------------------------------------------------------*
1575 : * Decision without hangover
1576 : * Weighted decision
1577 : *------------------------------------------------------------------*/
1578 :
1579 : /* decision without hangover (0 - speech/noise, 1 - music) */
1580 846471 : if ( !localVAD_HE_SAD || Etot < 10 || ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) ) )
1581 : {
1582 153126 : dlp = 0;
1583 : }
1584 : else
1585 : {
1586 693345 : dlp = lpm - lps + DLP_BIAS;
1587 :
1588 693345 : if ( dlp > 30.0f )
1589 : {
1590 26939 : dlp = 30.0f;
1591 : }
1592 666406 : else if ( dlp < -30.0f )
1593 : {
1594 0 : dlp = -30.0f;
1595 : }
1596 : }
1597 :
1598 846471 : dec = dlp > 0;
1599 :
1600 : /* calculate weight based on relE (higher relE -> lower weight, lower relE -> higher weight) */
1601 846471 : wrelE = lin_interp( relE, 15.0f, 0.9f, -15.0f, 0.99f, 1 );
1602 :
1603 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
1604 846471 : hSpMusClas->dlp_mean_ST = 0.8f * hSpMusClas->dlp_mean_ST + 0.2f * dlp;
1605 846471 : hSpMusClas->lt_dec_thres = hSpMusClas->dlp_mean_ST;
1606 :
1607 846471 : if ( dlp < 0 && dlp < hSpMusClas->dlp_mean_ST )
1608 : {
1609 190956 : if ( hSpMusClas->dlp_mean_ST > 0 )
1610 : {
1611 59109 : hSpMusClas->wdrop = -dlp;
1612 : }
1613 131847 : else if ( hSpMusClas->wdrop > 0 )
1614 : {
1615 30612 : hSpMusClas->wdrop += hSpMusClas->dlp_mean_ST - dlp;
1616 : }
1617 : }
1618 : else
1619 : {
1620 655515 : hSpMusClas->wdrop = 0;
1621 : }
1622 :
1623 846471 : wdrop = lin_interp( hSpMusClas->wdrop, 15.0f, 0.7f, 0.0f, 1.0f, 1 );
1624 :
1625 : /* calculate weight based on rises of dlp (close to 1 during sudden rise of dlp, close to 0 otherwise) */
1626 846471 : if ( hSpMusClas->sp_mus_state == HANG_LEN && hSpMusClas->dlp_mean_ST > 0 && hSpMusClas->dlp_mean_ST > hSpMusClas->past_dlp_mean_ST[0] )
1627 : {
1628 189022 : if ( hSpMusClas->past_dlp_mean_ST[0] < 0 )
1629 : {
1630 10183 : hSpMusClas->wrise = hSpMusClas->dlp_mean_ST;
1631 : }
1632 178839 : else if ( hSpMusClas->wrise > 0 )
1633 : {
1634 25388 : hSpMusClas->wrise += hSpMusClas->dlp_mean_ST - hSpMusClas->past_dlp_mean_ST[0];
1635 : }
1636 : }
1637 : else
1638 : {
1639 657449 : hSpMusClas->wrise = 0;
1640 : }
1641 :
1642 846471 : wrise = lin_interp( hSpMusClas->wrise, 5.0f, 0.95f, 0.0f, 1.0f, 1 );
1643 :
1644 : /* combine weights into one */
1645 846471 : wght = wrelE * wdrop * wrise;
1646 :
1647 : /* ratio of delta means vs. delta variances */
1648 846471 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1649 : {
1650 61812 : hSpMusClas->dlp_mean_LT = dlp;
1651 61812 : hSpMusClas->dlp_var_LT = 0;
1652 : }
1653 :
1654 846471 : hSpMusClas->dlp_mean_LT = 0.9f * hSpMusClas->dlp_mean_LT + 0.1f * dlp;
1655 846471 : ftmp = dlp - hSpMusClas->dlp_mean_LT;
1656 846471 : hSpMusClas->dlp_var_LT = 0.9f * hSpMusClas->dlp_var_LT + 0.1f * ( ftmp * ftmp );
1657 :
1658 846471 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1659 : {
1660 61812 : dlp_mean2var = 0;
1661 : }
1662 : else
1663 : {
1664 784659 : dlp_mean2var = fabsf( hSpMusClas->dlp_mean_LT ) / ( sqrtf( fabsf( hSpMusClas->dlp_var_LT ) ) + 1.0f );
1665 : }
1666 :
1667 846471 : if ( dlp_mean2var > 15.0f )
1668 : {
1669 : /* decrease the weight little bit when the classifier indicates "strong speech" or "strong music" */
1670 2843 : wght *= 0.9f;
1671 : }
1672 :
1673 846471 : if ( wght > 1.0f )
1674 : {
1675 0 : wght = 1.0f;
1676 : }
1677 846471 : else if ( wght < 0.01f )
1678 : {
1679 0 : wght = 0.01f;
1680 : }
1681 :
1682 846471 : if ( Etot < 10 )
1683 : {
1684 : /* silence */
1685 97285 : wght = 0.92f;
1686 : }
1687 :
1688 : /* calculate weighted decision */
1689 846471 : hSpMusClas->wdlp_0_95_sp = wght * hSpMusClas->wdlp_0_95_sp + ( 1 - wght ) * dlp;
1690 :
1691 : /* xtalk classifier: apply long hysteresis to prevent LRTD on music */
1692 846471 : hSpMusClas->wdlp_xtalk = 0.995f * hSpMusClas->wdlp_xtalk + 0.005f * dlp;
1693 :
1694 : /*------------------------------------------------------------------*
1695 : * Final speech/music decision
1696 : *------------------------------------------------------------------*/
1697 :
1698 846471 : if ( flag_spitch )
1699 : {
1700 28523 : hSpMusClas->flag_spitch_cnt = 5;
1701 : }
1702 817948 : else if ( hSpMusClas->flag_spitch_cnt > 0 )
1703 : {
1704 5233 : hSpMusClas->flag_spitch_cnt--;
1705 : }
1706 :
1707 846471 : if ( Etot < 10 )
1708 : {
1709 : /* silence */
1710 97285 : dec = 0;
1711 : }
1712 749186 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1713 : {
1714 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
1715 61812 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
1716 61812 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
1717 61812 : if ( ftmp > 2.0f )
1718 : {
1719 29406 : if ( dlp > 2.0f )
1720 : {
1721 20178 : dec = 2;
1722 : }
1723 : else
1724 : {
1725 9228 : dec = 1;
1726 : }
1727 : }
1728 : else
1729 : {
1730 32406 : dec = 0;
1731 : }
1732 : }
1733 : else
1734 : {
1735 : /* stable active state */
1736 687374 : if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 &&
1737 332607 : ( ( hSpMusClas->flag_spitch_cnt > 0 && hSpMusClas->wdlp_0_95_sp > 3.4f ) || ( hSpMusClas->flag_spitch_cnt == 0 && hSpMusClas->wdlp_0_95_sp > 2.1f ) ) )
1738 : {
1739 : /* switching from speech to unclear */
1740 1171 : dec = 1;
1741 : }
1742 686203 : else if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->vad_0_1_cnt < 50 && hSpMusClas->relE_attack_sum == 0.0f && hSpMusClas->wdlp_0_95_sp > 1.0f )
1743 : {
1744 : /* switch from speech to unclear also during slowly rising weak music onsets */
1745 2314 : dec = 1;
1746 : }
1747 683889 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp > 2.5f )
1748 : {
1749 : /* switching from unclear to music */
1750 2815 : dec = 2;
1751 : }
1752 681074 : else if ( hSpMusClas->past_dec[0] == 2 && hSpMusClas->past_dec[1] == 2 && hSpMusClas->past_dec[2] == 2 && hSpMusClas->wdlp_0_95_sp < -1.0f )
1753 : {
1754 : /* switching from music to unclear */
1755 1637 : dec = 1;
1756 : }
1757 679437 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < -2.5f )
1758 : {
1759 : /* switching from unclear to speech */
1760 1675 : dec = 0;
1761 : }
1762 : else
1763 : {
1764 677762 : dec = hSpMusClas->past_dec[0];
1765 : }
1766 : }
1767 :
1768 : /*------------------------------------------------------------------*
1769 : * raw S/M decision based on smoothed GMM score
1770 : *------------------------------------------------------------------*/
1771 :
1772 846471 : if ( dec == 0 || st->hSpMusClas->wdlp_0_95_sp <= 0 )
1773 : {
1774 499122 : st->sp_aud_decision0 = 0;
1775 499122 : st->sp_aud_decision1 = 0;
1776 : }
1777 : else
1778 : {
1779 347349 : st->sp_aud_decision0 = 1;
1780 347349 : st->sp_aud_decision1 = 1;
1781 : }
1782 :
1783 : /*------------------------------------------------------------------*
1784 : * Updates
1785 : *------------------------------------------------------------------*/
1786 :
1787 : /* update buffer of past non-binary decisions */
1788 846471 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
1789 846471 : hSpMusClas->past_dlp[0] = dlp;
1790 :
1791 846471 : mvr2r( &hSpMusClas->past_dlp_mean_ST[0], &hSpMusClas->past_dlp_mean_ST[1], HANG_LEN - 2 );
1792 846471 : hSpMusClas->past_dlp_mean_ST[0] = hSpMusClas->dlp_mean_ST;
1793 :
1794 : /* update buffer of past binary decisions */
1795 846471 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
1796 846471 : hSpMusClas->past_dec[0] = dec;
1797 :
1798 : #ifdef DEBUG_MODE_INFO
1799 : dbgwrite( &st->hSpMusClas->wdlp_0_95_sp, sizeof( float ), 1, 1, "res/wdlp_0_95_sp.x" );
1800 : #endif
1801 :
1802 846471 : return dec;
1803 : }
1804 :
1805 : /*---------------------------------------------------------------------*
1806 : * ivas_smc_mode_selection()
1807 : *
1808 : * 2nd stage speech/music classifier (select coding mode (ACELP, GSC and TCX) based on S/M classification)
1809 : * output (sp_aud_decision1 - sp_aud_decision2 -> coding mode):
1810 : * 0 - 0 -> ACELP
1811 : * 1 - 0 -> GSC
1812 : * 1 - 1 -> TCX
1813 : *---------------------------------------------------------------------*/
1814 :
1815 322167 : void ivas_smc_mode_selection(
1816 : Encoder_State *st, /* i/o: encoder state structure */
1817 : const int32_t element_brate, /* i : element bitrate */
1818 : int16_t smc_dec, /* i : raw decision of the 1st stage classifier*/
1819 : const float relE, /* i : relative frame energy */
1820 : const float Etot, /* i : total frame energy */
1821 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
1822 : const float *inp, /* i : input signal */
1823 : const float S_map[], /* i : short-term correlation map */
1824 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch*/
1825 : )
1826 : {
1827 : int16_t attack;
1828 : float ton;
1829 : int16_t i;
1830 : float S_p2a, S_max, S_ave;
1831 : float thr_sp2a;
1832 :
1833 322167 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1834 :
1835 : /* initialization */
1836 322167 : *attack_flag = 0;
1837 322167 : st->sp_aud_decision2 = 0;
1838 :
1839 : /* signal stability estimation */
1840 322167 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
1841 :
1842 : /* calculate variance of correlation */
1843 322167 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
1844 :
1845 : /* attack detection */
1846 322167 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, 0, st->element_mode, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
1847 :
1848 : /* tonal detector */
1849 322167 : ton = tonal_det( S_map, st->vad_flag, hSpMusClas->tod_S_map_lt, &hSpMusClas->tod_thr_lt, &hSpMusClas->tod_weight, &hSpMusClas->tod_S_mass_prev, &hSpMusClas->tod_S_mass_lt );
1850 :
1851 :
1852 : /* calculate spectral peak-to-average ratio */
1853 26095527 : for ( i = 0; i < TOD_NSPEC; i++ )
1854 : {
1855 25773360 : st->hSpMusClas->tod_lt_Bin_E[i] = P2A_FACT * st->hSpMusClas->tod_lt_Bin_E[i] + ( 1 - P2A_FACT ) * st->Bin_E[i];
1856 : }
1857 :
1858 322167 : maximum( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC, &S_max );
1859 322167 : S_ave = sum_f( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC ) / TOD_NSPEC;
1860 322167 : S_p2a = S_max - S_ave;
1861 :
1862 322167 : if ( element_brate <= IVAS_16k4 )
1863 : {
1864 105676 : thr_sp2a = THR_P2A_HIGH;
1865 : }
1866 : else
1867 : {
1868 216491 : thr_sp2a = THR_P2A;
1869 : }
1870 :
1871 : /* initial 3-way selection of coding modes (ACELP/GSC/TCX) */
1872 322167 : if ( relE > -10.0f && ( S_p2a > thr_sp2a || ton > hSpMusClas->tod_thr_lt ) )
1873 : {
1874 : /* select TCX to encode extremely peaky signals or strongly tonal signals */
1875 14024 : st->sp_aud_decision1 = 1;
1876 14024 : st->sp_aud_decision2 = 1;
1877 : }
1878 308143 : else if ( smc_dec == SPEECH )
1879 : {
1880 : /* select ACELP to encode speech */
1881 120374 : st->sp_aud_decision1 = 0;
1882 120374 : st->sp_aud_decision2 = 0;
1883 : }
1884 187769 : else if ( smc_dec == SPEECH_OR_MUSIC )
1885 : {
1886 : /* select GSC to encode "unclear" segments (classifier's score on the borderline) */
1887 6620 : st->sp_aud_decision1 = 1;
1888 6620 : st->sp_aud_decision2 = 0;
1889 : }
1890 : else
1891 : {
1892 : /* select TCX to encode music */
1893 181149 : st->sp_aud_decision1 = 1;
1894 181149 : st->sp_aud_decision2 = 1;
1895 : }
1896 :
1897 : /* change decision from GSC to ACELP/TCX in some special cases */
1898 322167 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1899 : {
1900 6620 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
1901 : {
1902 : /* prevent GSC on strong music with almost no content below 1kHz */
1903 4 : st->sp_aud_decision2 = 1;
1904 : }
1905 6616 : else if ( flag_spitch )
1906 : {
1907 : /* prevent GSC on signals with very short and stable high pitch period */
1908 103 : if ( hSpMusClas->wdlp_0_95_sp < 2.5f )
1909 : {
1910 : /* select ACELP instead */
1911 101 : st->sp_aud_decision1 = 0;
1912 : }
1913 : else
1914 : {
1915 : /* select TCX instead */
1916 2 : st->sp_aud_decision2 = 1;
1917 : }
1918 : }
1919 6513 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
1920 : {
1921 : /* prevent GSC in highly correlated signal with low energy variation */
1922 : /* this is basically a patch against bassoon-type of music */
1923 0 : st->sp_aud_decision2 = 1;
1924 : }
1925 : }
1926 :
1927 : /* change decision from GSC to ACELP TC during attacks/onsets */
1928 322167 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1929 : {
1930 6513 : if ( ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f ) &&
1931 508 : ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
1932 : {
1933 134 : if ( st->tc_cnt == 1 )
1934 : {
1935 : /* do ACELP TC coding instead of GC/VC if onset has been already declared before */
1936 47 : st->sp_aud_decision1 = 0;
1937 47 : st->coder_type = TRANSITION;
1938 : }
1939 : else
1940 : {
1941 87 : if ( attack >= ATT_3LSUB_POS )
1942 : {
1943 : /* do ACELP TC coding also if attack is located in the last subframe */
1944 11 : st->sp_aud_decision1 = 0;
1945 11 : *attack_flag = attack + 1;
1946 11 : st->coder_type = TRANSITION;
1947 : }
1948 76 : else if ( attack >= ATT_SEG_LEN / 2 )
1949 : {
1950 : /* do GSC coding if attack is located after the first quarter of the first subframe */
1951 : /* (pre-echo will be treated at the decoder side) */
1952 4 : *attack_flag = 31;
1953 4 : *attack_flag = attack + 1;
1954 : }
1955 : }
1956 : }
1957 : }
1958 :
1959 322167 : if ( st->localVAD == 1 && st->coder_type == GENERIC && attack > 0 /*&& *attack_flag < 32*/ /*&& st->tc_cnt != 2*/ && !( st->sp_aud_decision2 == 1 && ton > 0.65f ) )
1960 : {
1961 : /* change ACELP coder_type to TC if attack has been detected */
1962 6900 : st->sp_aud_decision1 = 0;
1963 6900 : st->sp_aud_decision2 = 0;
1964 :
1965 6900 : st->coder_type = TRANSITION;
1966 6900 : *attack_flag = attack + 1;
1967 : }
1968 :
1969 : #ifdef DEBUGGING
1970 : if ( st->idchan == 0 && st->coder_type != INACTIVE )
1971 : {
1972 : if ( st->force == FORCE_GSC && element_brate < IVAS_24k4 )
1973 : {
1974 : /* enforce GSC */
1975 : st->sp_aud_decision1 = 1;
1976 : st->sp_aud_decision2 = 0;
1977 : }
1978 : else if ( st->force == FORCE_SPEECH && ( st->sp_aud_decision1 == 1 || st->sp_aud_decision2 == 1 ) )
1979 : {
1980 : if ( element_brate < IVAS_24k4 )
1981 : {
1982 : /* convert TCX to GSC */
1983 : st->sp_aud_decision1 = 1;
1984 : st->sp_aud_decision2 = 0;
1985 : }
1986 : else
1987 : {
1988 : /* convert TCX to ACELP */
1989 : st->sp_aud_decision1 = 0;
1990 : st->sp_aud_decision2 = 0;
1991 : }
1992 : }
1993 : else if ( st->force == FORCE_MUSIC )
1994 : {
1995 : /* enforce TCX */
1996 : st->sp_aud_decision1 = 1;
1997 : st->sp_aud_decision2 = 1;
1998 : }
1999 : }
2000 : #endif
2001 :
2002 : /* set GSC noisy speech flag on unvoiced SWB segments */
2003 322167 : st->GSC_noisy_speech = 0;
2004 322167 : if ( st->vad_flag == 1 && element_brate <= IVAS_16k4 && st->lp_noise > 30.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB && st->coder_type_raw == UNVOICED )
2005 : {
2006 768 : st->GSC_noisy_speech = 1;
2007 : }
2008 :
2009 : /* set GSC submode */
2010 322167 : if ( st->element_mode > EVS_MONO && ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) && st->total_brate > STEREO_GSC_BIT_RATE_ALLOC ) /* below STEREO_GSC_BIT_RATE_ALLOC, fall back on normal GSC */
2011 : {
2012 5903 : st->GSC_IVAS_mode = 1;
2013 5903 : if ( st->hSpMusClas->wdlp_0_95_sp > 0.0f )
2014 : {
2015 : /* music-like content */
2016 3811 : st->GSC_IVAS_mode = 3;
2017 : }
2018 2092 : else if ( st->tc_cnt > 0 )
2019 : {
2020 : /* likely presence of an onset, GSC bit allocation will be more focused on LF */
2021 305 : st->GSC_IVAS_mode = 2;
2022 : }
2023 :
2024 5903 : if ( st->coder_type_raw == UNVOICED && st->sp_aud_decision0 == 0 /*&& st->GSC_IVAS_mode < 3*/ )
2025 : {
2026 153 : st->GSC_noisy_speech = 1;
2027 : }
2028 : else
2029 : {
2030 5750 : st->GSC_noisy_speech = 0;
2031 : }
2032 : }
2033 :
2034 : /* set coder_type to AUDIO when GSC is selected (st->core will be set later in the decision matrix) */
2035 322167 : if ( ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) || st->GSC_noisy_speech )
2036 : {
2037 7138 : st->coder_type = AUDIO;
2038 7138 : if ( st->hGSCEnc != NULL && st->GSC_noisy_speech == 0 ) /* In case of GSC_noisy_speech, NOISE_LEVEL should remain at NOISE_LEVEL_SP3 */
2039 : {
2040 6217 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
2041 : }
2042 : }
2043 :
2044 322167 : return;
2045 : }
2046 :
2047 :
2048 : /*------------------------------------------------------------------------*
2049 : * music_mixed_classif_improv()
2050 : *
2051 : * Improve 1st stage speech/music decision for mixed&music signals
2052 : *------------------------------------------------------------------------*/
2053 :
2054 3050 : static void music_mixed_classif_improv(
2055 : Encoder_State *st, /* i/o: Encoder state structure */
2056 : const float *new_inp, /* i : new input signal */
2057 : const float *epsP, /* i : LP prediction error */
2058 : const float etot, /* i : total frame energy */
2059 : const float old_cor, /* i : normalized correlation */
2060 : const float cor_map_sum /* i : correlation map sum */
2061 : )
2062 : {
2063 : int16_t i, dec, len, percus_flag;
2064 : float p2v_map[128], ftmp, ftmp1, lt_diff, log_max_spl, epsP_tilt, max_spl;
2065 :
2066 3050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2067 :
2068 : /* find sample with maximum absolute amplitude */
2069 3050 : max_spl = 0;
2070 783850 : for ( i = 0; i < L_FRAME; i++ )
2071 : {
2072 780800 : if ( fabs( new_inp[i] ) > max_spl )
2073 : {
2074 27337 : max_spl = fabsf( new_inp[i] );
2075 : }
2076 : }
2077 :
2078 : /* music is considered only appearing in high SNR condition and active signal */
2079 3050 : if ( st->vad_flag == 0 || st->lp_speech - st->lp_noise < 25 )
2080 : {
2081 8 : hSpMusClas->dec_mov = 0.5f;
2082 8 : hSpMusClas->dec_mov1 = 0.5f;
2083 :
2084 8 : if ( st->vad_flag == 0 )
2085 : {
2086 8 : hSpMusClas->onset_cnt = 0;
2087 : }
2088 :
2089 8 : return;
2090 : }
2091 :
2092 3042 : hSpMusClas->onset_cnt++;
2093 :
2094 3042 : if ( hSpMusClas->onset_cnt > 9 )
2095 : {
2096 2997 : hSpMusClas->onset_cnt = 9;
2097 : }
2098 :
2099 3042 : if ( hSpMusClas->onset_cnt == 1 )
2100 : {
2101 5 : set_f( hSpMusClas->buf_flux, -100, BUF_LEN );
2102 : }
2103 :
2104 : /* spectral analysis */
2105 3042 : spec_analysis( st->Bin_E, p2v_map );
2106 :
2107 : /* percussive music detection */
2108 3042 : log_max_spl = 20 * logf( max_spl + 0.0001f );
2109 3042 : lt_diff = log_max_spl - hSpMusClas->mov_log_max_spl;
2110 :
2111 12168 : for ( i = 0; i < 3; i++ )
2112 : {
2113 9126 : hSpMusClas->buf_etot[i] = hSpMusClas->buf_etot[i + 1];
2114 : }
2115 3042 : hSpMusClas->buf_etot[i] = etot;
2116 :
2117 3042 : percus_flag = 0;
2118 3042 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[0] > 6 && hSpMusClas->buf_etot[2] < hSpMusClas->buf_etot[1] && hSpMusClas->buf_etot[1] - st->lp_speech > 3 )
2119 : {
2120 22 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[3] > 3 && hSpMusClas->buf_etot[3] < hSpMusClas->buf_etot[2] && 0.5f * ( 0.5f * ( st->voicing[0] + st->voicing[1] ) + old_cor ) < 0.75f )
2121 : {
2122 5 : if ( hSpMusClas->dec_mov > 0.8f )
2123 : {
2124 0 : percus_flag = 1;
2125 : }
2126 5 : else if ( old_cor < 0.75f && st->voicing[0] < 0.75f && st->voicing[1] < 0.75f && hSpMusClas->old_lt_diff[0] > 10 )
2127 : {
2128 0 : percus_flag = 1;
2129 : }
2130 : }
2131 : }
2132 :
2133 : /* sound attack detection */
2134 3042 : if ( hSpMusClas->buf_etot[3] - hSpMusClas->buf_etot[2] > 6 && hSpMusClas->dec_mov > 0.9f && etot - st->lp_speech > 5 && hSpMusClas->old_lt_diff[0] > 5 )
2135 : {
2136 0 : hSpMusClas->attack_hangover = 3;
2137 : }
2138 :
2139 3042 : if ( st->voicing[0] > 0.9f && st->voicing[1] > 0.9f )
2140 : {
2141 831 : if ( log_max_spl > hSpMusClas->mov_log_max_spl )
2142 : {
2143 10 : hSpMusClas->mov_log_max_spl = 0.75f * hSpMusClas->mov_log_max_spl + ( 1 - 0.75f ) * log_max_spl;
2144 : }
2145 : else
2146 : {
2147 821 : hSpMusClas->mov_log_max_spl = 0.995f * hSpMusClas->mov_log_max_spl + ( 1 - 0.995f ) * log_max_spl;
2148 : }
2149 : }
2150 :
2151 3042 : hSpMusClas->old_lt_diff[0] = hSpMusClas->old_lt_diff[1];
2152 3042 : hSpMusClas->old_lt_diff[1] = lt_diff;
2153 :
2154 : /* calculate and buffer spectral energy fluctuation */
2155 3042 : flux( st->Bin_E, p2v_map, hSpMusClas->old_Bin_E, hSpMusClas->buf_flux, hSpMusClas->attack_hangover, hSpMusClas->dec_mov );
2156 :
2157 3042 : hSpMusClas->attack_hangover--;
2158 3042 : if ( hSpMusClas->attack_hangover < 0 )
2159 : {
2160 3042 : hSpMusClas->attack_hangover = 0;
2161 : }
2162 :
2163 : /* identify flux buffer status */
2164 3042 : len = 0;
2165 176883 : for ( i = BUF_LEN - 1; i >= 0 && hSpMusClas->buf_flux[i] >= 0; i-- )
2166 : {
2167 173841 : len++;
2168 : }
2169 :
2170 : /* reset flux buffer if percussive music is detected */
2171 3042 : if ( percus_flag == 1 )
2172 : {
2173 0 : set_f( &hSpMusClas->buf_flux[BUF_LEN - len], 5, len );
2174 : }
2175 :
2176 : /* calculate and buffer the tilt of residual LP analysis energies */
2177 3042 : ftmp = 0.00001f;
2178 3042 : ftmp1 = 0;
2179 48672 : for ( i = 1; i < 16; i++ )
2180 : {
2181 45630 : ftmp += epsP[i] * epsP[i];
2182 45630 : ftmp1 += epsP[i] * epsP[i + 1];
2183 : }
2184 :
2185 3042 : epsP_tilt = ftmp1 / ftmp;
2186 :
2187 182520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2188 : {
2189 179478 : hSpMusClas->buf_epsP_tilt[i] = hSpMusClas->buf_epsP_tilt[i + 1];
2190 : }
2191 3042 : hSpMusClas->buf_epsP_tilt[i] = epsP_tilt;
2192 :
2193 : /* calculate and buffer highband spectral peakness */
2194 3042 : tonal_dist( p2v_map, hSpMusClas->buf_pkh, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf );
2195 :
2196 : /* buffer sum of correlation map */
2197 182520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2198 : {
2199 179478 : hSpMusClas->buf_cor_map_sum[i] = hSpMusClas->buf_cor_map_sum[i + 1];
2200 : }
2201 3042 : hSpMusClas->buf_cor_map_sum[i] = cor_map_sum;
2202 :
2203 : /* buffer voicing metric */
2204 30420 : for ( i = 0; i < 9; i++ )
2205 : {
2206 27378 : hSpMusClas->buf_dlp[i] = hSpMusClas->buf_dlp[i + 1];
2207 : }
2208 3042 : hSpMusClas->buf_dlp[i] = hSpMusClas->lps - hSpMusClas->lpm;
2209 :
2210 : /* classification */
2211 3042 : dec = mode_decision( st, len, &hSpMusClas->dec_mov, hSpMusClas->buf_flux, hSpMusClas->buf_epsP_tilt, hSpMusClas->buf_pkh, hSpMusClas->buf_cor_map_sum, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf, hSpMusClas->buf_dlp );
2212 :
2213 : /* update long term moving average of the classification decisions */
2214 3042 : if ( len > 30 )
2215 : {
2216 2892 : hSpMusClas->dec_mov = 0.97f * hSpMusClas->dec_mov + ( 1 - 0.97f ) * dec;
2217 2892 : hSpMusClas->dec_mov1 = 0.97f * hSpMusClas->dec_mov1 + ( 1 - 0.97f ) * dec;
2218 : }
2219 :
2220 : /* update long-term unvoiced counter */
2221 3042 : if ( ( st->coder_type_raw == UNVOICED || st->coder_type_raw == INACTIVE ) && etot > 1.5f && hSpMusClas->buf_Ntonal2[59] < 2 )
2222 : {
2223 112 : hSpMusClas->UV_cnt1 -= 8;
2224 : }
2225 : else
2226 : {
2227 2930 : hSpMusClas->UV_cnt1++;
2228 : }
2229 :
2230 3042 : if ( hSpMusClas->UV_cnt1 > 300 )
2231 : {
2232 2034 : hSpMusClas->UV_cnt1 = 300;
2233 : }
2234 1008 : else if ( hSpMusClas->UV_cnt1 < 0 )
2235 : {
2236 0 : hSpMusClas->UV_cnt1 = 0;
2237 : }
2238 :
2239 3042 : hSpMusClas->LT_UV_cnt1 = 0.9f * hSpMusClas->LT_UV_cnt1 + 0.1f * hSpMusClas->UV_cnt1;
2240 :
2241 : /* revert classification decision due to long-term unvoiced counter */
2242 3042 : if ( dec == 1 && hSpMusClas->dec_mov1 < 0.2f && hSpMusClas->LT_UV_cnt1 < 200 )
2243 : {
2244 0 : dec = 0;
2245 : }
2246 :
2247 : /* overwrite 1st stage speech/music decision to music */
2248 3042 : if ( dec == 1 )
2249 : {
2250 596 : st->sp_aud_decision1 = 1;
2251 : }
2252 :
2253 3042 : return;
2254 : }
2255 :
2256 :
2257 : /*---------------------------------------------------------------------*
2258 : * spec_analysis()
2259 : *
2260 : * Spectral analysis for mixed/music classification improvement
2261 : *---------------------------------------------------------------------*/
2262 :
2263 3042 : static void spec_analysis(
2264 : float *Bin_E, /* i : log energy spectrum of the current frame */
2265 : float *p2v_map /* o : spectral peakiness map */
2266 : )
2267 : {
2268 : int16_t i, k, m;
2269 : float peak[L_FFT / 4 + 1];
2270 : float valley[L_FFT / 4 + 1];
2271 : int16_t peak_idx[L_FFT / 4 + 1];
2272 : int16_t valey_idx[L_FFT / 4 + 1];
2273 : float p2v[L_FFT / 4 + 1];
2274 :
2275 : /* find spectral peaks */
2276 3042 : k = 0;
2277 383292 : for ( i = 1; i < L_FFT / 2 - 2; i++ )
2278 : {
2279 380250 : if ( Bin_E[i] > Bin_E[i - 1] && Bin_E[i] > Bin_E[i + 1] )
2280 : {
2281 101670 : peak[k] = Bin_E[i];
2282 101670 : peak_idx[k] = i;
2283 101670 : k++;
2284 : }
2285 : }
2286 3042 : assert( k + 1 < L_FFT / 4 + 1 );
2287 3042 : peak_idx[k] = -1;
2288 3042 : peak_idx[k + 1] = -1;
2289 :
2290 3042 : if ( k == 0 )
2291 : {
2292 0 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2293 : {
2294 0 : p2v_map[i] = 0;
2295 : }
2296 :
2297 0 : return;
2298 : }
2299 :
2300 : /* find spectral valleys */
2301 3042 : m = 0;
2302 3042 : if ( Bin_E[0] < Bin_E[1] )
2303 : {
2304 1648 : valley[0] = Bin_E[0];
2305 1648 : valey_idx[0] = 0;
2306 1648 : m++;
2307 : }
2308 :
2309 3042 : k = L_FFT / 2 - 2;
2310 5338 : for ( i = L_FFT / 2 - 3; i >= 0 && Bin_E[i + 1] > Bin_E[i]; i-- )
2311 : {
2312 2296 : k = i;
2313 : }
2314 :
2315 380996 : for ( i = 1; i < k; i++ )
2316 : {
2317 377954 : if ( Bin_E[i] < Bin_E[i - 1] && Bin_E[i] < Bin_E[i + 1] )
2318 : {
2319 100022 : valley[m] = Bin_E[i];
2320 100022 : valey_idx[m] = i;
2321 100022 : m++;
2322 : }
2323 : }
2324 :
2325 3042 : valley[m] = Bin_E[k];
2326 3042 : valey_idx[m] = k;
2327 :
2328 : /* find spectral peak to valley distances */
2329 3042 : k = 0;
2330 104712 : for ( i = 0; i < m; i++ )
2331 : {
2332 101670 : if ( peak_idx[k] > valey_idx[i] && peak_idx[k] < valey_idx[i + 1] )
2333 : {
2334 101670 : p2v[k] = 2 * peak[k] - valley[i] - valley[i + 1];
2335 101670 : k++;
2336 : }
2337 : }
2338 :
2339 389376 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2340 : {
2341 386334 : p2v_map[i] = 0;
2342 : }
2343 :
2344 104712 : for ( i = 0; i < k; i++ )
2345 : {
2346 101670 : p2v_map[peak_idx[i]] = p2v[i];
2347 : }
2348 :
2349 3042 : return;
2350 : }
2351 :
2352 : /*---------------------------------------------------------------------*
2353 : * flux()
2354 : *
2355 : * Calculation of spectral flux
2356 : *---------------------------------------------------------------------*/
2357 :
2358 3042 : static void flux(
2359 : float *Bin_E, /* i : log energy spectrum of the current frame */
2360 : float *p2v_map, /* i : spectral peakiness map */
2361 : float *old_Bin_E, /* i/o: log energy spectrum of the frame 60ms ago */
2362 : float *buf_flux, /* i/o: buffer storing spectral energy fluctuation */
2363 : int16_t attack_hangover, /* i/o: hangover preventing flux buffering */
2364 : float dec_mov /* i/o: moving average of classifier decision */
2365 : )
2366 : {
2367 : int16_t i;
2368 : float *pt1, *pt2, *pt3, *pt4, *pt5, *pt6;
2369 : float flux;
2370 : int16_t cnt;
2371 :
2372 : /* calculate flux */
2373 3042 : flux = 0;
2374 3042 : cnt = 0;
2375 130806 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2376 : {
2377 127764 : if ( p2v_map[i] != 0 )
2378 : {
2379 31591 : flux += fabsf( Bin_E[i] - old_Bin_E[i] );
2380 31591 : cnt++;
2381 : }
2382 : }
2383 :
2384 3042 : if ( cnt == 0 )
2385 : {
2386 0 : flux = 5;
2387 : }
2388 : else
2389 : {
2390 3042 : flux = flux / (float) cnt;
2391 : }
2392 :
2393 3042 : if ( flux > 20 && dec_mov > 0.8f )
2394 : {
2395 75 : flux = 20;
2396 : }
2397 :
2398 : /* update old Bin_E buffer */
2399 3042 : pt1 = old_Bin_E;
2400 3042 : pt2 = old_Bin_E + N_OLD_BIN_E;
2401 3042 : pt3 = Bin_E;
2402 3042 : pt4 = old_Bin_E + N_OLD_BIN_E;
2403 3042 : pt5 = old_Bin_E + 2 * N_OLD_BIN_E;
2404 3042 : pt6 = old_Bin_E + 2 * N_OLD_BIN_E;
2405 :
2406 130806 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2407 : {
2408 127764 : *pt1++ = *pt2++;
2409 127764 : *pt4++ = *pt5++;
2410 127764 : *pt6++ = *pt3++;
2411 : }
2412 :
2413 : /* update flux buffer */
2414 3042 : if ( attack_hangover <= 0 )
2415 : {
2416 182520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2417 : {
2418 179478 : buf_flux[i] = buf_flux[i + 1];
2419 : }
2420 :
2421 3042 : buf_flux[i] = flux;
2422 : }
2423 :
2424 3042 : return;
2425 : }
2426 :
2427 :
2428 : /*---------------------------------------------------------------------*
2429 : * tonal_dist()
2430 : *
2431 : * Calculation of spectral distance
2432 : *---------------------------------------------------------------------*/
2433 :
2434 3042 : static void tonal_dist(
2435 : float *p2v_map, /* i : spectral peakiness map */
2436 : float *buf_pkh, /* i/o: buffer storing highband spectral peakiness */
2437 : float *buf_Ntonal, /* i/o: buffer storing No.of 1st spectral tone */
2438 : float *buf_Ntonal2, /* i/o: buffer storing No.of 2nd spectral tone */
2439 : float *buf_Ntonal_lf /* i/o: buffer storing low band spectral tone ratio */
2440 : )
2441 : {
2442 : int16_t i;
2443 : float pk;
2444 : int16_t Ntonal;
2445 : int16_t Ntonal2;
2446 : int16_t Ntonal_lf;
2447 :
2448 : /* find number of tonals, number of tonals at low-band,
2449 : spectral peakiness at high-band */
2450 3042 : pk = 0;
2451 3042 : Ntonal = 0;
2452 3042 : Ntonal2 = 0;
2453 3042 : Ntonal_lf = 0;
2454 197730 : for ( i = 0; i < 64; i++ )
2455 : {
2456 194688 : if ( p2v_map[i] > 55 )
2457 : {
2458 15017 : Ntonal++;
2459 : }
2460 :
2461 194688 : if ( p2v_map[i] > 80 )
2462 : {
2463 8809 : Ntonal2++;
2464 8809 : Ntonal_lf++;
2465 : }
2466 : }
2467 :
2468 194688 : for ( i = 64; i < 127; i++ )
2469 : {
2470 191646 : if ( p2v_map[i] != 0 )
2471 : {
2472 52339 : pk += p2v_map[i];
2473 : }
2474 :
2475 191646 : if ( p2v_map[i] > 55 )
2476 : {
2477 6583 : Ntonal++;
2478 : }
2479 :
2480 191646 : if ( p2v_map[i] > 80 )
2481 : {
2482 2424 : Ntonal2++;
2483 : }
2484 : }
2485 :
2486 : /* update buffers */
2487 182520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2488 : {
2489 179478 : buf_pkh[i] = buf_pkh[i + 1];
2490 179478 : buf_Ntonal[i] = buf_Ntonal[i + 1];
2491 179478 : buf_Ntonal2[i] = buf_Ntonal2[i + 1];
2492 179478 : buf_Ntonal_lf[i] = buf_Ntonal_lf[i + 1];
2493 : }
2494 :
2495 3042 : buf_pkh[i] = pk;
2496 3042 : buf_Ntonal[i] = (float) Ntonal;
2497 3042 : buf_Ntonal2[i] = (float) Ntonal2;
2498 3042 : buf_Ntonal_lf[i] = (float) Ntonal_lf;
2499 :
2500 3042 : return;
2501 : }
2502 :
2503 :
2504 : /*---------------------------------------------------------------------*
2505 : * mode_decision()
2506 : *
2507 : * Decision about internal mode of the mixed/music classifier improvement
2508 : *---------------------------------------------------------------------*/
2509 :
2510 3042 : static int16_t mode_decision(
2511 : Encoder_State *st,
2512 : int16_t len, /* i : buffering status */
2513 : float *dec_mov, /* i/o: moving average of classifier decision */
2514 : float *buf_flux, /* i : buffer storing spectral energy fluctuation */
2515 : float *buf_epsP_tilt, /* i : buffer storing LP prediciton error tilt */
2516 : float *buf_pkh, /* i : buffer storing highband spectral peakiness */
2517 : float *buf_cor_map_sum, /* i : buffer storing correlation map sum */
2518 : float *buf_Ntonal, /* i : buffer storing No.of 1st spectral tone */
2519 : float *buf_Ntonal2, /* i : buffer storing No.of 2nd spectral tone */
2520 : float *buf_Ntonal_lf, /* i : buffer storing low band spectral tone ratio */
2521 : float *buf_dlp /* i : buffer storing voicing estimate */
2522 : )
2523 : {
2524 : int16_t mode;
2525 : int16_t i;
2526 : int16_t voiced_cnt;
2527 : float M_pkh;
2528 : float M_cor_map_sum;
2529 : float M_Ntonal;
2530 : float M_flux;
2531 : float V_epsP_tilt;
2532 : float lf_Ntonal_ratio;
2533 :
2534 3042 : mode = *dec_mov > 0.5f;
2535 :
2536 3042 : if ( len <= 5 )
2537 : {
2538 25 : return ( mode );
2539 : }
2540 3017 : else if ( len < 10 )
2541 : {
2542 20 : M_pkh = mean( buf_pkh + BUF_LEN - len, len );
2543 20 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - len, len );
2544 20 : M_Ntonal = mean( buf_Ntonal + BUF_LEN - len, len );
2545 20 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - len, len );
2546 :
2547 20 : voiced_cnt = 0;
2548 140 : for ( i = 9; i > 3; i-- )
2549 : {
2550 120 : if ( buf_dlp[i] > 0.0f )
2551 : {
2552 4 : voiced_cnt++;
2553 : }
2554 : }
2555 :
2556 20 : if ( ( M_pkh > 1100 || V_epsP_tilt < 0.00008f || M_cor_map_sum > 100 ) && voiced_cnt < 4 )
2557 : {
2558 1 : mode = 1;
2559 : }
2560 19 : else if ( M_Ntonal > 27 && voiced_cnt < 4 )
2561 : {
2562 0 : mode = 1;
2563 : }
2564 : }
2565 : else
2566 : {
2567 2997 : voiced_cnt = 0;
2568 32967 : for ( i = 0; i < 10; i++ )
2569 : {
2570 29970 : if ( buf_dlp[i] > 0.0f )
2571 : {
2572 14806 : voiced_cnt++;
2573 : }
2574 : }
2575 :
2576 2997 : M_flux = mean( &buf_flux[BUF_LEN - 10], 10 );
2577 2997 : M_pkh = mean( buf_pkh + BUF_LEN - 10, 10 );
2578 2997 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - 10, 10 );
2579 2997 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - 10, 10 );
2580 :
2581 2997 : if ( ( M_flux < 8.5f || ( V_epsP_tilt < 0.001f && M_flux < 12.0f ) || M_pkh > 1050 || M_cor_map_sum > 100 ) && voiced_cnt < 3 && mean( &buf_flux[55], 5 ) < 15 )
2582 : {
2583 364 : mode = 1;
2584 364 : *dec_mov = 1;
2585 364 : return ( mode );
2586 : }
2587 :
2588 2633 : if ( M_flux > 16.0f || ( M_flux > 15 && voiced_cnt > 2 ) || mean( &buf_flux[55], 5 ) > 19.0f || ( buf_flux[59] >= 20 && st->hSpMusClas->lps - st->hSpMusClas->lpm > 0 ) )
2589 : {
2590 2301 : *dec_mov = 0;
2591 2301 : mode = 0;
2592 2301 : return ( mode );
2593 : }
2594 :
2595 6644 : for ( i = 10; i < len; i++ )
2596 : {
2597 6517 : M_flux = mean( &buf_flux[BUF_LEN - i], i );
2598 6517 : M_pkh = mean( buf_pkh + BUF_LEN - i, i );
2599 6517 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - i, i );
2600 6517 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - i, i );
2601 :
2602 6517 : if ( ( ( M_flux < 12 + 0.05f * ( len - 10 ) && mean( &buf_flux[BUF_LEN - 10], 10 ) < 15 ) || V_epsP_tilt < 0.0001f + 0.000018f * ( len - 10 ) || M_pkh > 1050 - 5.0f * ( len - 10 ) || M_cor_map_sum > 95 - 0.3f * ( len - 10 ) ) && voiced_cnt < 3 )
2603 : {
2604 205 : mode = 1;
2605 205 : return ( mode );
2606 : }
2607 : }
2608 :
2609 127 : if ( len == BUF_LEN )
2610 : {
2611 126 : M_Ntonal = mean( buf_Ntonal, BUF_LEN );
2612 126 : lf_Ntonal_ratio = sum_f( buf_Ntonal_lf, BUF_LEN ) / ( sum_f( buf_Ntonal2, BUF_LEN ) + 0.0001f );
2613 :
2614 126 : if ( M_Ntonal > 18 || lf_Ntonal_ratio < 0.2f )
2615 : {
2616 0 : mode = 1;
2617 : }
2618 126 : else if ( M_Ntonal < 1 )
2619 : {
2620 0 : mode = 0;
2621 : }
2622 : }
2623 : }
2624 :
2625 147 : return ( mode );
2626 : }
2627 :
2628 :
2629 : /*----------------------------------------------------------------------------------*
2630 : * tonal_context_improv()
2631 : *
2632 : * Context-based improvement of 1st/2nd stage speech/music decision on stable tonal signals
2633 : *----------------------------------------------------------------------------------*/
2634 :
2635 3050 : static void tonal_context_improv(
2636 : Encoder_State *st, /* i/o: encoder state structure */
2637 : const float PS[], /* i : energy spectrum */
2638 : const float voi_fv, /* i : scaled voicing feature */
2639 : const float cor_map_sum_fv, /* i : scaled correlation map feature */
2640 : const float LPCErr /* i : scaled LP prediction error feature */
2641 : )
2642 : {
2643 : int16_t lt_pitch_diff;
2644 : float sort_max, sort_avg, sort_val[80];
2645 : float tonality, tonality1, tonality2, tonality3, t2, t3, tL, err, cor, dft;
2646 :
2647 3050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2648 :
2649 : /* reset in case of codec mode switching */
2650 3050 : if ( st->last_codec_mode == MODE2 )
2651 : {
2652 582 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
2653 582 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
2654 582 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
2655 582 : hSpMusClas->lt_music_hangover = 0;
2656 582 : hSpMusClas->lt_music_state = 0;
2657 582 : hSpMusClas->lt_speech_state = 0;
2658 582 : hSpMusClas->lt_speech_hangover = 0;
2659 : }
2660 :
2661 : /* estimate maximum tonality in bands [0-1 kHz], [1-2kHz] and [2-4kHz] */
2662 3050 : mvr2r( PS, sort_val, 80 );
2663 :
2664 : /* tonality in band 0-1 kHz */
2665 3050 : v_sort( sort_val, 0, 19 );
2666 3050 : sort_max = sort_val[19];
2667 3050 : sort_avg = sum_f( &sort_val[0], 10 );
2668 3050 : tonality1 = sort_max / sort_avg;
2669 :
2670 : /* tonality in band 1-2 kHz */
2671 3050 : v_sort( sort_val, 20, 39 );
2672 3050 : sort_max = sort_val[39];
2673 3050 : sort_avg = sum_f( &sort_val[20], 10 );
2674 3050 : tonality2 = sort_max / sort_avg;
2675 :
2676 : /* tonality in band 2-4 kHz */
2677 3050 : v_sort( sort_val, 40, 79 );
2678 3050 : sort_max = sort_val[79];
2679 3050 : sort_avg = sum_f( &sort_val[40], 20 );
2680 3050 : tonality3 = sort_max / sort_avg;
2681 :
2682 3050 : tonality = max( max( tonality1, tonality2 ), tonality3 );
2683 :
2684 3050 : if ( st->hVAD->hangover_cnt == 10 && st->vad_flag == 1 )
2685 : {
2686 : /* long-term voicing parameter */
2687 10 : hSpMusClas->lt_voicing = 0.1f * hSpMusClas->lt_voicing + 0.9f * *st->voicing;
2688 :
2689 : /* long-term correlation value */
2690 10 : hSpMusClas->lt_corr = 0.1f * hSpMusClas->lt_corr + 0.9f * st->old_corr;
2691 :
2692 : /* long-term tonality measure */
2693 10 : hSpMusClas->lt_tonality = 0.1f * hSpMusClas->lt_tonality + 0.9f * tonality;
2694 : }
2695 : else
2696 : {
2697 : /* long-term voicing parameter */
2698 3040 : hSpMusClas->lt_voicing = 0.7f * hSpMusClas->lt_voicing + 0.3f * *st->voicing;
2699 :
2700 : /* long-term correlation value */
2701 3040 : hSpMusClas->lt_corr = 0.7f * hSpMusClas->lt_corr + 0.3f * st->old_corr;
2702 :
2703 : /* long-term tonality measure */
2704 3040 : hSpMusClas->lt_tonality = 0.5f * hSpMusClas->lt_tonality + 0.5f * tonality;
2705 : }
2706 :
2707 : /* pitch difference w.r.t to past 3 frames */
2708 3050 : lt_pitch_diff = (int16_t) abs( hSpMusClas->lt_corr_pitch[0] - st->pitch[0] );
2709 3050 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[1] - st->pitch[0] );
2710 3050 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[2] - st->pitch[0] );
2711 :
2712 3050 : hSpMusClas->lt_corr_pitch[0] = hSpMusClas->lt_corr_pitch[1];
2713 3050 : hSpMusClas->lt_corr_pitch[1] = hSpMusClas->lt_corr_pitch[2];
2714 3050 : hSpMusClas->lt_corr_pitch[2] = st->pitch[0];
2715 :
2716 3050 : hSpMusClas->lt_old_mode[0] = hSpMusClas->lt_old_mode[1];
2717 3050 : hSpMusClas->lt_old_mode[1] = hSpMusClas->lt_old_mode[2];
2718 :
2719 4072 : if ( st->sp_aud_decision1 == 1 &&
2720 1847 : ( min( min( tonality1, tonality2 ), tonality3 ) > 50.0f ) &&
2721 78 : ( tonality1 + tonality2 > 200.0f && tonality2 + tonality3 > 200.0f && tonality1 + tonality3 > 200.0f ) &&
2722 51 : ( hSpMusClas->lt_tonality < 20000.0f ) &&
2723 51 : ( ( hSpMusClas->lt_tonality > 1000 && max( hSpMusClas->lt_voicing, *st->voicing ) > 0.99f ) ||
2724 51 : ( hSpMusClas->lt_tonality > 1500 && hSpMusClas->lt_corr > 0.99f ) ||
2725 51 : ( hSpMusClas->lt_tonality > 3000 && hSpMusClas->lowrate_pitchGain > 0.96f ) ||
2726 29 : ( lt_pitch_diff == 0 && hSpMusClas->lowrate_pitchGain > 0.89f ) ) )
2727 : {
2728 0 : if ( sum_s( hSpMusClas->lt_old_mode, 2 ) < 2 )
2729 : {
2730 : /* probably speech - change the decision to speech */
2731 0 : st->sp_aud_decision1 = 0;
2732 0 : st->sp_aud_decision2 = 0;
2733 :
2734 0 : if ( hSpMusClas->lt_hangover == 0 )
2735 : {
2736 0 : hSpMusClas->lt_hangover = 6;
2737 : }
2738 : }
2739 : }
2740 : else
2741 : {
2742 : /* not speech, but still in the hangover period - change the decision to speech */
2743 3050 : if ( hSpMusClas->lt_hangover > 0 )
2744 : {
2745 0 : st->sp_aud_decision1 = 0;
2746 0 : st->sp_aud_decision2 = 0;
2747 0 : hSpMusClas->lt_hangover--;
2748 : }
2749 : }
2750 :
2751 : /* calculate standard deviation of log-tonality */
2752 3050 : mvr2r( hSpMusClas->tonality2_buf + 1, hSpMusClas->tonality2_buf, HANG_LEN_INIT - 1 );
2753 3050 : hSpMusClas->tonality2_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality2 );
2754 3050 : t2 = std_dev( hSpMusClas->tonality2_buf, HANG_LEN_INIT );
2755 :
2756 3050 : mvr2r( hSpMusClas->tonality3_buf + 1, hSpMusClas->tonality3_buf, HANG_LEN_INIT - 1 );
2757 3050 : hSpMusClas->tonality3_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality3 );
2758 3050 : t3 = std_dev( hSpMusClas->tonality3_buf, HANG_LEN_INIT );
2759 :
2760 3050 : tL = 0.2f * log10f( hSpMusClas->lt_tonality );
2761 :
2762 : /* calculate standard deviation of residual LP energy */
2763 3050 : mvr2r( hSpMusClas->LPCErr_buf + 1, hSpMusClas->LPCErr_buf, HANG_LEN_INIT - 1 );
2764 3050 : hSpMusClas->LPCErr_buf[HANG_LEN_INIT - 1] = LPCErr;
2765 3050 : err = std_dev( hSpMusClas->LPCErr_buf, HANG_LEN_INIT );
2766 :
2767 3050 : cor = max( voi_fv - cor_map_sum_fv, 0.0f );
2768 3050 : dft = 0.2f * fabsf( log10f( tonality2 ) - log10f( tonality3 ) );
2769 :
2770 : /* state machine for strong music */
2771 3050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_music_state == 0 && hSpMusClas->lt_music_hangover == 0 &&
2772 885 : t2 < 0.54f && t2 > 0.26f && t3 > 0.22f && tL < 0.54f && tL > 0.26f && err > 0.5f )
2773 : {
2774 7 : hSpMusClas->lt_music_state = 1;
2775 7 : hSpMusClas->lt_music_hangover = 6;
2776 : }
2777 3043 : else if ( hSpMusClas->lt_music_state == 1 && hSpMusClas->lt_music_hangover == 0 && t2 < 0.34 && t3 < 0.26f && tL < 0.45f )
2778 : {
2779 6 : hSpMusClas->lt_music_state = 0;
2780 6 : hSpMusClas->lt_music_hangover = 6;
2781 : }
2782 :
2783 3050 : if ( hSpMusClas->lt_music_hangover > 0 )
2784 : {
2785 74 : hSpMusClas->lt_music_hangover--;
2786 : }
2787 :
2788 : /* state machine for strong speech */
2789 3050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 0 && hSpMusClas->lt_speech_hangover == 0 &&
2790 188 : cor > 0.40f && dft < 0.1f && voi_fv > 2 * cor_map_sum_fv + 0.12f &&
2791 27 : t2 < cor && t3 < cor && tL < cor && cor_map_sum_fv < cor && voi_fv > cor && voi_fv > 0.76f )
2792 : {
2793 10 : hSpMusClas->lt_speech_state = 1;
2794 10 : hSpMusClas->lt_speech_hangover = 6;
2795 : }
2796 3040 : else if ( hSpMusClas->lt_speech_state == 1 && hSpMusClas->lt_speech_hangover == 0 && cor < 0.40f )
2797 : {
2798 9 : hSpMusClas->lt_speech_state = 0;
2799 9 : hSpMusClas->lt_speech_hangover = 6;
2800 : }
2801 :
2802 3050 : if ( hSpMusClas->lt_speech_hangover > 0 )
2803 : {
2804 97 : hSpMusClas->lt_speech_hangover--;
2805 : }
2806 :
2807 : /* final decision */
2808 3050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 1 )
2809 : {
2810 : /* strong speech - probably error in speech/music classification */
2811 54 : st->sp_aud_decision1 = 0;
2812 54 : st->sp_aud_decision2 = 0;
2813 : }
2814 2996 : else if ( st->sp_aud_decision1 == 0 && hSpMusClas->lt_music_state == 1 )
2815 : {
2816 : /* strong music - probably error in speech/music classification */
2817 0 : st->sp_aud_decision1 = 1;
2818 0 : st->sp_aud_decision2 = 1;
2819 : }
2820 :
2821 : /* update the buffer of past decisions */
2822 3050 : hSpMusClas->lt_old_mode[2] = st->sp_aud_decision1;
2823 :
2824 3050 : return;
2825 : }
2826 :
2827 : /*---------------------------------------------------------------------*
2828 : * detect_sparseness()
2829 : *
2830 : *
2831 : *---------------------------------------------------------------------*/
2832 :
2833 2042 : static void detect_sparseness(
2834 : Encoder_State *st, /* i/o: encoder state structure */
2835 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
2836 : const float voi_fv /* i : scaled voicing feature */
2837 : )
2838 : {
2839 : float sum;
2840 : float ftmp;
2841 : float ftmp1;
2842 : float S1[128];
2843 : int16_t i, j;
2844 2042 : int16_t hb_sp_high_flag = 0;
2845 2042 : int16_t lb_sp_high_flag = 0;
2846 : float sumh;
2847 : float sparse;
2848 : float tmp_buf[4];
2849 2042 : float Mlpe = 0.0f;
2850 2042 : float Mv = 0.0f;
2851 : float Msp;
2852 :
2853 2042 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2854 :
2855 2042 : mvr2r( st->Bin_E, S1, 128 );
2856 :
2857 2042 : sum = 0;
2858 165402 : for ( i = 0; i < 80; i++ )
2859 : {
2860 163360 : if ( S1[i] < 0 )
2861 : {
2862 31735 : S1[i] = 0;
2863 : }
2864 163360 : sum += S1[i];
2865 : }
2866 :
2867 2042 : sumh = 0;
2868 100058 : for ( i = 80; i < 128; i++ )
2869 : {
2870 98016 : if ( S1[i] < 0 )
2871 : {
2872 25369 : S1[i] = 0;
2873 : }
2874 98016 : sumh += S1[i];
2875 : }
2876 :
2877 2042 : sum += sumh;
2878 :
2879 : /* order spectral from max to min */
2880 2042 : order_spectrum( S1, 128 );
2881 :
2882 : /* calculate spectral sparseness in the range 0 - 6.4 kHz */
2883 2042 : j = 0;
2884 2042 : ftmp = 0.0f;
2885 2042 : ftmp1 = 0.75f * sum;
2886 109814 : for ( i = 0; i < 128; i++ )
2887 : {
2888 109807 : ftmp += S1[i];
2889 109807 : if ( ftmp > ftmp1 )
2890 : {
2891 2035 : j = i;
2892 2035 : break;
2893 : }
2894 : }
2895 :
2896 16336 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2897 : {
2898 14294 : hSpMusClas->sparse_buf[i] = hSpMusClas->sparse_buf[i + 1];
2899 : }
2900 :
2901 2042 : sparse = (float) j;
2902 2042 : hSpMusClas->sparse_buf[i] = sparse;
2903 :
2904 2042 : if ( st->bwidth == WB )
2905 : {
2906 0 : Msp = mean( hSpMusClas->sparse_buf, 8 );
2907 :
2908 : /* find long-term smoothed sparseness */
2909 0 : if ( hSpMusClas->last_vad_spa == 0 )
2910 : {
2911 0 : set_f( &hSpMusClas->sparse_buf[0], sparse, HANG_LEN_INIT - 1 );
2912 0 : hSpMusClas->LT_sparse = sparse;
2913 : }
2914 : else
2915 : {
2916 0 : set_f( tmp_buf, 0.0f, 4 );
2917 :
2918 0 : for ( i = 0; i < HANG_LEN_INIT; i++ )
2919 : {
2920 0 : for ( j = 0; j < 4; j++ )
2921 : {
2922 0 : if ( hSpMusClas->sparse_buf[i] > tmp_buf[j] )
2923 : {
2924 0 : mvr2r( &tmp_buf[j], &tmp_buf[j + 1], 3 - j );
2925 0 : tmp_buf[j] = hSpMusClas->sparse_buf[i];
2926 0 : break;
2927 : }
2928 : }
2929 : }
2930 :
2931 0 : ftmp = 0.25f * ( HANG_LEN_INIT * Msp - sum_f( tmp_buf, 4 ) ) - hSpMusClas->LT_sparse;
2932 :
2933 0 : hSpMusClas->LT_sparse = hSpMusClas->LT_sparse + 0.25f * ftmp;
2934 : }
2935 :
2936 : /* find high-band sparseness */
2937 0 : mvr2r( st->Bin_E + 80, S1, 48 );
2938 0 : order_spectrum( S1, 48 );
2939 :
2940 0 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2941 : {
2942 0 : hSpMusClas->hf_spar_buf[i] = hSpMusClas->hf_spar_buf[i + 1];
2943 : }
2944 0 : hSpMusClas->hf_spar_buf[i] = sum_f( S1, 5 ) / ( sumh + 0.1f );
2945 0 : if ( mean( hSpMusClas->hf_spar_buf, 8 ) > 0.2f )
2946 : {
2947 0 : hb_sp_high_flag = 1;
2948 : }
2949 :
2950 : /* find low-band sparseness */
2951 0 : mvr2r( st->Bin_E, S1, 60 );
2952 0 : order_spectrum( S1, 60 );
2953 :
2954 0 : if ( sum_f( S1, 5 ) / sum_f( S1, 60 ) > 0.18f )
2955 : {
2956 0 : lb_sp_high_flag = 1;
2957 : }
2958 :
2959 : /* find smoothed linear prediction efficiency */
2960 0 : for ( i = 0; i < 7; i++ )
2961 : {
2962 0 : hSpMusClas->lpe_buf[i] = hSpMusClas->lpe_buf[i + 1];
2963 : }
2964 :
2965 0 : hSpMusClas->lpe_buf[i] = hSpMusClas->past_epsP2;
2966 0 : Mlpe = mean( hSpMusClas->lpe_buf, 8 );
2967 :
2968 : /* find smoothed voicing */
2969 0 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2970 : {
2971 0 : hSpMusClas->voicing_buf[i] = hSpMusClas->voicing_buf[i + 1];
2972 : }
2973 :
2974 0 : hSpMusClas->voicing_buf[i] = voi_fv;
2975 0 : Mv = mean( hSpMusClas->voicing_buf, 8 );
2976 : }
2977 :
2978 : /* avoid using LR-MDCT on sparse spectra */
2979 2042 : if ( st->sp_aud_decision1 == 1 )
2980 : {
2981 636 : if ( st->bwidth == WB )
2982 : {
2983 0 : ftmp = 90;
2984 : }
2985 : else
2986 : {
2987 636 : ftmp = 91;
2988 : }
2989 636 : if ( sparse > ftmp )
2990 : {
2991 0 : st->sp_aud_decision1 = 0;
2992 0 : st->sp_aud_decision2 = 1;
2993 0 : hSpMusClas->gsc_hangover = 1;
2994 : }
2995 636 : else if ( hSpMusClas->gsc_hangover == 1 )
2996 : {
2997 0 : if ( sparse > 85 )
2998 : {
2999 0 : st->sp_aud_decision1 = 0;
3000 0 : st->sp_aud_decision2 = 1;
3001 : }
3002 0 : else if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3003 : {
3004 0 : st->sp_aud_decision1 = 0;
3005 0 : st->sp_aud_decision2 = 1;
3006 : }
3007 : }
3008 :
3009 636 : if ( st->bwidth == WB )
3010 : {
3011 0 : if ( hSpMusClas->LT_sparse > 60 && sparse > 50 && Mlpe < -1.3f && Mv > 0.85f &&
3012 0 : lb_sp_high_flag == 0 && ( ( hb_sp_high_flag == 0 && sumh > 0.15f * sum ) || sumh <= 0.15f * sum ) )
3013 : {
3014 0 : st->sp_aud_decision1 = 0;
3015 0 : st->sp_aud_decision2 = 1;
3016 0 : hSpMusClas->gsc_hangover = 1;
3017 : }
3018 0 : else if ( hSpMusClas->gsc_hangover == 1 && !( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 ) )
3019 : {
3020 0 : if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3021 : {
3022 0 : st->sp_aud_decision1 = 0;
3023 0 : st->sp_aud_decision2 = 1;
3024 : }
3025 : }
3026 : }
3027 : }
3028 :
3029 : /* update the counter of consecutive GSC frames with sparse spectrum */
3030 2042 : if ( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 )
3031 : {
3032 0 : ( hSpMusClas->gsc_cnt )++;
3033 0 : if ( hSpMusClas->gsc_cnt > 7 )
3034 : {
3035 0 : hSpMusClas->gsc_cnt = 7;
3036 : }
3037 : }
3038 : else
3039 : {
3040 2042 : hSpMusClas->gsc_cnt = 0;
3041 2042 : hSpMusClas->gsc_hangover = 0;
3042 : }
3043 :
3044 2042 : hSpMusClas->last_vad_spa = localVAD_HE_SAD;
3045 :
3046 2042 : return;
3047 : }
3048 :
3049 :
3050 : /*---------------------------------------------------------------------*
3051 : * order_spectrum()
3052 : *
3053 : *
3054 : *---------------------------------------------------------------------*/
3055 :
3056 2042 : static void order_spectrum(
3057 : float *vec,
3058 : const int16_t len )
3059 : {
3060 : int16_t i, j, imax, imin;
3061 : float temp;
3062 :
3063 132730 : for ( i = 0; i < len / 2; i++ )
3064 : {
3065 130688 : imax = i;
3066 130688 : imin = i;
3067 8625408 : for ( j = i; j < len - i; j++ )
3068 : {
3069 8494720 : if ( vec[j] > vec[imax] )
3070 : {
3071 353308 : imax = j;
3072 : }
3073 : else
3074 : {
3075 8141412 : if ( vec[j] < vec[imin] )
3076 : {
3077 500650 : imin = j;
3078 : }
3079 : }
3080 : }
3081 :
3082 130688 : temp = vec[i];
3083 130688 : vec[i] = vec[imax];
3084 130688 : vec[imax] = temp;
3085 :
3086 130688 : if ( imin == i )
3087 : {
3088 21286 : imin = imax;
3089 : }
3090 :
3091 130688 : temp = vec[len - i - 1];
3092 130688 : vec[len - i - 1] = vec[imin];
3093 130688 : vec[imin] = temp;
3094 : }
3095 :
3096 2042 : return;
3097 : }
|