Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <assert.h>
38 : #include <stdint.h>
39 : #include "options.h"
40 : #ifdef DEBUGGING
41 : #include "debug.h"
42 : #endif
43 : #include <math.h>
44 : #include "cnst.h"
45 : #include "prot.h"
46 : #include "ivas_prot.h"
47 : #include "rom_enc.h"
48 : #include "rom_com.h" /* Common static table prototypes */
49 : #include "wmc_auto.h"
50 :
51 :
52 : /*---------------------------------------------------------------------*
53 : * Local constants
54 : *---------------------------------------------------------------------*/
55 :
56 : #define ATT_SEG_LEN ( L_FRAME / ATT_NSEG )
57 : #define ATT_3LSUB_POS ( 3 * ATT_NSEG / NB_SUBFR )
58 : #define ATT_3LSUB_POS_16k ( int16_t )( ( 4.0f * ATT_NSEG / (float) NB_SUBFR16k ) + 0.5f )
59 :
60 : #define THR_CORR_PEAK 0.95f
61 : #define TON_FACT 0.95f
62 : #define TON_ALPHA 0.95f
63 :
64 : #define DLP_BIAS 0.138121f
65 :
66 : #define THR_MASS_MAX 0.85f
67 : #define THR_MASS_MIN 0.75f
68 : #define THR_MASS_STEP_UP 0.01f
69 : #define THR_MASS_STEP_DN 0.02f
70 :
71 :
72 : /*---------------------------------------------------------------------*
73 : * Local function prototypes
74 : *---------------------------------------------------------------------*/
75 :
76 : static void spec_analysis( float *Bin_E, float *p2v_map );
77 :
78 : static void flux( float *Bin_E, float *p2v_map, float *old_Bin_E, float *buf_flux, int16_t attack_hangover, float dec_mov );
79 :
80 : static void tonal_dist( float *p2v_map, float *buf_pkh, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf );
81 :
82 : static int16_t mode_decision( Encoder_State *st, int16_t len, float *dec_mov, float *buf_flux, float *buf_epsP_tilt, float *buf_pkh, float *buf_cor_map_sum, float *buf_Ntonal, float *buf_Ntonal2, float *buf_Ntonal_lf, float *buf_dlp );
83 :
84 : static void var_cor_calc( const float old_corr, float *mold_corr, float var_cor_t[], int16_t *high_stable_cor );
85 :
86 : static int16_t attack_det( const float *inp, const int16_t last_clas, const int16_t localVAD, const int16_t coder_type, const int32_t total_brate, const int16_t element_mode, const int16_t clas, float finc_prev[], float *lt_finc, int16_t *last_strong_attack );
87 :
88 : static float tonal_det( const float S[], int16_t vad_flag, float tod_S_map_lt[], float *tod_thr_lt, float *tod_weight, float *tod_S_mass_prev, float *tod_S_mass_lt );
89 :
90 : static void tonal_context_improv( Encoder_State *st, const float PS[], const float voi_fv, const float cor_map_sum_fv, const float LPCErr );
91 :
92 : static void order_spectrum( float *vec, const int16_t len );
93 :
94 : static void detect_sparseness( Encoder_State *st, const int16_t localVAD_HE_SAD, const float voi_fv );
95 :
96 : static int16_t sp_mus_classif_1st( Encoder_State *st, const int16_t localVAD_HE_SAD, const float lsp_new[M], const float cor_map_sum, const float epsP[M + 1], const float PS[], float non_sta, float relE, float *voi_fv, float *cor_map_sum_fv, float *LPCErr, int16_t *high_lpn_flag );
97 :
98 : static void sp_mus_classif_2nd( Encoder_State *st, const float Etot, int16_t *attack_flag, const float *inp );
99 :
100 : static void music_mixed_classif_improv( Encoder_State *st, const float *new_inp, const float *epsP, const float etot, const float old_cor, const float cor_map_sum );
101 :
102 :
103 : /*---------------------------------------------------------------------*
104 : * speech_music_clas_init()
105 : *
106 : * Initialization of speech/music classifier
107 : *---------------------------------------------------------------------*/
108 :
109 9534 : void speech_music_clas_init(
110 : SP_MUS_CLAS_HANDLE hSpMusClas /* i/o: speech/music classifier handle */
111 : )
112 : {
113 : int16_t i;
114 :
115 9534 : set_f( hSpMusClas->FV_st, 0.0f, N_SMC_FEATURES );
116 :
117 9534 : hSpMusClas->inact_cnt = 0;
118 9534 : set_s( hSpMusClas->past_dec, 0, HANG_LEN - 1 );
119 9534 : set_f( hSpMusClas->past_dlp, 0, HANG_LEN - 1 );
120 9534 : set_f( hSpMusClas->past_dlp_mean_ST, 0, HANG_LEN - 1 );
121 9534 : hSpMusClas->dlp_mean_ST = 0.0f;
122 9534 : hSpMusClas->dlp_mean_LT = 0.0f;
123 9534 : hSpMusClas->dlp_var_LT = 0.0f;
124 :
125 152544 : for ( i = 0; i < N_SMC_FEATURES; i++ )
126 : {
127 143010 : hSpMusClas->prev_FV[i] = 0.5f * hout_intervals[2 * i] + 0.5f * hout_intervals[2 * i + 1];
128 : }
129 :
130 152544 : for ( i = 0; i < NB_BANDS_SPMUS; i++ )
131 : {
132 143010 : hSpMusClas->past_log_enr[i] = logf( E_MIN );
133 : }
134 :
135 9534 : hSpMusClas->sp_mus_state = -8;
136 9534 : hSpMusClas->wdrop = 0.0f;
137 9534 : hSpMusClas->wrise = 0.0f;
138 9534 : hSpMusClas->wdlp_0_95_sp = 0.0f;
139 9534 : hSpMusClas->wdlp_xtalk = 0.0f;
140 9534 : set_f( hSpMusClas->last_lsp, 0.0f, M_LSP_SPMUS );
141 9534 : hSpMusClas->last_cor_map_sum = 0.0f;
142 9534 : hSpMusClas->last_non_sta = 0.0f;
143 9534 : set_f( hSpMusClas->past_PS, 0.0f, HIGHEST_FBIN - LOWEST_FBIN );
144 9534 : hSpMusClas->past_ps_diff = 0;
145 9534 : hSpMusClas->past_epsP2 = 01;
146 9534 : hSpMusClas->past_epsP = 0;
147 9534 : hSpMusClas->flag_spitch_cnt = 0;
148 :
149 9534 : hSpMusClas->gsc_thres[0] = TH_0_MIN;
150 9534 : hSpMusClas->gsc_thres[1] = TH_1_MIN;
151 9534 : hSpMusClas->gsc_thres[2] = TH_2_MIN;
152 9534 : hSpMusClas->gsc_thres[3] = TH_3_MIN;
153 9534 : set_f( hSpMusClas->gsc_lt_diff_etot, 0.0f, MAX_LT );
154 9534 : hSpMusClas->gsc_mem_etot = 0.0f;
155 9534 : hSpMusClas->gsc_last_music_flag = 0;
156 9534 : hSpMusClas->gsc_nb_thr_1 = 0;
157 9534 : hSpMusClas->gsc_nb_thr_3 = 0;
158 9534 : hSpMusClas->mold_corr = 0.9f;
159 9534 : hSpMusClas->mean_avr_dyn = 0.5f;
160 9534 : hSpMusClas->last_sw_dyn = 10.0f;
161 :
162 9534 : hSpMusClas->relE_attack_cnt = 0;
163 9534 : hSpMusClas->prev_relE = 0.0f;
164 9534 : hSpMusClas->prev_Etot = 0.0f;
165 9534 : hSpMusClas->prev_vad = 0;
166 9534 : hSpMusClas->vad_0_1_cnt = 0;
167 9534 : hSpMusClas->relE_attack_sum = 0;
168 :
169 : /* speech/music classifier improvement */
170 581574 : for ( i = 0; i < BUF_LEN; i++ )
171 : {
172 572040 : hSpMusClas->buf_flux[i] = -100;
173 572040 : hSpMusClas->buf_pkh[i] = 0;
174 572040 : hSpMusClas->buf_epsP_tilt[i] = 0;
175 572040 : hSpMusClas->buf_cor_map_sum[i] = 0;
176 572040 : hSpMusClas->buf_Ntonal[i] = 0;
177 572040 : hSpMusClas->buf_Ntonal2[i] = 0;
178 572040 : hSpMusClas->buf_Ntonal_lf[i] = 0;
179 : }
180 :
181 9534 : set_f( hSpMusClas->lpe_buf, 0, HANG_LEN_INIT );
182 9534 : set_f( hSpMusClas->voicing_buf, 0, HANG_LEN_INIT );
183 9534 : hSpMusClas->gsc_hangover = 0;
184 9534 : set_f( hSpMusClas->sparse_buf, 0, HANG_LEN_INIT );
185 9534 : set_f( hSpMusClas->hf_spar_buf, 0, HANG_LEN_INIT );
186 9534 : hSpMusClas->LT_sparse = 0.0f;
187 9534 : hSpMusClas->gsc_cnt = 0;
188 9534 : hSpMusClas->last_vad_spa = 0;
189 :
190 9534 : set_f( hSpMusClas->old_Bin_E, 0.0f, 3 * N_OLD_BIN_E );
191 9534 : set_f( hSpMusClas->buf_etot, 0, 4 );
192 9534 : set_f( hSpMusClas->buf_dlp, 0, 10 );
193 :
194 9534 : hSpMusClas->UV_cnt1 = 300;
195 9534 : hSpMusClas->LT_UV_cnt1 = 250.0f;
196 9534 : hSpMusClas->onset_cnt = 0;
197 9534 : hSpMusClas->attack_hangover = 0;
198 9534 : hSpMusClas->dec_mov = 0.0f;
199 9534 : hSpMusClas->dec_mov1 = 0.0f;
200 9534 : hSpMusClas->mov_log_max_spl = 200.0f;
201 9534 : hSpMusClas->old_lt_diff[0] = 0.0f;
202 9534 : hSpMusClas->old_lt_diff[1] = 0.0f;
203 :
204 9534 : set_f( hSpMusClas->finc_prev, 0.0f, ATT_NSEG );
205 9534 : hSpMusClas->lt_finc = 0.0f;
206 9534 : hSpMusClas->last_strong_attack = 0;
207 9534 : hSpMusClas->tdm_lt_Etot = 0.01f;
208 9534 : set_f( hSpMusClas->tod_lt_Bin_E, 0.0f, TOD_NSPEC );
209 9534 : set_f( hSpMusClas->tod_S_map_lt, 0.0f, TOD_NSPEC );
210 9534 : hSpMusClas->tod_thr_lt = TOD_THR_MASS;
211 9534 : hSpMusClas->tod_weight = 0.0f;
212 9534 : hSpMusClas->tod_S_mass_prev = 0.0f;
213 9534 : hSpMusClas->tod_S_mass_lt = 0.0f;
214 :
215 : /* speech/music classification */
216 9534 : set_s( hSpMusClas->lt_old_mode, 1, 3 );
217 9534 : hSpMusClas->lt_voicing = 0.5f;
218 9534 : hSpMusClas->lt_corr = 0.5f;
219 9534 : hSpMusClas->lt_tonality = 0;
220 9534 : set_s( hSpMusClas->lt_corr_pitch, 0, 3 );
221 9534 : hSpMusClas->lt_hangover = 0;
222 9534 : hSpMusClas->lowrate_pitchGain = 0;
223 :
224 9534 : hSpMusClas->lt_music_hangover = 0;
225 9534 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
226 9534 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
227 9534 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
228 9534 : hSpMusClas->lt_music_state = 0;
229 9534 : hSpMusClas->lt_speech_state = 0;
230 9534 : hSpMusClas->lt_speech_hangover = 0;
231 :
232 9534 : hSpMusClas->lt_dec_thres = 10.0f;
233 9534 : hSpMusClas->ener_RAT = 0.0f;
234 :
235 9534 : hSpMusClas->high_stable_cor = 0;
236 9534 : set_f( hSpMusClas->var_cor_t, 0.0f, VAR_COR_LEN );
237 :
238 9534 : hSpMusClas->lps = 0.0f;
239 9534 : hSpMusClas->lpm = 0.0f;
240 9534 : hSpMusClas->lpn = 0.0f;
241 :
242 9534 : return;
243 : }
244 :
245 :
246 : /*---------------------------------------------------------------------*
247 : * speech_music_classif()
248 : *
249 : * Speech/music classification
250 : *
251 : * The following technologies are used based on the outcome of the sp/mus classifier
252 : * sp_aud_decision1 sp_aud_decision2
253 : * 0 0 use ACELP (+TD BWE)
254 : * 1 0 use ACELP (+FD BWE) or HQ/LR-MDCT depending on bitrate
255 : * 1 1 use GSC (+FD BWE) or HQ/LR-MDCT depending on bitrate
256 : *
257 : * 0 1 exceptionally use GSC (+FD BWE) instead of LR-MDCT at 13.2 kbps (WB/SWB) for sparse spectra
258 : *---------------------------------------------------------------------*/
259 :
260 : /*! r: 1st stage decision (1-music, 0-speech or noise) */
261 3100 : void speech_music_classif(
262 : Encoder_State *st, /* i/o: state structure */
263 : const float *new_inp, /* i : new input signal */
264 : const float *inp, /* i : input signal to locate attach position */
265 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
266 : const float lsp_new[M], /* i : LSPs in current frame */
267 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
268 : const float epsP[M + 1], /* i : LP prediciton error */
269 : const float PS[], /* i : energy spectrum */
270 : const float Etot, /* i : total frame energy */
271 : const float old_cor, /* i : max correlation from previous frame */
272 : int16_t *attack_flag, /* o : attack flag (GSC or TC) */
273 : const float non_sta, /* i : unbound non-stationarity for sp/mus classifier */
274 : const float relE, /* i : relative frame energy */
275 : int16_t *high_lpn_flag, /* o : sp/mus LPN flag */
276 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
277 : )
278 : {
279 : float voi_fv, cor_map_sum_fv, LPCErr;
280 :
281 : /* 1st stage speech/music classification based on the GMM model */
282 3100 : st->sp_aud_decision1 = sp_mus_classif_1st( st, localVAD_HE_SAD, lsp_new, cor_map_sum, epsP, PS, non_sta, relE, &voi_fv, &cor_map_sum_fv, &LPCErr, high_lpn_flag );
283 :
284 3100 : if ( st->codec_mode == MODE1 || st->sr_core == INT_FS_12k8 )
285 : {
286 :
287 : /* Improvement of the 1st stage decision for mixed/music content */
288 2050 : if ( !st->Opt_SC_VBR && ( st->total_brate != ACELP_24k40 ) )
289 : {
290 2050 : music_mixed_classif_improv( st, new_inp, epsP, Etot, old_cor, cor_map_sum );
291 : }
292 :
293 2050 : st->sp_aud_decision0 = st->sp_aud_decision1;
294 :
295 : /* 2nd stage speech/music classification (rewrite music to speech in onsets) */
296 2050 : st->sp_aud_decision2 = st->sp_aud_decision1;
297 :
298 2050 : if ( st->bwidth > NB )
299 : {
300 2050 : sp_mus_classif_2nd( st, Etot, attack_flag, inp );
301 :
302 2050 : if ( flag_spitch && st->bwidth == WB && st->total_brate < ACELP_13k20 )
303 : {
304 : /* avoid switch to AUDIO/MUSIC class for very short stable high pitch
305 : and/or stable pitch with high correlation at low bitrates*/
306 0 : st->sp_aud_decision2 = 0;
307 : }
308 : }
309 :
310 : /* Context-based improvement of 1st and 2nd stage decision on stable tonal signals */
311 2050 : if ( !st->Opt_SC_VBR && st->total_brate != ACELP_24k40 )
312 : {
313 2050 : tonal_context_improv( st, PS, voi_fv, cor_map_sum_fv, LPCErr );
314 : }
315 :
316 : /* Avoid using LR-MDCT on sparse spectra, use GSC instead at 13.2 kbps (WB/SWB) */
317 2050 : if ( !st->Opt_SC_VBR && st->total_brate == ACELP_13k20 && st->vad_flag == 1 && ( st->bwidth == WB || st->bwidth == SWB ) )
318 : {
319 1042 : detect_sparseness( st, localVAD_HE_SAD, voi_fv );
320 : }
321 :
322 : /* override speech/music classification to ACELP when background noise level reaches certain level */
323 : /* this is a patch against mis-classifications during active noisy speech segments */
324 2050 : if ( st->lp_noise > 12.0f )
325 : {
326 0 : st->sp_aud_decision1 = 0;
327 0 : st->sp_aud_decision2 = 0;
328 : }
329 :
330 : /* set GSC noisy speech flag on unvoiced SWB segments */
331 2050 : st->GSC_noisy_speech = 0;
332 2050 : if ( st->vad_flag == 1 && st->total_brate >= ACELP_13k20 && st->total_brate < ACELP_24k40 &&
333 1042 : st->lp_noise > 12.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB &&
334 0 : st->coder_type_raw == UNVOICED )
335 : {
336 0 : st->GSC_noisy_speech = 1;
337 : }
338 :
339 : /* Select AUDIO frames */
340 : #ifdef DEBUGGING
341 : if ( st->codec_mode == MODE1 && ( st->force == 1 || ( st->force == -1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) ) ) )
342 : #else
343 2050 : if ( st->codec_mode == MODE1 && ( st->sp_aud_decision2 || st->GSC_noisy_speech ) )
344 : #endif
345 : {
346 634 : st->coder_type = AUDIO;
347 634 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
348 : }
349 : }
350 : else
351 : {
352 1050 : st->sp_aud_decision0 = st->sp_aud_decision1;
353 : }
354 :
355 :
356 3100 : return;
357 : }
358 :
359 :
360 : /*---------------------------------------------------------------------*
361 : * sp_mus_classif_1st()
362 : *
363 : * 1st stage speech/music classification (based on the GMM model)
364 : *---------------------------------------------------------------------*/
365 :
366 : /*! r: decision flag (1-music, 0-speech or noise) */
367 3100 : static int16_t sp_mus_classif_1st(
368 : Encoder_State *st, /* i/o: state structure */
369 : const int16_t localVAD_HE_SAD, /* i : local VAD HE flag */
370 : const float lsp_new[M], /* i : LSPs in current frame */
371 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
372 : const float epsP[M + 1], /* i : LP prediciton error */
373 : const float PS[], /* i : energy spectrum */
374 : float non_sta, /* i : unbound non-stationarity */
375 : float relE, /* i : relative frame energy */
376 : float *voi_fv, /* o : scaled voicing feature */
377 : float *cor_map_sum_fv, /* o : scaled correlation map feature */
378 : float *LPCErr, /* o : scaled LP prediction error feature */
379 : int16_t *high_lpn_flag /* o : sp/mus LPN flag */
380 : )
381 : {
382 : int16_t i, k, p, dec, vad;
383 : float dlp, ftmp, lepsP1, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght, mx;
384 3100 : float FV[N_FEATURES], *pFV = FV, PS_norm[128], dPS[128], lsp[M];
385 3100 : float pys, pym, xm[N_FEATURES], py, lps = 0, lpm = 0;
386 : const float *pSF;
387 3100 : float pyn, lpn = 0;
388 :
389 3100 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
390 :
391 : /*------------------------------------------------------------------*
392 : * Initialization
393 : *------------------------------------------------------------------*/
394 :
395 3100 : vad = localVAD_HE_SAD;
396 :
397 : /*------------------------------------------------------------------*
398 : * Preparation of the feature vector
399 : *------------------------------------------------------------------*/
400 :
401 : /* [0] OL pitch */
402 3100 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
403 : {
404 262 : *pFV++ = (float) st->pitch[2];
405 : }
406 : else
407 : {
408 2838 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
409 : }
410 :
411 : /* [1] voicing */
412 3100 : if ( st->tc_cnt == 1 || st->tc_cnt == 2 )
413 : {
414 262 : *pFV++ = st->voicing[2];
415 : }
416 : else
417 : {
418 2838 : *pFV++ = (float) ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
419 : }
420 :
421 : /* [2,3,4,5,6] LSFs */
422 3100 : mvr2r( lsp_new, lsp, M );
423 :
424 3100 : ftmp = (float) acos( lsp[1] );
425 3100 : *pFV++ = ftmp + hSpMusClas->last_lsp[1];
426 3100 : hSpMusClas->last_lsp[1] = ftmp;
427 :
428 3100 : ftmp = (float) acos( lsp[2] );
429 3100 : *pFV++ = ftmp + hSpMusClas->last_lsp[2];
430 3100 : hSpMusClas->last_lsp[2] = ftmp;
431 :
432 3100 : ftmp = (float) acos( lsp[3] );
433 3100 : *pFV++ = ftmp + hSpMusClas->last_lsp[3];
434 3100 : hSpMusClas->last_lsp[3] = ftmp;
435 :
436 3100 : ftmp = (float) acos( lsp[4] );
437 3100 : *pFV++ = ftmp + hSpMusClas->last_lsp[4];
438 3100 : hSpMusClas->last_lsp[4] = ftmp;
439 :
440 3100 : ftmp = (float) acos( lsp[5] );
441 3100 : *pFV++ = ftmp + hSpMusClas->last_lsp[5];
442 3100 : hSpMusClas->last_lsp[5] = ftmp;
443 :
444 : /* [7] cor_map_sum */
445 3100 : *pFV++ = cor_map_sum + hSpMusClas->last_cor_map_sum;
446 3100 : hSpMusClas->last_cor_map_sum = cor_map_sum;
447 :
448 : /* [8] non_sta */
449 3100 : *pFV++ = non_sta + hSpMusClas->last_non_sta;
450 3100 : hSpMusClas->last_non_sta = non_sta;
451 :
452 : /* [9] epsP */
453 3100 : if ( st->bwidth == NB )
454 : {
455 : /* do not take into account (statistics are too different) */
456 0 : *pFV++ = -1.647f;
457 : }
458 : else
459 : {
460 3100 : lepsP1 = logf( epsP[1] + 1e-5f );
461 3100 : ftmp = logf( epsP[13] ) - lepsP1;
462 3100 : *pFV++ = ftmp + hSpMusClas->past_epsP2;
463 3100 : hSpMusClas->past_epsP2 = ftmp;
464 : }
465 :
466 : /* calculation of differential normalized power spectrum */
467 3100 : sum_PS = 1e-5f;
468 210800 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
469 : {
470 207700 : sum_PS += PS[i];
471 : }
472 :
473 210800 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
474 : {
475 207700 : PS_norm[i] = PS[i] / sum_PS;
476 207700 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
477 : }
478 :
479 : /* [10] ps_diff (spectral difference) */
480 3100 : ps_diff = 0;
481 210800 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
482 : {
483 207700 : ps_diff += dPS[i];
484 : }
485 :
486 3100 : ps_diff = logf( ps_diff + 1e-5f );
487 3100 : *pFV++ = ps_diff + hSpMusClas->past_ps_diff;
488 3100 : hSpMusClas->past_ps_diff = ps_diff;
489 :
490 : /* [11] ps_sta (spectral stationarity) */
491 3100 : ps_sta = 0;
492 210800 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
493 : {
494 207700 : mx = PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] ? PS_norm[i] : hSpMusClas->past_PS[i - LOWEST_FBIN];
495 207700 : ps_sta += mx / ( dPS[i] + 1e-5f );
496 : }
497 :
498 3100 : *pFV++ = logf( ps_sta + 1e-5f );
499 3100 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
500 :
501 : /*------------------------------------------------------------------*
502 : * Scaling of the feature vector
503 : *------------------------------------------------------------------*/
504 :
505 3100 : pFV = FV;
506 3100 : if ( st->bwidth == NB )
507 : {
508 0 : pSF = SF_8k;
509 : }
510 : else
511 : {
512 3100 : pSF = SF;
513 : }
514 :
515 40300 : for ( i = 0; i < N_FEATURES; i++, pFV++, pSF += 2 )
516 : {
517 37200 : *pFV = pSF[0] * *pFV + pSF[1];
518 : }
519 :
520 : /* store some scaled parameters for later correction of the 1st stage speech/music classification */
521 3100 : *voi_fv = FV[1];
522 3100 : *cor_map_sum_fv = FV[7];
523 3100 : *LPCErr = FV[9];
524 :
525 : /*------------------------------------------------------------------*
526 : * Calculation of posterior probability
527 : * Log-probability
528 : *------------------------------------------------------------------*/
529 :
530 3100 : pys = pym = pyn = 1e-5f;
531 :
532 : /* run loop for all mixtures (for each mixture, calculate the probability of speech or noise and the probability of music) */
533 21700 : for ( k = 0; k < N_MIXTURES; k++ )
534 : {
535 : /* active frames - calculate the probability of speech */
536 241800 : for ( p = 0; p < N_FEATURES; p++ )
537 : {
538 223200 : xm[p] = FV[p] - m_speech[k * N_FEATURES + p];
539 : }
540 :
541 18600 : py = lvm_speech[k] + dot_product_mat( xm, &invV_speech[k * N_FEATURES * N_FEATURES], N_FEATURES );
542 18600 : pys += expf( py );
543 : /* inactive frames - calculate the probability of noise */
544 241800 : for ( p = 0; p < N_FEATURES; p++ )
545 : {
546 223200 : xm[p] = FV[p] - m_noise[k * N_FEATURES + p];
547 : }
548 :
549 18600 : py = lvm_noise[k] + dot_product_mat( xm, &invV_noise[k * N_FEATURES * N_FEATURES], N_FEATURES );
550 18600 : pyn += expf( py );
551 :
552 : /* either active or inactive frames - calculate the probability of music */
553 241800 : for ( p = 0; p < N_FEATURES; p++ )
554 : {
555 223200 : xm[p] = FV[p] - m_music[k * N_FEATURES + p];
556 : }
557 :
558 18600 : py = lvm_music[k] + dot_product_mat( xm, &invV_music[k * N_FEATURES * N_FEATURES], N_FEATURES );
559 18600 : pym += expf( py );
560 : }
561 :
562 : /* calculate log-probability */
563 3100 : lps = logf( pys ) - 0.5f * N_FEATURES * logf( PI2 );
564 3100 : lpm = logf( pym ) - 0.5f * N_FEATURES * logf( PI2 );
565 3100 : lpn = logf( pyn ) - 0.5f * N_FEATURES * logf( PI2 );
566 :
567 3100 : *high_lpn_flag = 0;
568 3100 : if ( lpn > lps && lpn > lpm )
569 : {
570 43 : *high_lpn_flag = 1;
571 : }
572 :
573 3100 : if ( !vad )
574 : {
575 : /* artificially increase log-probability of noise */
576 84 : lps = lpn * 1.2f;
577 : }
578 :
579 3100 : hSpMusClas->lpm = lpm;
580 3100 : hSpMusClas->lps = lps;
581 :
582 : /* determine HQ Generic speech class */
583 3100 : if ( st->hHQ_core != NULL )
584 : {
585 3100 : if ( lps > lpm + 0.5f )
586 : {
587 1414 : st->hHQ_core->hq_generic_speech_class = 1;
588 : }
589 : else
590 : {
591 1686 : st->hHQ_core->hq_generic_speech_class = 0;
592 : }
593 : }
594 :
595 : /*------------------------------------------------------------------*
596 : * State machine (sp_mus_state < 0 .. inactive, > 0 .. entry, = 0 .. active )
597 : *------------------------------------------------------------------*/
598 :
599 3100 : if ( vad )
600 : {
601 3016 : if ( relE < -20 || ( lps <= -5 && lpm <= -5 ) )
602 : {
603 397 : if ( hSpMusClas->sp_mus_state > 0 )
604 : {
605 71 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
606 : {
607 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
608 9 : hSpMusClas->inact_cnt = 0;
609 : }
610 :
611 : /* energy is too low -> we are going to instable state */
612 71 : hSpMusClas->sp_mus_state = 0;
613 : }
614 326 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
615 : {
616 : /* energy is still too low -> we are still in instable state */
617 154 : hSpMusClas->sp_mus_state--;
618 : }
619 : }
620 2619 : else if ( hSpMusClas->sp_mus_state <= 0 )
621 : {
622 71 : if ( hSpMusClas->inact_cnt == 0 )
623 : {
624 :
625 24 : hSpMusClas->sp_mus_state = 1;
626 : }
627 : else
628 : {
629 :
630 47 : hSpMusClas->sp_mus_state = HANG_LEN;
631 : }
632 :
633 71 : hSpMusClas->inact_cnt = 12;
634 : }
635 2548 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
636 : {
637 : /* we are inside an entry period -> increment the counter of entry frames */
638 129 : hSpMusClas->sp_mus_state++;
639 : }
640 :
641 3016 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
642 : {
643 196 : hSpMusClas->inact_cnt--;
644 : }
645 : }
646 : else
647 : {
648 84 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
649 : {
650 0 : hSpMusClas->inact_cnt = 0;
651 : }
652 84 : else if ( hSpMusClas->inact_cnt > 0 )
653 : {
654 26 : hSpMusClas->inact_cnt--;
655 : }
656 :
657 84 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
658 : {
659 0 : hSpMusClas->sp_mus_state = -HANG_LEN;
660 : }
661 84 : else if ( hSpMusClas->sp_mus_state > 0 )
662 : {
663 0 : hSpMusClas->sp_mus_state = -1;
664 : }
665 84 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
666 : {
667 : /* we are in inactive state */
668 45 : hSpMusClas->sp_mus_state--;
669 : }
670 : }
671 :
672 : /*------------------------------------------------------------------*
673 : * Decision without hangover
674 : * Weighted decision
675 : *------------------------------------------------------------------*/
676 :
677 : /* decision without hangover (0 - speech/noise, 1 - music) */
678 3100 : dec = lpm > lps;
679 3100 : dlp = lpm - lps;
680 :
681 3100 : if ( !vad )
682 : {
683 84 : dec = 0;
684 84 : dlp = 0;
685 : }
686 :
687 : /* calculate weight based on relE (close to 0.01 in low-E regions, close to 1 in high-E regions) */
688 3100 : wrelE = 1.0f + relE / 15;
689 :
690 3100 : if ( wrelE > 1.0f )
691 : {
692 1164 : wrelE = 1.0f;
693 : }
694 1936 : else if ( wrelE < 0.01f )
695 : {
696 665 : wrelE = 0.01f;
697 : }
698 :
699 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
700 3100 : if ( dlp < 0 && dlp < hSpMusClas->past_dlp[0] )
701 : {
702 881 : if ( hSpMusClas->past_dlp[0] > 0 )
703 : {
704 280 : hSpMusClas->wdrop = -dlp;
705 : }
706 : else
707 : {
708 601 : hSpMusClas->wdrop += hSpMusClas->past_dlp[0] - dlp;
709 : }
710 : }
711 : else
712 : {
713 2219 : hSpMusClas->wdrop = 0;
714 : }
715 :
716 3100 : wdrop = hSpMusClas->wdrop / 20;
717 :
718 3100 : if ( wdrop > 1.0f )
719 : {
720 0 : wdrop = 1.0f;
721 : }
722 3100 : else if ( wdrop < 0.1f )
723 : {
724 2627 : wdrop = 0.1f;
725 : }
726 :
727 : /* combine weights into one */
728 3100 : wght = wrelE * wdrop;
729 3100 : if ( wght < 0.01f )
730 : {
731 727 : wght = 0.01f;
732 : }
733 :
734 : /* calculate weighted decision */
735 3100 : hSpMusClas->wdlp_0_95_sp = wght * dlp + ( 1 - wght ) * hSpMusClas->wdlp_0_95_sp;
736 :
737 3100 : if ( hSpMusClas->sp_mus_state == -HANG_LEN )
738 : {
739 230 : hSpMusClas->wdlp_0_95_sp = 0;
740 : }
741 :
742 : /*------------------------------------------------------------------*
743 : * Final speech/music decision
744 : *------------------------------------------------------------------*/
745 :
746 3100 : if ( !vad && hSpMusClas->sp_mus_state == -HANG_LEN )
747 : {
748 : /* inactive state */
749 43 : dec = 0;
750 : }
751 3057 : else if ( hSpMusClas->sp_mus_state <= 0 )
752 : {
753 : /* transition from active to inactive state or instable state */
754 438 : dec = hSpMusClas->past_dec[0];
755 : }
756 2619 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
757 : {
758 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
759 138 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
760 138 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
761 138 : dec = ftmp > 2.0f;
762 : }
763 : else
764 : {
765 : /* stable active state */
766 2481 : if ( hSpMusClas->wdlp_0_95_sp > 0 && hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 )
767 : {
768 : /* switching from speech to music */
769 17 : dec = 1;
770 : }
771 2464 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < 0 )
772 : {
773 : /* switching from music to speech */
774 17 : dec = 0;
775 : }
776 : else
777 : {
778 2447 : dec = hSpMusClas->past_dec[0];
779 : }
780 : }
781 :
782 : /*------------------------------------------------------------------*
783 : * Updates
784 : *------------------------------------------------------------------*/
785 :
786 : /* update buffer of past non-binary decisions */
787 3100 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
788 3100 : hSpMusClas->past_dlp[0] = dlp;
789 :
790 : /* update buffer of past binary decisions */
791 3100 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
792 3100 : hSpMusClas->past_dec[0] = dec;
793 :
794 3100 : return dec;
795 : }
796 :
797 :
798 : /*---------------------------------------------------------------------*
799 : * sp_mus_classif_2nd()
800 : *
801 : * 2nd stage speech/music classifier (convert music to speech for onsets)
802 : *---------------------------------------------------------------------*/
803 :
804 2050 : static void sp_mus_classif_2nd(
805 : Encoder_State *st, /* i/o: encoder state structure */
806 : const float Etot, /* i : total frame energy */
807 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
808 : const float *inp /* i : input signal */
809 : )
810 : {
811 : int16_t attack;
812 2050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
813 :
814 : /* initialization */
815 2050 : *attack_flag = 0;
816 :
817 : /* signal stability estimation */
818 2050 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
819 :
820 : /* calculate variance of correlation */
821 2050 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
822 :
823 : /* attack detection */
824 2050 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, st->total_brate, EVS_MONO, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
825 :
826 : /* change decision from music to speech in certain special cases */
827 2050 : if ( st->sp_aud_decision1 == 1 )
828 : {
829 677 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
830 : {
831 : /* strong music decision but almost no content below 1kHz */
832 0 : st->sp_aud_decision2 = 0;
833 : }
834 677 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
835 : {
836 : /* prevent GSC in highly correlated signal with low energy variation */
837 : /* this is basically a patch against bassoon-type of music */
838 0 : st->sp_aud_decision2 = 0;
839 :
840 0 : if ( st->codec_mode == MODE1 && st->coder_type == TRANSITION )
841 : {
842 0 : st->coder_type = GENERIC;
843 : }
844 : }
845 677 : else if ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f && ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
846 : {
847 21 : if ( st->tc_cnt == 1 )
848 : {
849 : /* do TC coding instead of GC/VC if onset has been already declared before */
850 0 : st->sp_aud_decision2 = 0;
851 :
852 0 : if ( st->codec_mode == MODE1 )
853 : {
854 0 : st->coder_type = TRANSITION;
855 : }
856 : }
857 : else
858 : {
859 21 : if ( attack >= ATT_3LSUB_POS )
860 : {
861 : /* do TC coding if attack is located in the last subframe */
862 6 : st->sp_aud_decision2 = 0;
863 6 : *attack_flag = attack + 1;
864 :
865 6 : if ( st->codec_mode == MODE1 )
866 : {
867 6 : st->coder_type = TRANSITION;
868 : }
869 : }
870 15 : else if ( attack >= ATT_SEG_LEN / 2 )
871 : {
872 : /* do GSC coding if attack is located after the first quarter of the first subframe */
873 : /* (pre-echo will be treated at the decoder side) */
874 0 : st->sp_aud_decision2 = 1;
875 0 : *attack_flag = 31;
876 : }
877 : }
878 : }
879 : }
880 1373 : else if ( st->localVAD == 1 && st->coder_type == GENERIC && ( ( attack >= ATT_3LSUB_POS && st->total_brate < ACELP_24k40 ) || ( attack >= ATT_3LSUB_POS_16k && st->total_brate >= ACELP_24k40 && st->total_brate < ACELP_48k ) ) )
881 : {
882 : /* do TC coding if attack is located in the last subframe */
883 19 : *attack_flag = attack + 1;
884 :
885 19 : if ( st->codec_mode == MODE1 )
886 : {
887 19 : st->coder_type = TRANSITION;
888 : }
889 : }
890 :
891 2050 : return;
892 : }
893 :
894 :
895 : /*---------------------------------------------------------------------*
896 : * tonal_det()
897 : *
898 : * Tonal detector based on spectral stability and harmonicity
899 : *---------------------------------------------------------------------*/
900 :
901 414383 : static float tonal_det(
902 : const float S[],
903 : int16_t vad_flag,
904 : float tod_S_map_lt[],
905 : float *tod_thr_lt,
906 : float *tod_weight,
907 : float *tod_S_mass_prev,
908 : float *tod_S_mass_lt )
909 : {
910 : int16_t i;
911 : float S_mass, alpha;
912 :
913 : /* update the adaptive weight */
914 414383 : *tod_weight = TON_ALPHA * *tod_weight + ( 1 - TON_ALPHA ) * vad_flag;
915 414383 : if ( *tod_weight > TON_ALPHA )
916 : {
917 268725 : *tod_weight = TON_ALPHA;
918 : }
919 145658 : else if ( *tod_weight < ( 1 - TON_ALPHA ) )
920 : {
921 29279 : *tod_weight = 1 - TON_ALPHA;
922 : }
923 :
924 : /* calculate LT spectral correlation in each band up to 4KHz */
925 414383 : S_mass = 0.0f;
926 33565023 : for ( i = 0; i < TOD_NSPEC; i++ )
927 : {
928 33150640 : tod_S_map_lt[i] = *tod_weight * tod_S_map_lt[i] + ( 1 - *tod_weight ) * S[i];
929 :
930 33150640 : S_mass += tod_S_map_lt[i];
931 : }
932 414383 : S_mass /= TOD_NSPEC;
933 :
934 414383 : if ( S_mass > *tod_S_mass_prev )
935 : {
936 201357 : alpha = 0.7f;
937 : }
938 : else
939 : {
940 213026 : alpha = 0.3f;
941 : }
942 414383 : *tod_S_mass_prev = S_mass;
943 414383 : *tod_S_mass_lt = alpha * *tod_S_mass_lt + ( 1 - alpha ) * S_mass;
944 414383 : S_mass = *tod_S_mass_lt;
945 :
946 : /* updating adaptive decision threshold */
947 414383 : if ( S_mass > *tod_thr_lt )
948 : {
949 3580 : *tod_thr_lt -= THR_MASS_STEP_DN;
950 : }
951 : else
952 : {
953 410803 : *tod_thr_lt += THR_MASS_STEP_UP;
954 : }
955 :
956 414383 : if ( *tod_thr_lt > THR_MASS_MAX )
957 : {
958 409801 : *tod_thr_lt = THR_MASS_MAX;
959 : }
960 :
961 414383 : if ( *tod_thr_lt < THR_MASS_MIN )
962 : {
963 3026 : *tod_thr_lt = THR_MASS_MIN;
964 : }
965 :
966 414383 : return S_mass;
967 : }
968 :
969 : /*---------------------------------------------------------------------*
970 : * var_cor_calc()
971 : *
972 : * Calculate variance of correlation
973 : *---------------------------------------------------------------------*/
974 :
975 416433 : static void var_cor_calc(
976 : const float old_corr,
977 : float *mold_corr,
978 : float var_cor_t[],
979 : int16_t *high_stable_cor )
980 : {
981 : int16_t i;
982 : float var_cor;
983 :
984 : /* update buffer of old correlation values */
985 4164330 : for ( i = VAR_COR_LEN - 1; i > 0; i-- )
986 : {
987 3747897 : var_cor_t[i] = var_cor_t[i - 1];
988 : }
989 416433 : var_cor_t[i] = old_corr;
990 :
991 : /* calculate variance of correlation */
992 416433 : var_cor = var( var_cor_t, VAR_COR_LEN );
993 :
994 : /* set flag in case of highly-correlated stable signal */
995 416433 : if ( *mold_corr > 0.8f && var_cor < 5e-4f )
996 : {
997 8550 : *high_stable_cor = 1;
998 : }
999 : else
1000 : {
1001 407883 : *high_stable_cor = 0;
1002 : }
1003 :
1004 : /* update average correlation */
1005 416433 : *mold_corr = 0.1f * old_corr + 0.9f * *mold_corr;
1006 :
1007 416433 : return;
1008 : }
1009 :
1010 : /*---------------------------------------------------------------------*
1011 : * attack_det()
1012 : *
1013 : * Attack detection
1014 : *---------------------------------------------------------------------*/
1015 :
1016 416433 : static int16_t attack_det(
1017 : const float *inp, /* i : input signal */
1018 : const int16_t last_clas, /* i : last signal clas */
1019 : const int16_t localVAD, /* i : local VAD flag */
1020 : const int16_t coder_type, /* i : coder type */
1021 : const int32_t total_brate, /* i : total bitrate */
1022 : const int16_t element_mode, /* i : IVAS element mode */
1023 : const int16_t clas, /* i : signal class */
1024 : float finc_prev[], /* i/o: previous finc */
1025 : float *lt_finc, /* i/o: long-term mean finc */
1026 : int16_t *last_strong_attack /* i/o: last strong attack flag */
1027 : )
1028 : {
1029 : int16_t i, attack;
1030 : float etmp, etmp2, finc[ATT_NSEG];
1031 : int16_t att_3lsub_pos;
1032 : int16_t attack1;
1033 :
1034 416433 : att_3lsub_pos = ATT_3LSUB_POS;
1035 416433 : if ( total_brate >= ACELP_24k40 )
1036 : {
1037 1000 : att_3lsub_pos = ATT_3LSUB_POS_16k;
1038 : }
1039 :
1040 : /* compute energy per section */
1041 13742289 : for ( i = 0; i < ATT_NSEG; i++ )
1042 : {
1043 13325856 : finc[i] = sum2_f( inp + i * ATT_SEG_LEN, ATT_SEG_LEN );
1044 : }
1045 :
1046 416433 : attack = maximum( finc, ATT_NSEG, &etmp );
1047 416433 : attack1 = attack;
1048 :
1049 416433 : if ( localVAD == 1 && coder_type == GENERIC )
1050 : {
1051 : /* compute mean energy in the first three subframes */
1052 209252 : etmp = mean( finc, att_3lsub_pos );
1053 :
1054 : /* compute mean energy after the attack */
1055 209252 : etmp2 = mean( finc + attack, ATT_NSEG - attack );
1056 :
1057 : /* and compare them */
1058 209252 : if ( etmp * 8 > etmp2 )
1059 : {
1060 : /* stop, if the attack is not sufficiently strong */
1061 202823 : attack = 0;
1062 : }
1063 :
1064 209252 : if ( last_clas == VOICED_CLAS && etmp * 20 > etmp2 )
1065 : {
1066 : /* stop, if the signal was voiced and the attack is not sufficiently strong */
1067 49980 : attack = 0;
1068 : }
1069 :
1070 : /* compare wrt. other sections (reduces miss-classification) */
1071 209252 : if ( attack > 0 )
1072 : {
1073 5824 : etmp2 = finc[attack];
1074 :
1075 119270 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1076 : {
1077 113892 : if ( finc[i] * 2.0f > etmp2 )
1078 : {
1079 : /* stop, if the attack is not sufficiently strong */
1080 446 : attack = 0;
1081 446 : break;
1082 : }
1083 : }
1084 : }
1085 :
1086 209252 : if ( attack == 0 && element_mode > EVS_MONO && ( clas < VOICED_TRANSITION || clas == ONSET ) )
1087 : {
1088 135235 : mvr2r( finc, finc_prev, attack1 );
1089 :
1090 : /* compute mean energy before the attack */
1091 135235 : etmp = mean( finc_prev, ATT_NSEG );
1092 :
1093 135235 : etmp2 = finc[attack1];
1094 :
1095 135235 : if ( ( etmp * 16 < etmp2 ) || ( etmp * 12 < etmp2 && last_clas == UNVOICED_CLAS ) )
1096 : {
1097 5020 : attack = attack1;
1098 : }
1099 :
1100 135235 : if ( 20 * *lt_finc > etmp2 || *last_strong_attack )
1101 : {
1102 127360 : attack = 0;
1103 : }
1104 : }
1105 :
1106 209252 : *last_strong_attack = attack;
1107 : }
1108 :
1109 : /* compare wrt. other sections (reduces miss-classification) */
1110 207181 : else if ( attack > 0 )
1111 : {
1112 2551047 : for ( i = 2; i < att_3lsub_pos - 2; i++ )
1113 : {
1114 2476273 : if ( i != attack && finc[i] * 1.3f > finc[attack] )
1115 : {
1116 : /* stop, if the attack is not sufficiently strong */
1117 122805 : attack = 0;
1118 122805 : break;
1119 : }
1120 : }
1121 197579 : *last_strong_attack = 0;
1122 : }
1123 :
1124 : /* updates */
1125 416433 : mvr2r( finc, finc_prev, ATT_NSEG );
1126 416433 : *lt_finc = 0.95f * *lt_finc + 0.05f * mean( finc, ATT_NSEG );
1127 :
1128 416433 : return attack;
1129 : }
1130 :
1131 : /*---------------------------------------------------------------------*
1132 : * ivas_smc_gmm()
1133 : *
1134 : * 1st stage of the speech/music classification (based on the GMM model)
1135 : *---------------------------------------------------------------------*/
1136 :
1137 : /*! r: S/M decision (0=speech or noise,1=unclear,2=music) */
1138 1150734 : int16_t ivas_smc_gmm(
1139 : Encoder_State *st, /* i/o: state structure */
1140 : STEREO_CLASSIF_HANDLE hStereoClassif, /* i/o: stereo classifier structure */
1141 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
1142 : const float Etot, /* i : total frame energy */
1143 : const float lsp_new[M], /* i : LSPs in current frame */
1144 : const float cor_map_sum, /* i : correlation map sum (from multi-harmonic anal.) */
1145 : const float epsP[M + 1], /* i : LP prediciton error */
1146 : const float PS[], /* i : energy spectrum */
1147 : const float non_sta, /* i : unbound non-stationarity */
1148 : const float relE, /* i : relative frame energy */
1149 : int16_t *high_lpn_flag, /* i/o: sp/mus LPN flag */
1150 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch */
1151 : )
1152 : {
1153 : int16_t i, m, dec;
1154 : int16_t flag_odv;
1155 : float lps, lpm, lpn;
1156 : float ps[N_SMC_MIXTURES], pm[N_SMC_MIXTURES], pn[N_SMC_MIXTURES];
1157 : float fvm[N_PCA_COEF], lprob;
1158 : float dlp, ftmp, sum_PS, ps_diff, ps_sta, wrelE, wdrop, wght;
1159 : float wrise;
1160 : float dlp_mean2var;
1161 : float FV[N_SMC_FEATURES], *pFV, PS_norm[128], dPS[128];
1162 : const float *pODV;
1163 : float *pFV_st, smc_st_mean_fact;
1164 : int16_t relE_attack_flag;
1165 : int16_t j, len;
1166 : const float *pt_mel_fb;
1167 : float melS[NB_MEL_BANDS], mfcc[NB_MEL_BANDS];
1168 : int16_t odv_cnt;
1169 : int16_t i_out[N_SMC_FEATURES], *p_out;
1170 :
1171 : /*------------------------------------------------------------------*
1172 : * Initialization
1173 : *------------------------------------------------------------------*/
1174 :
1175 1150734 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1176 :
1177 : /*------------------------------------------------------------------*
1178 : * State machine (sp_mus_state: -8 = INACTIVE, -7:-1 = UNSTABLE, 0:7 = ENTRY, 8 = STABLE )
1179 : *------------------------------------------------------------------*/
1180 :
1181 1150734 : if ( localVAD_HE_SAD )
1182 : {
1183 974933 : if ( relE < -20 )
1184 : {
1185 99681 : if ( hSpMusClas->sp_mus_state > 0 )
1186 : {
1187 10654 : if ( hSpMusClas->sp_mus_state < HANG_LEN )
1188 : {
1189 : /* energy is too low but we are in entry period -> reset the inactive counter to allow new entry later */
1190 2326 : hSpMusClas->inact_cnt = 0;
1191 : }
1192 :
1193 : /* energy is too low -> we are going to instable state */
1194 10654 : hSpMusClas->sp_mus_state = 0;
1195 : }
1196 89027 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1197 : {
1198 : /* energy is still too low -> we are still in instable state */
1199 29447 : hSpMusClas->sp_mus_state--;
1200 : }
1201 : }
1202 875252 : else if ( hSpMusClas->sp_mus_state <= 0 )
1203 : {
1204 22078 : if ( hSpMusClas->inact_cnt == 0 )
1205 : {
1206 :
1207 13799 : hSpMusClas->sp_mus_state = 1;
1208 : }
1209 : else
1210 : {
1211 :
1212 8279 : hSpMusClas->sp_mus_state = HANG_LEN;
1213 : }
1214 :
1215 22078 : hSpMusClas->inact_cnt = 12;
1216 : }
1217 853174 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1218 : {
1219 : /* we are inside an entry period -> increment the counter of entry frames */
1220 65577 : hSpMusClas->sp_mus_state++;
1221 : }
1222 :
1223 974933 : if ( hSpMusClas->sp_mus_state < 0 && hSpMusClas->inact_cnt > 0 )
1224 : {
1225 30363 : hSpMusClas->inact_cnt--;
1226 : }
1227 : }
1228 : else
1229 : {
1230 175801 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1231 : {
1232 1057 : hSpMusClas->inact_cnt = 0;
1233 : }
1234 174744 : else if ( hSpMusClas->inact_cnt > 0 )
1235 : {
1236 23520 : hSpMusClas->inact_cnt--;
1237 : }
1238 :
1239 175801 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1240 : {
1241 1057 : hSpMusClas->sp_mus_state = -HANG_LEN;
1242 : }
1243 174744 : else if ( hSpMusClas->sp_mus_state > 0 )
1244 : {
1245 3479 : hSpMusClas->sp_mus_state = -1;
1246 : }
1247 171265 : else if ( hSpMusClas->sp_mus_state > -HANG_LEN )
1248 : {
1249 : /* we are in inactive state */
1250 15599 : hSpMusClas->sp_mus_state--;
1251 : }
1252 : }
1253 :
1254 : /* detect attacks based on relE */
1255 1150734 : if ( relE > hSpMusClas->prev_relE )
1256 : {
1257 488181 : hSpMusClas->relE_attack_sum += relE - hSpMusClas->prev_relE;
1258 : }
1259 : else
1260 : {
1261 662553 : hSpMusClas->relE_attack_sum = 0;
1262 : }
1263 1150734 : hSpMusClas->prev_relE = relE;
1264 :
1265 : /* update counter from last VAD 0->1 change */
1266 1150734 : if ( hSpMusClas->prev_vad == 0 && localVAD_HE_SAD == 1 )
1267 : {
1268 15822 : hSpMusClas->vad_0_1_cnt = 1;
1269 : }
1270 1134912 : else if ( localVAD_HE_SAD == 1 && hSpMusClas->vad_0_1_cnt > 0 && hSpMusClas->vad_0_1_cnt < 50 )
1271 : {
1272 248931 : hSpMusClas->vad_0_1_cnt++;
1273 : }
1274 : else
1275 : {
1276 885981 : hSpMusClas->vad_0_1_cnt = 0;
1277 : }
1278 1150734 : hSpMusClas->prev_vad = localVAD_HE_SAD;
1279 :
1280 1150734 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && hSpMusClas->relE_attack_sum > 5.0f )
1281 : {
1282 23128 : hSpMusClas->relE_attack_cnt++;
1283 :
1284 : /* set flag only in the first X frames in a series */
1285 23128 : if ( hSpMusClas->relE_attack_cnt > 0 && hSpMusClas->relE_attack_cnt < 3 )
1286 : {
1287 16591 : relE_attack_flag = 1;
1288 : }
1289 : else
1290 : {
1291 6537 : relE_attack_flag = 0;
1292 : }
1293 : }
1294 : else
1295 : {
1296 1127606 : hSpMusClas->relE_attack_cnt = 0;
1297 1127606 : relE_attack_flag = 0;
1298 : }
1299 :
1300 1150734 : hSpMusClas->prev_Etot = Etot;
1301 :
1302 : /*------------------------------------------------------------------*
1303 : * Preparation of the feature vector
1304 : *------------------------------------------------------------------*/
1305 :
1306 1150734 : pFV = FV;
1307 :
1308 : /* [0] OL pitch */
1309 1150734 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1310 : {
1311 115893 : *pFV++ = (float) st->pitch[2];
1312 : }
1313 : else
1314 : {
1315 1034841 : *pFV++ = (float) ( st->pitch[0] + st->pitch[1] + st->pitch[2] ) / 3.0f;
1316 : }
1317 :
1318 : /* [1] voicing */
1319 1150734 : if ( relE_attack_flag || st->tc_cnt == 1 || st->tc_cnt == 2 )
1320 : {
1321 115893 : *pFV++ = st->voicing[2];
1322 : }
1323 : else
1324 : {
1325 1034841 : *pFV++ = ( st->voicing[0] + st->voicing[1] + st->voicing[2] ) / 3.0f;
1326 : }
1327 :
1328 : /* [2,3,4,5,6] LSFs */
1329 1150734 : *pFV++ = acosf( lsp_new[2] );
1330 1150734 : *pFV++ = acosf( lsp_new[3] );
1331 1150734 : *pFV++ = acosf( lsp_new[4] );
1332 1150734 : *pFV++ = acosf( lsp_new[5] );
1333 1150734 : *pFV++ = acosf( lsp_new[6] );
1334 :
1335 : /* [7] cor_map_sum */
1336 1150734 : *pFV++ = cor_map_sum;
1337 :
1338 : /* [8] non_sta */
1339 1150734 : *pFV++ = non_sta;
1340 :
1341 : /* [9] epsP */
1342 1150734 : *pFV++ = logf( epsP[14] + 1e-5f ) - logf( epsP[0] + 1e-5f );
1343 :
1344 : /* [10,11,12] MFCCs */
1345 1150734 : set_zero( melS, NB_MEL_BANDS );
1346 1150734 : pt_mel_fb = mel_fb;
1347 47180094 : for ( i = 0; i < NB_MEL_BANDS; i++ )
1348 : {
1349 46029360 : j = mel_fb_start[i];
1350 46029360 : len = mel_fb_len[i];
1351 46029360 : melS[i] = logf( dotp( &PS[j], pt_mel_fb, len ) + 1e-5f );
1352 46029360 : pt_mel_fb += len;
1353 : }
1354 :
1355 1150734 : v_mult_mat( mfcc, melS, dct_mtx, NB_MEL_BANDS, NB_MEL_COEF );
1356 :
1357 1150734 : *pFV++ = mfcc[2];
1358 1150734 : *pFV++ = mfcc[6];
1359 1150734 : *pFV++ = mfcc[12];
1360 :
1361 : /* calculation of differential normalized power spectrum */
1362 1150734 : sum_PS = 1e-5f;
1363 78249912 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1364 : {
1365 77099178 : sum_PS += PS[i];
1366 : }
1367 :
1368 78249912 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1369 : {
1370 77099178 : PS_norm[i] = PS[i] / sum_PS;
1371 77099178 : dPS[i] = fabsf( PS_norm[i] - hSpMusClas->past_PS[i - LOWEST_FBIN] );
1372 : }
1373 :
1374 : /* [13] ps_diff (spectral difference) */
1375 1150734 : ps_diff = 0;
1376 78249912 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1377 : {
1378 77099178 : ps_diff += dPS[i];
1379 : }
1380 :
1381 1150734 : *pFV++ = ps_diff;
1382 :
1383 : /* [14] ps_sta (spectral stationarity) */
1384 1150734 : ps_sta = 0;
1385 78249912 : for ( i = LOWEST_FBIN; i < HIGHEST_FBIN; i++ )
1386 : {
1387 77099178 : if ( PS_norm[i] > hSpMusClas->past_PS[i - LOWEST_FBIN] )
1388 : {
1389 36353323 : ps_sta += PS_norm[i] / ( dPS[i] + 1e-5f );
1390 : }
1391 : else
1392 : {
1393 40745855 : ps_sta += hSpMusClas->past_PS[i - LOWEST_FBIN] / ( dPS[i] + 1e-5f );
1394 : }
1395 : }
1396 :
1397 1150734 : *pFV++ = logf( ps_sta + 1e-5f );
1398 1150734 : mvr2r( &PS_norm[LOWEST_FBIN], hSpMusClas->past_PS, HIGHEST_FBIN - LOWEST_FBIN );
1399 :
1400 : /* save ps_diff and ps_sta features for XTALK and UNCLR classifier */
1401 1150734 : if ( hStereoClassif != NULL )
1402 : {
1403 782031 : if ( st->idchan == 0 )
1404 : {
1405 420855 : hStereoClassif->ps_diff_ch1 = ps_diff;
1406 420855 : hStereoClassif->ps_sta_ch1 = logf( ps_sta + 1e-5f );
1407 : }
1408 : else
1409 : {
1410 361176 : hStereoClassif->ps_diff_ch2 = ps_diff;
1411 361176 : hStereoClassif->ps_sta_ch2 = logf( ps_sta + 1e-5f );
1412 : }
1413 : }
1414 :
1415 : /*------------------------------------------------------------------*
1416 : * Outlier detection based on feature histograms
1417 : *------------------------------------------------------------------*/
1418 :
1419 1150734 : flag_odv = 0;
1420 1150734 : if ( localVAD_HE_SAD )
1421 : {
1422 974933 : pFV = FV;
1423 974933 : pODV = hout_intervals;
1424 974933 : p_out = i_out;
1425 974933 : odv_cnt = 0;
1426 15598928 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1427 : {
1428 14623995 : if ( *pFV < pODV[0] || *pFV > pODV[1] )
1429 : {
1430 2602 : *p_out++ = i;
1431 2602 : odv_cnt++;
1432 : }
1433 :
1434 14623995 : pFV++;
1435 14623995 : pODV += 2;
1436 : }
1437 :
1438 : /* set outlier flag */
1439 974933 : if ( odv_cnt >= 2 )
1440 : {
1441 587 : flag_odv = 1;
1442 :
1443 : /* replace outlying features with values from the previous frame */
1444 2096 : for ( i = 0; i < odv_cnt; i++ )
1445 : {
1446 1509 : FV[i_out[i]] = hSpMusClas->prev_FV[i_out[i]];
1447 : }
1448 : }
1449 : }
1450 :
1451 : /*------------------------------------------------------------------*
1452 : * Adaptive short-term mean filter on feature vector
1453 : *------------------------------------------------------------------*/
1454 :
1455 1150734 : pFV = FV;
1456 1150734 : pFV_st = hSpMusClas->FV_st;
1457 1150734 : smc_st_mean_fact = SMC_ST_MEAN_FACT;
1458 18411744 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1459 : {
1460 17261010 : *pFV_st = smc_st_mean_fact * ( *pFV_st ) + ( 1 - smc_st_mean_fact ) * ( *pFV );
1461 :
1462 17261010 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) )
1463 : {
1464 : /* strong attack or outlier frame during entry state -> features cannot be trusted but there is also no useful past info -> */
1465 : /* -> do whatever you want because dlp will be reset to 0 anyway */
1466 248910 : pFV++;
1467 248910 : pFV_st++;
1468 : }
1469 17012100 : else if ( hSpMusClas->sp_mus_state == HANG_LEN && ( st->tc_cnt == 1 || st->tc_cnt == 2 ) )
1470 : {
1471 : /* energy attack in stable state -> use current features intead of the long-term average */
1472 1360095 : pFV++;
1473 1360095 : pFV_st++;
1474 : }
1475 : else
1476 : {
1477 15652005 : *pFV++ = *pFV_st++;
1478 : }
1479 : }
1480 :
1481 : /* update */
1482 1150734 : mvr2r( FV, hSpMusClas->prev_FV, N_SMC_FEATURES );
1483 :
1484 : /*------------------------------------------------------------------*
1485 : * Non-linear power transformation (boxcox) on certain features
1486 : *------------------------------------------------------------------*/
1487 :
1488 1150734 : pFV = FV;
1489 18411744 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1490 : {
1491 17261010 : if ( bcox_lmbd[i] != 0 )
1492 : {
1493 3452202 : *pFV -= bcox_add_cnst[i];
1494 3452202 : if ( *pFV < 1 )
1495 : {
1496 112160 : *pFV = 1;
1497 : }
1498 3452202 : *pFV = ( powf( *pFV, bcox_lmbd[i] ) - 1 ) / bcox_lmbd[i];
1499 : }
1500 :
1501 17261010 : pFV++;
1502 : }
1503 :
1504 : /*------------------------------------------------------------------*
1505 : * Scaling of the feature vector
1506 : * PCA
1507 : *------------------------------------------------------------------*/
1508 :
1509 1150734 : pFV = FV;
1510 18411744 : for ( i = 0; i < N_SMC_FEATURES; i++ )
1511 : {
1512 : /* Standard scaler - mean and variance normalization */
1513 17261010 : *pFV = ( *pFV - sm_means[i] ) / sm_scale[i];
1514 17261010 : pFV++;
1515 :
1516 : /* MinMax sclaer - mean and variance normalization */
1517 : /**pFV = *pFV * sm_scale[i] + sm_min[i];*/
1518 : /*pFV++;*/
1519 : }
1520 :
1521 : /* PCA */
1522 1150734 : v_sub( FV, pca_mean_, FV, N_SMC_FEATURES );
1523 1150734 : v_mult_mat( FV, FV, pca_components_, N_SMC_FEATURES, N_PCA_COEF );
1524 :
1525 : /*------------------------------------------------------------------*
1526 : * Calculation of posterior probability
1527 : * Log-probability
1528 : *------------------------------------------------------------------*/
1529 :
1530 : /* run loop for all mixtures (for each mixture, calculate the probability of speech, music and noise) */
1531 1150734 : lps = lpm = lpn = 0;
1532 8055138 : for ( m = 0; m < N_SMC_MIXTURES; m++ )
1533 : {
1534 6904404 : v_sub( FV, &means_speech[m * N_PCA_COEF], fvm, N_PCA_COEF );
1535 6904404 : lprob = dot_product_cholesky( fvm, &prec_chol_speech[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1536 6904404 : ps[m] = logf( weights_speech[m] ) + log_det_chol_speech[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1537 :
1538 6904404 : v_sub( FV, &means_music[m * N_PCA_COEF], fvm, N_PCA_COEF );
1539 6904404 : lprob = dot_product_cholesky( fvm, &prec_chol_music[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1540 6904404 : pm[m] = logf( weights_music[m] ) + log_det_chol_music[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1541 :
1542 6904404 : v_sub( FV, &means_noise[m * N_PCA_COEF], fvm, N_PCA_COEF );
1543 6904404 : lprob = dot_product_cholesky( fvm, &prec_chol_noise[m * ( N_PCA_COEF * N_PCA_COEF + N_PCA_COEF ) / 2], N_PCA_COEF );
1544 6904404 : pn[m] = logf( weights_noise[m] ) + log_det_chol_noise[m] - 0.5f * N_PCA_COEF * logf( PI2 ) - 0.5f * lprob;
1545 : }
1546 :
1547 1150734 : lps = logsumexp( ps, N_SMC_MIXTURES );
1548 1150734 : lpm = logsumexp( pm, N_SMC_MIXTURES );
1549 1150734 : lpn = logsumexp( pn, N_SMC_MIXTURES );
1550 :
1551 1150734 : *high_lpn_flag = 0;
1552 1150734 : if ( lpn > lps && lpn > lpm )
1553 : {
1554 141950 : *high_lpn_flag = 1;
1555 : }
1556 :
1557 1150734 : hSpMusClas->lpm = lpm;
1558 1150734 : hSpMusClas->lps = lps;
1559 1150734 : hSpMusClas->lpn = lpn;
1560 :
1561 : /* determine HQ Generic speech class */
1562 1150734 : if ( st->hHQ_core != NULL )
1563 : {
1564 421725 : if ( lps > lpm + 0.5f )
1565 : {
1566 160042 : st->hHQ_core->hq_generic_speech_class = 1;
1567 : }
1568 : else
1569 : {
1570 261683 : st->hHQ_core->hq_generic_speech_class = 0;
1571 : }
1572 : }
1573 :
1574 : /*------------------------------------------------------------------*
1575 : * Decision without hangover
1576 : * Weighted decision
1577 : *------------------------------------------------------------------*/
1578 :
1579 : /* decision without hangover (0 - speech/noise, 1 - music) */
1580 1150734 : if ( !localVAD_HE_SAD || Etot < 10 || ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN && ( relE_attack_flag || flag_odv ) ) )
1581 : {
1582 214380 : dlp = 0;
1583 : }
1584 : else
1585 : {
1586 936354 : dlp = lpm - lps + DLP_BIAS;
1587 :
1588 936354 : if ( dlp > 30.0f )
1589 : {
1590 33541 : dlp = 30.0f;
1591 : }
1592 902813 : else if ( dlp < -30.0f )
1593 : {
1594 0 : dlp = -30.0f;
1595 : }
1596 : }
1597 :
1598 1150734 : dec = dlp > 0;
1599 :
1600 : /* calculate weight based on relE (higher relE -> lower weight, lower relE -> higher weight) */
1601 1150734 : wrelE = lin_interp( relE, 15.0f, 0.9f, -15.0f, 0.99f, 1 );
1602 :
1603 : /* calculate weight based on drops of dlp (close to 1 during sudden drops of dlp, close to 0 otherwise) */
1604 1150734 : hSpMusClas->dlp_mean_ST = 0.8f * hSpMusClas->dlp_mean_ST + 0.2f * dlp;
1605 1150734 : hSpMusClas->lt_dec_thres = hSpMusClas->dlp_mean_ST;
1606 :
1607 1150734 : if ( dlp < 0 && dlp < hSpMusClas->dlp_mean_ST )
1608 : {
1609 258685 : if ( hSpMusClas->dlp_mean_ST > 0 )
1610 : {
1611 77455 : hSpMusClas->wdrop = -dlp;
1612 : }
1613 181230 : else if ( hSpMusClas->wdrop > 0 )
1614 : {
1615 42931 : hSpMusClas->wdrop += hSpMusClas->dlp_mean_ST - dlp;
1616 : }
1617 : }
1618 : else
1619 : {
1620 892049 : hSpMusClas->wdrop = 0;
1621 : }
1622 :
1623 1150734 : wdrop = lin_interp( hSpMusClas->wdrop, 15.0f, 0.7f, 0.0f, 1.0f, 1 );
1624 :
1625 : /* calculate weight based on rises of dlp (close to 1 during sudden rise of dlp, close to 0 otherwise) */
1626 1150734 : if ( hSpMusClas->sp_mus_state == HANG_LEN && hSpMusClas->dlp_mean_ST > 0 && hSpMusClas->dlp_mean_ST > hSpMusClas->past_dlp_mean_ST[0] )
1627 : {
1628 251990 : if ( hSpMusClas->past_dlp_mean_ST[0] < 0 )
1629 : {
1630 14052 : hSpMusClas->wrise = hSpMusClas->dlp_mean_ST;
1631 : }
1632 237938 : else if ( hSpMusClas->wrise > 0 )
1633 : {
1634 34759 : hSpMusClas->wrise += hSpMusClas->dlp_mean_ST - hSpMusClas->past_dlp_mean_ST[0];
1635 : }
1636 : }
1637 : else
1638 : {
1639 898744 : hSpMusClas->wrise = 0;
1640 : }
1641 :
1642 1150734 : wrise = lin_interp( hSpMusClas->wrise, 5.0f, 0.95f, 0.0f, 1.0f, 1 );
1643 :
1644 : /* combine weights into one */
1645 1150734 : wght = wrelE * wdrop * wrise;
1646 :
1647 : /* ratio of delta means vs. delta variances */
1648 1150734 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1649 : {
1650 71537 : hSpMusClas->dlp_mean_LT = dlp;
1651 71537 : hSpMusClas->dlp_var_LT = 0;
1652 : }
1653 :
1654 1150734 : hSpMusClas->dlp_mean_LT = 0.9f * hSpMusClas->dlp_mean_LT + 0.1f * dlp;
1655 1150734 : ftmp = dlp - hSpMusClas->dlp_mean_LT;
1656 1150734 : hSpMusClas->dlp_var_LT = 0.9f * hSpMusClas->dlp_var_LT + 0.1f * ( ftmp * ftmp );
1657 :
1658 1150734 : if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1659 : {
1660 71537 : dlp_mean2var = 0;
1661 : }
1662 : else
1663 : {
1664 1079197 : dlp_mean2var = fabsf( hSpMusClas->dlp_mean_LT ) / ( sqrtf( fabsf( hSpMusClas->dlp_var_LT ) ) + 1.0f );
1665 : }
1666 :
1667 1150734 : if ( dlp_mean2var > 15.0f )
1668 : {
1669 : /* decrease the weight little bit when the classifier indicates "strong speech" or "strong music" */
1670 3225 : wght *= 0.9f;
1671 : }
1672 :
1673 1150734 : if ( wght > 1.0f )
1674 : {
1675 0 : wght = 1.0f;
1676 : }
1677 1150734 : else if ( wght < 0.01f )
1678 : {
1679 0 : wght = 0.01f;
1680 : }
1681 :
1682 1150734 : if ( Etot < 10 )
1683 : {
1684 : /* silence */
1685 135252 : wght = 0.92f;
1686 : }
1687 :
1688 : /* calculate weighted decision */
1689 1150734 : hSpMusClas->wdlp_0_95_sp = wght * hSpMusClas->wdlp_0_95_sp + ( 1 - wght ) * dlp;
1690 :
1691 : /* xtalk classifier: apply long hysteresis to prevent LRTD on music */
1692 1150734 : hSpMusClas->wdlp_xtalk = 0.995f * hSpMusClas->wdlp_xtalk + 0.005f * dlp;
1693 :
1694 : /*------------------------------------------------------------------*
1695 : * Final speech/music decision
1696 : *------------------------------------------------------------------*/
1697 :
1698 1150734 : if ( flag_spitch )
1699 : {
1700 39748 : hSpMusClas->flag_spitch_cnt = 5;
1701 : }
1702 1110986 : else if ( hSpMusClas->flag_spitch_cnt > 0 )
1703 : {
1704 5871 : hSpMusClas->flag_spitch_cnt--;
1705 : }
1706 :
1707 1150734 : if ( Etot < 10 )
1708 : {
1709 : /* silence */
1710 135252 : dec = 0;
1711 : }
1712 1015482 : else if ( hSpMusClas->sp_mus_state > 0 && hSpMusClas->sp_mus_state < HANG_LEN )
1713 : {
1714 : /* entry state -> final decision is calculated based on weighted average of past non-binary decisions */
1715 71537 : ftmp = w_spmus[hSpMusClas->sp_mus_state - 1][0] * dlp;
1716 71537 : ftmp += dotp( &w_spmus[hSpMusClas->sp_mus_state - 1][1], hSpMusClas->past_dlp, HANG_LEN - 1 );
1717 71537 : if ( ftmp > 2.0f )
1718 : {
1719 35380 : if ( dlp > 2.0f )
1720 : {
1721 24507 : dec = 2;
1722 : }
1723 : else
1724 : {
1725 10873 : dec = 1;
1726 : }
1727 : }
1728 : else
1729 : {
1730 36157 : dec = 0;
1731 : }
1732 : }
1733 : else
1734 : {
1735 : /* stable active state */
1736 943945 : if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->past_dec[1] == 0 && hSpMusClas->past_dec[2] == 0 &&
1737 471305 : ( ( hSpMusClas->flag_spitch_cnt > 0 && hSpMusClas->wdlp_0_95_sp > 3.4f ) || ( hSpMusClas->flag_spitch_cnt == 0 && hSpMusClas->wdlp_0_95_sp > 2.1f ) ) )
1738 : {
1739 : /* switching from speech to unclear */
1740 1879 : dec = 1;
1741 : }
1742 942066 : else if ( hSpMusClas->past_dec[0] == 0 && hSpMusClas->vad_0_1_cnt < 50 && hSpMusClas->relE_attack_sum == 0.0f && hSpMusClas->wdlp_0_95_sp > 1.0f )
1743 : {
1744 : /* switch from speech to unclear also during slowly rising weak music onsets */
1745 3431 : dec = 1;
1746 : }
1747 938635 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp > 2.5f )
1748 : {
1749 : /* switching from unclear to music */
1750 4227 : dec = 2;
1751 : }
1752 934408 : else if ( hSpMusClas->past_dec[0] == 2 && hSpMusClas->past_dec[1] == 2 && hSpMusClas->past_dec[2] == 2 && hSpMusClas->wdlp_0_95_sp < -1.0f )
1753 : {
1754 : /* switching from music to unclear */
1755 2466 : dec = 1;
1756 : }
1757 931942 : else if ( hSpMusClas->past_dec[0] == 1 && hSpMusClas->wdlp_0_95_sp < -2.5f )
1758 : {
1759 : /* switching from unclear to speech */
1760 2542 : dec = 0;
1761 : }
1762 : else
1763 : {
1764 929400 : dec = hSpMusClas->past_dec[0];
1765 : }
1766 : }
1767 :
1768 : /*------------------------------------------------------------------*
1769 : * raw S/M decision based on smoothed GMM score
1770 : *------------------------------------------------------------------*/
1771 :
1772 1150734 : if ( dec == 0 || st->hSpMusClas->wdlp_0_95_sp <= 0 )
1773 : {
1774 696332 : st->sp_aud_decision0 = 0;
1775 696332 : st->sp_aud_decision1 = 0;
1776 : }
1777 : else
1778 : {
1779 454402 : st->sp_aud_decision0 = 1;
1780 454402 : st->sp_aud_decision1 = 1;
1781 : }
1782 :
1783 : /*------------------------------------------------------------------*
1784 : * Updates
1785 : *------------------------------------------------------------------*/
1786 :
1787 : /* update buffer of past non-binary decisions */
1788 1150734 : mvr2r( &hSpMusClas->past_dlp[0], &hSpMusClas->past_dlp[1], HANG_LEN - 2 );
1789 1150734 : hSpMusClas->past_dlp[0] = dlp;
1790 :
1791 1150734 : mvr2r( &hSpMusClas->past_dlp_mean_ST[0], &hSpMusClas->past_dlp_mean_ST[1], HANG_LEN - 2 );
1792 1150734 : hSpMusClas->past_dlp_mean_ST[0] = hSpMusClas->dlp_mean_ST;
1793 :
1794 : /* update buffer of past binary decisions */
1795 1150734 : mvs2s( &hSpMusClas->past_dec[0], &hSpMusClas->past_dec[1], HANG_LEN - 2 );
1796 1150734 : hSpMusClas->past_dec[0] = dec;
1797 :
1798 : #ifdef DEBUG_MODE_INFO
1799 : dbgwrite( &st->hSpMusClas->wdlp_0_95_sp, sizeof( float ), 1, 1, "res/wdlp_0_95_sp.x" );
1800 : #endif
1801 :
1802 1150734 : return dec;
1803 : }
1804 :
1805 : /*---------------------------------------------------------------------*
1806 : * ivas_smc_mode_selection()
1807 : *
1808 : * 2nd stage speech/music classifier (select coding mode (ACELP, GSC and TCX) based on S/M classification)
1809 : * output (sp_aud_decision1 - sp_aud_decision2 -> coding mode):
1810 : * 0 - 0 -> ACELP
1811 : * 1 - 0 -> GSC
1812 : * 1 - 1 -> TCX
1813 : *---------------------------------------------------------------------*/
1814 :
1815 414383 : void ivas_smc_mode_selection(
1816 : Encoder_State *st, /* i/o: encoder state structure */
1817 : const int32_t element_brate, /* i : element bitrate */
1818 : int16_t smc_dec, /* i : raw decision of the 1st stage classifier*/
1819 : const float relE, /* i : relative frame energy */
1820 : const float Etot, /* i : total frame energy */
1821 : int16_t *attack_flag, /* i/o: attack flag (GSC or TC) */
1822 : const float *inp, /* i : input signal */
1823 : const float S_map[], /* i : short-term correlation map */
1824 : const int16_t flag_spitch /* i : flag to indicate very short stable pitch*/
1825 : )
1826 : {
1827 : int16_t attack;
1828 : float ton;
1829 : int16_t i;
1830 : float S_p2a, S_max, S_ave;
1831 : float thr_sp2a;
1832 :
1833 414383 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
1834 :
1835 : /* initialization */
1836 414383 : *attack_flag = 0;
1837 414383 : st->sp_aud_decision2 = 0;
1838 :
1839 : /* signal stability estimation */
1840 414383 : stab_est( Etot, hSpMusClas->gsc_lt_diff_etot, &hSpMusClas->gsc_mem_etot, &hSpMusClas->gsc_nb_thr_3, &hSpMusClas->gsc_nb_thr_1, hSpMusClas->gsc_thres, &hSpMusClas->gsc_last_music_flag, st->vad_flag );
1841 :
1842 : /* calculate variance of correlation */
1843 414383 : var_cor_calc( st->old_corr, &hSpMusClas->mold_corr, hSpMusClas->var_cor_t, &hSpMusClas->high_stable_cor );
1844 :
1845 : /* attack detection */
1846 414383 : attack = attack_det( inp, st->clas, st->localVAD, st->coder_type, 0, st->element_mode, st->clas, hSpMusClas->finc_prev, &hSpMusClas->lt_finc, &hSpMusClas->last_strong_attack );
1847 :
1848 : /* tonal detector */
1849 414383 : ton = tonal_det( S_map, st->vad_flag, hSpMusClas->tod_S_map_lt, &hSpMusClas->tod_thr_lt, &hSpMusClas->tod_weight, &hSpMusClas->tod_S_mass_prev, &hSpMusClas->tod_S_mass_lt );
1850 :
1851 :
1852 : /* calculate spectral peak-to-average ratio */
1853 33565023 : for ( i = 0; i < TOD_NSPEC; i++ )
1854 : {
1855 33150640 : st->hSpMusClas->tod_lt_Bin_E[i] = P2A_FACT * st->hSpMusClas->tod_lt_Bin_E[i] + ( 1 - P2A_FACT ) * st->Bin_E[i];
1856 : }
1857 :
1858 414383 : maximum( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC, &S_max );
1859 414383 : S_ave = sum_f( st->hSpMusClas->tod_lt_Bin_E, TOD_NSPEC ) / TOD_NSPEC;
1860 414383 : S_p2a = S_max - S_ave;
1861 :
1862 414383 : if ( element_brate <= IVAS_16k4 )
1863 : {
1864 135734 : thr_sp2a = THR_P2A_HIGH;
1865 : }
1866 : else
1867 : {
1868 278649 : thr_sp2a = THR_P2A;
1869 : }
1870 :
1871 : /* initial 3-way selection of coding modes (ACELP/GSC/TCX) */
1872 414383 : if ( relE > -10.0f && ( S_p2a > thr_sp2a || ton > hSpMusClas->tod_thr_lt ) )
1873 : {
1874 : /* select TCX to encode extremely peaky signals or strongly tonal signals */
1875 19590 : st->sp_aud_decision1 = 1;
1876 19590 : st->sp_aud_decision2 = 1;
1877 : }
1878 394793 : else if ( smc_dec == SPEECH )
1879 : {
1880 : /* select ACELP to encode speech */
1881 153265 : st->sp_aud_decision1 = 0;
1882 153265 : st->sp_aud_decision2 = 0;
1883 : }
1884 241528 : else if ( smc_dec == SPEECH_OR_MUSIC )
1885 : {
1886 : /* select GSC to encode "unclear" segments (classifier's score on the borderline) */
1887 6438 : st->sp_aud_decision1 = 1;
1888 6438 : st->sp_aud_decision2 = 0;
1889 : }
1890 : else
1891 : {
1892 : /* select TCX to encode music */
1893 235090 : st->sp_aud_decision1 = 1;
1894 235090 : st->sp_aud_decision2 = 1;
1895 : }
1896 :
1897 : /* change decision from GSC to ACELP/TCX in some special cases */
1898 414383 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1899 : {
1900 6438 : if ( hSpMusClas->ener_RAT < 0.18f && hSpMusClas->lt_dec_thres > 15.0f )
1901 : {
1902 : /* prevent GSC on strong music with almost no content below 1kHz */
1903 4 : st->sp_aud_decision2 = 1;
1904 : }
1905 6434 : else if ( flag_spitch )
1906 : {
1907 : /* prevent GSC on signals with very short and stable high pitch period */
1908 120 : if ( hSpMusClas->wdlp_0_95_sp < 2.5f )
1909 : {
1910 : /* select ACELP instead */
1911 116 : st->sp_aud_decision1 = 0;
1912 : }
1913 : else
1914 : {
1915 : /* select TCX instead */
1916 4 : st->sp_aud_decision2 = 1;
1917 : }
1918 : }
1919 6314 : else if ( hSpMusClas->high_stable_cor && st->pitch[0] >= 130 )
1920 : {
1921 : /* prevent GSC in highly correlated signal with low energy variation */
1922 : /* this is basically a patch against bassoon-type of music */
1923 0 : st->sp_aud_decision2 = 1;
1924 : }
1925 : }
1926 :
1927 : /* change decision from GSC to ACELP TC during attacks/onsets */
1928 414383 : if ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 )
1929 : {
1930 6314 : if ( ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] > 4.5f ) &&
1931 559 : ( hSpMusClas->gsc_lt_diff_etot[MAX_LT - 1] - hSpMusClas->gsc_lt_diff_etot[MAX_LT - 2] > 10.0f ) )
1932 : {
1933 123 : if ( st->tc_cnt == 1 )
1934 : {
1935 : /* do ACELP TC coding instead of GC/VC if onset has been already declared before */
1936 47 : st->sp_aud_decision1 = 0;
1937 47 : st->coder_type = TRANSITION;
1938 : }
1939 : else
1940 : {
1941 76 : if ( attack >= ATT_3LSUB_POS )
1942 : {
1943 : /* do ACELP TC coding also if attack is located in the last subframe */
1944 16 : st->sp_aud_decision1 = 0;
1945 16 : *attack_flag = attack + 1;
1946 16 : st->coder_type = TRANSITION;
1947 : }
1948 60 : else if ( attack >= ATT_SEG_LEN / 2 )
1949 : {
1950 : /* do GSC coding if attack is located after the first quarter of the first subframe */
1951 : /* (pre-echo will be treated at the decoder side) */
1952 1 : *attack_flag = 31;
1953 1 : *attack_flag = attack + 1;
1954 : }
1955 : }
1956 : }
1957 : }
1958 :
1959 414383 : if ( st->localVAD == 1 && st->coder_type == GENERIC && attack > 0 /*&& *attack_flag < 32*/ /*&& st->tc_cnt != 2*/ && !( st->sp_aud_decision2 == 1 && ton > 0.65f ) )
1960 : {
1961 : /* change ACELP coder_type to TC if attack has been detected */
1962 6751 : st->sp_aud_decision1 = 0;
1963 6751 : st->sp_aud_decision2 = 0;
1964 :
1965 6751 : st->coder_type = TRANSITION;
1966 6751 : *attack_flag = attack + 1;
1967 : }
1968 :
1969 : #ifdef DEBUGGING
1970 : if ( st->idchan == 0 && st->coder_type != INACTIVE )
1971 : {
1972 : if ( st->force == FORCE_GSC && element_brate < IVAS_24k4 )
1973 : {
1974 : /* enforce GSC */
1975 : st->sp_aud_decision1 = 1;
1976 : st->sp_aud_decision2 = 0;
1977 : }
1978 : else if ( st->force == FORCE_SPEECH && ( st->sp_aud_decision1 == 1 || st->sp_aud_decision2 == 1 ) )
1979 : {
1980 : if ( element_brate < IVAS_24k4 )
1981 : {
1982 : /* convert TCX to GSC */
1983 : st->sp_aud_decision1 = 1;
1984 : st->sp_aud_decision2 = 0;
1985 : }
1986 : else
1987 : {
1988 : /* convert TCX to ACELP */
1989 : st->sp_aud_decision1 = 0;
1990 : st->sp_aud_decision2 = 0;
1991 : }
1992 : }
1993 : else if ( st->force == FORCE_MUSIC )
1994 : {
1995 : /* enforce TCX */
1996 : st->sp_aud_decision1 = 1;
1997 : st->sp_aud_decision2 = 1;
1998 : }
1999 : }
2000 : #endif
2001 :
2002 : /* set GSC noisy speech flag on unvoiced SWB segments */
2003 414383 : st->GSC_noisy_speech = 0;
2004 414383 : if ( st->vad_flag == 1 && element_brate <= IVAS_16k4 && st->lp_noise > 30.0f && st->sp_aud_decision1 == 0 && st->bwidth >= SWB && st->coder_type_raw == UNVOICED )
2005 : {
2006 1230 : st->GSC_noisy_speech = 1;
2007 : }
2008 :
2009 : /* set GSC submode */
2010 414383 : if ( st->element_mode > EVS_MONO && ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) && st->total_brate > STEREO_GSC_BIT_RATE_ALLOC ) /* below STEREO_GSC_BIT_RATE_ALLOC, fall back on normal GSC */
2011 : {
2012 5237 : st->GSC_IVAS_mode = 1;
2013 5237 : if ( st->hSpMusClas->wdlp_0_95_sp > 0.0f )
2014 : {
2015 : /* music-like content */
2016 3121 : st->GSC_IVAS_mode = 3;
2017 : }
2018 2116 : else if ( st->tc_cnt > 0 )
2019 : {
2020 : /* likely presence of an onset, GSC bit allocation will be more focused on LF */
2021 265 : st->GSC_IVAS_mode = 2;
2022 : }
2023 :
2024 5237 : if ( st->coder_type_raw == UNVOICED && st->sp_aud_decision0 == 0 /*&& st->GSC_IVAS_mode < 3*/ )
2025 : {
2026 113 : st->GSC_noisy_speech = 1;
2027 : }
2028 : else
2029 : {
2030 5124 : st->GSC_noisy_speech = 0;
2031 : }
2032 : }
2033 :
2034 : /* set coder_type to AUDIO when GSC is selected (st->core will be set later in the decision matrix) */
2035 414383 : if ( ( st->sp_aud_decision1 == 1 && st->sp_aud_decision2 == 0 ) || st->GSC_noisy_speech )
2036 : {
2037 7391 : st->coder_type = AUDIO;
2038 7391 : if ( st->hGSCEnc != NULL && st->GSC_noisy_speech == 0 ) /* In case of GSC_noisy_speech, NOISE_LEVEL should remain at NOISE_LEVEL_SP3 */
2039 : {
2040 6048 : st->hGSCEnc->noise_lev = NOISE_LEVEL_SP0;
2041 : }
2042 : }
2043 :
2044 414383 : return;
2045 : }
2046 :
2047 :
2048 : /*------------------------------------------------------------------------*
2049 : * music_mixed_classif_improv()
2050 : *
2051 : * Improve 1st stage speech/music decision for mixed&music signals
2052 : *------------------------------------------------------------------------*/
2053 :
2054 2050 : static void music_mixed_classif_improv(
2055 : Encoder_State *st, /* i/o: Encoder state structure */
2056 : const float *new_inp, /* i : new input signal */
2057 : const float *epsP, /* i : LP prediction error */
2058 : const float etot, /* i : total frame energy */
2059 : const float old_cor, /* i : normalized correlation */
2060 : const float cor_map_sum /* i : correlation map sum */
2061 : )
2062 : {
2063 : int16_t i, dec, len, percus_flag;
2064 : float p2v_map[128], ftmp, ftmp1, lt_diff, log_max_spl, epsP_tilt, max_spl;
2065 :
2066 2050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2067 :
2068 : /* find sample with maximum absolute amplitude */
2069 2050 : max_spl = 0;
2070 526850 : for ( i = 0; i < L_FRAME; i++ )
2071 : {
2072 524800 : if ( fabs( new_inp[i] ) > max_spl )
2073 : {
2074 18405 : max_spl = fabsf( new_inp[i] );
2075 : }
2076 : }
2077 :
2078 : /* music is considered only appearing in high SNR condition and active signal */
2079 2050 : if ( st->vad_flag == 0 || st->lp_speech - st->lp_noise < 25 )
2080 : {
2081 8 : hSpMusClas->dec_mov = 0.5f;
2082 8 : hSpMusClas->dec_mov1 = 0.5f;
2083 :
2084 8 : if ( st->vad_flag == 0 )
2085 : {
2086 8 : hSpMusClas->onset_cnt = 0;
2087 : }
2088 :
2089 8 : return;
2090 : }
2091 :
2092 2042 : hSpMusClas->onset_cnt++;
2093 :
2094 2042 : if ( hSpMusClas->onset_cnt > 9 )
2095 : {
2096 2006 : hSpMusClas->onset_cnt = 9;
2097 : }
2098 :
2099 2042 : if ( hSpMusClas->onset_cnt == 1 )
2100 : {
2101 4 : set_f( hSpMusClas->buf_flux, -100, BUF_LEN );
2102 : }
2103 :
2104 : /* spectral analysis */
2105 2042 : spec_analysis( st->Bin_E, p2v_map );
2106 :
2107 : /* percussive music detection */
2108 2042 : log_max_spl = 20 * logf( max_spl + 0.0001f );
2109 2042 : lt_diff = log_max_spl - hSpMusClas->mov_log_max_spl;
2110 :
2111 8168 : for ( i = 0; i < 3; i++ )
2112 : {
2113 6126 : hSpMusClas->buf_etot[i] = hSpMusClas->buf_etot[i + 1];
2114 : }
2115 2042 : hSpMusClas->buf_etot[i] = etot;
2116 :
2117 2042 : percus_flag = 0;
2118 2042 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[0] > 6 && hSpMusClas->buf_etot[2] < hSpMusClas->buf_etot[1] && hSpMusClas->buf_etot[1] - st->lp_speech > 3 )
2119 : {
2120 15 : if ( hSpMusClas->buf_etot[1] - hSpMusClas->buf_etot[3] > 3 && hSpMusClas->buf_etot[3] < hSpMusClas->buf_etot[2] && 0.5f * ( 0.5f * ( st->voicing[0] + st->voicing[1] ) + old_cor ) < 0.75f )
2121 : {
2122 4 : if ( hSpMusClas->dec_mov > 0.8f )
2123 : {
2124 0 : percus_flag = 1;
2125 : }
2126 4 : else if ( old_cor < 0.75f && st->voicing[0] < 0.75f && st->voicing[1] < 0.75f && hSpMusClas->old_lt_diff[0] > 10 )
2127 : {
2128 0 : percus_flag = 1;
2129 : }
2130 : }
2131 : }
2132 :
2133 : /* sound attack detection */
2134 2042 : if ( hSpMusClas->buf_etot[3] - hSpMusClas->buf_etot[2] > 6 && hSpMusClas->dec_mov > 0.9f && etot - st->lp_speech > 5 && hSpMusClas->old_lt_diff[0] > 5 )
2135 : {
2136 0 : hSpMusClas->attack_hangover = 3;
2137 : }
2138 :
2139 2042 : if ( st->voicing[0] > 0.9f && st->voicing[1] > 0.9f )
2140 : {
2141 561 : if ( log_max_spl > hSpMusClas->mov_log_max_spl )
2142 : {
2143 8 : hSpMusClas->mov_log_max_spl = 0.75f * hSpMusClas->mov_log_max_spl + ( 1 - 0.75f ) * log_max_spl;
2144 : }
2145 : else
2146 : {
2147 553 : hSpMusClas->mov_log_max_spl = 0.995f * hSpMusClas->mov_log_max_spl + ( 1 - 0.995f ) * log_max_spl;
2148 : }
2149 : }
2150 :
2151 2042 : hSpMusClas->old_lt_diff[0] = hSpMusClas->old_lt_diff[1];
2152 2042 : hSpMusClas->old_lt_diff[1] = lt_diff;
2153 :
2154 : /* calculate and buffer spectral energy fluctuation */
2155 2042 : flux( st->Bin_E, p2v_map, hSpMusClas->old_Bin_E, hSpMusClas->buf_flux, hSpMusClas->attack_hangover, hSpMusClas->dec_mov );
2156 :
2157 2042 : hSpMusClas->attack_hangover--;
2158 2042 : if ( hSpMusClas->attack_hangover < 0 )
2159 : {
2160 2042 : hSpMusClas->attack_hangover = 0;
2161 : }
2162 :
2163 : /* identify flux buffer status */
2164 2042 : len = 0;
2165 117653 : for ( i = BUF_LEN - 1; i >= 0 && hSpMusClas->buf_flux[i] >= 0; i-- )
2166 : {
2167 115611 : len++;
2168 : }
2169 :
2170 : /* reset flux buffer if percussive music is detected */
2171 2042 : if ( percus_flag == 1 )
2172 : {
2173 0 : set_f( &hSpMusClas->buf_flux[BUF_LEN - len], 5, len );
2174 : }
2175 :
2176 : /* calculate and buffer the tilt of residual LP analysis energies */
2177 2042 : ftmp = 0.00001f;
2178 2042 : ftmp1 = 0;
2179 32672 : for ( i = 1; i < 16; i++ )
2180 : {
2181 30630 : ftmp += epsP[i] * epsP[i];
2182 30630 : ftmp1 += epsP[i] * epsP[i + 1];
2183 : }
2184 :
2185 2042 : epsP_tilt = ftmp1 / ftmp;
2186 :
2187 122520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2188 : {
2189 120478 : hSpMusClas->buf_epsP_tilt[i] = hSpMusClas->buf_epsP_tilt[i + 1];
2190 : }
2191 2042 : hSpMusClas->buf_epsP_tilt[i] = epsP_tilt;
2192 :
2193 : /* calculate and buffer highband spectral peakness */
2194 2042 : tonal_dist( p2v_map, hSpMusClas->buf_pkh, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf );
2195 :
2196 : /* buffer sum of correlation map */
2197 122520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2198 : {
2199 120478 : hSpMusClas->buf_cor_map_sum[i] = hSpMusClas->buf_cor_map_sum[i + 1];
2200 : }
2201 2042 : hSpMusClas->buf_cor_map_sum[i] = cor_map_sum;
2202 :
2203 : /* buffer voicing metric */
2204 20420 : for ( i = 0; i < 9; i++ )
2205 : {
2206 18378 : hSpMusClas->buf_dlp[i] = hSpMusClas->buf_dlp[i + 1];
2207 : }
2208 2042 : hSpMusClas->buf_dlp[i] = hSpMusClas->lps - hSpMusClas->lpm;
2209 :
2210 : /* classification */
2211 2042 : dec = mode_decision( st, len, &hSpMusClas->dec_mov, hSpMusClas->buf_flux, hSpMusClas->buf_epsP_tilt, hSpMusClas->buf_pkh, hSpMusClas->buf_cor_map_sum, hSpMusClas->buf_Ntonal, hSpMusClas->buf_Ntonal2, hSpMusClas->buf_Ntonal_lf, hSpMusClas->buf_dlp );
2212 :
2213 : /* update long term moving average of the classification decisions */
2214 2042 : if ( len > 30 )
2215 : {
2216 1922 : hSpMusClas->dec_mov = 0.97f * hSpMusClas->dec_mov + ( 1 - 0.97f ) * dec;
2217 1922 : hSpMusClas->dec_mov1 = 0.97f * hSpMusClas->dec_mov1 + ( 1 - 0.97f ) * dec;
2218 : }
2219 :
2220 : /* update long-term unvoiced counter */
2221 2042 : if ( ( st->coder_type_raw == UNVOICED || st->coder_type_raw == INACTIVE ) && etot > 1.5f && hSpMusClas->buf_Ntonal2[59] < 2 )
2222 : {
2223 76 : hSpMusClas->UV_cnt1 -= 8;
2224 : }
2225 : else
2226 : {
2227 1966 : hSpMusClas->UV_cnt1++;
2228 : }
2229 :
2230 2042 : if ( hSpMusClas->UV_cnt1 > 300 )
2231 : {
2232 1358 : hSpMusClas->UV_cnt1 = 300;
2233 : }
2234 684 : else if ( hSpMusClas->UV_cnt1 < 0 )
2235 : {
2236 0 : hSpMusClas->UV_cnt1 = 0;
2237 : }
2238 :
2239 2042 : hSpMusClas->LT_UV_cnt1 = 0.9f * hSpMusClas->LT_UV_cnt1 + 0.1f * hSpMusClas->UV_cnt1;
2240 :
2241 : /* revert classification decision due to long-term unvoiced counter */
2242 2042 : if ( dec == 1 && hSpMusClas->dec_mov1 < 0.2f && hSpMusClas->LT_UV_cnt1 < 200 )
2243 : {
2244 0 : dec = 0;
2245 : }
2246 :
2247 : /* overwrite 1st stage speech/music decision to music */
2248 2042 : if ( dec == 1 )
2249 : {
2250 387 : st->sp_aud_decision1 = 1;
2251 : }
2252 :
2253 2042 : return;
2254 : }
2255 :
2256 :
2257 : /*---------------------------------------------------------------------*
2258 : * spec_analysis()
2259 : *
2260 : * Spectral analysis for mixed/music classification improvement
2261 : *---------------------------------------------------------------------*/
2262 :
2263 2042 : static void spec_analysis(
2264 : float *Bin_E, /* i : log energy spectrum of the current frame */
2265 : float *p2v_map /* o : spectral peakiness map */
2266 : )
2267 : {
2268 : int16_t i, k, m;
2269 : float peak[L_FFT / 4 + 1];
2270 : float valley[L_FFT / 4 + 1];
2271 : int16_t peak_idx[L_FFT / 4 + 1];
2272 : int16_t valey_idx[L_FFT / 4 + 1];
2273 : float p2v[L_FFT / 4 + 1];
2274 :
2275 : /* find spectral peaks */
2276 2042 : k = 0;
2277 257292 : for ( i = 1; i < L_FFT / 2 - 2; i++ )
2278 : {
2279 255250 : if ( Bin_E[i] > Bin_E[i - 1] && Bin_E[i] > Bin_E[i + 1] )
2280 : {
2281 68279 : peak[k] = Bin_E[i];
2282 68279 : peak_idx[k] = i;
2283 68279 : k++;
2284 : }
2285 : }
2286 2042 : assert( k + 1 < L_FFT / 4 + 1 );
2287 2042 : peak_idx[k] = -1;
2288 2042 : peak_idx[k + 1] = -1;
2289 :
2290 2042 : if ( k == 0 )
2291 : {
2292 0 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2293 : {
2294 0 : p2v_map[i] = 0;
2295 : }
2296 :
2297 0 : return;
2298 : }
2299 :
2300 : /* find spectral valleys */
2301 2042 : m = 0;
2302 2042 : if ( Bin_E[0] < Bin_E[1] )
2303 : {
2304 1111 : valley[0] = Bin_E[0];
2305 1111 : valey_idx[0] = 0;
2306 1111 : m++;
2307 : }
2308 :
2309 2042 : k = L_FFT / 2 - 2;
2310 3590 : for ( i = L_FFT / 2 - 3; i >= 0 && Bin_E[i + 1] > Bin_E[i]; i-- )
2311 : {
2312 1548 : k = i;
2313 : }
2314 :
2315 255744 : for ( i = 1; i < k; i++ )
2316 : {
2317 253702 : if ( Bin_E[i] < Bin_E[i - 1] && Bin_E[i] < Bin_E[i + 1] )
2318 : {
2319 67168 : valley[m] = Bin_E[i];
2320 67168 : valey_idx[m] = i;
2321 67168 : m++;
2322 : }
2323 : }
2324 :
2325 2042 : valley[m] = Bin_E[k];
2326 2042 : valey_idx[m] = k;
2327 :
2328 : /* find spectral peak to valley distances */
2329 2042 : k = 0;
2330 70321 : for ( i = 0; i < m; i++ )
2331 : {
2332 68279 : if ( peak_idx[k] > valey_idx[i] && peak_idx[k] < valey_idx[i + 1] )
2333 : {
2334 68279 : p2v[k] = 2 * peak[k] - valley[i] - valley[i + 1];
2335 68279 : k++;
2336 : }
2337 : }
2338 :
2339 261376 : for ( i = 0; i < L_FFT / 2 - 1; i++ )
2340 : {
2341 259334 : p2v_map[i] = 0;
2342 : }
2343 :
2344 70321 : for ( i = 0; i < k; i++ )
2345 : {
2346 68279 : p2v_map[peak_idx[i]] = p2v[i];
2347 : }
2348 :
2349 2042 : return;
2350 : }
2351 :
2352 : /*---------------------------------------------------------------------*
2353 : * flux()
2354 : *
2355 : * Calculation of spectral flux
2356 : *---------------------------------------------------------------------*/
2357 :
2358 2042 : static void flux(
2359 : float *Bin_E, /* i : log energy spectrum of the current frame */
2360 : float *p2v_map, /* i : spectral peakiness map */
2361 : float *old_Bin_E, /* i/o: log energy spectrum of the frame 60ms ago */
2362 : float *buf_flux, /* i/o: buffer storing spectral energy fluctuation */
2363 : int16_t attack_hangover, /* i/o: hangover preventing flux buffering */
2364 : float dec_mov /* i/o: moving average of classifier decision */
2365 : )
2366 : {
2367 : int16_t i;
2368 : float *pt1, *pt2, *pt3, *pt4, *pt5, *pt6;
2369 : float flux;
2370 : int16_t cnt;
2371 :
2372 : /* calculate flux */
2373 2042 : flux = 0;
2374 2042 : cnt = 0;
2375 87806 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2376 : {
2377 85764 : if ( p2v_map[i] != 0 )
2378 : {
2379 21213 : flux += fabsf( Bin_E[i] - old_Bin_E[i] );
2380 21213 : cnt++;
2381 : }
2382 : }
2383 :
2384 2042 : if ( cnt == 0 )
2385 : {
2386 0 : flux = 5;
2387 : }
2388 : else
2389 : {
2390 2042 : flux = flux / (float) cnt;
2391 : }
2392 :
2393 2042 : if ( flux > 20 && dec_mov > 0.8f )
2394 : {
2395 48 : flux = 20;
2396 : }
2397 :
2398 : /* update old Bin_E buffer */
2399 2042 : pt1 = old_Bin_E;
2400 2042 : pt2 = old_Bin_E + N_OLD_BIN_E;
2401 2042 : pt3 = Bin_E;
2402 2042 : pt4 = old_Bin_E + N_OLD_BIN_E;
2403 2042 : pt5 = old_Bin_E + 2 * N_OLD_BIN_E;
2404 2042 : pt6 = old_Bin_E + 2 * N_OLD_BIN_E;
2405 :
2406 87806 : for ( i = 0; i < N_OLD_BIN_E; i++ )
2407 : {
2408 85764 : *pt1++ = *pt2++;
2409 85764 : *pt4++ = *pt5++;
2410 85764 : *pt6++ = *pt3++;
2411 : }
2412 :
2413 : /* update flux buffer */
2414 2042 : if ( attack_hangover <= 0 )
2415 : {
2416 122520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2417 : {
2418 120478 : buf_flux[i] = buf_flux[i + 1];
2419 : }
2420 :
2421 2042 : buf_flux[i] = flux;
2422 : }
2423 :
2424 2042 : return;
2425 : }
2426 :
2427 :
2428 : /*---------------------------------------------------------------------*
2429 : * tonal_dist()
2430 : *
2431 : * Calculation of spectral distance
2432 : *---------------------------------------------------------------------*/
2433 :
2434 2042 : static void tonal_dist(
2435 : float *p2v_map, /* i : spectral peakiness map */
2436 : float *buf_pkh, /* i/o: buffer storing highband spectral peakiness */
2437 : float *buf_Ntonal, /* i/o: buffer storing No.of 1st spectral tone */
2438 : float *buf_Ntonal2, /* i/o: buffer storing No.of 2nd spectral tone */
2439 : float *buf_Ntonal_lf /* i/o: buffer storing low band spectral tone ratio */
2440 : )
2441 : {
2442 : int16_t i;
2443 : float pk;
2444 : int16_t Ntonal;
2445 : int16_t Ntonal2;
2446 : int16_t Ntonal_lf;
2447 :
2448 : /* find number of tonals, number of tonals at low-band,
2449 : spectral peakiness at high-band */
2450 2042 : pk = 0;
2451 2042 : Ntonal = 0;
2452 2042 : Ntonal2 = 0;
2453 2042 : Ntonal_lf = 0;
2454 132730 : for ( i = 0; i < 64; i++ )
2455 : {
2456 130688 : if ( p2v_map[i] > 55 )
2457 : {
2458 9999 : Ntonal++;
2459 : }
2460 :
2461 130688 : if ( p2v_map[i] > 80 )
2462 : {
2463 5854 : Ntonal2++;
2464 5854 : Ntonal_lf++;
2465 : }
2466 : }
2467 :
2468 130688 : for ( i = 64; i < 127; i++ )
2469 : {
2470 128646 : if ( p2v_map[i] != 0 )
2471 : {
2472 35136 : pk += p2v_map[i];
2473 : }
2474 :
2475 128646 : if ( p2v_map[i] > 55 )
2476 : {
2477 4382 : Ntonal++;
2478 : }
2479 :
2480 128646 : if ( p2v_map[i] > 80 )
2481 : {
2482 1596 : Ntonal2++;
2483 : }
2484 : }
2485 :
2486 : /* update buffers */
2487 122520 : for ( i = 0; i < BUF_LEN - 1; i++ )
2488 : {
2489 120478 : buf_pkh[i] = buf_pkh[i + 1];
2490 120478 : buf_Ntonal[i] = buf_Ntonal[i + 1];
2491 120478 : buf_Ntonal2[i] = buf_Ntonal2[i + 1];
2492 120478 : buf_Ntonal_lf[i] = buf_Ntonal_lf[i + 1];
2493 : }
2494 :
2495 2042 : buf_pkh[i] = pk;
2496 2042 : buf_Ntonal[i] = (float) Ntonal;
2497 2042 : buf_Ntonal2[i] = (float) Ntonal2;
2498 2042 : buf_Ntonal_lf[i] = (float) Ntonal_lf;
2499 :
2500 2042 : return;
2501 : }
2502 :
2503 :
2504 : /*---------------------------------------------------------------------*
2505 : * mode_decision()
2506 : *
2507 : * Decision about internal mode of the mixed/music classifier improvement
2508 : *---------------------------------------------------------------------*/
2509 :
2510 2042 : static int16_t mode_decision(
2511 : Encoder_State *st,
2512 : int16_t len, /* i : buffering status */
2513 : float *dec_mov, /* i/o: moving average of classifier decision */
2514 : float *buf_flux, /* i : buffer storing spectral energy fluctuation */
2515 : float *buf_epsP_tilt, /* i : buffer storing LP prediciton error tilt */
2516 : float *buf_pkh, /* i : buffer storing highband spectral peakiness */
2517 : float *buf_cor_map_sum, /* i : buffer storing correlation map sum */
2518 : float *buf_Ntonal, /* i : buffer storing No.of 1st spectral tone */
2519 : float *buf_Ntonal2, /* i : buffer storing No.of 2nd spectral tone */
2520 : float *buf_Ntonal_lf, /* i : buffer storing low band spectral tone ratio */
2521 : float *buf_dlp /* i : buffer storing voicing estimate */
2522 : )
2523 : {
2524 : int16_t mode;
2525 : int16_t i;
2526 : int16_t voiced_cnt;
2527 : float M_pkh;
2528 : float M_cor_map_sum;
2529 : float M_Ntonal;
2530 : float M_flux;
2531 : float V_epsP_tilt;
2532 : float lf_Ntonal_ratio;
2533 :
2534 2042 : mode = *dec_mov > 0.5f;
2535 :
2536 2042 : if ( len <= 5 )
2537 : {
2538 20 : return ( mode );
2539 : }
2540 2022 : else if ( len < 10 )
2541 : {
2542 16 : M_pkh = mean( buf_pkh + BUF_LEN - len, len );
2543 16 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - len, len );
2544 16 : M_Ntonal = mean( buf_Ntonal + BUF_LEN - len, len );
2545 16 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - len, len );
2546 :
2547 16 : voiced_cnt = 0;
2548 112 : for ( i = 9; i > 3; i-- )
2549 : {
2550 96 : if ( buf_dlp[i] > 0.0f )
2551 : {
2552 4 : voiced_cnt++;
2553 : }
2554 : }
2555 :
2556 16 : if ( ( M_pkh > 1100 || V_epsP_tilt < 0.00008f || M_cor_map_sum > 100 ) && voiced_cnt < 4 )
2557 : {
2558 1 : mode = 1;
2559 : }
2560 15 : else if ( M_Ntonal > 27 && voiced_cnt < 4 )
2561 : {
2562 0 : mode = 1;
2563 : }
2564 : }
2565 : else
2566 : {
2567 2006 : voiced_cnt = 0;
2568 22066 : for ( i = 0; i < 10; i++ )
2569 : {
2570 20060 : if ( buf_dlp[i] > 0.0f )
2571 : {
2572 10018 : voiced_cnt++;
2573 : }
2574 : }
2575 :
2576 2006 : M_flux = mean( &buf_flux[BUF_LEN - 10], 10 );
2577 2006 : M_pkh = mean( buf_pkh + BUF_LEN - 10, 10 );
2578 2006 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - 10, 10 );
2579 2006 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - 10, 10 );
2580 :
2581 2006 : if ( ( M_flux < 8.5f || ( V_epsP_tilt < 0.001f && M_flux < 12.0f ) || M_pkh > 1050 || M_cor_map_sum > 100 ) && voiced_cnt < 3 && mean( &buf_flux[55], 5 ) < 15 )
2582 : {
2583 240 : mode = 1;
2584 240 : *dec_mov = 1;
2585 240 : return ( mode );
2586 : }
2587 :
2588 1766 : if ( M_flux > 16.0f || ( M_flux > 15 && voiced_cnt > 2 ) || mean( &buf_flux[55], 5 ) > 19.0f || ( buf_flux[59] >= 20 && st->hSpMusClas->lps - st->hSpMusClas->lpm > 0 ) )
2589 : {
2590 1545 : *dec_mov = 0;
2591 1545 : mode = 0;
2592 1545 : return ( mode );
2593 : }
2594 :
2595 4580 : for ( i = 10; i < len; i++ )
2596 : {
2597 4492 : M_flux = mean( &buf_flux[BUF_LEN - i], i );
2598 4492 : M_pkh = mean( buf_pkh + BUF_LEN - i, i );
2599 4492 : M_cor_map_sum = mean( buf_cor_map_sum + BUF_LEN - i, i );
2600 4492 : V_epsP_tilt = var( buf_epsP_tilt + BUF_LEN - i, i );
2601 :
2602 4492 : if ( ( ( M_flux < 12 + 0.05f * ( len - 10 ) && mean( &buf_flux[BUF_LEN - 10], 10 ) < 15 ) || V_epsP_tilt < 0.0001f + 0.000018f * ( len - 10 ) || M_pkh > 1050 - 5.0f * ( len - 10 ) || M_cor_map_sum > 95 - 0.3f * ( len - 10 ) ) && voiced_cnt < 3 )
2603 : {
2604 133 : mode = 1;
2605 133 : return ( mode );
2606 : }
2607 : }
2608 :
2609 88 : if ( len == BUF_LEN )
2610 : {
2611 87 : M_Ntonal = mean( buf_Ntonal, BUF_LEN );
2612 87 : lf_Ntonal_ratio = sum_f( buf_Ntonal_lf, BUF_LEN ) / ( sum_f( buf_Ntonal2, BUF_LEN ) + 0.0001f );
2613 :
2614 87 : if ( M_Ntonal > 18 || lf_Ntonal_ratio < 0.2f )
2615 : {
2616 0 : mode = 1;
2617 : }
2618 87 : else if ( M_Ntonal < 1 )
2619 : {
2620 0 : mode = 0;
2621 : }
2622 : }
2623 : }
2624 :
2625 104 : return ( mode );
2626 : }
2627 :
2628 :
2629 : /*----------------------------------------------------------------------------------*
2630 : * tonal_context_improv()
2631 : *
2632 : * Context-based improvement of 1st/2nd stage speech/music decision on stable tonal signals
2633 : *----------------------------------------------------------------------------------*/
2634 :
2635 2050 : static void tonal_context_improv(
2636 : Encoder_State *st, /* i/o: encoder state structure */
2637 : const float PS[], /* i : energy spectrum */
2638 : const float voi_fv, /* i : scaled voicing feature */
2639 : const float cor_map_sum_fv, /* i : scaled correlation map feature */
2640 : const float LPCErr /* i : scaled LP prediction error feature */
2641 : )
2642 : {
2643 : int16_t lt_pitch_diff;
2644 : float sort_max, sort_avg, sort_val[80];
2645 : float tonality, tonality1, tonality2, tonality3, t2, t3, tL, err, cor, dft;
2646 :
2647 2050 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2648 :
2649 : /* reset in case of codec mode switching */
2650 2050 : if ( st->last_codec_mode == MODE2 )
2651 : {
2652 275 : set_f( hSpMusClas->tonality2_buf, 0, HANG_LEN_INIT );
2653 275 : set_f( hSpMusClas->tonality3_buf, 0, HANG_LEN_INIT );
2654 275 : set_f( hSpMusClas->LPCErr_buf, 0, HANG_LEN_INIT );
2655 275 : hSpMusClas->lt_music_hangover = 0;
2656 275 : hSpMusClas->lt_music_state = 0;
2657 275 : hSpMusClas->lt_speech_state = 0;
2658 275 : hSpMusClas->lt_speech_hangover = 0;
2659 : }
2660 :
2661 : /* estimate maximum tonality in bands [0-1 kHz], [1-2kHz] and [2-4kHz] */
2662 2050 : mvr2r( PS, sort_val, 80 );
2663 :
2664 : /* tonality in band 0-1 kHz */
2665 2050 : v_sort( sort_val, 0, 19 );
2666 2050 : sort_max = sort_val[19];
2667 2050 : sort_avg = sum_f( &sort_val[0], 10 );
2668 2050 : tonality1 = sort_max / sort_avg;
2669 :
2670 : /* tonality in band 1-2 kHz */
2671 2050 : v_sort( sort_val, 20, 39 );
2672 2050 : sort_max = sort_val[39];
2673 2050 : sort_avg = sum_f( &sort_val[20], 10 );
2674 2050 : tonality2 = sort_max / sort_avg;
2675 :
2676 : /* tonality in band 2-4 kHz */
2677 2050 : v_sort( sort_val, 40, 79 );
2678 2050 : sort_max = sort_val[79];
2679 2050 : sort_avg = sum_f( &sort_val[40], 20 );
2680 2050 : tonality3 = sort_max / sort_avg;
2681 :
2682 2050 : tonality = max( max( tonality1, tonality2 ), tonality3 );
2683 :
2684 2050 : if ( st->hVAD->hangover_cnt == 10 && st->vad_flag == 1 )
2685 : {
2686 : /* long-term voicing parameter */
2687 10 : hSpMusClas->lt_voicing = 0.1f * hSpMusClas->lt_voicing + 0.9f * *st->voicing;
2688 :
2689 : /* long-term correlation value */
2690 10 : hSpMusClas->lt_corr = 0.1f * hSpMusClas->lt_corr + 0.9f * st->old_corr;
2691 :
2692 : /* long-term tonality measure */
2693 10 : hSpMusClas->lt_tonality = 0.1f * hSpMusClas->lt_tonality + 0.9f * tonality;
2694 : }
2695 : else
2696 : {
2697 : /* long-term voicing parameter */
2698 2040 : hSpMusClas->lt_voicing = 0.7f * hSpMusClas->lt_voicing + 0.3f * *st->voicing;
2699 :
2700 : /* long-term correlation value */
2701 2040 : hSpMusClas->lt_corr = 0.7f * hSpMusClas->lt_corr + 0.3f * st->old_corr;
2702 :
2703 : /* long-term tonality measure */
2704 2040 : hSpMusClas->lt_tonality = 0.5f * hSpMusClas->lt_tonality + 0.5f * tonality;
2705 : }
2706 :
2707 : /* pitch difference w.r.t to past 3 frames */
2708 2050 : lt_pitch_diff = (int16_t) abs( hSpMusClas->lt_corr_pitch[0] - st->pitch[0] );
2709 2050 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[1] - st->pitch[0] );
2710 2050 : lt_pitch_diff += (int16_t) abs( hSpMusClas->lt_corr_pitch[2] - st->pitch[0] );
2711 :
2712 2050 : hSpMusClas->lt_corr_pitch[0] = hSpMusClas->lt_corr_pitch[1];
2713 2050 : hSpMusClas->lt_corr_pitch[1] = hSpMusClas->lt_corr_pitch[2];
2714 2050 : hSpMusClas->lt_corr_pitch[2] = st->pitch[0];
2715 :
2716 2050 : hSpMusClas->lt_old_mode[0] = hSpMusClas->lt_old_mode[1];
2717 2050 : hSpMusClas->lt_old_mode[1] = hSpMusClas->lt_old_mode[2];
2718 :
2719 2727 : if ( st->sp_aud_decision1 == 1 &&
2720 1224 : ( min( min( tonality1, tonality2 ), tonality3 ) > 50.0f ) &&
2721 51 : ( tonality1 + tonality2 > 200.0f && tonality2 + tonality3 > 200.0f && tonality1 + tonality3 > 200.0f ) &&
2722 33 : ( hSpMusClas->lt_tonality < 20000.0f ) &&
2723 33 : ( ( hSpMusClas->lt_tonality > 1000 && max( hSpMusClas->lt_voicing, *st->voicing ) > 0.99f ) ||
2724 33 : ( hSpMusClas->lt_tonality > 1500 && hSpMusClas->lt_corr > 0.99f ) ||
2725 33 : ( hSpMusClas->lt_tonality > 3000 && hSpMusClas->lowrate_pitchGain > 0.96f ) ||
2726 19 : ( lt_pitch_diff == 0 && hSpMusClas->lowrate_pitchGain > 0.89f ) ) )
2727 : {
2728 0 : if ( sum_s( hSpMusClas->lt_old_mode, 2 ) < 2 )
2729 : {
2730 : /* probably speech - change the decision to speech */
2731 0 : st->sp_aud_decision1 = 0;
2732 0 : st->sp_aud_decision2 = 0;
2733 :
2734 0 : if ( hSpMusClas->lt_hangover == 0 )
2735 : {
2736 0 : hSpMusClas->lt_hangover = 6;
2737 : }
2738 : }
2739 : }
2740 : else
2741 : {
2742 : /* not speech, but still in the hangover period - change the decision to speech */
2743 2050 : if ( hSpMusClas->lt_hangover > 0 )
2744 : {
2745 0 : st->sp_aud_decision1 = 0;
2746 0 : st->sp_aud_decision2 = 0;
2747 0 : hSpMusClas->lt_hangover--;
2748 : }
2749 : }
2750 :
2751 : /* calculate standard deviation of log-tonality */
2752 2050 : mvr2r( hSpMusClas->tonality2_buf + 1, hSpMusClas->tonality2_buf, HANG_LEN_INIT - 1 );
2753 2050 : hSpMusClas->tonality2_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality2 );
2754 2050 : t2 = std_dev( hSpMusClas->tonality2_buf, HANG_LEN_INIT );
2755 :
2756 2050 : mvr2r( hSpMusClas->tonality3_buf + 1, hSpMusClas->tonality3_buf, HANG_LEN_INIT - 1 );
2757 2050 : hSpMusClas->tonality3_buf[HANG_LEN_INIT - 1] = 0.2f * log10f( tonality3 );
2758 2050 : t3 = std_dev( hSpMusClas->tonality3_buf, HANG_LEN_INIT );
2759 :
2760 2050 : tL = 0.2f * log10f( hSpMusClas->lt_tonality );
2761 :
2762 : /* calculate standard deviation of residual LP energy */
2763 2050 : mvr2r( hSpMusClas->LPCErr_buf + 1, hSpMusClas->LPCErr_buf, HANG_LEN_INIT - 1 );
2764 2050 : hSpMusClas->LPCErr_buf[HANG_LEN_INIT - 1] = LPCErr;
2765 2050 : err = std_dev( hSpMusClas->LPCErr_buf, HANG_LEN_INIT );
2766 :
2767 2050 : cor = max( voi_fv - cor_map_sum_fv, 0.0f );
2768 2050 : dft = 0.2f * fabsf( log10f( tonality2 ) - log10f( tonality3 ) );
2769 :
2770 : /* state machine for strong music */
2771 2050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_music_state == 0 && hSpMusClas->lt_music_hangover == 0 &&
2772 540 : t2 < 0.54f && t2 > 0.26f && t3 > 0.22f && tL < 0.54f && tL > 0.26f && err > 0.5f )
2773 : {
2774 7 : hSpMusClas->lt_music_state = 1;
2775 7 : hSpMusClas->lt_music_hangover = 6;
2776 : }
2777 2043 : else if ( hSpMusClas->lt_music_state == 1 && hSpMusClas->lt_music_hangover == 0 && t2 < 0.34 && t3 < 0.26f && tL < 0.45f )
2778 : {
2779 6 : hSpMusClas->lt_music_state = 0;
2780 6 : hSpMusClas->lt_music_hangover = 6;
2781 : }
2782 :
2783 2050 : if ( hSpMusClas->lt_music_hangover > 0 )
2784 : {
2785 74 : hSpMusClas->lt_music_hangover--;
2786 : }
2787 :
2788 : /* state machine for strong speech */
2789 2050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 0 && hSpMusClas->lt_speech_hangover == 0 &&
2790 129 : cor > 0.40f && dft < 0.1f && voi_fv > 2 * cor_map_sum_fv + 0.12f &&
2791 17 : t2 < cor && t3 < cor && tL < cor && cor_map_sum_fv < cor && voi_fv > cor && voi_fv > 0.76f )
2792 : {
2793 7 : hSpMusClas->lt_speech_state = 1;
2794 7 : hSpMusClas->lt_speech_hangover = 6;
2795 : }
2796 2043 : else if ( hSpMusClas->lt_speech_state == 1 && hSpMusClas->lt_speech_hangover == 0 && cor < 0.40f )
2797 : {
2798 6 : hSpMusClas->lt_speech_state = 0;
2799 6 : hSpMusClas->lt_speech_hangover = 6;
2800 : }
2801 :
2802 2050 : if ( hSpMusClas->lt_speech_hangover > 0 )
2803 : {
2804 66 : hSpMusClas->lt_speech_hangover--;
2805 : }
2806 :
2807 : /* final decision */
2808 2050 : if ( st->sp_aud_decision1 == 1 && hSpMusClas->lt_speech_state == 1 )
2809 : {
2810 : /* strong speech - probably error in speech/music classification */
2811 37 : st->sp_aud_decision1 = 0;
2812 37 : st->sp_aud_decision2 = 0;
2813 : }
2814 2013 : else if ( st->sp_aud_decision1 == 0 && hSpMusClas->lt_music_state == 1 )
2815 : {
2816 : /* strong music - probably error in speech/music classification */
2817 0 : st->sp_aud_decision1 = 1;
2818 0 : st->sp_aud_decision2 = 1;
2819 : }
2820 :
2821 : /* update the buffer of past decisions */
2822 2050 : hSpMusClas->lt_old_mode[2] = st->sp_aud_decision1;
2823 :
2824 2050 : return;
2825 : }
2826 :
2827 : /*---------------------------------------------------------------------*
2828 : * detect_sparseness()
2829 : *
2830 : *
2831 : *---------------------------------------------------------------------*/
2832 :
2833 1042 : static void detect_sparseness(
2834 : Encoder_State *st, /* i/o: encoder state structure */
2835 : const int16_t localVAD_HE_SAD, /* i : HE-SAD flag without hangover */
2836 : const float voi_fv /* i : scaled voicing feature */
2837 : )
2838 : {
2839 : float sum;
2840 : float ftmp;
2841 : float ftmp1;
2842 : float S1[128];
2843 : int16_t i, j;
2844 1042 : int16_t hb_sp_high_flag = 0;
2845 1042 : int16_t lb_sp_high_flag = 0;
2846 : float sumh;
2847 : float sparse;
2848 : float tmp_buf[4];
2849 1042 : float Mlpe = 0.0f;
2850 1042 : float Mv = 0.0f;
2851 : float Msp;
2852 :
2853 1042 : SP_MUS_CLAS_HANDLE hSpMusClas = st->hSpMusClas;
2854 :
2855 1042 : mvr2r( st->Bin_E, S1, 128 );
2856 :
2857 1042 : sum = 0;
2858 84402 : for ( i = 0; i < 80; i++ )
2859 : {
2860 83360 : if ( S1[i] < 0 )
2861 : {
2862 17654 : S1[i] = 0;
2863 : }
2864 83360 : sum += S1[i];
2865 : }
2866 :
2867 1042 : sumh = 0;
2868 51058 : for ( i = 80; i < 128; i++ )
2869 : {
2870 50016 : if ( S1[i] < 0 )
2871 : {
2872 13266 : S1[i] = 0;
2873 : }
2874 50016 : sumh += S1[i];
2875 : }
2876 :
2877 1042 : sum += sumh;
2878 :
2879 : /* order spectral from max to min */
2880 1042 : order_spectrum( S1, 128 );
2881 :
2882 : /* calculate spectral sparseness in the range 0 - 6.4 kHz */
2883 1042 : j = 0;
2884 1042 : ftmp = 0.0f;
2885 1042 : ftmp1 = 0.75f * sum;
2886 55534 : for ( i = 0; i < 128; i++ )
2887 : {
2888 55528 : ftmp += S1[i];
2889 55528 : if ( ftmp > ftmp1 )
2890 : {
2891 1036 : j = i;
2892 1036 : break;
2893 : }
2894 : }
2895 :
2896 8336 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2897 : {
2898 7294 : hSpMusClas->sparse_buf[i] = hSpMusClas->sparse_buf[i + 1];
2899 : }
2900 :
2901 1042 : sparse = (float) j;
2902 1042 : hSpMusClas->sparse_buf[i] = sparse;
2903 :
2904 1042 : if ( st->bwidth == WB )
2905 : {
2906 0 : Msp = mean( hSpMusClas->sparse_buf, 8 );
2907 :
2908 : /* find long-term smoothed sparseness */
2909 0 : if ( hSpMusClas->last_vad_spa == 0 )
2910 : {
2911 0 : set_f( &hSpMusClas->sparse_buf[0], sparse, HANG_LEN_INIT - 1 );
2912 0 : hSpMusClas->LT_sparse = sparse;
2913 : }
2914 : else
2915 : {
2916 0 : set_f( tmp_buf, 0.0f, 4 );
2917 :
2918 0 : for ( i = 0; i < HANG_LEN_INIT; i++ )
2919 : {
2920 0 : for ( j = 0; j < 4; j++ )
2921 : {
2922 0 : if ( hSpMusClas->sparse_buf[i] > tmp_buf[j] )
2923 : {
2924 0 : mvr2r( &tmp_buf[j], &tmp_buf[j + 1], 3 - j );
2925 0 : tmp_buf[j] = hSpMusClas->sparse_buf[i];
2926 0 : break;
2927 : }
2928 : }
2929 : }
2930 :
2931 0 : ftmp = 0.25f * ( HANG_LEN_INIT * Msp - sum_f( tmp_buf, 4 ) ) - hSpMusClas->LT_sparse;
2932 :
2933 0 : hSpMusClas->LT_sparse = hSpMusClas->LT_sparse + 0.25f * ftmp;
2934 : }
2935 :
2936 : /* find high-band sparseness */
2937 0 : mvr2r( st->Bin_E + 80, S1, 48 );
2938 0 : order_spectrum( S1, 48 );
2939 :
2940 0 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2941 : {
2942 0 : hSpMusClas->hf_spar_buf[i] = hSpMusClas->hf_spar_buf[i + 1];
2943 : }
2944 0 : hSpMusClas->hf_spar_buf[i] = sum_f( S1, 5 ) / ( sumh + 0.1f );
2945 0 : if ( mean( hSpMusClas->hf_spar_buf, 8 ) > 0.2f )
2946 : {
2947 0 : hb_sp_high_flag = 1;
2948 : }
2949 :
2950 : /* find low-band sparseness */
2951 0 : mvr2r( st->Bin_E, S1, 60 );
2952 0 : order_spectrum( S1, 60 );
2953 :
2954 0 : if ( sum_f( S1, 5 ) / sum_f( S1, 60 ) > 0.18f )
2955 : {
2956 0 : lb_sp_high_flag = 1;
2957 : }
2958 :
2959 : /* find smoothed linear prediction efficiency */
2960 0 : for ( i = 0; i < 7; i++ )
2961 : {
2962 0 : hSpMusClas->lpe_buf[i] = hSpMusClas->lpe_buf[i + 1];
2963 : }
2964 :
2965 0 : hSpMusClas->lpe_buf[i] = hSpMusClas->past_epsP2;
2966 0 : Mlpe = mean( hSpMusClas->lpe_buf, 8 );
2967 :
2968 : /* find smoothed voicing */
2969 0 : for ( i = 0; i < HANG_LEN_INIT - 1; i++ )
2970 : {
2971 0 : hSpMusClas->voicing_buf[i] = hSpMusClas->voicing_buf[i + 1];
2972 : }
2973 :
2974 0 : hSpMusClas->voicing_buf[i] = voi_fv;
2975 0 : Mv = mean( hSpMusClas->voicing_buf, 8 );
2976 : }
2977 :
2978 : /* avoid using LR-MDCT on sparse spectra */
2979 1042 : if ( st->sp_aud_decision1 == 1 )
2980 : {
2981 308 : if ( st->bwidth == WB )
2982 : {
2983 0 : ftmp = 90;
2984 : }
2985 : else
2986 : {
2987 308 : ftmp = 91;
2988 : }
2989 308 : if ( sparse > ftmp )
2990 : {
2991 0 : st->sp_aud_decision1 = 0;
2992 0 : st->sp_aud_decision2 = 1;
2993 0 : hSpMusClas->gsc_hangover = 1;
2994 : }
2995 308 : else if ( hSpMusClas->gsc_hangover == 1 )
2996 : {
2997 0 : if ( sparse > 85 )
2998 : {
2999 0 : st->sp_aud_decision1 = 0;
3000 0 : st->sp_aud_decision2 = 1;
3001 : }
3002 0 : else if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3003 : {
3004 0 : st->sp_aud_decision1 = 0;
3005 0 : st->sp_aud_decision2 = 1;
3006 : }
3007 : }
3008 :
3009 308 : if ( st->bwidth == WB )
3010 : {
3011 0 : if ( hSpMusClas->LT_sparse > 60 && sparse > 50 && Mlpe < -1.3f && Mv > 0.85f &&
3012 0 : lb_sp_high_flag == 0 && ( ( hb_sp_high_flag == 0 && sumh > 0.15f * sum ) || sumh <= 0.15f * sum ) )
3013 : {
3014 0 : st->sp_aud_decision1 = 0;
3015 0 : st->sp_aud_decision2 = 1;
3016 0 : hSpMusClas->gsc_hangover = 1;
3017 : }
3018 0 : else if ( hSpMusClas->gsc_hangover == 1 && !( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 ) )
3019 : {
3020 0 : if ( fabs( sparse - mean( &hSpMusClas->sparse_buf[HANG_LEN_INIT - 1 - hSpMusClas->gsc_cnt], hSpMusClas->gsc_cnt ) ) < 7.0f )
3021 : {
3022 0 : st->sp_aud_decision1 = 0;
3023 0 : st->sp_aud_decision2 = 1;
3024 : }
3025 : }
3026 : }
3027 : }
3028 :
3029 : /* update the counter of consecutive GSC frames with sparse spectrum */
3030 1042 : if ( st->sp_aud_decision1 == 0 && st->sp_aud_decision2 == 1 )
3031 : {
3032 0 : ( hSpMusClas->gsc_cnt )++;
3033 0 : if ( hSpMusClas->gsc_cnt > 7 )
3034 : {
3035 0 : hSpMusClas->gsc_cnt = 7;
3036 : }
3037 : }
3038 : else
3039 : {
3040 1042 : hSpMusClas->gsc_cnt = 0;
3041 1042 : hSpMusClas->gsc_hangover = 0;
3042 : }
3043 :
3044 1042 : hSpMusClas->last_vad_spa = localVAD_HE_SAD;
3045 :
3046 1042 : return;
3047 : }
3048 :
3049 :
3050 : /*---------------------------------------------------------------------*
3051 : * order_spectrum()
3052 : *
3053 : *
3054 : *---------------------------------------------------------------------*/
3055 :
3056 1042 : static void order_spectrum(
3057 : float *vec,
3058 : const int16_t len )
3059 : {
3060 : int16_t i, j, imax, imin;
3061 : float temp;
3062 :
3063 67730 : for ( i = 0; i < len / 2; i++ )
3064 : {
3065 66688 : imax = i;
3066 66688 : imin = i;
3067 4401408 : for ( j = i; j < len - i; j++ )
3068 : {
3069 4334720 : if ( vec[j] > vec[imax] )
3070 : {
3071 178842 : imax = j;
3072 : }
3073 : else
3074 : {
3075 4155878 : if ( vec[j] < vec[imin] )
3076 : {
3077 249826 : imin = j;
3078 : }
3079 : }
3080 : }
3081 :
3082 66688 : temp = vec[i];
3083 66688 : vec[i] = vec[imax];
3084 66688 : vec[imax] = temp;
3085 :
3086 66688 : if ( imin == i )
3087 : {
3088 11864 : imin = imax;
3089 : }
3090 :
3091 66688 : temp = vec[len - i - 1];
3092 66688 : vec[len - i - 1] = vec[imin];
3093 66688 : vec[imin] = temp;
3094 : }
3095 :
3096 1042 : return;
3097 : }
|