Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include "cnst.h"
43 : #include "rom_enc.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define MAX_DELTA 16 /* half-length of the delta search */
52 : #define COR_BUF_LEN ( L_INTERPOL1 * 2 + MAX_DELTA * 2 + 1 )
53 :
54 : /*-------------------------------------------------------------------*
55 : * pitch_ol2()
56 : *
57 : * Open-loop pitch precision improvement with 1/4 resolution
58 : * The pitch is searched in the interval <pitch_ol-delta, pitch_ol+delta),
59 : * i.e. the value pitch_ol + delta is not a part of the interval
60 : *-------------------------------------------------------------------*/
61 :
62 979340 : void pitch_ol2(
63 : const int16_t pit_min, /* i : pit_min value */
64 : const int16_t pitch_ol, /* i : pitch to be improved */
65 : float *pitch_fr, /* o : adjusted 1/4 fractional pitch */
66 : float *voicing_fr, /* o : adjusted 1/4 fractional voicing */
67 : const int16_t pos, /* i : position in frame where to calculate the improv. */
68 : const float *wsp, /* i : weighted speech for current frame and look-ahead */
69 : const int16_t delta /* i : delta for pitch search */
70 : )
71 : {
72 : int16_t i, t, t0, t1, step, fraction, t0_min, t0_max, t_min, t_max;
73 : float temp, cor_max, enr_wsp, enr_old, cor[COR_BUF_LEN], *pt_cor, wsp_fr[L_SUBFR];
74 : const float *pt_wsp;
75 : #ifdef FIX_2271_OOB_INDEXING_IN_PIT_OL2
76 : int16_t base_idx;
77 : #endif
78 979340 : t0_min = pitch_ol - delta;
79 979340 : t0_max = pitch_ol + delta - 1;
80 :
81 979340 : if ( t0_min < pit_min )
82 : {
83 105922 : t0_min = pit_min;
84 : }
85 979340 : t_min = t0_min - L_INTERPOL1;
86 :
87 979340 : if ( t0_max > PIT_MAX )
88 : {
89 24484 : t0_max = PIT_MAX;
90 : }
91 979340 : t_max = t0_max + L_INTERPOL1;
92 :
93 979340 : pt_wsp = wsp + pos;
94 979340 : pt_cor = cor;
95 21918046 : for ( t = t_min; t <= t_max; t++ )
96 : {
97 20938706 : *pt_cor++ = dotp( pt_wsp, pt_wsp - t, L_SUBFR );
98 : }
99 :
100 979340 : pt_cor = cor + L_INTERPOL1;
101 979340 : cor_max = *pt_cor++;
102 979340 : t1 = t0_min;
103 13103986 : for ( t = t0_min + 1; t <= t0_max; t++ )
104 : {
105 12124646 : if ( *pt_cor > cor_max )
106 : {
107 4751290 : cor_max = *pt_cor;
108 4751290 : t1 = t;
109 : }
110 12124646 : pt_cor++;
111 : }
112 :
113 : /*------------------------------------------------------------------*
114 : * Search fractional pitch with 1/4 subsample resolution.
115 : * search the fractions around t0 and choose the one which maximizes
116 : * the interpolated normalized correlation.
117 : *-----------------------------------------------------------------*/
118 :
119 : #ifndef FIX_2271_OOB_INDEXING_IN_PIT_OL2
120 : pt_cor = cor + L_INTERPOL1 - t0_min;
121 : #endif
122 979340 : t0 = t1;
123 : #ifdef FIX_2271_OOB_INDEXING_IN_PIT_OL2
124 979340 : base_idx = L_INTERPOL1 - t0_min;
125 : #endif
126 979340 : step = 1; /* 1/4 subsample resolution */
127 979340 : fraction = 1;
128 :
129 979340 : if ( t0 == t0_min ) /* Limit case */
130 : {
131 94111 : fraction = 0;
132 : #ifndef FIX_2271_OOB_INDEXING_IN_PIT_OL2
133 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
134 : #else
135 94111 : cor_max = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
136 : #endif
137 : }
138 : else /* Process negative fractions */
139 : {
140 885229 : t0--;
141 : #ifndef FIX_2271_OOB_INDEXING_IN_PIT_OL2
142 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
143 : #else
144 885229 : cor_max = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
145 : #endif
146 2655687 : for ( i = ( fraction + step ); i <= 3; i = i + step )
147 : {
148 : #ifndef FIX_2271_OOB_INDEXING_IN_PIT_OL2
149 : temp = interpolation( &pt_cor[t0], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
150 : #else
151 1770458 : temp = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
152 : #endif
153 1770458 : if ( temp > cor_max )
154 : {
155 1658114 : cor_max = temp;
156 1658114 : fraction = i;
157 : }
158 : }
159 : }
160 :
161 4896700 : for ( i = 0; i <= 3; i = i + step ) /* Process positive fractions */
162 : {
163 : #ifndef FIX_2271_OOB_INDEXING_IN_PIT_OL2
164 : temp = interpolation( &pt_cor[t1], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
165 : #else
166 3917360 : temp = interpolation( &cor[t1 + base_idx], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
167 : #endif
168 3917360 : if ( temp > cor_max )
169 : {
170 1104558 : cor_max = temp;
171 1104558 : fraction = i;
172 1104558 : t0 = t1;
173 : }
174 : }
175 :
176 979340 : *pitch_fr = t0 + (float) fraction / 4.0f;
177 979340 : pred_lt4( pt_wsp, wsp_fr, t0, fraction, L_SUBFR, E_ROM_inter4_1, 4, PIT_UP_SAMP );
178 :
179 979340 : enr_wsp = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
180 979340 : enr_old = dotp( wsp_fr, wsp_fr, L_SUBFR ) + 0.01f;
181 979340 : *voicing_fr = cor_max * inv_sqrt( enr_wsp * enr_old );
182 :
183 979340 : return;
184 : }
185 :
186 :
187 : /*-------------------------------------------------------------------*
188 : * StableHighPitchDetect()
189 : *
190 : * Very short stable pitch detection
191 : *-------------------------------------------------------------------*/
192 :
193 851621 : void StableHighPitchDetect(
194 : int16_t *flag_spitch, /* o : flag to indicate very short stable pitch*/
195 : int16_t pitch[], /* i/o: OL pitch buffer */
196 : const float voicing[], /* i : OL pitch gains */
197 : const float Bin_E[], /* i : per bin log energy spectrum */
198 : const float wsp[], /* i : weighted speech */
199 : const int16_t localVAD, /* i : local VAD flag */
200 : float *voicing_sm, /* i/o: smoothed open-loop pitch gains */
201 : float *voicing0_sm, /* i/o: smoothed high pitch gains */
202 : float *LF_EnergyRatio_sm, /* i/o: smoothed [0, 300Hz] relative peak energy*/
203 : int16_t *predecision_flag, /* i/o: predecision flag */
204 : float *diff_sm, /* i/o: smoothed pitch frequency difference */
205 : float *energy_sm /* i/o: smoothed energy around pitch frequency */
206 : )
207 : {
208 : int16_t i, pitch_freq_point, pit_min_up;
209 : int16_t T, Tp, pit_min;
210 :
211 : float voicing_m;
212 : float energy0, energy1, ratio, cor_max, diff, sum_energy;
213 : const float *pt_wsp;
214 :
215 851621 : voicing_m = mean( voicing, 3 );
216 851621 : *voicing_sm = 0.75f * ( *voicing_sm ) + 0.25f * voicing_m;
217 :
218 :
219 : /* initial short pitch possibility pre-decision */
220 851621 : pitch_freq_point = (int16_t) ( L_FFT / pitch[1] + 0.5f );
221 851621 : diff = 0.0f;
222 851621 : sum_energy = 0.0f;
223 :
224 7518314 : for ( i = 1; i < 2 * pitch_freq_point; i++ )
225 : {
226 6666693 : diff += ( Bin_E[pitch_freq_point] - Bin_E[i] );
227 6666693 : sum_energy += Bin_E[i];
228 : }
229 851621 : sum_energy /= ( 2 * pitch_freq_point - 1 );
230 :
231 851621 : *diff_sm = 0.2f * diff + 0.8f * *diff_sm;
232 851621 : *energy_sm = 0.2f * sum_energy + 0.8f * *energy_sm;
233 851621 : diff /= sum_energy;
234 :
235 851621 : if ( *diff_sm < -10 && *energy_sm < 38.5 && diff < -0.8 )
236 : {
237 16290 : *predecision_flag = 1;
238 : }
239 :
240 851621 : if ( *diff_sm > 10 && *energy_sm > 83 && diff > 0.5 )
241 : {
242 69627 : *predecision_flag = 0;
243 : }
244 :
245 : /* short pitch possiblity pre-decision */
246 851621 : maximum( Bin_E, 7, &energy0 );
247 851621 : maximum( Bin_E + 8, 7, &energy1 );
248 851621 : ratio = max( energy1 - energy0, 0 );
249 851621 : ratio *= max( voicing_m, 0 );
250 :
251 851621 : *LF_EnergyRatio_sm = ( 15 * ( *LF_EnergyRatio_sm ) + ratio ) / 16;
252 :
253 851621 : if ( *LF_EnergyRatio_sm > 35 || ratio > 50 )
254 : {
255 19427 : *predecision_flag = 1;
256 : }
257 :
258 851621 : if ( *LF_EnergyRatio_sm < 16 )
259 : {
260 815344 : *predecision_flag = 0;
261 : }
262 :
263 : /* short pitch candidate detection */
264 851621 : Tp = pitch[1];
265 851621 : cor_max = 0;
266 :
267 851621 : pt_wsp = wsp + 3 * L_SUBFR;
268 851621 : pit_min = PIT_MIN_DOUBLEEXTEND;
269 851621 : pit_min_up = PIT_MIN;
270 :
271 16180799 : for ( T = pit_min; T <= pit_min_up; T++ )
272 : {
273 15329178 : energy1 = dotp( pt_wsp, pt_wsp - T, L_SUBFR );
274 :
275 15329178 : if ( energy1 > cor_max || T == pit_min )
276 : {
277 4546884 : cor_max = energy1;
278 4546884 : Tp = T;
279 : }
280 : }
281 :
282 851621 : energy0 = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
283 851621 : energy1 = dotp( pt_wsp - Tp, pt_wsp - Tp, L_SUBFR ) + 0.01f;
284 851621 : cor_max *= inv_sqrt( energy0 * energy1 );
285 851621 : *voicing0_sm = 0.75f * ( *voicing0_sm ) + 0.25f * cor_max;
286 :
287 : /* final short pitch correction */
288 851621 : *flag_spitch = 0;
289 851621 : if ( localVAD && *predecision_flag && *voicing0_sm > 0.65f && *voicing0_sm > 0.7f * ( *voicing_sm ) )
290 : {
291 25438 : *flag_spitch = 1;
292 :
293 25438 : pitch[0] = Tp;
294 25438 : pitch[1] = Tp;
295 25438 : pitch[2] = Tp;
296 : }
297 :
298 851621 : return;
299 : }
300 :
301 : /*-------------------------------------------------------------------*
302 : * pitchDoubling_det()
303 : * Multiple pitch doubling detector
304 : *
305 : *-------------------------------------------------------------------*/
306 :
307 0 : void pitchDoubling_det(
308 : const float *wspeech,
309 : int16_t *pitch_ol,
310 : float *pitch_fr,
311 : float *voicing_fr )
312 : {
313 : float new_op_fr[2];
314 : float new_voicing[2];
315 : int16_t new_Top[2];
316 : int16_t m, T;
317 :
318 : /*save initial values*/
319 0 : new_Top[0] = pitch_ol[0];
320 0 : new_Top[1] = pitch_ol[1];
321 0 : for ( m = 2; m < 5; m++ )
322 : {
323 0 : T = pitch_ol[0] / m;
324 0 : if ( T >= PIT_MIN_12k8 )
325 : {
326 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 0, wspeech, 2 );
327 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], L_SUBFR, wspeech, 2 );
328 :
329 0 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[0] + voicing_fr[1] ) )
330 : {
331 0 : new_Top[0] = T;
332 0 : pitch_fr[0] = new_op_fr[0];
333 0 : pitch_fr[1] = new_op_fr[1];
334 0 : voicing_fr[0] = new_voicing[0];
335 0 : voicing_fr[1] = new_voicing[1];
336 : }
337 : }
338 :
339 0 : T = pitch_ol[1] / m;
340 0 : if ( T >= PIT_MIN_12k8 )
341 : {
342 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 2 * L_SUBFR, wspeech, 2 );
343 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], 3 * L_SUBFR, wspeech, 2 );
344 :
345 0 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[2] + voicing_fr[3] ) )
346 : {
347 0 : new_Top[1] = T;
348 0 : pitch_fr[2] = new_op_fr[0];
349 0 : pitch_fr[3] = new_op_fr[1];
350 0 : voicing_fr[2] = new_voicing[0];
351 0 : voicing_fr[3] = new_voicing[1];
352 : }
353 : }
354 : }
355 0 : pitch_ol[0] = new_Top[0];
356 0 : pitch_ol[1] = new_Top[1];
357 :
358 0 : return;
359 : }
|