Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include "cnst.h"
43 : #include "rom_enc.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define MAX_DELTA 16 /* half-length of the delta search */
52 : #define COR_BUF_LEN ( L_INTERPOL1 * 2 + MAX_DELTA * 2 + 1 )
53 :
54 : /*-------------------------------------------------------------------*
55 : * pitch_ol2()
56 : *
57 : * Open-loop pitch precision improvement with 1/4 resolution
58 : * The pitch is searched in the interval <pitch_ol-delta, pitch_ol+delta),
59 : * i.e. the value pitch_ol + delta is not a part of the interval
60 : *-------------------------------------------------------------------*/
61 :
62 14049996 : void pitch_ol2(
63 : const int16_t pit_min, /* i : pit_min value */
64 : const int16_t pitch_ol, /* i : pitch to be improved */
65 : float *pitch_fr, /* o : adjusted 1/4 fractional pitch */
66 : float *voicing_fr, /* o : adjusted 1/4 fractional voicing */
67 : const int16_t pos, /* i : position in frame where to calculate the improv. */
68 : const float *wsp, /* i : weighted speech for current frame and look-ahead */
69 : const int16_t delta /* i : delta for pitch search */
70 : )
71 : {
72 : int16_t i, t, t0, t1, step, fraction, t0_min, t0_max, t_min, t_max;
73 : float temp, cor_max, enr_wsp, enr_old, cor[COR_BUF_LEN], *pt_cor, wsp_fr[L_SUBFR];
74 : const float *pt_wsp;
75 :
76 14049996 : t0_min = pitch_ol - delta;
77 14049996 : t0_max = pitch_ol + delta - 1;
78 :
79 14049996 : if ( t0_min < pit_min )
80 : {
81 1867212 : t0_min = pit_min;
82 : }
83 14049996 : t_min = t0_min - L_INTERPOL1;
84 :
85 14049996 : if ( t0_max > PIT_MAX )
86 : {
87 308338 : t0_max = PIT_MAX;
88 : }
89 14049996 : t_max = t0_max + L_INTERPOL1;
90 :
91 14049996 : pt_wsp = wsp + pos;
92 14049996 : pt_cor = cor;
93 312754314 : for ( t = t_min; t <= t_max; t++ )
94 : {
95 298704318 : *pt_cor++ = dotp( pt_wsp, pt_wsp - t, L_SUBFR );
96 : }
97 :
98 14049996 : pt_cor = cor + L_INTERPOL1;
99 14049996 : cor_max = *pt_cor++;
100 14049996 : t1 = t0_min;
101 186304350 : for ( t = t0_min + 1; t <= t0_max; t++ )
102 : {
103 172254354 : if ( *pt_cor > cor_max )
104 : {
105 62908184 : cor_max = *pt_cor;
106 62908184 : t1 = t;
107 : }
108 172254354 : pt_cor++;
109 : }
110 :
111 : /*------------------------------------------------------------------*
112 : * Search fractional pitch with 1/4 subsample resolution.
113 : * search the fractions around t0 and choose the one which maximizes
114 : * the interpolated normalized correlation.
115 : *-----------------------------------------------------------------*/
116 :
117 14049996 : pt_cor = cor + L_INTERPOL1 - t0_min;
118 14049996 : t0 = t1;
119 :
120 14049996 : step = 1; /* 1/4 subsample resolution */
121 14049996 : fraction = 1;
122 :
123 14049996 : if ( t0 == t0_min ) /* Limit case */
124 : {
125 1422077 : fraction = 0;
126 1422077 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
127 : }
128 : else /* Process negative fractions */
129 : {
130 12627919 : t0--;
131 12627919 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
132 37883757 : for ( i = ( fraction + step ); i <= 3; i = i + step )
133 : {
134 25255838 : temp = interpolation( &pt_cor[t0], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
135 25255838 : if ( temp > cor_max )
136 : {
137 23734824 : cor_max = temp;
138 23734824 : fraction = i;
139 : }
140 : }
141 : }
142 :
143 70249980 : for ( i = 0; i <= 3; i = i + step ) /* Process positive fractions */
144 : {
145 56199984 : temp = interpolation( &pt_cor[t1], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
146 56199984 : if ( temp > cor_max )
147 : {
148 15287878 : cor_max = temp;
149 15287878 : fraction = i;
150 15287878 : t0 = t1;
151 : }
152 : }
153 :
154 14049996 : *pitch_fr = t0 + (float) fraction / 4.0f;
155 14049996 : pred_lt4( pt_wsp, wsp_fr, t0, fraction, L_SUBFR, E_ROM_inter4_1, 4, PIT_UP_SAMP );
156 :
157 14049996 : enr_wsp = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
158 14049996 : enr_old = dotp( wsp_fr, wsp_fr, L_SUBFR ) + 0.01f;
159 14049996 : *voicing_fr = cor_max * inv_sqrt( enr_wsp * enr_old );
160 :
161 14049996 : return;
162 : }
163 :
164 :
165 : /*-------------------------------------------------------------------*
166 : * StableHighPitchDetect()
167 : *
168 : * Very short stable pitch detection
169 : *-------------------------------------------------------------------*/
170 :
171 16209600 : void StableHighPitchDetect(
172 : int16_t *flag_spitch, /* o : flag to indicate very short stable pitch*/
173 : int16_t pitch[], /* i/o: OL pitch buffer */
174 : const float voicing[], /* i : OL pitch gains */
175 : const float Bin_E[], /* i : per bin log energy spectrum */
176 : const float wsp[], /* i : weighted speech */
177 : const int16_t localVAD, /* i : local VAD flag */
178 : float *voicing_sm, /* i/o: smoothed open-loop pitch gains */
179 : float *voicing0_sm, /* i/o: smoothed high pitch gains */
180 : float *LF_EnergyRatio_sm, /* i/o: smoothed [0, 300Hz] relative peak energy*/
181 : int16_t *predecision_flag, /* i/o: predecision flag */
182 : float *diff_sm, /* i/o: smoothed pitch frequency difference */
183 : float *energy_sm /* i/o: smoothed energy around pitch frequency */
184 : )
185 : {
186 : int16_t i, pitch_freq_point, pit_min_up;
187 : int16_t T, Tp, pit_min;
188 :
189 : float voicing_m;
190 : float energy0, energy1, ratio, cor_max, diff, sum_energy;
191 : const float *pt_wsp;
192 :
193 16209600 : voicing_m = mean( voicing, 3 );
194 16209600 : *voicing_sm = 0.75f * ( *voicing_sm ) + 0.25f * voicing_m;
195 :
196 :
197 : /* initial short pitch possibility pre-decision */
198 16209600 : pitch_freq_point = (int16_t) ( L_FFT / pitch[1] + 0.5f );
199 16209600 : diff = 0.0f;
200 16209600 : sum_energy = 0.0f;
201 :
202 148717146 : for ( i = 1; i < 2 * pitch_freq_point; i++ )
203 : {
204 132507546 : diff += ( Bin_E[pitch_freq_point] - Bin_E[i] );
205 132507546 : sum_energy += Bin_E[i];
206 : }
207 16209600 : sum_energy /= ( 2 * pitch_freq_point - 1 );
208 :
209 16209600 : *diff_sm = 0.2f * diff + 0.8f * *diff_sm;
210 16209600 : *energy_sm = 0.2f * sum_energy + 0.8f * *energy_sm;
211 16209600 : diff /= sum_energy;
212 :
213 16209600 : if ( *diff_sm < -10 && *energy_sm < 38.5 && diff < -0.8 )
214 : {
215 295471 : *predecision_flag = 1;
216 : }
217 :
218 16209600 : if ( *diff_sm > 10 && *energy_sm > 83 && diff > 0.5 )
219 : {
220 691019 : *predecision_flag = 0;
221 : }
222 :
223 : /* short pitch possiblity pre-decision */
224 16209600 : maximum( Bin_E, 7, &energy0 );
225 16209600 : maximum( Bin_E + 8, 7, &energy1 );
226 16209600 : ratio = max( energy1 - energy0, 0 );
227 16209600 : ratio *= max( voicing_m, 0 );
228 :
229 16209600 : *LF_EnergyRatio_sm = ( 15 * ( *LF_EnergyRatio_sm ) + ratio ) / 16;
230 :
231 16209600 : if ( *LF_EnergyRatio_sm > 35 || ratio > 50 )
232 : {
233 648786 : *predecision_flag = 1;
234 : }
235 :
236 16209600 : if ( *LF_EnergyRatio_sm < 16 )
237 : {
238 15222056 : *predecision_flag = 0;
239 : }
240 :
241 : /* short pitch candidate detection */
242 16209600 : Tp = pitch[1];
243 16209600 : cor_max = 0;
244 :
245 16209600 : pt_wsp = wsp + 3 * L_SUBFR;
246 16209600 : pit_min = PIT_MIN_DOUBLEEXTEND;
247 16209600 : pit_min_up = PIT_MIN;
248 :
249 307982400 : for ( T = pit_min; T <= pit_min_up; T++ )
250 : {
251 291772800 : energy1 = dotp( pt_wsp, pt_wsp - T, L_SUBFR );
252 :
253 291772800 : if ( energy1 > cor_max || T == pit_min )
254 : {
255 80184261 : cor_max = energy1;
256 80184261 : Tp = T;
257 : }
258 : }
259 :
260 16209600 : energy0 = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
261 16209600 : energy1 = dotp( pt_wsp - Tp, pt_wsp - Tp, L_SUBFR ) + 0.01f;
262 16209600 : cor_max *= inv_sqrt( energy0 * energy1 );
263 16209600 : *voicing0_sm = 0.75f * ( *voicing0_sm ) + 0.25f * cor_max;
264 :
265 : /* final short pitch correction */
266 16209600 : *flag_spitch = 0;
267 16209600 : if ( localVAD && *predecision_flag && *voicing0_sm > 0.65f && *voicing0_sm > 0.7f * ( *voicing_sm ) )
268 : {
269 815365 : *flag_spitch = 1;
270 :
271 815365 : pitch[0] = Tp;
272 815365 : pitch[1] = Tp;
273 815365 : pitch[2] = Tp;
274 : }
275 :
276 16209600 : return;
277 : }
278 :
279 : /*-------------------------------------------------------------------*
280 : * pitchDoubling_det()
281 : * Multiple pitch doubling detector
282 : *
283 : *-------------------------------------------------------------------*/
284 :
285 2371 : void pitchDoubling_det(
286 : const float *wspeech,
287 : int16_t *pitch_ol,
288 : float *pitch_fr,
289 : float *voicing_fr )
290 : {
291 : float new_op_fr[2];
292 : float new_voicing[2];
293 : int16_t new_Top[2];
294 : int16_t m, T;
295 :
296 : /*save initial values*/
297 2371 : new_Top[0] = pitch_ol[0];
298 2371 : new_Top[1] = pitch_ol[1];
299 9484 : for ( m = 2; m < 5; m++ )
300 : {
301 7113 : T = pitch_ol[0] / m;
302 7113 : if ( T >= PIT_MIN_12k8 )
303 : {
304 2784 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 0, wspeech, 2 );
305 2784 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], L_SUBFR, wspeech, 2 );
306 :
307 2784 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[0] + voicing_fr[1] ) )
308 : {
309 71 : new_Top[0] = T;
310 71 : pitch_fr[0] = new_op_fr[0];
311 71 : pitch_fr[1] = new_op_fr[1];
312 71 : voicing_fr[0] = new_voicing[0];
313 71 : voicing_fr[1] = new_voicing[1];
314 : }
315 : }
316 :
317 7113 : T = pitch_ol[1] / m;
318 7113 : if ( T >= PIT_MIN_12k8 )
319 : {
320 2840 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 2 * L_SUBFR, wspeech, 2 );
321 2840 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], 3 * L_SUBFR, wspeech, 2 );
322 :
323 2840 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[2] + voicing_fr[3] ) )
324 : {
325 74 : new_Top[1] = T;
326 74 : pitch_fr[2] = new_op_fr[0];
327 74 : pitch_fr[3] = new_op_fr[1];
328 74 : voicing_fr[2] = new_voicing[0];
329 74 : voicing_fr[3] = new_voicing[1];
330 : }
331 : }
332 : }
333 2371 : pitch_ol[0] = new_Top[0];
334 2371 : pitch_ol[1] = new_Top[1];
335 :
336 2371 : return;
337 : }
|