Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2026 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include "cnst.h"
43 : #include "rom_enc.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define MAX_DELTA 16 /* half-length of the delta search */
52 : #define COR_BUF_LEN ( L_INTERPOL1 * 2 + MAX_DELTA * 2 + 1 )
53 :
54 : /*-------------------------------------------------------------------*
55 : * pitch_ol2()
56 : *
57 : * Open-loop pitch precision improvement with 1/4 resolution
58 : * The pitch is searched in the interval <pitch_ol-delta, pitch_ol+delta),
59 : * i.e. the value pitch_ol + delta is not a part of the interval
60 : *-------------------------------------------------------------------*/
61 :
62 2559444 : void pitch_ol2(
63 : const int16_t pit_min, /* i : pit_min value */
64 : const int16_t pitch_ol, /* i : pitch to be improved */
65 : float *pitch_fr, /* o : adjusted 1/4 fractional pitch */
66 : float *voicing_fr, /* o : adjusted 1/4 fractional voicing */
67 : const int16_t pos, /* i : position in frame where to calculate the improv. */
68 : const float *wsp, /* i : weighted speech for current frame and look-ahead */
69 : const int16_t delta /* i : delta for pitch search */
70 : )
71 : {
72 : int16_t i, t, t0, t1, step, fraction, t0_min, t0_max, t_min, t_max;
73 : float temp, cor_max, enr_wsp, enr_old, cor[COR_BUF_LEN], *pt_cor, wsp_fr[L_SUBFR];
74 : const float *pt_wsp;
75 : int16_t base_idx;
76 2559444 : t0_min = pitch_ol - delta;
77 2559444 : t0_max = pitch_ol + delta - 1;
78 :
79 2559444 : if ( t0_min < pit_min )
80 : {
81 263484 : t0_min = pit_min;
82 : }
83 2559444 : t_min = t0_min - L_INTERPOL1;
84 :
85 2559444 : if ( t0_max > PIT_MAX )
86 : {
87 52724 : t0_max = PIT_MAX;
88 : }
89 2559444 : t_max = t0_max + L_INTERPOL1;
90 :
91 2559444 : pt_wsp = wsp + pos;
92 2559444 : pt_cor = cor;
93 57459692 : for ( t = t_min; t <= t_max; t++ )
94 : {
95 54900248 : *pt_cor++ = dotp( pt_wsp, pt_wsp - t, L_SUBFR );
96 : }
97 :
98 2559444 : pt_cor = cor + L_INTERPOL1;
99 2559444 : cor_max = *pt_cor++;
100 2559444 : t1 = t0_min;
101 34424696 : for ( t = t0_min + 1; t <= t0_max; t++ )
102 : {
103 31865252 : if ( *pt_cor > cor_max )
104 : {
105 12611401 : cor_max = *pt_cor;
106 12611401 : t1 = t;
107 : }
108 31865252 : pt_cor++;
109 : }
110 :
111 : /*------------------------------------------------------------------*
112 : * Search fractional pitch with 1/4 subsample resolution.
113 : * search the fractions around t0 and choose the one which maximizes
114 : * the interpolated normalized correlation.
115 : *-----------------------------------------------------------------*/
116 :
117 2559444 : t0 = t1;
118 2559444 : base_idx = L_INTERPOL1 - t0_min;
119 2559444 : step = 1; /* 1/4 subsample resolution */
120 2559444 : fraction = 1;
121 :
122 2559444 : if ( t0 == t0_min ) /* Limit case */
123 : {
124 220527 : fraction = 0;
125 220527 : cor_max = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
126 : }
127 : else /* Process negative fractions */
128 : {
129 2338917 : t0--;
130 2338917 : cor_max = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
131 7016751 : for ( i = ( fraction + step ); i <= 3; i = i + step )
132 : {
133 4677834 : temp = interpolation( &cor[t0 + base_idx], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
134 4677834 : if ( temp > cor_max )
135 : {
136 4381161 : cor_max = temp;
137 4381161 : fraction = i;
138 : }
139 : }
140 : }
141 :
142 12797220 : for ( i = 0; i <= 3; i = i + step ) /* Process positive fractions */
143 : {
144 10237776 : temp = interpolation( &cor[t1 + base_idx], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
145 10237776 : if ( temp > cor_max )
146 : {
147 2967914 : cor_max = temp;
148 2967914 : fraction = i;
149 2967914 : t0 = t1;
150 : }
151 : }
152 :
153 2559444 : *pitch_fr = t0 + (float) fraction / 4.0f;
154 2559444 : pred_lt4( pt_wsp, wsp_fr, t0, fraction, L_SUBFR, E_ROM_inter4_1, 4, PIT_UP_SAMP );
155 :
156 2559444 : enr_wsp = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
157 2559444 : enr_old = dotp( wsp_fr, wsp_fr, L_SUBFR ) + 0.01f;
158 2559444 : *voicing_fr = cor_max * inv_sqrt( enr_wsp * enr_old );
159 :
160 2559444 : return;
161 : }
162 :
163 :
164 : /*-------------------------------------------------------------------*
165 : * StableHighPitchDetect()
166 : *
167 : * Very short stable pitch detection
168 : *-------------------------------------------------------------------*/
169 :
170 3144290 : void StableHighPitchDetect(
171 : int16_t *flag_spitch, /* o : flag to indicate very short stable pitch*/
172 : int16_t pitch[], /* i/o: OL pitch buffer */
173 : const float voicing[], /* i : OL pitch gains */
174 : const float Bin_E[], /* i : per bin log energy spectrum */
175 : const float wsp[], /* i : weighted speech */
176 : const int16_t localVAD, /* i : local VAD flag */
177 : float *voicing_sm, /* i/o: smoothed open-loop pitch gains */
178 : float *voicing0_sm, /* i/o: smoothed high pitch gains */
179 : float *LF_EnergyRatio_sm, /* i/o: smoothed [0, 300Hz] relative peak energy*/
180 : int16_t *predecision_flag, /* i/o: predecision flag */
181 : float *diff_sm, /* i/o: smoothed pitch frequency difference */
182 : float *energy_sm /* i/o: smoothed energy around pitch frequency */
183 : )
184 : {
185 : int16_t i, pitch_freq_point, pit_min_up;
186 : int16_t T, Tp, pit_min;
187 :
188 : float voicing_m;
189 : float energy0, energy1, ratio, cor_max, diff, sum_energy;
190 : const float *pt_wsp;
191 :
192 3144290 : voicing_m = mean( voicing, 3 );
193 3144290 : *voicing_sm = 0.75f * ( *voicing_sm ) + 0.25f * voicing_m;
194 :
195 :
196 : /* initial short pitch possibility pre-decision */
197 3144290 : pitch_freq_point = (int16_t) ( L_FFT / pitch[1] + 0.5f );
198 3144290 : diff = 0.0f;
199 3144290 : sum_energy = 0.0f;
200 :
201 28120926 : for ( i = 1; i < 2 * pitch_freq_point; i++ )
202 : {
203 24976636 : diff += ( Bin_E[pitch_freq_point] - Bin_E[i] );
204 24976636 : sum_energy += Bin_E[i];
205 : }
206 3144290 : sum_energy /= ( 2 * pitch_freq_point - 1 );
207 :
208 3144290 : *diff_sm = 0.2f * diff + 0.8f * *diff_sm;
209 3144290 : *energy_sm = 0.2f * sum_energy + 0.8f * *energy_sm;
210 3144290 : diff /= sum_energy;
211 :
212 3144290 : if ( *diff_sm < -10 && *energy_sm < 38.5 && diff < -0.8 )
213 : {
214 44372 : *predecision_flag = 1;
215 : }
216 :
217 3144290 : if ( *diff_sm > 10 && *energy_sm > 83 && diff > 0.5 )
218 : {
219 297670 : *predecision_flag = 0;
220 : }
221 :
222 : /* short pitch possiblity pre-decision */
223 3144290 : maximum( Bin_E, 7, &energy0 );
224 3144290 : maximum( Bin_E + 8, 7, &energy1 );
225 3144290 : ratio = max( energy1 - energy0, 0 );
226 3144290 : ratio *= max( voicing_m, 0 );
227 :
228 3144290 : *LF_EnergyRatio_sm = ( 15 * ( *LF_EnergyRatio_sm ) + ratio ) / 16;
229 :
230 3144290 : if ( *LF_EnergyRatio_sm > 35 || ratio > 50 )
231 : {
232 32817 : *predecision_flag = 1;
233 : }
234 :
235 3144290 : if ( *LF_EnergyRatio_sm < 16 )
236 : {
237 3079041 : *predecision_flag = 0;
238 : }
239 :
240 : /* short pitch candidate detection */
241 3144290 : Tp = pitch[1];
242 3144290 : cor_max = 0;
243 :
244 3144290 : pt_wsp = wsp + 3 * L_SUBFR;
245 3144290 : pit_min = PIT_MIN_DOUBLEEXTEND;
246 3144290 : pit_min_up = PIT_MIN;
247 :
248 59741510 : for ( T = pit_min; T <= pit_min_up; T++ )
249 : {
250 56597220 : energy1 = dotp( pt_wsp, pt_wsp - T, L_SUBFR );
251 :
252 56597220 : if ( energy1 > cor_max || T == pit_min )
253 : {
254 17248627 : cor_max = energy1;
255 17248627 : Tp = T;
256 : }
257 : }
258 :
259 3144290 : energy0 = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
260 3144290 : energy1 = dotp( pt_wsp - Tp, pt_wsp - Tp, L_SUBFR ) + 0.01f;
261 3144290 : cor_max *= inv_sqrt( energy0 * energy1 );
262 3144290 : *voicing0_sm = 0.75f * ( *voicing0_sm ) + 0.25f * cor_max;
263 :
264 : /* final short pitch correction */
265 3144290 : *flag_spitch = 0;
266 3144290 : if ( localVAD && *predecision_flag && *voicing0_sm > 0.65f && *voicing0_sm > 0.7f * ( *voicing_sm ) )
267 : {
268 42654 : *flag_spitch = 1;
269 :
270 42654 : pitch[0] = Tp;
271 42654 : pitch[1] = Tp;
272 42654 : pitch[2] = Tp;
273 : }
274 :
275 3144290 : return;
276 : }
277 :
278 : /*-------------------------------------------------------------------*
279 : * pitchDoubling_det()
280 : * Multiple pitch doubling detector
281 : *
282 : *-------------------------------------------------------------------*/
283 :
284 360 : void pitchDoubling_det(
285 : const float *wspeech,
286 : int16_t *pitch_ol,
287 : float *pitch_fr,
288 : float *voicing_fr )
289 : {
290 : float new_op_fr[2];
291 : float new_voicing[2];
292 : int16_t new_Top[2];
293 : int16_t m, T;
294 :
295 : /*save initial values*/
296 360 : new_Top[0] = pitch_ol[0];
297 360 : new_Top[1] = pitch_ol[1];
298 1440 : for ( m = 2; m < 5; m++ )
299 : {
300 1080 : T = pitch_ol[0] / m;
301 1080 : if ( T >= PIT_MIN_12k8 )
302 : {
303 102 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 0, wspeech, 2 );
304 102 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], L_SUBFR, wspeech, 2 );
305 :
306 102 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[0] + voicing_fr[1] ) )
307 : {
308 0 : new_Top[0] = T;
309 0 : pitch_fr[0] = new_op_fr[0];
310 0 : pitch_fr[1] = new_op_fr[1];
311 0 : voicing_fr[0] = new_voicing[0];
312 0 : voicing_fr[1] = new_voicing[1];
313 : }
314 : }
315 :
316 1080 : T = pitch_ol[1] / m;
317 1080 : if ( T >= PIT_MIN_12k8 )
318 : {
319 120 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 2 * L_SUBFR, wspeech, 2 );
320 120 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], 3 * L_SUBFR, wspeech, 2 );
321 :
322 120 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[2] + voicing_fr[3] ) )
323 : {
324 0 : new_Top[1] = T;
325 0 : pitch_fr[2] = new_op_fr[0];
326 0 : pitch_fr[3] = new_op_fr[1];
327 0 : voicing_fr[2] = new_voicing[0];
328 0 : voicing_fr[3] = new_voicing[1];
329 : }
330 : }
331 : }
332 360 : pitch_ol[0] = new_Top[0];
333 360 : pitch_ol[1] = new_Top[1];
334 :
335 360 : return;
336 : }
|