Line data Source code
1 : /******************************************************************************************************
2 :
3 : (C) 2022-2025 IVAS codec Public Collaboration with portions copyright Dolby International AB, Ericsson AB,
4 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
5 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
6 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
7 : contributors to this repository. All Rights Reserved.
8 :
9 : This software is protected by copyright law and by international treaties.
10 : The IVAS codec Public Collaboration consisting of Dolby International AB, Ericsson AB,
11 : Fraunhofer-Gesellschaft zur Foerderung der angewandten Forschung e.V., Huawei Technologies Co. LTD.,
12 : Koninklijke Philips N.V., Nippon Telegraph and Telephone Corporation, Nokia Technologies Oy, Orange,
13 : Panasonic Holdings Corporation, Qualcomm Technologies, Inc., VoiceAge Corporation, and other
14 : contributors to this repository retain full ownership rights in their respective contributions in
15 : the software. This notice grants no license of any kind, including but not limited to patent
16 : license, nor is any license granted by implication, estoppel or otherwise.
17 :
18 : Contributors are required to enter into the IVAS codec Public Collaboration agreement before making
19 : contributions.
20 :
21 : This software is provided "AS IS", without any express or implied warranties. The software is in the
22 : development stage. It is intended exclusively for experts who have experience with such software and
23 : solely for the purpose of inspection. All implied warranties of non-infringement, merchantability
24 : and fitness for a particular purpose are hereby disclaimed and excluded.
25 :
26 : Any dispute, controversy or claim arising under or in relation to providing this software shall be
27 : submitted to and settled by the final, binding jurisdiction of the courts of Munich, Germany in
28 : accordance with the laws of the Federal Republic of Germany excluding its conflict of law rules and
29 : the United Nations Convention on Contracts on the International Sales of Goods.
30 :
31 : *******************************************************************************************************/
32 :
33 : /*====================================================================================
34 : EVS Codec 3GPP TS26.443 Nov 04, 2021. Version 12.14.0 / 13.10.0 / 14.6.0 / 15.4.0 / 16.3.0
35 : ====================================================================================*/
36 :
37 : #include <stdint.h>
38 : #include "options.h"
39 : #ifdef DEBUGGING
40 : #include "debug.h"
41 : #endif
42 : #include "cnst.h"
43 : #include "rom_enc.h"
44 : #include "prot.h"
45 : #include "wmc_auto.h"
46 :
47 : /*-------------------------------------------------------------------*
48 : * Local constants
49 : *-------------------------------------------------------------------*/
50 :
51 : #define MAX_DELTA 16 /* half-length of the delta search */
52 : #define COR_BUF_LEN ( L_INTERPOL1 * 2 + MAX_DELTA * 2 + 1 )
53 :
54 : /*-------------------------------------------------------------------*
55 : * pitch_ol2()
56 : *
57 : * Open-loop pitch precision improvement with 1/4 resolution
58 : * The pitch is searched in the interval <pitch_ol-delta, pitch_ol+delta),
59 : * i.e. the value pitch_ol + delta is not a part of the interval
60 : *-------------------------------------------------------------------*/
61 :
62 1187304 : void pitch_ol2(
63 : const int16_t pit_min, /* i : pit_min value */
64 : const int16_t pitch_ol, /* i : pitch to be improved */
65 : float *pitch_fr, /* o : adjusted 1/4 fractional pitch */
66 : float *voicing_fr, /* o : adjusted 1/4 fractional voicing */
67 : const int16_t pos, /* i : position in frame where to calculate the improv. */
68 : const float *wsp, /* i : weighted speech for current frame and look-ahead */
69 : const int16_t delta /* i : delta for pitch search */
70 : )
71 : {
72 : int16_t i, t, t0, t1, step, fraction, t0_min, t0_max, t_min, t_max;
73 : float temp, cor_max, enr_wsp, enr_old, cor[COR_BUF_LEN], *pt_cor, wsp_fr[L_SUBFR];
74 : const float *pt_wsp;
75 :
76 1187304 : t0_min = pitch_ol - delta;
77 1187304 : t0_max = pitch_ol + delta - 1;
78 :
79 1187304 : if ( t0_min < pit_min )
80 : {
81 104832 : t0_min = pit_min;
82 : }
83 1187304 : t_min = t0_min - L_INTERPOL1;
84 :
85 1187304 : if ( t0_max > PIT_MAX )
86 : {
87 31382 : t0_max = PIT_MAX;
88 : }
89 1187304 : t_max = t0_max + L_INTERPOL1;
90 :
91 1187304 : pt_wsp = wsp + pos;
92 1187304 : pt_cor = cor;
93 26733326 : for ( t = t_min; t <= t_max; t++ )
94 : {
95 25546022 : *pt_cor++ = dotp( pt_wsp, pt_wsp - t, L_SUBFR );
96 : }
97 :
98 1187304 : pt_cor = cor + L_INTERPOL1;
99 1187304 : cor_max = *pt_cor++;
100 1187304 : t1 = t0_min;
101 16047590 : for ( t = t0_min + 1; t <= t0_max; t++ )
102 : {
103 14860286 : if ( *pt_cor > cor_max )
104 : {
105 5946549 : cor_max = *pt_cor;
106 5946549 : t1 = t;
107 : }
108 14860286 : pt_cor++;
109 : }
110 :
111 : /*------------------------------------------------------------------*
112 : * Search fractional pitch with 1/4 subsample resolution.
113 : * search the fractions around t0 and choose the one which maximizes
114 : * the interpolated normalized correlation.
115 : *-----------------------------------------------------------------*/
116 :
117 1187304 : pt_cor = cor + L_INTERPOL1 - t0_min;
118 1187304 : t0 = t1;
119 :
120 1187304 : step = 1; /* 1/4 subsample resolution */
121 1187304 : fraction = 1;
122 :
123 1187304 : if ( t0 == t0_min ) /* Limit case */
124 : {
125 84554 : fraction = 0;
126 84554 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
127 : }
128 : else /* Process negative fractions */
129 : {
130 1102750 : t0--;
131 1102750 : cor_max = interpolation( &pt_cor[t0], E_ROM_inter4_1, fraction, PIT_UP_SAMP, 4 );
132 3308250 : for ( i = ( fraction + step ); i <= 3; i = i + step )
133 : {
134 2205500 : temp = interpolation( &pt_cor[t0], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
135 2205500 : if ( temp > cor_max )
136 : {
137 2066220 : cor_max = temp;
138 2066220 : fraction = i;
139 : }
140 : }
141 : }
142 :
143 5936520 : for ( i = 0; i <= 3; i = i + step ) /* Process positive fractions */
144 : {
145 4749216 : temp = interpolation( &pt_cor[t1], E_ROM_inter4_1, i, PIT_UP_SAMP, 4 );
146 4749216 : if ( temp > cor_max )
147 : {
148 1364397 : cor_max = temp;
149 1364397 : fraction = i;
150 1364397 : t0 = t1;
151 : }
152 : }
153 :
154 1187304 : *pitch_fr = t0 + (float) fraction / 4.0f;
155 1187304 : pred_lt4( pt_wsp, wsp_fr, t0, fraction, L_SUBFR, E_ROM_inter4_1, 4, PIT_UP_SAMP );
156 :
157 1187304 : enr_wsp = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
158 1187304 : enr_old = dotp( wsp_fr, wsp_fr, L_SUBFR ) + 0.01f;
159 1187304 : *voicing_fr = cor_max * inv_sqrt( enr_wsp * enr_old );
160 :
161 1187304 : return;
162 : }
163 :
164 :
165 : /*-------------------------------------------------------------------*
166 : * StableHighPitchDetect()
167 : *
168 : * Very short stable pitch detection
169 : *-------------------------------------------------------------------*/
170 :
171 1153834 : void StableHighPitchDetect(
172 : int16_t *flag_spitch, /* o : flag to indicate very short stable pitch*/
173 : int16_t pitch[], /* i/o: OL pitch buffer */
174 : const float voicing[], /* i : OL pitch gains */
175 : const float Bin_E[], /* i : per bin log energy spectrum */
176 : const float wsp[], /* i : weighted speech */
177 : const int16_t localVAD, /* i : local VAD flag */
178 : float *voicing_sm, /* i/o: smoothed open-loop pitch gains */
179 : float *voicing0_sm, /* i/o: smoothed high pitch gains */
180 : float *LF_EnergyRatio_sm, /* i/o: smoothed [0, 300Hz] relative peak energy*/
181 : int16_t *predecision_flag, /* i/o: predecision flag */
182 : float *diff_sm, /* i/o: smoothed pitch frequency difference */
183 : float *energy_sm /* i/o: smoothed energy around pitch frequency */
184 : )
185 : {
186 : int16_t i, pitch_freq_point, pit_min_up;
187 : int16_t T, Tp, pit_min;
188 :
189 : float voicing_m;
190 : float energy0, energy1, ratio, cor_max, diff, sum_energy;
191 : const float *pt_wsp;
192 :
193 1153834 : voicing_m = mean( voicing, 3 );
194 1153834 : *voicing_sm = 0.75f * ( *voicing_sm ) + 0.25f * voicing_m;
195 :
196 :
197 : /* initial short pitch possibility pre-decision */
198 1153834 : pitch_freq_point = (int16_t) ( L_FFT / pitch[1] + 0.5f );
199 1153834 : diff = 0.0f;
200 1153834 : sum_energy = 0.0f;
201 :
202 10117144 : for ( i = 1; i < 2 * pitch_freq_point; i++ )
203 : {
204 8963310 : diff += ( Bin_E[pitch_freq_point] - Bin_E[i] );
205 8963310 : sum_energy += Bin_E[i];
206 : }
207 1153834 : sum_energy /= ( 2 * pitch_freq_point - 1 );
208 :
209 1153834 : *diff_sm = 0.2f * diff + 0.8f * *diff_sm;
210 1153834 : *energy_sm = 0.2f * sum_energy + 0.8f * *energy_sm;
211 1153834 : diff /= sum_energy;
212 :
213 1153834 : if ( *diff_sm < -10 && *energy_sm < 38.5 && diff < -0.8 )
214 : {
215 21644 : *predecision_flag = 1;
216 : }
217 :
218 1153834 : if ( *diff_sm > 10 && *energy_sm > 83 && diff > 0.5 )
219 : {
220 88247 : *predecision_flag = 0;
221 : }
222 :
223 : /* short pitch possiblity pre-decision */
224 1153834 : maximum( Bin_E, 7, &energy0 );
225 1153834 : maximum( Bin_E + 8, 7, &energy1 );
226 1153834 : ratio = max( energy1 - energy0, 0 );
227 1153834 : ratio *= max( voicing_m, 0 );
228 :
229 1153834 : *LF_EnergyRatio_sm = ( 15 * ( *LF_EnergyRatio_sm ) + ratio ) / 16;
230 :
231 1153834 : if ( *LF_EnergyRatio_sm > 35 || ratio > 50 )
232 : {
233 28267 : *predecision_flag = 1;
234 : }
235 :
236 1153834 : if ( *LF_EnergyRatio_sm < 16 )
237 : {
238 1104103 : *predecision_flag = 0;
239 : }
240 :
241 : /* short pitch candidate detection */
242 1153834 : Tp = pitch[1];
243 1153834 : cor_max = 0;
244 :
245 1153834 : pt_wsp = wsp + 3 * L_SUBFR;
246 1153834 : pit_min = PIT_MIN_DOUBLEEXTEND;
247 1153834 : pit_min_up = PIT_MIN;
248 :
249 21922846 : for ( T = pit_min; T <= pit_min_up; T++ )
250 : {
251 20769012 : energy1 = dotp( pt_wsp, pt_wsp - T, L_SUBFR );
252 :
253 20769012 : if ( energy1 > cor_max || T == pit_min )
254 : {
255 6157981 : cor_max = energy1;
256 6157981 : Tp = T;
257 : }
258 : }
259 :
260 1153834 : energy0 = dotp( pt_wsp, pt_wsp, L_SUBFR ) + 0.01f;
261 1153834 : energy1 = dotp( pt_wsp - Tp, pt_wsp - Tp, L_SUBFR ) + 0.01f;
262 1153834 : cor_max *= inv_sqrt( energy0 * energy1 );
263 1153834 : *voicing0_sm = 0.75f * ( *voicing0_sm ) + 0.25f * cor_max;
264 :
265 : /* final short pitch correction */
266 1153834 : *flag_spitch = 0;
267 1153834 : if ( localVAD && *predecision_flag && *voicing0_sm > 0.65f && *voicing0_sm > 0.7f * ( *voicing_sm ) )
268 : {
269 37177 : *flag_spitch = 1;
270 :
271 37177 : pitch[0] = Tp;
272 37177 : pitch[1] = Tp;
273 37177 : pitch[2] = Tp;
274 : }
275 :
276 1153834 : return;
277 : }
278 :
279 : /*-------------------------------------------------------------------*
280 : * pitchDoubling_det()
281 : * Multiple pitch doubling detector
282 : *
283 : *-------------------------------------------------------------------*/
284 :
285 0 : void pitchDoubling_det(
286 : const float *wspeech,
287 : int16_t *pitch_ol,
288 : float *pitch_fr,
289 : float *voicing_fr )
290 : {
291 : float new_op_fr[2];
292 : float new_voicing[2];
293 : int16_t new_Top[2];
294 : int16_t m, T;
295 :
296 : /*save initial values*/
297 0 : new_Top[0] = pitch_ol[0];
298 0 : new_Top[1] = pitch_ol[1];
299 0 : for ( m = 2; m < 5; m++ )
300 : {
301 0 : T = pitch_ol[0] / m;
302 0 : if ( T >= PIT_MIN_12k8 )
303 : {
304 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 0, wspeech, 2 );
305 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], L_SUBFR, wspeech, 2 );
306 :
307 0 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[0] + voicing_fr[1] ) )
308 : {
309 0 : new_Top[0] = T;
310 0 : pitch_fr[0] = new_op_fr[0];
311 0 : pitch_fr[1] = new_op_fr[1];
312 0 : voicing_fr[0] = new_voicing[0];
313 0 : voicing_fr[1] = new_voicing[1];
314 : }
315 : }
316 :
317 0 : T = pitch_ol[1] / m;
318 0 : if ( T >= PIT_MIN_12k8 )
319 : {
320 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[0], &new_voicing[0], 2 * L_SUBFR, wspeech, 2 );
321 0 : pitch_ol2( PIT_MIN_SHORTER, T, &new_op_fr[1], &new_voicing[1], 3 * L_SUBFR, wspeech, 2 );
322 :
323 0 : if ( ( new_voicing[0] + new_voicing[1] ) > ( voicing_fr[2] + voicing_fr[3] ) )
324 : {
325 0 : new_Top[1] = T;
326 0 : pitch_fr[2] = new_op_fr[0];
327 0 : pitch_fr[3] = new_op_fr[1];
328 0 : voicing_fr[2] = new_voicing[0];
329 0 : voicing_fr[3] = new_voicing[1];
330 : }
331 : }
332 : }
333 0 : pitch_ol[0] = new_Top[0];
334 0 : pitch_ol[1] = new_Top[1];
335 :
336 0 : return;
337 : }
|