Code Coverage Report for src/util/float-to-int16.c


Hit Total Coverage
Lines: 100 100 100.0%
Branches: 256 256 100.0%

1 /*
2 * libnogg: a decoder library for Ogg Vorbis streams
3 * Copyright (c) 2014-2024 Andrew Church <achurch@achurch.org>
4 *
5 * This software may be copied and redistributed under certain conditions;
6 * see the file "COPYING" in the source code distribution for details.
7 * NO WARRANTY is provided with this software.
8 */
9
10 #include "include/nogg.h"
11 #include "src/common.h"
12 #include "src/util/float-to-int16.h"
13 #include "src/x86.h"
14
15 #include <math.h>
16
17 #ifdef ENABLE_ASM_ARM_NEON
18 # include <arm_neon.h>
19 #endif
20
21 /*************************************************************************/
22 /************************** Interface routines ***************************/
23 /*************************************************************************/
24
25 void float_to_int16(int16_t *__restrict dest, const float *__restrict src,
26 int count)
27 {
28 #if defined(ENABLE_ASM_ARM_NEON)
29 const float32x4_t k32767 = {32767, 32767, 32767, 32767};
30 const float32x4_t k0_5 = {0.5, 0.5, 0.5, 0.5};
31 const uint32x4_t k7FFFFFFF = {0x7FFFFFFF, 0x7FFFFFFF,
32 0x7FFFFFFF, 0x7FFFFFFF};
33 for (; count >= 8; src += 8, dest += 8, count -= 8) {
34 const float32x4_t in0 = vld1q_f32(src);
35 const float32x4_t in1 = vld1q_f32(src + 4);
36 const float32x4_t in0_scaled = vmulq_f32(in0, k32767);
37 const float32x4_t in1_scaled = vmulq_f32(in1, k32767);
38 const uint32x4_t in0_abs = vandq_u32((uint32x4_t)in0_scaled,
39 k7FFFFFFF);
40 const uint32x4_t in1_abs = vandq_u32((uint32x4_t)in1_scaled,
41 k7FFFFFFF);
42 const uint32x4_t in0_sign = vbicq_u32((uint32x4_t)in0_scaled,
43 k7FFFFFFF);
44 const uint32x4_t in1_sign = vbicq_u32((uint32x4_t)in1_scaled,
45 k7FFFFFFF);
46 const uint32x4_t in0_sat = vminq_u32(in0_abs, (uint32x4_t)k32767);
47 const uint32x4_t in1_sat = vminq_u32(in1_abs, (uint32x4_t)k32767);
48 /* Note that we have to add 0.5 to the absolute values here because
49 * vcvt always rounds toward zero. */
50 const float32x4_t in0_adj = vaddq_f32((float32x4_t)in0_sat, k0_5);
51 const float32x4_t in1_adj = vaddq_f32((float32x4_t)in1_sat, k0_5);
52 const float32x4_t out0 = (float32x4_t)vorrq_u32((uint32x4_t)in0_adj,
53 in0_sign);
54 const float32x4_t out1 = (float32x4_t)vorrq_u32((uint32x4_t)in1_adj,
55 in1_sign);
56 const int32x4_t out0_32 = vcvtq_s32_f32(out0);
57 const int32x4_t out1_32 = vcvtq_s32_f32(out1);
58 #if defined(__GNUC__) && !defined(__clang__) && !defined(__aarch64__)
59 /* GCC doesn't seem to be smart enough to put out0_16 and out1_16
60 * in paired registers (and it ignores any explicit registers we
61 * specify with asm("REG")), so do it manually. */
62 int16x8_t out_16;
63 __asm__(
64 "vmovn.i32 %e0, %q1\n"
65 "vmovn.i32 %f0, %q2\n"
66 : "=&w" (out_16)
67 : "w" (out0_32), "w" (out1_32)
68 );
69 #else
70 const int16x4_t out0_16 = vmovn_s32(out0_32);
71 const int16x4_t out1_16 = vmovn_s32(out1_32);
72 const int16x8_t out_16 = vcombine_s16(out0_16, out1_16);
73 #endif
74 vst1q_s16(dest, out_16);
75 }
76
77 #elif defined(ENABLE_ASM_X86_AVX2)
78
79 const uint32_t saved_mxcsr = _mm_getcsr();
80 uint32_t mxcsr = saved_mxcsr;
81 mxcsr &= ~(3<<13); // RC (00 = round to nearest)
82 mxcsr |= 1<<7; // EM_INVALID
83 _mm_setcsr(mxcsr);
84
85 const __m256 k32767 = _mm256_set1_ps(32767.0f);
86 const __m256 k7FFFFFFF = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
87 const __m256 k80000000 = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
88
89 if ((((uintptr_t)src | (uintptr_t)dest) & 31) == 0) {
90 for (; count >= 16; src += 16, dest += 16, count -= 16) {
91 const __m256 in0 = _mm256_load_ps(src);
92 const __m256 in1 = _mm256_load_ps(src+8);
93 const __m256 in0_scaled = _mm256_mul_ps(in0, k32767);
94 const __m256 in1_scaled = _mm256_mul_ps(in1, k32767);
95 const __m256 in0_abs = _mm256_and_ps(in0_scaled, k7FFFFFFF);
96 const __m256 in1_abs = _mm256_and_ps(in1_scaled, k7FFFFFFF);
97 const __m256 in0_sign = _mm256_and_ps(in0_scaled, k80000000);
98 const __m256 in1_sign = _mm256_and_ps(in1_scaled, k80000000);
99 const __m256 in0_sat = _mm256_min_ps(in0_abs, k32767);
100 const __m256 in1_sat = _mm256_min_ps(in1_abs, k32767);
101 const __m256 out0 = _mm256_or_ps(in0_sat, in0_sign);
102 const __m256 out1 = _mm256_or_ps(in1_sat, in1_sign);
103 const __m256i out0_32 = _mm256_cvtps_epi32(out0);
104 const __m256i out1_32 = _mm256_cvtps_epi32(out1);
105 const __m256i out_16_0213 = _mm256_packs_epi32(out0_32, out1_32);
106 const __m256i out_16 =
107 _mm256_permute4x64_epi64(out_16_0213, _MM_SHUFFLE(3,1,2,0));
108 _mm256_store_si256((void *)dest, out_16);
109 }
110 } else {
111 for (; count >= 16; src += 16, dest += 16, count -= 16) {
112 const __m256 in0 = _mm256_loadu_ps(src);
113 const __m256 in1 = _mm256_loadu_ps(src+8);
114 const __m256 in0_scaled = _mm256_mul_ps(in0, k32767);
115 const __m256 in1_scaled = _mm256_mul_ps(in1, k32767);
116 const __m256 in0_abs = _mm256_and_ps(in0_scaled, k7FFFFFFF);
117 const __m256 in1_abs = _mm256_and_ps(in1_scaled, k7FFFFFFF);
118 const __m256 in0_sign = _mm256_and_ps(in0_scaled, k80000000);
119 const __m256 in1_sign = _mm256_and_ps(in1_scaled, k80000000);
120 const __m256 in0_sat = _mm256_min_ps(in0_abs, k32767);
121 const __m256 in1_sat = _mm256_min_ps(in1_abs, k32767);
122 const __m256 out0 = _mm256_or_ps(in0_sat, in0_sign);
123 const __m256 out1 = _mm256_or_ps(in1_sat, in1_sign);
124 const __m256i out0_32 = _mm256_cvtps_epi32(out0);
125 const __m256i out1_32 = _mm256_cvtps_epi32(out1);
126 const __m256i out_16_0213 = _mm256_packs_epi32(out0_32, out1_32);
127 const __m256i out_16 =
128 _mm256_permute4x64_epi64(out_16_0213, _MM_SHUFFLE(3,1,2,0));
129 _mm256_storeu_si256((void *)dest, out_16);
130 }
131 }
132
133 _mm_setcsr(saved_mxcsr);
134
135 #elif defined(ENABLE_ASM_X86_SSE2)
136
137 const uint32_t saved_mxcsr = _mm_getcsr();
138 uint32_t mxcsr = saved_mxcsr;
139 mxcsr &= ~(3<<13); // RC (00 = round to nearest)
140 mxcsr |= 1<<7; // EM_INVALID
141 _mm_setcsr(mxcsr);
142
143 const __m128 k32767 = _mm_set1_ps(32767.0f);
144 const __m128 k7FFFFFFF = CAST_M128(_mm_set1_epi32(0x7FFFFFFF));
145 const __m128 k80000000 = CAST_M128(_mm_set1_epi32(0x80000000));
146
147 (10/10) if ((((uintptr_t)src | (uintptr_t)dest) & 15) == 0) {
148 (10/10) for (; count >= 8; src += 8, dest += 8, count -= 8) {
149 const __m128 in0 = _mm_load_ps(src);
150 const __m128 in1 = _mm_load_ps(src+4);
151 const __m128 in0_scaled = _mm_mul_ps(in0, k32767);
152 const __m128 in1_scaled = _mm_mul_ps(in1, k32767);
153 const __m128 in0_abs = _mm_and_ps(in0_scaled, k7FFFFFFF);
154 const __m128 in1_abs = _mm_and_ps(in1_scaled, k7FFFFFFF);
155 const __m128 in0_sign = _mm_and_ps(in0_scaled, k80000000);
156 const __m128 in1_sign = _mm_and_ps(in1_scaled, k80000000);
157 const __m128 in0_sat = _mm_min_ps(in0_abs, k32767);
158 const __m128 in1_sat = _mm_min_ps(in1_abs, k32767);
159 const __m128 out0 = _mm_or_ps(in0_sat, in0_sign);
160 const __m128 out1 = _mm_or_ps(in1_sat, in1_sign);
161 const __m128i out0_32 = _mm_cvtps_epi32(out0);
162 const __m128i out1_32 = _mm_cvtps_epi32(out1);
163 const __m128i out_16 = _mm_packs_epi32(out0_32, out1_32);
164 _mm_store_si128((void *)dest, out_16);
165 }
166 } else {
167 (10/10) for (; count >= 8; src += 8, dest += 8, count -= 8) {
168 const __m128 in0 = _mm_loadu_ps(src);
169 const __m128 in1 = _mm_loadu_ps(src+4);
170 const __m128 in0_scaled = _mm_mul_ps(in0, k32767);
171 const __m128 in1_scaled = _mm_mul_ps(in1, k32767);
172 const __m128 in0_abs = _mm_and_ps(in0_scaled, k7FFFFFFF);
173 const __m128 in1_abs = _mm_and_ps(in1_scaled, k7FFFFFFF);
174 const __m128 in0_sign = _mm_and_ps(in0_scaled, k80000000);
175 const __m128 in1_sign = _mm_and_ps(in1_scaled, k80000000);
176 const __m128 in0_sat = _mm_min_ps(in0_abs, k32767);
177 const __m128 in1_sat = _mm_min_ps(in1_abs, k32767);
178 const __m128 out0 = _mm_or_ps(in0_sat, in0_sign);
179 const __m128 out1 = _mm_or_ps(in1_sat, in1_sign);
180 const __m128i out0_32 = _mm_cvtps_epi32(out0);
181 const __m128i out1_32 = _mm_cvtps_epi32(out1);
182 const __m128i out_16 = _mm_packs_epi32(out0_32, out1_32);
183 _mm_storeu_si128((void *)dest, out_16);
184 }
185 }
186
187 _mm_setcsr(saved_mxcsr);
188
189 #endif // ENABLE_ASM_*
190
191 (18/18) for (int i = 0; i < count; i++) {
192 const float sample = src[i];
193 (18/18) if (UNLIKELY(sample < -1.0f)) {
194 dest[i] = -32767;
195 (18/18) } else if (LIKELY(sample <= 1.0f)) {
196 dest[i] = (int16_t)roundf(sample * 32767.0f);
197 } else {
198 dest[i] = 32767;
199 }
200 }
201 }
202
203 /*-----------------------------------------------------------------------*/
204
205 void float_to_int16_interleave(int16_t *dest, float **src, int channels,
206 int count)
207 {
208 (18/18) for (int i = 0; i < count; i++) {
209 (18/18) for (int c = 0; c < channels; c++, dest++) {
210 const float sample = src[c][i];
211 (18/18) if (UNLIKELY(sample < -1.0f)) {
212 *dest = -32767;
213 (18/18) } else if (LIKELY(sample <= 1.0f)) {
214 *dest = (int16_t)roundf(sample * 32767.0f);
215 } else {
216 *dest = 32767;
217 }
218 }
219 }
220 }
221
222 /*-----------------------------------------------------------------------*/
223
224 void float_to_int16_interleave_2(
225 int16_t *__restrict dest, const float *__restrict src0,
226 const float *__restrict src1, int count)
227 {
228 #if defined(ENABLE_ASM_ARM_NEON)
229
230 const float32x4_t k32767 = {32767, 32767, 32767, 32767};
231 const float32x4_t k0_5 = {0.5, 0.5, 0.5, 0.5};
232 const uint32x4_t k7FFFFFFF = {0x7FFFFFFF, 0x7FFFFFFF,
233 0x7FFFFFFF, 0x7FFFFFFF};
234 for (; count >= 4; src0 += 4, src1 += 4, dest += 8, count -= 4) {
235 const float32x4_t in0 = vld1q_f32(src0);
236 const float32x4_t in1 = vld1q_f32(src1);
237 const float32x4_t in0_scaled = vmulq_f32(in0, k32767);
238 const float32x4_t in1_scaled = vmulq_f32(in1, k32767);
239 const uint32x4_t in0_abs = vandq_u32((uint32x4_t)in0_scaled,
240 k7FFFFFFF);
241 const uint32x4_t in1_abs = vandq_u32((uint32x4_t)in1_scaled,
242 k7FFFFFFF);
243 const uint32x4_t in0_sign = vbicq_u32((uint32x4_t)in0_scaled,
244 k7FFFFFFF);
245 const uint32x4_t in1_sign = vbicq_u32((uint32x4_t)in1_scaled,
246 k7FFFFFFF);
247 const uint32x4_t in0_sat = vminq_u32(in0_abs, (uint32x4_t)k32767);
248 const uint32x4_t in1_sat = vminq_u32(in1_abs, (uint32x4_t)k32767);
249 /* Note that we have to add 0.5 to the absolute values here because
250 * vcvt always rounds toward zero. */
251 const float32x4_t in0_adj = vaddq_f32((float32x4_t)in0_sat, k0_5);
252 const float32x4_t in1_adj = vaddq_f32((float32x4_t)in1_sat, k0_5);
253 const float32x4_t out0 = (float32x4_t)vorrq_u32((uint32x4_t)in0_adj,
254 in0_sign);
255 const float32x4_t out1 = (float32x4_t)vorrq_u32((uint32x4_t)in1_adj,
256 in1_sign);
257 const int32x4_t out0_32 = vcvtq_s32_f32(out0);
258 const int32x4_t out1_32 = vcvtq_s32_f32(out1);
259 int32x4x2_t out_32 = vzipq_s32(out0_32, out1_32);
260 #if defined(__GNUC__) && !defined(__clang__) && !defined(__aarch64__)
261 int16x8_t out_16;
262 __asm__(
263 "vmovn.i32 %e0, %q1\n"
264 "vmovn.i32 %f0, %q2\n"
265 : "=&w" (out_16)
266 : "w" (out_32.val[0]), "w" (out_32.val[1])
267 );
268 #else
269 const int16x4_t out0_16 = vmovn_s32(out_32.val[0]);
270 const int16x4_t out1_16 = vmovn_s32(out_32.val[1]);
271 const int16x8_t out_16 = vcombine_s16(out0_16, out1_16);
272 #endif
273 vst1q_s16(dest, out_16);
274 }
275
276 #elif defined(ENABLE_ASM_X86_AVX2)
277
278 ASSERT((((uintptr_t)src0 | (uintptr_t)src1 | (uintptr_t)dest) & 31) == 0);
279
280 const uint32_t saved_mxcsr = _mm_getcsr();
281 uint32_t mxcsr = saved_mxcsr;
282 mxcsr &= ~(3<<13); // RC (00 = round to nearest)
283 mxcsr |= 1<<7; // EM_INVALID
284 _mm_setcsr(mxcsr);
285
286 const __m256 k32767 = _mm256_set1_ps(32767.0f);
287 const __m256 k7FFFFFFF = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
288 const __m256 k80000000 = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
289
290 for (; count >= 8; src0 += 8, src1 += 8, dest += 16, count -= 8) {
291 const __m256 in0 = _mm256_load_ps(src0);
292 const __m256 in1 = _mm256_load_ps(src1);
293 const __m256 in0_scaled = _mm256_mul_ps(in0, k32767);
294 const __m256 in1_scaled = _mm256_mul_ps(in1, k32767);
295 const __m256 in0_abs = _mm256_and_ps(in0_scaled, k7FFFFFFF);
296 const __m256 in1_abs = _mm256_and_ps(in1_scaled, k7FFFFFFF);
297 const __m256 in0_sign = _mm256_and_ps(in0_scaled, k80000000);
298 const __m256 in1_sign = _mm256_and_ps(in1_scaled, k80000000);
299 const __m256 in0_sat = _mm256_min_ps(in0_abs, k32767);
300 const __m256 in1_sat = _mm256_min_ps(in1_abs, k32767);
301 const __m256 out0 = _mm256_or_ps(in0_sat, in0_sign);
302 const __m256 out1 = _mm256_or_ps(in1_sat, in1_sign);
303 const __m256i out0_32 = _mm256_cvtps_epi32(out0);
304 const __m256i out1_32 = _mm256_cvtps_epi32(out1);
305 const __m256i out_32_lo = _mm256_unpacklo_epi32(out0_32, out1_32);
306 const __m256i out_32_hi = _mm256_unpackhi_epi32(out0_32, out1_32);
307 const __m256i out_16 = _mm256_packs_epi32(out_32_lo, out_32_hi);
308 _mm256_store_si256((void *)dest, out_16);
309 }
310
311 _mm_setcsr(saved_mxcsr);
312
313 #elif defined(ENABLE_ASM_X86_SSE2)
314
315 ASSERT((((uintptr_t)src0 | (uintptr_t)src1 | (uintptr_t)dest) & 15) == 0);
316
317 const uint32_t saved_mxcsr = _mm_getcsr();
318 uint32_t mxcsr = saved_mxcsr;
319 mxcsr &= ~(3<<13); // RC (00 = round to nearest)
320 mxcsr |= 1<<7; // EM_INVALID
321 _mm_setcsr(mxcsr);
322
323 const __m128 k32767 = _mm_set1_ps(32767.0f);
324 const __m128 k7FFFFFFF = CAST_M128(_mm_set1_epi32(0x7FFFFFFF));
325 const __m128 k80000000 = CAST_M128(_mm_set1_epi32(0x80000000));
326
327 (10/10) for (; count >= 4; src0 += 4, src1 += 4, dest += 8, count -= 4) {
328 const __m128 in0 = _mm_load_ps(src0);
329 const __m128 in1 = _mm_load_ps(src1);
330 const __m128 in0_scaled = _mm_mul_ps(in0, k32767);
331 const __m128 in1_scaled = _mm_mul_ps(in1, k32767);
332 const __m128 in0_abs = _mm_and_ps(in0_scaled, k7FFFFFFF);
333 const __m128 in1_abs = _mm_and_ps(in1_scaled, k7FFFFFFF);
334 const __m128 in0_sign = _mm_and_ps(in0_scaled, k80000000);
335 const __m128 in1_sign = _mm_and_ps(in1_scaled, k80000000);
336 const __m128 in0_sat = _mm_min_ps(in0_abs, k32767);
337 const __m128 in1_sat = _mm_min_ps(in1_abs, k32767);
338 const __m128 out0 = _mm_or_ps(in0_sat, in0_sign);
339 const __m128 out1 = _mm_or_ps(in1_sat, in1_sign);
340 const __m128i out0_32 = _mm_cvtps_epi32(out0);
341 const __m128i out1_32 = _mm_cvtps_epi32(out1);
342 const __m128i out_32_lo = _mm_unpacklo_epi32(out0_32, out1_32);
343 const __m128i out_32_hi = _mm_unpackhi_epi32(out0_32, out1_32);
344 const __m128i out_16 = _mm_packs_epi32(out_32_lo, out_32_hi);
345 _mm_store_si128((void *)dest, out_16);
346 }
347
348 _mm_setcsr(saved_mxcsr);
349
350 #endif // ENABLE_ASM_*
351
352 (18/18) for (int i = 0; i < count; i++) {
353 const float sample0 = src0[i];
354 const float sample1 = src1[i];
355 (18/18) if (UNLIKELY(sample0 < -1.0f)) {
356 dest[i*2+0] = -32767;
357 (18/18) } else if (LIKELY(sample0 <= 1.0f)) {
358 dest[i*2+0] = (int16_t)roundf(sample0 * 32767.0f);
359 } else {
360 dest[i*2+0] = 32767;
361 }
362 (18/18) if (UNLIKELY(sample1 < -1.0f)) {
363 dest[i*2+1] = -32767;
364 (18/18) } else if (LIKELY(sample1 <= 1.0f)) {
365 dest[i*2+1] = (int16_t)roundf(sample1 * 32767.0f);
366 } else {
367 dest[i*2+1] = 32767;
368 }
369 }
370 }
371
372 /*************************************************************************/
373 /*************************************************************************/