Code Coverage Report for src/decode/decode.c


Hit Total Coverage
Lines: 1039 1064 97.6%
Branches: 3367 3538 95.1%

1 /*
2 * libnogg: a decoder library for Ogg Vorbis streams
3 * Copyright (c) 2014-2023 Andrew Church <achurch@achurch.org>
4 *
5 * This software may be copied and redistributed under certain conditions;
6 * see the file "COPYING" in the source code distribution for details.
7 * NO WARRANTY is provided with this software.
8 */
9
10 #include "include/nogg.h"
11 #include "src/common.h"
12 #include "src/decode/common.h"
13 #include "src/decode/crc32.h"
14 #include "src/decode/decode.h"
15 #include "src/decode/inlines.h"
16 #include "src/decode/io.h"
17 #include "src/decode/packet.h"
18 #include "src/decode/setup.h"
19 #include "src/util/memory.h"
20 #include "src/x86.h"
21
22 #include <math.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26 #ifdef ENABLE_ASM_ARM_NEON
27 # include <arm_neon.h>
28 /* Note that vectorization is sometimes slower on ARM because of the lack
29 * of fast vector swizzle instructions, so ARM code is deliberately omitted
30 * in those cases. */
31 #endif
32
33 /* Older versions of GCC warn about shadowing functions with variables, so
34 * work around those warnings. */
35 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__==4 && __GNUC_MINOR__<7
36 # define div div_
37 # define floor floor_
38 # define y0 y0_
39 # define y1 y1_
40 #endif
41
42 /*
43 * Note on variable naming: Variables are generally named following usage
44 * in the Vorbis spec, though some variables have been renamed for clarity.
45 * In particular, the Vorbis spec consistently uses "n" for the window size
46 * of a packet, and we follow that usage here.
47 */
48
49 /*************************************************************************/
50 /****************************** Local data *******************************/
51 /*************************************************************************/
52
53 /* Lookup table for type 1 floor curve generation, copied from the Vorbis
54 * specification. */
55 static float floor1_inverse_db_table[256] =
56 {
57 1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f,
58 1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f,
59 1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f,
60 2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f,
61 2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f,
62 3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f,
63 4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f,
64 6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f,
65 7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f,
66 1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f,
67 1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f,
68 1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f,
69 2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f,
70 2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f,
71 3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f,
72 4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f,
73 5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f,
74 7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f,
75 9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f,
76 1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f,
77 1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f,
78 2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f,
79 2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f,
80 3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f,
81 4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f,
82 5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f,
83 7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f,
84 9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f,
85 0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f,
86 0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f,
87 0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f,
88 0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f,
89 0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f,
90 0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f,
91 0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f,
92 0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f,
93 0.00092223983f, 0.00098217216f, 0.0010459992f, 0.0011139742f,
94 0.0011863665f, 0.0012634633f, 0.0013455702f, 0.0014330129f,
95 0.0015261382f, 0.0016253153f, 0.0017309374f, 0.0018434235f,
96 0.0019632195f, 0.0020908006f, 0.0022266726f, 0.0023713743f,
97 0.0025254795f, 0.0026895994f, 0.0028643847f, 0.0030505286f,
98 0.0032487691f, 0.0034598925f, 0.0036847358f, 0.0039241906f,
99 0.0041792066f, 0.0044507950f, 0.0047400328f, 0.0050480668f,
100 0.0053761186f, 0.0057254891f, 0.0060975636f, 0.0064938176f,
101 0.0069158225f, 0.0073652516f, 0.0078438871f, 0.0083536271f,
102 0.0088964928f, 0.009474637f, 0.010090352f, 0.010746080f,
103 0.011444421f, 0.012188144f, 0.012980198f, 0.013823725f,
104 0.014722068f, 0.015678791f, 0.016697687f, 0.017782797f,
105 0.018938423f, 0.020169149f, 0.021479854f, 0.022875735f,
106 0.024362330f, 0.025945531f, 0.027631618f, 0.029427276f,
107 0.031339626f, 0.033376252f, 0.035545228f, 0.037855157f,
108 0.040315199f, 0.042935108f, 0.045725273f, 0.048696758f,
109 0.051861348f, 0.055231591f, 0.058820850f, 0.062643361f,
110 0.066714279f, 0.071049749f, 0.075666962f, 0.080584227f,
111 0.085821044f, 0.091398179f, 0.097337747f, 0.10366330f,
112 0.11039993f, 0.11757434f, 0.12521498f, 0.13335215f,
113 0.14201813f, 0.15124727f, 0.16107617f, 0.17154380f,
114 0.18269168f, 0.19456402f, 0.20720788f, 0.22067342f,
115 0.23501402f, 0.25028656f, 0.26655159f, 0.28387361f,
116 0.30232132f, 0.32196786f, 0.34289114f, 0.36517414f,
117 0.38890521f, 0.41417847f, 0.44109412f, 0.46975890f,
118 0.50028648f, 0.53279791f, 0.56742212f, 0.60429640f,
119 0.64356699f, 0.68538959f, 0.72993007f, 0.77736504f,
120 0.82788260f, 0.88168307f, 0.9389798f, 1.0f
121 };
122
123 /*************************************************************************/
124 /************************* Vectorization helpers *************************/
125 /*************************************************************************/
126
127 #ifdef ENABLE_ASM_ARM_NEON
128
129 /* Used to avoid unnecessary typecasts when flipping sign bits. */
130 static inline float32x4_t veorq_f32(uint32x4_t a, float32x4_t b) {
131 return (float32x4_t)veorq_u32(a, (uint32x4_t)b);
132 }
133
134 /* Various kinds of vector swizzles (xyzw = identity). The separate
135 * declarations are needed because some functions are implemented in
136 * terms of others. */
137 static inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a);
138 static inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a);
139 static inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a);
140 static inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a);
141 static inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a);
142 static inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a);
143 static inline float32x4_t vswizzleq_yyww_f32(float32x4_t a);
144 static inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a);
145 static inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a);
146 static inline float32x4_t vswizzleq_zzww_f32(float32x4_t a);
147 static inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a);
148 static inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a);
149 static inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a);
150 static inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a);
151 /* These are all marked UNUSED so Clang doesn't complain about the fact
152 * that we don't currently use all of them. */
153 static UNUSED inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a) {
154 return vzipq_f32(a, a).val[0];
155 }
156 static UNUSED inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a) {
157 const uint32x4_t sel = {~0U, 0, ~0U, 0};
158 /* vbsl operands: vbsl(mask, mask_on_value, mask_off_value) */
159 return vbslq_f32(sel, a, (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));
160 }
161 static UNUSED inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a) {
162 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[0]);
163 }
164 static UNUSED inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a) {
165 return vuzpq_f32(a, a).val[0];
166 }
167 static UNUSED inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a) {
168 return vswizzleq_xyxy_f32(vswizzleq_yxwz_f32(a));
169 }
170 static UNUSED inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a) {
171 const uint32x4_t sel = {~0U, 0, ~0U, 0};
172 return vbslq_f32(sel,
173 (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32),
174 (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));
175 }
176 static UNUSED inline float32x4_t vswizzleq_yyww_f32(float32x4_t a) {
177 const uint32x4_t sel = {~0U, 0, ~0U, 0};
178 return vbslq_f32(sel, (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32), a);
179 }
180 static UNUSED inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a) {
181 return vuzpq_f32(a, a).val[1];
182 }
183 static UNUSED inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a) {
184 return vswizzleq_zwxy_f32(vswizzleq_xxzz_f32(a));
185 }
186 static UNUSED inline float32x4_t vswizzleq_zzww_f32(float32x4_t a) {
187 return vzipq_f32(a, a).val[1];
188 }
189 static UNUSED inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a) {
190 return vextq_f32(a, a, 2);
191 }
192 static UNUSED inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a) {
193 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[1]);
194 }
195 static UNUSED inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a) {
196 return vswizzleq_yxwz_f32(vswizzleq_zwxy_f32(a));
197 }
198 static UNUSED inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a) {
199 return vswizzleq_zwxy_f32(vswizzleq_yyww_f32(a));
200 }
201
202 #endif // ENABLE_ASM_ARM_NEON
203
204 /*-----------------------------------------------------------------------*/
205
206 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)
207
208 /*
209 * _mm_xor_sign: Simple wrapper function for _mm_xor_ps() used to avoid
210 * unnecessary typecasts when flipping sign bits and work around compiler
211 * over-optimizations.
212 *
213 * Unlike ARM, the x86 architecture includes an exclusive-or instruction
214 * which nominally operates on floating-point data (xorps, with the
215 * corresponding intrinsic _mm_xor_ps()), which has the same effect as
216 * the equivalent integer instruction but avoids cross-domain data
217 * movement on processors with separate integer and floating-point vector
218 * pipelines, so at first glance it would seem like we could simply cast
219 * the sign vector to a float vector and use it that way. But some
220 * compilers, notably GCC 8 and later and Clang 11 and later, aggressively
221 * optimize away signed zero values in vector constants when inexact
222 * floating-point optimizations are enabled, and a mask containing only
223 * sign bits looks like an all-zero vector when interpreted as
224 * floating-point data.
225 *
226 * We could potentially accept the potential cross-domain stall and use
227 * the integer instruction (pxor), which works for GCC, but Clang seems
228 * to "pierce the veil" of the typecast and optimize out the operation
229 * anyway (in fact, this was diagnosed as incorrectly treating an XOR of
230 * two 64-bit sign masks as a vector negation of four 32-bit floats and
231 * applying the transformation "-(x - y) => y - x"). So we interpose a
232 * dummy (empty) inline assembly statement to hide knowledge of the
233 * constant's value from these compilers, thus forcing them to emit the
234 * exclusive-or operation as desired.
235 *
236 * See also:
237 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86855
238 * https://github.com/llvm/llvm-project/issues/55758 (fixed in 15.0.0)
239 */
240 static inline __m128 _mm_xor_sign(__m128i sign_mask, __m128 value) {
241 #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0))
242 __asm__("" : "=x" (value) : "0" (value));
243 #endif
244 return _mm_xor_ps(CAST_M128(sign_mask), value);
245 }
246
247 #endif // ENABLE_ASM_X86_SSE2
248
249 /*-----------------------------------------------------------------------*/
250
251 #ifdef ENABLE_ASM_X86_AVX2
252
253 /*
254 * _mm256_xor_sign: 256-bit version of _mm_xor_sign(), with the same
255 * caveats.
256 */
257 static inline __m256 _mm256_xor_sign(__m256i sign_mask, __m256 value) {
258 #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0))
259 __asm__("" : "=x" (value) : "0" (value));
260 #endif
261 return _mm256_xor_ps(_mm256_castsi256_ps(sign_mask), value);
262 }
263
264 /**
265 * _mm256_permute4x64_ps: Convenience wrapper for _mm256_permute4x64_pd()
266 * which encapsulates ps/pd casts. Written as a macro because "control"
267 * must be a compile-time constant, which the compiler may not be able
268 * to determine if this is written as a function.
269 */
270 #define _mm256_permute4x64_ps(value, control) \
271 (_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd((value)), \
272 (control))))
273
274 #endif // ENABLE_ASM_X86_AVX2
275
276 /*************************************************************************/
277 /******************* Codebook decoding: scalar context *******************/
278 /*************************************************************************/
279
280 /**
281 * codebook_decode_scalar_raw_slow: Slow path for Huffman decoding,
282 * searching through the lookup tables to decode the next symbol.
283 */
284 static int32_t codebook_decode_scalar_raw_slow(stb_vorbis *handle,
285 const Codebook *book)
286 {
287 /* Fill the bit accumulator far enough to read any valid code. */
288 (18/18) if (handle->valid_bits < 24) {
289 fill_bits(handle);
290 }
291
292 /* Find the code using binary search if we can, linear search otherwise. */
293 (18/18) if (book->sorted_codewords) {
294 const uint32_t code = bit_reverse(handle->acc);
295 int32_t low = 0, high = book->sorted_entries;
296 /* Invariant: sorted_codewords[low] <= code < sorted_codewords[high] */
297 (18/18) while (low+1 < high) {
298 const int32_t mid = (low + high) / 2;
299 (18/18) if (book->sorted_codewords[mid] <= code) {
300 low = mid;
301 } else {
302 high = mid;
303 }
304 }
305 (18/18) const int32_t result = book->sparse ? low : book->sorted_values[low];
306 const int len = book->codeword_lengths[result];
307 handle->acc >>= len;
308 handle->valid_bits -= len;
309 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {
310 return -1;
311 }
312 return result;
313 } else {
314 /* This loop terminates when the code is found. We ensure that all
315 * Huffman trees are completely specified during setup, so the
316 * assertion below will never fail. */
317 for (int i = 0; ; i++) {
318 ASSERT(i < book->entries);
319 (18/18) if (book->codeword_lengths[i] == NO_CODE) {
320 continue;
321 }
322 if (book->codewords[i] ==
323 (18/18) (handle->acc & ((UINT32_C(1) << book->codeword_lengths[i])-1)))
324 {
325 handle->acc >>= book->codeword_lengths[i];
326 handle->valid_bits -= book->codeword_lengths[i];
327 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {
328 return -1;
329 }
330 return i;
331 }
332 }
333 }
334 }
335
336 /*-----------------------------------------------------------------------*/
337
338 /**
339 * codebook_decode_scalar_raw: Read a Huffman code from the packet and
340 * decode it in scalar context using the given codebook, returning the
341 * symbol for non-sparse codebooks and the sorted_values[] index for sparse
342 * codebooks.
343 *
344 * This function contains only the fast path for Huffman lookup so that it
345 * can be easily inlined into callers. The slow path is implemented as a
346 * separate, non-inlined function to avoid code bloat.
347 *
348 * [Parameters]
349 * handle: Stream handle.
350 * book: Codebook to use.
351 * [Return value]
352 * Value read, or -1 if the end of the packet is reached.
353 */
354 static inline int32_t codebook_decode_scalar_raw(stb_vorbis *handle,
355 const Codebook *book)
356 {
357 /* First try the O(1) table. We only fill the bit accumulator if we
358 * don't have enough bits for the fast table, to avoid overhead from
359 * repeatedly reading single bytes from the packet. */
360 (18/18) if (handle->valid_bits < handle->fast_huffman_length) {
361 /* Note that moving this "semi-fast" path out to a separate
362 * function is no faster, and possibly slower (though not
363 * significantly so), than just calling fill_bits() here, at least
364 * on x86_64 and 32-bit ARM. */
365 fill_bits(handle);
366 }
367 const int fast_code =
368 book->fast_huffman[handle->acc & handle->fast_huffman_mask];
369 (18/18) if (fast_code >= 0) {
370 const int bits = book->codeword_lengths[fast_code];
371 handle->acc >>= bits;
372 handle->valid_bits -= bits;
373 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {
374 return -1;
375 }
376 return fast_code;
377 }
378
379 /* Fall back to the slow (table search) method. */
380 return codebook_decode_scalar_raw_slow(handle, book);
381 }
382
383 /*-----------------------------------------------------------------------*/
384
385 /**
386 * codebook_decode_scalar: Read a Huffman code from the packet and decode
387 * it in scalar context using the given codebook.
388 *
389 * [Parameters]
390 * handle: Stream handle.
391 * book: Codebook to use.
392 * [Return value]
393 * Symbol read, or -1 if the end of the packet is reached.
394 */
395 static int32_t codebook_decode_scalar(stb_vorbis *handle, const Codebook *book)
396 {
397 const int32_t value = codebook_decode_scalar_raw(handle, book);
398 (18/18) return book->sparse ? book->sorted_values[value] : value;
399 }
400
401 /*************************************************************************/
402 /********************* Codebook decoding: VQ context *********************/
403 /*************************************************************************/
404
405 /**
406 * codebook_decode_scalar_for_vq: Read a Huffman code from the packet and
407 * return a value appropriate for decoding the code in VQ context. Helper
408 * function for codebook_decode() and related functions.
409 *
410 * [Parameters]
411 * handle: Stream handle.
412 * book: Codebook to use.
413 * [Return value]
414 * Symbol or value read (depending on library configuration and
415 * codebook type), or -1 if the end of the packet is reached.
416 */
417 static inline int32_t codebook_decode_scalar_for_vq(stb_vorbis *handle,
418 const Codebook *book)
419 {
420 (18/18) if (handle->divides_in_codebook) {
421 return codebook_decode_scalar(handle, book);
422 } else {
423 return codebook_decode_scalar_raw(handle, book);
424 }
425 }
426
427 /*-----------------------------------------------------------------------*/
428
429 /**
430 * codebook_decode: Read a Huffman code from the packet and decode it in
431 * VQ context using the given codebook. Decoded vector components are
432 * added componentwise to the values in the output vector.
433 *
434 * If an error occurs, the current packet is flushed.
435 *
436 * [Parameters]
437 * handle: Stream handle.
438 * book: Codebook to use. Must be a vector (not scalar) codebook.
439 * output: Output vector.
440 * len: Number of elements to read. Automatically capped at
441 * book->dimensions.
442 * [Return value]
443 * True on success, false on error or end-of-packet.
444 */
445 static bool codebook_decode(stb_vorbis *handle, const Codebook *book,
446 float *output, int len)
447 {
448 ASSERT(book->lookup_type != 0);
449 ASSERT(book->multiplicands);
450
451 (18/18) if (len > book->dimensions) {
452 len = book->dimensions;
453 }
454
455 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
456 (18/18) if (UNLIKELY(code < 0)) {
457 return false;
458 }
459
460 (18/18) if (book->lookup_type == 1) {
461 int div = 1;
462 (18/18) if (book->sequence_p) {
463 float last = 0;
464 (18/18) for (int i = 0; i < len; i++) {
465 const int32_t offset = (code / div) % book->lookup_values;
466 const float val = book->multiplicands[offset] + last;
467 output[i] += val;
468 last = val;
469 div *= book->lookup_values;
470 }
471 } else {
472 (18/18) for (int i = 0; i < len; i++) {
473 const int32_t offset = (code / div) % book->lookup_values;
474 output[i] += book->multiplicands[offset];
475 div *= book->lookup_values;
476 }
477 }
478 } else {
479 const int32_t offset = code * book->dimensions;
480 int i = 0;
481 #if defined(ENABLE_ASM_ARM_NEON)
482 for (; i+4 <= len; i += 4) {
483 vst1q_f32(&output[i], vaddq_f32(
484 vld1q_f32(&output[i]),
485 vld1q_f32(&book->multiplicands[offset+i])));
486 }
487 #elif defined(ENABLE_ASM_X86_SSE2)
488 (10/10) for (; i+4 <= len; i += 4) {
489 _mm_storeu_ps(&output[i], _mm_add_ps(
490 _mm_loadu_ps(&output[i]),
491 _mm_loadu_ps(&book->multiplicands[offset+i])));
492 }
493 #endif
494 (18/18) for (; i < len; i++) {
495 output[i] += book->multiplicands[offset+i];
496 }
497 }
498
499 return true;
500 }
501
502 /*-----------------------------------------------------------------------*/
503
504 /**
505 * codebook_decode_step: Read a Huffman code from the packet and decode it
506 * in VQ context using the given codebook. Decoded vector components are
507 * added componentwise to values in the output vector at intervals of "step"
508 * array elements. (codebook_decode() is effectively a specialization of
509 * this function with step==1.)
510 *
511 * If an error occurs, the current packet is flushed.
512 *
513 * [Parameters]
514 * handle: Stream handle.
515 * book: Codebook to use. Must be a vector (not scalar) codebook.
516 * output: Output vector.
517 * len: Number of elements to read. Automatically capped at
518 * book->dimensions.
519 * step: Interval between consecutive output elements: the second
520 * element is output[step], the third is output[2*step], etc.
521 * [Return value]
522 * True on success, false on error or end-of-packet.
523 */
524 static bool codebook_decode_step(stb_vorbis *handle, const Codebook *book,
525 float *output, int len, int step)
526 {
527 ASSERT(book->lookup_type != 0);
528 ASSERT(book->multiplicands);
529
530 (18/18) if (len > book->dimensions) {
531 len = book->dimensions;
532 }
533
534 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
535 (18/18) if (UNLIKELY(code < 0)) {
536 return false;
537 }
538
539 (18/18) if (book->lookup_type == 1) {
540 int div = 1;
541 (9/18) if (book->sequence_p) {
542 /* This case appears not to be used by the reference encoder. */
543 float last = 0;
544 (0/18) for (int i = 0; i < len; i++) {
545 const int32_t offset = (code / div) % book->lookup_values;
546 const float val = book->multiplicands[offset] + last;
547 output[i*step] += val;
548 last = val;
549 div *= book->lookup_values;
550 }
551 } else {
552 (18/18) for (int i = 0; i < len; i++) {
553 const int32_t offset = (code / div) % book->lookup_values;
554 output[i*step] += book->multiplicands[offset];
555 div *= book->lookup_values;
556 }
557 }
558 } else {
559 const int32_t offset = code * book->dimensions;
560 (18/18) for (int i = 0; i < len; i++) {
561 output[i*step] += book->multiplicands[offset+i];
562 }
563 }
564
565 return true;
566 }
567
568 /*-----------------------------------------------------------------------*/
569
570 /**
571 * codebook_decode_deinterleave_repeat: Read Huffman codes from the packet
572 * and decode them in VQ context using the given codebook, deinterleaving
573 * across "ch" output channels, and repeat until total_decode vector
574 * elements have been decoded. Decoded vector components are added
575 * componentwise to the values in the output vectors.
576 *
577 * If an error occurs, the current packet is flushed.
578 *
579 * [Parameters]
580 * handle: Stream handle.
581 * book: Codebook to use.
582 * outputs: Output vectors for each channel.
583 * ch: Number of channels.
584 * n: Length of each output vector.
585 * total_offset: Offset within the interleaved output vector at which
586 * to start writing output values.
587 * total_decode: Total number of elements to read.
588 * [Return value]
589 * True on success, false on error or end-of-packet.
590 */
591 static bool codebook_decode_deinterleave_repeat(
592 stb_vorbis *handle, const Codebook *book, float **outputs, int ch, int n,
593 int32_t total_offset, int32_t total_decode)
594 {
595 (9/18) if (total_decode > ch*n - total_offset) {
596 total_decode = ch*n - total_offset;
597 }
598 int c_inter = total_offset % ch;
599 int p_inter = total_offset / ch;
600 int len = book->dimensions;
601
602 (18/18) while (total_decode > 0) {
603 /* Make sure we don't run off the end of the output vectors. */
604 (9/18) if (len > total_decode) {
605 len = total_decode;
606 }
607
608 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
609 (18/18) if (UNLIKELY(code < 0)) {
610 return false;
611 }
612
613 (18/18) if (book->lookup_type == 1) {
614 int div = 1;
615 (9/18) if (book->sequence_p) {
616 /* This case appears not to be used by the reference encoder. */
617 float last = 0;
618 (0/18) for (int i = 0; i < len; i++) {
619 const int32_t offset = (code / div) % book->lookup_values;
620 const float val = book->multiplicands[offset] + last;
621 (0/18) if (outputs[c_inter]) {
622 outputs[c_inter][p_inter] += val;
623 }
624 (0/18) if (++c_inter == ch) {
625 c_inter = 0;
626 p_inter++;
627 }
628 last = val;
629 div *= book->lookup_values;
630 }
631 } else {
632 (18/18) for (int i = 0; i < len; i++) {
633 const int32_t offset = (code / div) % book->lookup_values;
634 (9/18) if (outputs[c_inter]) {
635 outputs[c_inter][p_inter] +=
636 book->multiplicands[offset];
637 }
638 (18/18) if (++c_inter == ch) {
639 c_inter = 0;
640 p_inter++;
641 }
642 div *= book->lookup_values;
643 }
644 }
645 } else { // book->lookup_type == 2
646 const int32_t offset = code * book->dimensions;
647 (18/18) for (int i = 0; i < len; i++) {
648 (18/18) if (outputs[c_inter]) {
649 outputs[c_inter][p_inter] += book->multiplicands[offset+i];
650 }
651 (18/18) if (++c_inter == ch) {
652 c_inter = 0;
653 p_inter++;
654 }
655 }
656 }
657
658 total_decode -= len;
659 }
660
661 return true;
662 }
663
664 /*-----------------------------------------------------------------------*/
665
666 /**
667 * codebook_decode_deinterleave_repeat_2: Read Huffman codes from the
668 * packet and decode them in VQ context using the given codebook,
669 * deinterleaving across 2 output channels. This function is a
670 * specialization of codebook_decode_deinterleave_repeat() for ch==2 and
671 * VORBIS_OPTION_DIVIDES_IN_CODEBOOK disabled.
672 *
673 * If an error occurs, the current packet is flushed.
674 *
675 * [Parameters]
676 * handle: Stream handle.
677 * book: Codebook to use.
678 * outputs: Output vectors for each channel.
679 * len: Length of each output vector.
680 * total_decode: Total number of elements to read.
681 * [Return value]
682 * True on success, false on error or end-of-packet.
683 */
684 static bool codebook_decode_deinterleave_repeat_2(
685 stb_vorbis *handle, const Codebook *book, float **outputs, int n,
686 int32_t total_offset, int32_t total_decode)
687 {
688 (9/18) if (total_decode > 2*n - total_offset) {
689 total_decode = 2*n - total_offset;
690 }
691 float * const output0 = outputs[0];
692 float * const output1 = outputs[1];
693 ASSERT(output0);
694 ASSERT(output1);
695 int c_inter = total_offset % 2;
696 int p_inter = total_offset / 2;
697 int len = book->dimensions;
698
699 (18/18) while (total_decode > 0) {
700 /* Make sure we don't run off the end of the output vectors. */
701 (9/18) if (len > total_decode) {
702 len = total_decode;
703 }
704
705 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
706 (18/18) if (UNLIKELY(code < 0)) {
707 return false;
708 }
709
710 const int32_t offset = code * book->dimensions;
711 int i = 0;
712 (18/18) if (c_inter == 1) {
713 output1[p_inter] += book->multiplicands[offset];
714 c_inter = 0;
715 p_inter++;
716 i++;
717 }
718 (18/18) for (; i+2 <= len; i += 2) {
719 output0[p_inter] += book->multiplicands[offset+i];
720 output1[p_inter] += book->multiplicands[offset+i+1];
721 p_inter++;
722 }
723 (18/18) if (i < len) {
724 output0[p_inter] += book->multiplicands[offset+i];
725 c_inter++;
726 }
727
728 total_decode -= len;
729 }
730
731 return true;
732 }
733
734 /*************************************************************************/
735 /*************************** Floor processing ****************************/
736 /*************************************************************************/
737
738 /**
739 * render_point: Calculate the Y coordinate for point X on the line
740 * (x0,y0)-(x1,y1). Defined by section 9.2.6 in the Vorbis specification.
741 *
742 * [Parameters]
743 * x0, y0: First endpoint of the line.
744 * x1, y1: Second endpoint of the line.
745 * x: X coordinate of point to calculate.
746 * [Return value]
747 * Y value for the given X coordinate.
748 */
749 static CONST_FUNCTION int render_point(int x0, int y0, int x1, int y1, int x)
750 {
751 /* The definition of this function in the spec has separate code
752 * branches for positive and negative dy values, presumably to try and
753 * avoid defining integer division as rounding negative values toward
754 * zero (even though it ends up doing exactly that in the very next
755 * section...), but C already defines integer division to round both
756 * positive and negative values toward zero, so we just do things the
757 * natural way. */
758 return y0 + ((y1 - y0) * (x - x0) / (x1 - x0));
759 }
760
761 /*-----------------------------------------------------------------------*/
762
763 /**
764 * render_line: Render a line for a type 1 floor curve and perform the
765 * dot-product operation with the residue vector. Defined by section
766 * 9.2.7 in the Vorbis specification.
767 *
768 * [Parameters]
769 * x0, y0: First endpoint of the line.
770 * x1, y1: Second endpoint of the line.
771 * output: Output buffer. On entry, this buffer should contain the
772 * residue vector.
773 * n: Window length.
774 */
775 static inline void render_line(int x0, int y0, int x1, int y1, float *output,
776 int n)
777 {
778 /* N.B.: The spec requires a very specific sequence of operations for
779 * this function to ensure that both the encoder and the decoder
780 * generate the same integer-quantized curve. Take care that any
781 * optimizations to this function do not change the output. */
782
783 ASSERT(x0 >= 0);
784 ASSERT(y0 >= 0);
785 ASSERT(x1 > x0);
786
787 const int dy = y1 - y0;
788 const int adx = x1 - x0;
789 const int base = dy / adx;
790 const int ady = abs(dy) - (abs(base) * adx);
791 (18/18) const int sy = (dy < 0 ? base-1 : base+1);
792 int x = x0, y = y0;
793 int err = 0;
794
795 (18/18) if (x1 >= n) {
796 (9/18) if (x0 >= n) {
797 return;
798 }
799 x1 = n;
800 }
801 output[x] *= floor1_inverse_db_table[y];
802 (18/18) for (x++; x < x1; x++) {
803 err += ady;
804 (18/18) if (err >= adx) {
805 err -= adx;
806 y += sy;
807 } else {
808 y += base;
809 }
810 output[x] *= floor1_inverse_db_table[y];
811 }
812 }
813
814 /*-----------------------------------------------------------------------*/
815
816 /**
817 * decode_floor0: Perform type 0 floor decoding.
818 *
819 * [Parameters]
820 * handle: Stream handle.
821 * floor: Floor configuration.
822 * coefficients: Floor coefficient buffer for the current channel.
823 * [Return value]
824 * The floor amplitude (positive), 0 to indicate "unused" status, or
825 * -1 to indicate an undecodable packet.
826 */
827 static int64_t decode_floor0(stb_vorbis *handle, const Floor0 *floor,
828 float *coefficients)
829 {
830 const int amplitude = get_bits(handle, floor->amplitude_bits);
831 (18/18) if (amplitude == 0) {
832 return 0;
833 }
834
835 const int booknumber = get_bits(handle, floor->book_bits);
836 (18/18) if (booknumber >= floor->number_of_books) {
837 return -1;
838 }
839 const Codebook *book = &handle->codebooks[floor->book_list[booknumber]];
840 (18/18) if (book->lookup_type == 0) {
841 /* The specification is unclear on whether a floor 0 configuration
842 * referencing a scalar codebook is a setup-time or decode-time
843 * error; the only applicable rule seems to be 3.3 "If decoder
844 * setup or decode requests such an action [using a codebook of
845 * lookup type 0 in any context expecting a vector return value],
846 * that is an error condition rendering the packet undecodable".
847 * In the spirit of being lenient in what you accept, we treat it
848 * as a decode-time error so that packets which do not use that
849 * floor configuration can still be decoded. */
850 return -1;
851 }
852
853 float last = 0;
854 int coef_count = 0;
855 (18/18) while (coef_count < floor->order) {
856 const int len = min(book->dimensions, floor->order - coef_count);
857 (18/18) for (int i = 0; i < len; i++) {
858 coefficients[coef_count + i] = last;
859 }
860 (18/18) if (!codebook_decode(handle, book, &coefficients[coef_count], len)) {
861 return 0;
862 }
863 coef_count += len;
864 last = coefficients[coef_count - 1];
865 }
866
867 return amplitude;
868 }
869
870 /*-----------------------------------------------------------------------*/
871
872 /**
873 * decode_floor1_partition: Decode a single partition for a type 1 floor.
874 *
875 * [Parameters]
876 * handle: Stream handle.
877 * floor: Floor configuration.
878 * final_Y: final_Y buffer for the current channel.
879 * offset: Current offset in final_Y buffer.
880 * class: Partition class for the current partition.
881 * [Return value]
882 * New value of offset.
883 */
884 static int decode_floor1_partition(stb_vorbis *handle, const Floor1 *floor,
885 int16_t *final_Y, int offset,
886 const int class)
887 {
888 const int cdim = floor->class_dimensions[class];
889 const int cbits = floor->class_subclasses[class];
890 const int csub = (1 << cbits) - 1;
891 int cval = 0;
892 (18/18) if (cbits > 0) {
893 const Codebook *book =
894 &handle->codebooks[floor->class_masterbooks[class]];
895 cval = codebook_decode_scalar(handle, book);
896 }
897 (18/18) for (int j = 0; j < cdim; j++) {
898 const int book_index = floor->subclass_books[class][cval & csub];
899 cval >>= cbits;
900 (18/18) if (book_index >= 0) {
901 const Codebook *book = &handle->codebooks[book_index];
902 final_Y[offset++] = codebook_decode_scalar(handle, book);
903 } else {
904 final_Y[offset++] = 0;
905 }
906 }
907 return offset;
908 }
909
910 /*-----------------------------------------------------------------------*/
911
912 /**
913 * decode_floor1_amplitude: Calculate an element of the final_Y vector
914 * for a type 1 floor.
915 *
916 * [Parameters]
917 * floor: Floor configuration.
918 * final_Y: final_Y buffer for the current channel.
919 * range: Maximum value for final_Y elements.
920 * step2_flag: Array of flags indicating final_Y elements which will
921 * need to be recalculated later.
922 * i: Element of final_Y to calculate.
923 */
924 static void decode_floor1_amplitude(const Floor1 *floor, int16_t *final_Y,
925 const int range, bool *step2_flag,
926 const int i)
927 {
928 const int low = floor->neighbors[i].low;
929 const int high = floor->neighbors[i].high;
930 const int predicted = render_point(floor->X_list[low], final_Y[low],
931 floor->X_list[high], final_Y[high],
932 floor->X_list[i]);
933 const int val = final_Y[i]; // Value read from packet.
934 const int highroom = range - predicted;
935 const int lowroom = predicted;
936 int room;
937 (18/18) if (highroom > lowroom) {
938 room = lowroom * 2;
939 } else {
940 room = highroom * 2;
941 }
942 (18/18) if (val) {
943 step2_flag[low] = true;
944 step2_flag[high] = true;
945 step2_flag[i] = true;
946 (18/18) if (val >= room) {
947 (18/18) if (highroom > lowroom) {
948 /* The spec (7.2.4) suggests that "decoder implementations
949 * guard the values in vector [floor1_final_Y] by clamping
950 * each element to [0, [range])" because "it is possible
951 * to abuse the setup and codebook machinery to produce
952 * negative or over-range results". That can only happen
953 * in these two cases, so we insert tests here. This has
954 * about a 1% performance penalty, but that's better than
955 * corrupt data causing the library to crash. */
956 (9/18) if (UNLIKELY(val > range - 1)) {
957 final_Y[i] = range - 1;
958 } else {
959 final_Y[i] = val;
960 }
961 } else {
962 (9/18) if (UNLIKELY(val > range - 1)) {
963 final_Y[i] = 0;
964 } else {
965 final_Y[i] = (range - 1) - val;
966 }
967 }
968 } else {
969 (18/18) if (val % 2 != 0) {
970 final_Y[i] = predicted - (val+1)/2;
971 } else {
972 final_Y[i] = predicted + val/2;
973 }
974 }
975 } else {
976 step2_flag[i] = false;
977 final_Y[i] = predicted;
978 }
979 }
980
981 /*-----------------------------------------------------------------------*/
982
983 /**
984 * decode_floor1: Perform type 1 floor decoding.
985 *
986 * [Parameters]
987 * handle: Stream handle.
988 * floor: Floor configuration.
989 * final_Y: final_Y buffer for the current channel.
990 * [Return value]
991 * False to indicate "unused" status (4.2.3 step 6), true otherwise.
992 */
993 static bool decode_floor1(stb_vorbis *handle, const Floor1 *floor,
994 int16_t *final_Y)
995 {
996 static const int16_t range_list[4] = {256, 128, 86, 64};
997 static const int8_t range_bits_list[4] = {8, 7, 7, 6};
998 const int range = range_list[floor->floor1_multiplier - 1];
999 const int range_bits = range_bits_list[floor->floor1_multiplier - 1];
1000
1001 (18/18) if (!get_bits(handle, 1)) {
1002 return false; // Channel is zero.
1003 }
1004
1005 /* Floor decode (7.2.3). */
1006 int offset = 2;
1007 final_Y[0] = get_bits(handle, range_bits);
1008 final_Y[1] = get_bits(handle, range_bits);
1009 (18/18) for (int i = 0; i < floor->partitions; i++) {
1010 const int class = floor->partition_class_list[i];
1011 offset = decode_floor1_partition(handle, floor, final_Y, offset, class);
1012 }
1013 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {
1014 return false;
1015 }
1016
1017 /* Amplitude value synthesis (7.2.4 step 1). */
1018 bool step2_flag[256];
1019 step2_flag[0] = true;
1020 step2_flag[1] = true;
1021 (18/18) for (int i = 2; i < floor->values; i++) {
1022 decode_floor1_amplitude(floor, final_Y, range, step2_flag, i);
1023 }
1024
1025 /* Curve synthesis (7.2.4 step 2). We defer final floor computation
1026 * until after synthesis; here, we just clear final_Y values which
1027 * need to be synthesized later. */
1028 (18/18) for (int i = 0; i < floor->values; i++) {
1029 (18/18) if (!step2_flag[i]) {
1030 final_Y[i] = -1;
1031 }
1032 }
1033
1034 return true;
1035 }
1036
1037 /*-----------------------------------------------------------------------*/
1038
1039 /**
1040 * do_floor0_final: Perform the floor curve computation and residue
1041 * product for type 0 floors.
1042 *
1043 * [Parameters]
1044 * handle: Stream handle.
1045 * floor: Floor configuration.
1046 * ch: Channel to operate on.
1047 * n: Frame window size.
1048 * amplitude: Floor amplitude.
1049 */
1050 static void do_floor0_final(stb_vorbis *handle, const Floor0 *floor,
1051 const int ch, const int n, const int64_t amplitude)
1052 {
1053 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];
1054 float *coefficients = handle->coefficients[ch];
1055 const int16_t *map = floor->map[(n == handle->blocksize[1])];
1056 const float omega_base = M_PIf / floor->bark_map_size;
1057 const float scaled_amplitude = (float)amplitude
1058 / (float)((UINT64_C(1) << floor->amplitude_bits) - 1);
1059 const float lfv_scale = 0.11512925f * floor->amplitude_offset;
1060
1061 (18/18) for (int i = 0; i < floor->order; i++) {
1062 coefficients[i] = 2 * cosf(coefficients[i]);
1063 }
1064
1065 int i = 0;
1066 do {
1067 const float two_cos_omega = 2 * cosf(map[i] * omega_base);
1068 float p = 0.5, q = 0.5;
1069 int j;
1070 (18/18) for (j = 0; j <= (floor->order - 2) / 2; j++) {
1071 p *= coefficients[2*j+1] - two_cos_omega;
1072 q *= coefficients[2*j] - two_cos_omega;
1073 }
1074 (18/18) if (floor->order % 2 != 0) {
1075 q *= coefficients[2*j] - two_cos_omega;
1076 /* The spec gives this as "4 - cos^2(omega)", but we use the
1077 * equality (a^2-b^2) = (a+b)(a-b) to give us constants
1078 * common between both branches of the if statement. */
1079 p *= p * (2 + two_cos_omega) * (2 - two_cos_omega);
1080 q *= q;
1081 } else {
1082 p *= p * (2 - two_cos_omega);
1083 q *= q * (2 + two_cos_omega);
1084 }
1085 const float linear_floor_value =
1086 expf(lfv_scale * ((scaled_amplitude / sqrtf(p + q)) - 1));
1087 const int iteration_condition = map[i];
1088 do {
1089 output[i] *= linear_floor_value;
1090 i++;
1091 (18/18) } while (map[i] == iteration_condition);
1092 (18/18) } while (i < n/2);
1093 }
1094
1095 /*-----------------------------------------------------------------------*/
1096
1097 /**
1098 * do_floor1_final: Perform the final floor curve computation and residue
1099 * product for type 1 floors.
1100 *
1101 * [Parameters]
1102 * handle: Stream handle.
1103 * floor: Floor configuration.
1104 * ch: Channel to operate on.
1105 * n: Frame window size.
1106 */
1107 static void do_floor1_final(stb_vorbis *handle, const Floor1 *floor,
1108 const int ch, const int n)
1109 {
1110 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];
1111 const int16_t *final_Y = handle->final_Y[ch];
1112 int lx = 0;
1113 int ly = final_Y[0] * floor->floor1_multiplier;
1114 (18/18) for (int i = 1; i < floor->values; i++) {
1115 int j = floor->sorted_order[i];
1116 (18/18) if (final_Y[j] >= 0) {
1117 const int hx = floor->X_list[j];
1118 const int hy = final_Y[j] * floor->floor1_multiplier;
1119 render_line(lx, ly, hx, hy, output, n/2);
1120 lx = hx;
1121 ly = hy;
1122 }
1123 }
1124 (18/18) if (lx < n/2) {
1125 /* Optimization of: render_line(lx, ly, n/2, ly, output, n/2); */
1126 (18/18) for (int i = lx; i < n/2; i++) {
1127 output[i] *= floor1_inverse_db_table[ly];
1128 }
1129 }
1130 }
1131
1132 /*************************************************************************/
1133 /*************************** Residue handling ****************************/
1134 /*************************************************************************/
1135
1136 /**
1137 * decode_residue_partition_0: Decode a single residue partition for
1138 * residue type 0.
1139 *
1140 * [Parameters]
1141 * handle: Stream handle.
1142 * book: Codebook to use for decoding.
1143 * size: Residue partition size.
1144 * offset: Output offset.
1145 * outputs: Output vector array.
1146 * n: Length of residue vectors (half of window size).
1147 * ch: Number of channels of residue data to decode.
1148 * [Return value]
1149 * True on success, false on end of packet.
1150 */
1151 static bool decode_residue_partition_0(
1152 stb_vorbis *handle, const Codebook *book, int size, int offset,
1153 float **output_ptr, UNUSED int n, UNUSED int ch)
1154 {
1155 float *output = *output_ptr;
1156 const int dimensions = book->dimensions;
1157 const int step = size / dimensions;
1158 (18/18) for (int i = 0; i < step; i++) {
1159 (18/18) if (!codebook_decode_step(handle, book, &output[offset+i], size-i,
1160 step)) {
1161 return false;
1162 }
1163 }
1164 return true;
1165 }
1166
1167 /*-----------------------------------------------------------------------*/
1168
1169 /**
1170 * decode_residue_partition_1: Decode a single residue partition for
1171 * residue type 1.
1172 *
1173 * [Parameters]
1174 * handle: Stream handle.
1175 * book: Codebook to use for decoding.
1176 * size: Residue partition size.
1177 * offset: Output offset.
1178 * outputs: Output vector array.
1179 * n: Length of residue vectors (half of window size).
1180 * ch: Number of channels of residue data to decode.
1181 * [Return value]
1182 * True on success, false on end of packet.
1183 */
1184 static bool decode_residue_partition_1(
1185 stb_vorbis *handle, const Codebook *book, int size, int offset,
1186 float **output_ptr, UNUSED int n, UNUSED int ch)
1187 {
1188 float *output = *output_ptr;
1189 const int dimensions = book->dimensions;
1190 (18/18) for (int i = 0; i < size; i += dimensions) {
1191 (18/18) if (!codebook_decode(handle, book, &output[offset+i], size-i)) {
1192 return false;
1193 }
1194 }
1195 return true;
1196 }
1197
1198 /*-----------------------------------------------------------------------*/
1199
1200 /**
1201 * decode_residue_partition_2: Decode a single residue partition for
1202 * residue type 2.
1203 *
1204 * [Parameters]
1205 * handle: Stream handle.
1206 * book: Codebook to use for decoding.
1207 * size: Residue partition size.
1208 * offset: Output offset.
1209 * outputs: Output vector array.
1210 * n: Length of residue vectors (half of window size).
1211 * ch: Number of channels of residue data to decode.
1212 * [Return value]
1213 * True on success, false on end of packet.
1214 */
1215 static bool decode_residue_partition_2(
1216 stb_vorbis *handle, const Codebook *book, int size, int offset,
1217 float **outputs, int n, int ch)
1218 {
1219 return codebook_decode_deinterleave_repeat(
1220 handle, book, outputs, ch, n, offset, size);
1221 }
1222
1223 /*-----------------------------------------------------------------------*/
1224
1225 /**
1226 * decode_residue_partition_2_2ch: Decode a single residue partition for
1227 * residue type 2 with two channels. Both channels must be active for
1228 * decoding.
1229 *
1230 * [Parameters]
1231 * handle: Stream handle.
1232 * book: Codebook to use for decoding.
1233 * size: Residue partition size.
1234 * offset: Output offset.
1235 * outputs: Output vector array.
1236 * n: Length of residue vectors (half of window size).
1237 * ch: Number of channels of residue data to decode.
1238 * [Return value]
1239 * True on success, false on end of packet.
1240 */
1241 static bool decode_residue_partition_2_2ch(
1242 stb_vorbis *handle, const Codebook *book, int size, int offset,
1243 float **outputs, int n, UNUSED int ch)
1244 {
1245 return codebook_decode_deinterleave_repeat_2(
1246 handle, book, outputs, n, offset, size);
1247 }
1248
1249 /*-----------------------------------------------------------------------*/
1250
1251 /**
1252 * decode_residue_partition: Decode a single residue partition.
1253 *
1254 * [Parameters]
1255 * handle: Stream handle.
1256 * res: Residue configuration.
1257 * pass: Residue decoding pass.
1258 * partition_count: Current partition index.
1259 * vqclass: Classification for this partition.
1260 * outputs: Output vector array.
1261 * n: Length of residue vectors (half of window size).
1262 * ch: Number of channels of residue data to decode.
1263 * do_partition: Partition decoding function.
1264 * [Return value]
1265 * True on success, false on end of packet.
1266 */
1267 static inline bool decode_residue_partition(
1268 stb_vorbis *handle, const Residue *res, int pass, int partition_count,
1269 int vqclass, float **outputs, int n, int ch,
1270 bool (*do_partition)(
1271 stb_vorbis *handle, const Codebook *book, int size, int offset,
1272 float **outputs, int n, int ch))
1273 {
1274 const int vqbook = res->residue_books[vqclass][pass];
1275 (18/18) if (vqbook < 0) {
1276 return true;
1277 }
1278
1279 const Codebook *book = &handle->codebooks[vqbook];
1280 (18/18) if (UNLIKELY(book->lookup_type == 0)) {
1281 /* Lookup type 0 is only valid in a scalar context. The spec
1282 * doesn't say what to do about this case; we treat it like
1283 * end-of-packet. */
1284 flush_packet(handle);
1285 handle->valid_bits = -1;
1286 return false;
1287 }
1288
1289 const int32_t offset = res->begin + partition_count * res->part_size;
1290 return (*do_partition)(
1291 handle, book, res->part_size, offset, outputs, n, ch);
1292 }
1293
1294 /*-----------------------------------------------------------------------*/
1295
1296 /**
1297 * decode_residue_common: Common decode processing for all residue vector
1298 * types.
1299 *
1300 * [Parameters]
1301 * handle: Stream handle.
1302 * res: Residue configuration.
1303 * type: Residue type.
1304 * n: Length of residue vectors (half of window size).
1305 * ch: Number of channels of residue data to decode.
1306 * residue_buffers: Pointers to the buffers for each channel in which
1307 * the residue vectors should be stored. A pointer of NULL
1308 * indicates that the given channel should not be decoded (the
1309 * "do not decode" state referred to by the specification).
1310 * do_partition: Function pointer for the partition decoding function
1311 * to use.
1312 */
1313 static void decode_residue_common(
1314 stb_vorbis *handle, const Residue *res, int type, int n, int ch,
1315 float *residue_buffers[],
1316 bool (*do_partition)(
1317 stb_vorbis *handle, const Codebook *book, int size, int offset,
1318 float **outputs, int n, int ch))
1319 {
1320 const Codebook *classbook = &handle->codebooks[res->classbook];
1321 const int classwords = classbook->dimensions;
1322 (18/18) const int actual_size = (type == 2 ? n*ch : n);
1323 const int n_to_read =
1324 min(res->end, actual_size) - min(res->begin, actual_size);
1325 const int partitions_to_read = n_to_read / res->part_size;
1326 (18/18) const int ch_to_read = (type == 2 ? 1 : ch);
1327
1328 (18/18) if (n_to_read <= 0) {
1329 return;
1330 }
1331
1332 (18/18) if (handle->divides_in_residue) {
1333
1334 int **classifications = handle->classifications;
1335
1336 (18/18) for (int pass = 0; pass < 8; pass++) {
1337 int partition_count = 0;
1338 (18/18) while (partition_count < partitions_to_read) {
1339 (18/18) if (pass == 0) {
1340 (18/18) for (int j = 0; j < ch_to_read; j++) {
1341 (18/18) if (residue_buffers[j]) {
1342 int temp =
1343 codebook_decode_scalar(handle, classbook);
1344 (18/18) if (UNLIKELY(temp == EOP)) {
1345 return;
1346 }
1347 (18/18) for (int i = classwords-1; i >= 0; i--) {
1348 classifications[j][i+partition_count] =
1349 temp % res->classifications;
1350 temp /= res->classifications;
1351 }
1352 }
1353 }
1354 }
1355 for (int i = 0;
1356 (36/36) i < classwords && partition_count < partitions_to_read;
1357 i++, partition_count++)
1358 {
1359 (18/18) for (int j = 0; j < ch_to_read; j++) {
1360 (18/18) if (residue_buffers[j]) {
1361 const int vqclass =
1362 classifications[j][partition_count];
1363 (18/18) if (!decode_residue_partition(
1364 handle, res, pass, partition_count, vqclass,
1365 &residue_buffers[j], n, ch, do_partition)) {
1366 return;
1367 }
1368 }
1369 }
1370 }
1371 } // while (partition_count < partitions_to_read)
1372 } // for (int pass = 0; pass < 8; pass++)
1373
1374 } else { // !handle->divides_in_residue
1375
1376 uint8_t ***part_classdata = handle->classifications;
1377
1378 (18/18) for (int pass = 0; pass < 8; pass++) {
1379 int partition_count = 0;
1380 int class_set = 0;
1381 (18/18) while (partition_count < partitions_to_read) {
1382 (18/18) if (pass == 0) {
1383 (18/18) for (int j = 0; j < ch_to_read; j++) {
1384 (18/18) if (residue_buffers[j]) {
1385 const int temp =
1386 codebook_decode_scalar(handle, classbook);
1387 (18/18) if (UNLIKELY(temp == EOP)) {
1388 return;
1389 }
1390 part_classdata[j][class_set] =
1391 res->classdata[temp];
1392 }
1393 }
1394 }
1395 for (int i = 0;
1396 (36/36) i < classwords && partition_count < partitions_to_read;
1397 i++, partition_count++)
1398 {
1399 (18/18) for (int j = 0; j < ch_to_read; j++) {
1400 (18/18) if (residue_buffers[j]) {
1401 const int vqclass =
1402 part_classdata[j][class_set][i];
1403 (18/18) if (!decode_residue_partition(
1404 handle, res, pass, partition_count, vqclass,
1405 &residue_buffers[j], n, ch, do_partition)) {
1406 return;
1407 }
1408 }
1409 }
1410 }
1411 class_set++;
1412 } // while (partition_count < partitions_to_read)
1413 } // for (int pass = 0; pass < 8; pass++)
1414
1415 } // if (handle->divides_in_residue)
1416 }
1417
1418 /*-----------------------------------------------------------------------*/
1419
1420 /**
1421 * decode_residue: Decode the residue vectors for the current frame.
1422 *
1423 * [Parameters]
1424 * handle: Stream handle.
1425 * residue_index: Index of residue configuration to use.
1426 * n: Length of residue vectors (half of window size).
1427 * ch: Number of channels of residue data to decode.
1428 * residue_buffers: Pointers to the buffers for each channel in which
1429 * the residue vectors should be stored. A pointer of NULL
1430 * indicates that the given channel should not be decoded (the
1431 * "do not decode" state referred to by the specification).
1432 */
1433 static void decode_residue(stb_vorbis *handle, int residue_index, int n,
1434 int ch, float *residue_buffers[])
1435 {
1436 Residue *res = &handle->residue_config[residue_index];
1437 const int type = handle->residue_types[residue_index];
1438
1439 (18/18) for (int i = 0; i < ch; i++) {
1440 (18/18) if (residue_buffers[i]) {
1441 memset(residue_buffers[i], 0, sizeof(*residue_buffers[i]) * n);
1442 }
1443 }
1444
1445 /* For residue type 2, if there are multiple channels, we need to
1446 * deinterleave the data after decoding it. */
1447 (36/36) if (type == 2 && ch > 1) {
1448 (36/36) if (ch == 2 && !handle->divides_in_residue
1449 (27/36) && residue_buffers[0] && residue_buffers[1]) {
1450 /* We have slightly optimized handling for this case. */
1451 decode_residue_common(
1452 handle, res, type, n, ch, residue_buffers,
1453 decode_residue_partition_2_2ch);
1454 } else {
1455 decode_residue_common(
1456 handle, res, type, n, ch, residue_buffers,
1457 decode_residue_partition_2);
1458 }
1459 } else {
1460 (18/18) decode_residue_common(
1461 handle, res, type, n, ch, residue_buffers,
1462 type==0 ? decode_residue_partition_0 : decode_residue_partition_1);
1463 }
1464 }
1465
1466 /*************************************************************************/
1467 /************************ Inverse MDCT processing ************************/
1468 /*************************************************************************/
1469
1470 /**
1471 * imdct_setup_step1: Setup and step 1 of the IMDCT.
1472 *
1473 * [Parameters]
1474 * n: Window size.
1475 * A: Twiddle factor A.
1476 * Y: Input buffer (length n/2).
1477 * v: Output buffer (length n/2). Must be distinct from the input buffer.
1478 */
1479 static void imdct_setup_step1(const unsigned int n, const float *A,
1480 const float *Y, float *v)
1481 {
1482 #if 0 // Roughly literal implementation.
1483
1484 for (unsigned int i = 0; i < n/4; i += 2) {
1485 v[(n/2)-i-1] = Y[i*2]*A[i+0] - Y[(i+1)*2]*A[i+1];
1486 v[(n/2)-i-2] = Y[i*2]*A[i+1] + Y[(i+1)*2]*A[i+0];
1487 }
1488 for (unsigned int i = 0; i < n/4; i += 2) {
1489 v[(n/4)-i-1] =
1490 -Y[(n/2-1)-i*2]*A[(n/4)+i+0] - -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+1];
1491 v[(n/4)-i-2] =
1492 -Y[(n/2-1)-i*2]*A[(n/4)+i+1] + -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+0];
1493 }
1494
1495 #else // Optimized implementations.
1496
1497 #if defined(ENABLE_ASM_ARM_NEON)
1498 const int step = 4;
1499 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);
1500 #elif defined(ENABLE_ASM_X86_AVX2)
1501 const int step = 8;
1502 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);
1503 const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31);
1504 #elif defined(ENABLE_ASM_X86_SSE2)
1505 const int step = 4;
1506 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);
1507 const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31);
1508 #else
1509 const int step = 4;
1510 #endif
1511 ASSERT((n/4) % step == 0);
1512
1513 v += n/2;
1514 (18/18) for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) {
1515 #if defined(ENABLE_ASM_ARM_NEON)
1516 const float32x4x2_t Y_all = vld2q_f32(&Y[i*2]);
1517 const float32x4_t Y_i2 = vswizzleq_zzxx_f32(Y_all.val[0]);
1518 const float32x4_t Y_i3 = vswizzleq_wwyy_f32(Y_all.val[0]);
1519 const float32x4_t A_i = vld1q_f32(&A[i]);
1520 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);
1521 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);
1522 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_i2, A_i3),
1523 veorq_f32(sign_1010,
1524 vmulq_f32(Y_i3, A_i2))));
1525 #elif defined(ENABLE_ASM_X86_AVX2)
1526 /* AVX2 doesn't include an instruction allowing us to mix two
1527 * registers while also shuffling values between 128-bit lanes
1528 * of each register, so we need a couple of extra temporaries to
1529 * get to our desired element order. */
1530 const __m256 Y_lo = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+0)*2]));
1531 const __m256 Y_hi = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+4)*2]));
1532 const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(0,2,1,3));
1533 const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(0,2,1,3));
1534 const __m256 Y_i2 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x13);
1535 const __m256 Y_i3 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x02);
1536 /* Again, we need an extra operation to swap between 128-bit lanes.
1537 * The use of permute4x64 here should help the compiler optimize to
1538 * a single vpermpd on a memory operand. */
1539 const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]),
1540 _MM_SHUFFLE(1,0,3,2));
1541 const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3));
1542 const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2));
1543 _mm256_store_ps(&v[j], _mm256_add_ps(
1544 _mm256_mul_ps(Y_i2, A_i3),
1545 _mm256_xor_sign(sign_1010,
1546 _mm256_mul_ps(Y_i3, A_i2))));
1547 #elif defined(ENABLE_ASM_X86_SSE2)
1548 const __m128 Y_lo = _mm_load_ps(&Y[(i+0)*2]);
1549 const __m128 Y_hi = _mm_load_ps(&Y[(i+2)*2]);
1550 const __m128 Y_i2 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(0,0,0,0));
1551 const __m128 Y_i3 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(2,2,2,2));
1552 const __m128 A_i = _mm_load_ps(&A[i]);
1553 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));
1554 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));
1555 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_i2, A_i3),
1556 _mm_xor_sign(sign_1010,
1557 _mm_mul_ps(Y_i3, A_i2))));
1558 #else
1559 v[j+3] = Y[(i+0)*2]*A[i+0] - Y[(i+1)*2]*A[i+1];
1560 v[j+2] = Y[(i+0)*2]*A[i+1] + Y[(i+1)*2]*A[i+0];
1561 v[j+1] = Y[(i+2)*2]*A[i+2] - Y[(i+3)*2]*A[i+3];
1562 v[j+0] = Y[(i+2)*2]*A[i+3] + Y[(i+3)*2]*A[i+2];
1563 #endif
1564 }
1565
1566 Y += n/2;
1567 A += n/4;
1568 v -= n/4;
1569 (18/18) for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) {
1570 #if defined(ENABLE_ASM_ARM_NEON)
1571 const float32x4x2_t Y_all = vld2q_f32(&Y[j*2]);
1572 const float32x4_t Y_j1 = vnegq_f32(vswizzleq_yyww_f32(Y_all.val[1]));
1573 const float32x4_t Y_j0 = vnegq_f32(vswizzleq_xxzz_f32(Y_all.val[1]));
1574 const float32x4_t A_i = vld1q_f32(&A[i]);
1575 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);
1576 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);
1577 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_j1, A_i3),
1578 veorq_f32(sign_1010,
1579 vmulq_f32(Y_j0, A_i2))));
1580 #elif defined(ENABLE_ASM_X86_AVX2)
1581 const __m256 Y_lo =
1582 _mm256_xor_sign(sign_1111,
1583 _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+0)*2])));
1584 const __m256 Y_hi =
1585 _mm256_xor_sign(sign_1111,
1586 _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+4)*2])));
1587 const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(2,0,3,1));
1588 const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(2,0,3,1));
1589 const __m256 Y_j1 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x20);
1590 const __m256 Y_j0 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x31);
1591 const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]),
1592 _MM_SHUFFLE(1,0,3,2));
1593 const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3));
1594 const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2));
1595 _mm256_store_ps(&v[j], _mm256_add_ps(
1596 _mm256_mul_ps(Y_j1, A_i3),
1597 _mm256_xor_sign(sign_1010,
1598 _mm256_mul_ps(Y_j0, A_i2))));
1599 #elif defined(ENABLE_ASM_X86_SSE2)
1600 const __m128 Y_lo = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+0)*2]));
1601 const __m128 Y_hi = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+2)*2]));
1602 const __m128 Y_j1 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(3,3,3,3));
1603 const __m128 Y_j0 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(1,1,1,1));
1604 const __m128 A_i = _mm_load_ps(&A[i]);
1605 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));
1606 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));
1607 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_j1, A_i3),
1608 _mm_xor_sign(sign_1010,
1609 _mm_mul_ps(Y_j0, A_i2))));
1610 #else
1611 v[j+3] = -Y[(j+3)*2+1]*A[i+0] - -Y[(j+2)*2+1]*A[i+1];
1612 v[j+2] = -Y[(j+3)*2+1]*A[i+1] + -Y[(j+2)*2+1]*A[i+0];
1613 v[j+1] = -Y[(j+1)*2+1]*A[i+2] - -Y[(j+0)*2+1]*A[i+3];
1614 v[j+0] = -Y[(j+1)*2+1]*A[i+3] + -Y[(j+0)*2+1]*A[i+2];
1615 #endif
1616 }
1617
1618 #endif // Literal vs. optimized implementation.
1619 }
1620
1621 /*-----------------------------------------------------------------------*/
1622
1623 /**
1624 * imdct_step2: Step 2 of the IMDCT.
1625 *
1626 * [Parameters]
1627 * n: Window size.
1628 * A: Twiddle factor A.
1629 * v: Input buffer (length n/2).
1630 * w: Output buffer (length n/2). May be the same as the input buffer.
1631 */
1632 static void imdct_step2(const unsigned int n, const float *A,
1633 const float *v, float *w)
1634 {
1635 const float *v0 = &v[(n/4)];
1636 const float *v1 = &v[0];
1637 float *w0 = &w[(n/4)];
1638 float *w1 = &w[0];
1639
1640 #if defined(ENABLE_ASM_ARM_NEON)
1641 const int step = 4;
1642 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);
1643 #elif defined(ENABLE_ASM_X86_AVX2)
1644 const int step = 8;
1645 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);
1646 const __m256i permute = _mm256_set_epi32(1, 1, 3, 3, 0, 0, 2, 2);
1647 #elif defined(ENABLE_ASM_X86_SSE2)
1648 const int step = 4;
1649 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);
1650 #else
1651 const int step = 4;
1652 #endif
1653 ASSERT((n/2) % (2*step) == 0);
1654
1655 (18/18) for (int i = 0, j = n/2 - 2*step; j >= 0; i += step, j -= 2*step) {
1656
1657 #if defined(ENABLE_ASM_ARM_NEON)
1658 const float32x4_t v0_i = vld1q_f32(&v0[i]);
1659 const float32x4_t v1_i = vld1q_f32(&v1[i]);
1660 const float32x4x2_t A_all = vld2q_f32(&A[j]);
1661 vst1q_f32(&w0[i], vaddq_f32(v0_i, v1_i));
1662 const float32x4_t diff = vsubq_f32(v0_i, v1_i);
1663 const float32x4_t diff2 = vswizzleq_yxwz_f32(diff);
1664 const uint32x4_t A_sel = {~0U, 0, ~0U, 0};
1665 const float32x4_t A_40 = vswizzleq_zzxx_f32(vbslq_f32(
1666 A_sel, A_all.val[0],
1667 (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[0], 32)));
1668 const float32x4_t A_51 = vswizzleq_zzxx_f32(vbslq_f32(
1669 A_sel, A_all.val[1],
1670 (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[1], 32)));
1671 vst1q_f32(&w1[i], vaddq_f32(vmulq_f32(diff, A_40),
1672 veorq_f32(sign_1010,
1673 vmulq_f32(diff2, A_51))));
1674
1675 #elif defined(ENABLE_ASM_X86_AVX2)
1676 const __m256 v0_i = _mm256_load_ps(&v0[i]);
1677 const __m256 v1_i = _mm256_load_ps(&v1[i]);
1678 const __m256 A_0 = _mm256_load_ps(&A[j+0]);
1679 const __m256 A_8 = _mm256_load_ps(&A[j+8]);
1680 _mm256_store_ps(&w0[i], _mm256_add_ps(v0_i, v1_i));
1681 const __m256 diff = _mm256_sub_ps(v0_i, v1_i);
1682 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));
1683 const __m256 A_lo = _mm256_permutevar_ps(
1684 _mm256_permute4x64_ps(A_0, _MM_SHUFFLE(2,0,2,0)), permute);
1685 const __m256 A_hi = _mm256_permutevar_ps(
1686 _mm256_permute4x64_ps(A_8, _MM_SHUFFLE(2,0,2,0)), permute);
1687 const __m256 A_40 = _mm256_permute2f128_ps(A_lo, A_hi, 0x02);
1688 const __m256 A_51 = _mm256_permute2f128_ps(A_lo, A_hi, 0x13);
1689 _mm256_store_ps(&w1[i], _mm256_add_ps(
1690 _mm256_mul_ps(diff, A_40),
1691 _mm256_xor_sign(sign_1010,
1692 _mm256_mul_ps(diff2, A_51))));
1693
1694 #elif defined(ENABLE_ASM_X86_SSE2)
1695 const __m128 v0_i = _mm_load_ps(&v0[i]);
1696 const __m128 v1_i = _mm_load_ps(&v1[i]);
1697 const __m128 A_0 = _mm_load_ps(&A[j+0]);
1698 const __m128 A_4 = _mm_load_ps(&A[j+4]);
1699 _mm_store_ps(&w0[i], _mm_add_ps(v0_i, v1_i));
1700 const __m128 diff = _mm_sub_ps(v0_i, v1_i);
1701 const __m128 diff2 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(2,3,0,1));
1702 const __m128 A_40 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(0,0,0,0));
1703 const __m128 A_51 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(1,1,1,1));
1704 _mm_store_ps(&w1[i], _mm_add_ps(_mm_mul_ps(diff, A_40),
1705 _mm_xor_sign(sign_1010,
1706 _mm_mul_ps(diff2, A_51))));
1707
1708 #else
1709 float v40_20, v41_21;
1710
1711 v41_21 = v0[i+1] - v1[i+1];
1712 v40_20 = v0[i+0] - v1[i+0];
1713 w0[i+1] = v0[i+1] + v1[i+1];
1714 w0[i+0] = v0[i+0] + v1[i+0];
1715 w1[i+1] = v41_21*A[j+4] - v40_20*A[j+5];
1716 w1[i+0] = v40_20*A[j+4] + v41_21*A[j+5];
1717
1718 v41_21 = v0[i+3] - v1[i+3];
1719 v40_20 = v0[i+2] - v1[i+2];
1720 w0[i+3] = v0[i+3] + v1[i+3];
1721 w0[i+2] = v0[i+2] + v1[i+2];
1722 w1[i+3] = v41_21*A[j+0] - v40_20*A[j+1];
1723 w1[i+2] = v40_20*A[j+0] + v41_21*A[j+1];
1724
1725 #endif // ENABLE_ASM_*
1726
1727 }
1728 }
1729
1730 /*-----------------------------------------------------------------------*/
1731
1732 /**
1733 * imdct_step3_inner_r_loop: Step 3 of the IMDCT for a single iteration
1734 * of the l and s loops.
1735 *
1736 * [Parameters]
1737 * lim: Iteration limit for the r loop.
1738 * A: Twiddle factor A.
1739 * e: Input/output buffer (length n/2, modified in place).
1740 * i_off: Initial offset, calculated as (n/2 - k0*s).
1741 * k0: Constant k0.
1742 * k1: Constant k1.
1743 */
1744 static void imdct_step3_inner_r_loop(const unsigned int lim, const float *A,
1745 float *e, int i_off, int k0, int k1)
1746 {
1747 float *e0 = e + i_off;
1748 float *e2 = e0 - k0/2;
1749
1750 #if defined(ENABLE_ASM_ARM_NEON)
1751 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);
1752 #elif defined(ENABLE_ASM_X86_AVX2)
1753 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);
1754 #elif defined(ENABLE_ASM_X86_SSE2)
1755 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);
1756 #endif
1757
1758 (18/18) for (int i = lim/4; i > 0; --i, e0 -= 8, e2 -= 8) {
1759
1760 #if defined(ENABLE_ASM_ARM_NEON)
1761 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);
1762 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);
1763 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);
1764 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);
1765 const float32x4x2_t A_0k =
1766 vuzpq_f32(vld1q_f32(&A[0]), vld1q_f32(&A[k1]));
1767 const float32x4x2_t A_2k =
1768 vuzpq_f32(vld1q_f32(&A[2*k1]), vld1q_f32(&A[3*k1]));
1769 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));
1770 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));
1771 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);
1772 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);
1773 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);
1774 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);
1775 const uint32x4_t A_sel = {~0U, 0, ~0U, 0};
1776 const float32x4_t A_0 = vswizzleq_zzxx_f32(vbslq_f32(
1777 A_sel, A_0k.val[0],
1778 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[0], 32)));
1779 const float32x4_t A_1 = vswizzleq_zzxx_f32(vbslq_f32(
1780 A_sel, A_0k.val[1],
1781 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[1], 32)));
1782 const float32x4_t A_2 = vswizzleq_zzxx_f32(vbslq_f32(
1783 A_sel, A_2k.val[0],
1784 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[0], 32)));
1785 const float32x4_t A_3 = vswizzleq_zzxx_f32(vbslq_f32(
1786 A_sel, A_2k.val[1],
1787 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[1], 32)));
1788 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_0),
1789 veorq_f32(sign_1010,
1790 vmulq_f32(diff2_4, A_1))));
1791 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_2),
1792 veorq_f32(sign_1010,
1793 vmulq_f32(diff2_8, A_3))));
1794 A += 4*k1;
1795
1796 #elif defined(ENABLE_ASM_X86_AVX2)
1797 const __m256 e0_8 = _mm256_load_ps(&e0[-8]);
1798 const __m256 e2_8 = _mm256_load_ps(&e2[-8]);
1799 const __m128 A_0k = _mm_load_ps(A);
1800 const __m128 A_1k = _mm_load_ps(&A[k1]);
1801 const __m128 A_2k = _mm_load_ps(&A[2*k1]);
1802 const __m128 A_3k = _mm_load_ps(&A[3*k1]);
1803 _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8));
1804 const __m256 diff = _mm256_sub_ps(e0_8, e2_8);
1805 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));
1806 const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0));
1807 const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1));
1808 const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0));
1809 const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1));
1810 const __m256 A_20 = _mm256_set_m128(A_0, A_2);
1811 const __m256 A_31 = _mm256_set_m128(A_1, A_3);
1812 _mm256_store_ps(&e2[-8], _mm256_add_ps(
1813 _mm256_mul_ps(diff, A_20),
1814 _mm256_xor_sign(sign_1010,
1815 _mm256_mul_ps(diff2, A_31))));
1816 A += 4*k1;
1817
1818 #elif defined(ENABLE_ASM_X86_SSE2)
1819 const __m128 e0_4 = _mm_load_ps(&e0[-4]);
1820 const __m128 e2_4 = _mm_load_ps(&e2[-4]);
1821 const __m128 e0_8 = _mm_load_ps(&e0[-8]);
1822 const __m128 e2_8 = _mm_load_ps(&e2[-8]);
1823 const __m128 A_0k = _mm_load_ps(A);
1824 const __m128 A_1k = _mm_load_ps(&A[k1]);
1825 const __m128 A_2k = _mm_load_ps(&A[2*k1]);
1826 const __m128 A_3k = _mm_load_ps(&A[3*k1]);
1827 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));
1828 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));
1829 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);
1830 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);
1831 const __m128 diff2_4 =
1832 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));
1833 const __m128 diff2_8 =
1834 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));
1835 const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0));
1836 const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1));
1837 const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0));
1838 const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1));
1839 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_0),
1840 _mm_xor_sign(sign_1010,
1841 _mm_mul_ps(diff2_4, A_1))));
1842 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_2),
1843 _mm_xor_sign(sign_1010,
1844 _mm_mul_ps(diff2_8, A_3))));
1845 A += 4*k1;
1846
1847 #else
1848 float k00_20, k01_21;
1849
1850 k00_20 = e0[-1] - e2[-1];
1851 k01_21 = e0[-2] - e2[-2];
1852 e0[-1] = e0[-1] + e2[-1];
1853 e0[-2] = e0[-2] + e2[-2];
1854 e2[-1] = k00_20 * A[0] - k01_21 * A[1];
1855 e2[-2] = k01_21 * A[0] + k00_20 * A[1];
1856 A += k1;
1857
1858 k00_20 = e0[-3] - e2[-3];
1859 k01_21 = e0[-4] - e2[-4];
1860 e0[-3] = e0[-3] + e2[-3];
1861 e0[-4] = e0[-4] + e2[-4];
1862 e2[-3] = k00_20 * A[0] - k01_21 * A[1];
1863 e2[-4] = k01_21 * A[0] + k00_20 * A[1];
1864 A += k1;
1865
1866 k00_20 = e0[-5] - e2[-5];
1867 k01_21 = e0[-6] - e2[-6];
1868 e0[-5] = e0[-5] + e2[-5];
1869 e0[-6] = e0[-6] + e2[-6];
1870 e2[-5] = k00_20 * A[0] - k01_21 * A[1];
1871 e2[-6] = k01_21 * A[0] + k00_20 * A[1];
1872 A += k1;
1873
1874 k00_20 = e0[-7] - e2[-7];
1875 k01_21 = e0[-8] - e2[-8];
1876 e0[-7] = e0[-7] + e2[-7];
1877 e0[-8] = e0[-8] + e2[-8];
1878 e2[-7] = k00_20 * A[0] - k01_21 * A[1];
1879 e2[-8] = k01_21 * A[0] + k00_20 * A[1];
1880 A += k1;
1881
1882 #endif // ENABLE_ASM_*
1883
1884 }
1885 }
1886
1887 /*-----------------------------------------------------------------------*/
1888
1889 /**
1890 * imdct_step3_inner_s_loop: Step 3 of the IMDCT for a single iteration
1891 * of the l and r loops.
1892 *
1893 * [Parameters]
1894 * lim: Iteration limit for the s loop.
1895 * A: Twiddle factor A.
1896 * e: Input/output buffer (length n/2, modified in place).
1897 * i_off: Initial offset, calculated as (n/2 - k0*s).
1898 * k0: Constant k0.
1899 * k1: Constant k1.
1900 */
1901 static void imdct_step3_inner_s_loop(const unsigned int lim, const float *A,
1902 float *e, int i_off, int k0, int k1)
1903 {
1904 const float A0 = A[0];
1905 const float A1 = A[0+1];
1906 const float A2 = A[0+k1];
1907 const float A3 = A[0+k1+1];
1908 const float A4 = A[0+k1*2+0];
1909 const float A5 = A[0+k1*2+1];
1910 const float A6 = A[0+k1*3+0];
1911 const float A7 = A[0+k1*3+1];
1912
1913 float *e0 = e + i_off;
1914 float *e2 = e0 - k0/2;
1915
1916 (18/18) for (int i = lim; i > 0; i--, e0 -= k0, e2 -= k0) {
1917
1918 #if defined(ENABLE_ASM_ARM_NEON)
1919 const float32x4_t A_20 = {A2, A2, A0, A0};
1920 const float32x4_t A_31 = {A3, -A3, A1, -A1};
1921 const float32x4_t A_64 = {A6, A6, A4, A4};
1922 const float32x4_t A_75 = {A7, -A7, A5, -A5};
1923 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);
1924 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);
1925 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);
1926 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);
1927 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));
1928 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));
1929 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);
1930 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);
1931 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);
1932 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);
1933 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_20),
1934 vmulq_f32(diff2_4, A_31)));
1935 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_64),
1936 vmulq_f32(diff2_8, A_75)));
1937
1938 #elif defined(ENABLE_ASM_X86_AVX2)
1939 const __m256 A_6420 = _mm256_set_ps(A0, A0, A2, A2, A4, A4, A6, A6);
1940 const __m256 A_7531 = _mm256_set_ps(-A1, A1, -A3, A3, -A5, A5, -A7, A7);
1941 const __m256 e0_8 = _mm256_load_ps(&e0[-8]);
1942 const __m256 e2_8 = _mm256_load_ps(&e2[-8]);
1943 _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8));
1944 const __m256 diff = _mm256_sub_ps(e0_8, e2_8);
1945 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));
1946 _mm256_store_ps(&e2[-8], _mm256_fmadd_ps(
1947 diff, A_6420,
1948 _mm256_mul_ps(diff2, A_7531)));
1949
1950 #elif defined(ENABLE_ASM_X86_SSE2)
1951 const __m128 A_20 = _mm_set_ps(A0, A0, A2, A2);
1952 const __m128 A_31 = _mm_set_ps(-A1, A1, -A3, A3);
1953 const __m128 A_64 = _mm_set_ps(A4, A4, A6, A6);
1954 const __m128 A_75 = _mm_set_ps(-A5, A5, -A7, A7);
1955 const __m128 e0_4 = _mm_load_ps(&e0[-4]);
1956 const __m128 e2_4 = _mm_load_ps(&e2[-4]);
1957 const __m128 e0_8 = _mm_load_ps(&e0[-8]);
1958 const __m128 e2_8 = _mm_load_ps(&e2[-8]);
1959 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));
1960 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));
1961 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);
1962 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);
1963 const __m128 diff2_4 =
1964 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));
1965 const __m128 diff2_8 =
1966 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));
1967 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_20),
1968 _mm_mul_ps(diff2_4, A_31)));
1969 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_64),
1970 _mm_mul_ps(diff2_8, A_75)));
1971
1972 #else
1973 float k00, k11;
1974
1975 k00 = e0[-1] - e2[-1];
1976 k11 = e0[-2] - e2[-2];
1977 e0[-1] = e0[-1] + e2[-1];
1978 e0[-2] = e0[-2] + e2[-2];
1979 e2[-1] = k00 * A0 - k11 * A1;
1980 e2[-2] = k11 * A0 + k00 * A1;
1981
1982 k00 = e0[-3] - e2[-3];
1983 k11 = e0[-4] - e2[-4];
1984 e0[-3] = e0[-3] + e2[-3];
1985 e0[-4] = e0[-4] + e2[-4];
1986 e2[-3] = k00 * A2 - k11 * A3;
1987 e2[-4] = k11 * A2 + k00 * A3;
1988
1989 k00 = e0[-5] - e2[-5];
1990 k11 = e0[-6] - e2[-6];
1991 e0[-5] = e0[-5] + e2[-5];
1992 e0[-6] = e0[-6] + e2[-6];
1993 e2[-5] = k00 * A4 - k11 * A5;
1994 e2[-6] = k11 * A4 + k00 * A5;
1995
1996 k00 = e0[-7] - e2[-7];
1997 k11 = e0[-8] - e2[-8];
1998 e0[-7] = e0[-7] + e2[-7];
1999 e0[-8] = e0[-8] + e2[-8];
2000 e2[-7] = k00 * A6 - k11 * A7;
2001 e2[-8] = k11 * A6 + k00 * A7;
2002
2003 #endif // ENABLE_ASM_*
2004
2005 }
2006 }
2007
2008 /*-----------------------------------------------------------------------*/
2009
2010 /**
2011 * iter_54: Step 3 of the IMDCT for l=ld(n)-5 and l=ld(n)-4 for a sequence
2012 * of 8 elements. Helper function for imdct_step3_inner_s_loop_ld654().
2013 *
2014 * [Parameters]
2015 * z: Pointer to elements to operate on.
2016 */
2017 static inline void iter_54(float *z)
2018 {
2019 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)
2020 const __m128i sign_0011 =
2021 _mm_set_epi32(0, 0, UINT32_C(1)<<31, UINT32_C(1)<<31);
2022 const __m128i sign_0110 =
2023 _mm_set_epi32(0, UINT32_C(1)<<31, UINT32_C(1)<<31, 0);
2024 const __m128 z_4 = _mm_load_ps(&z[-4]);
2025 const __m128 z_8 = _mm_load_ps(&z[-8]);
2026 const __m128 sum = _mm_add_ps(z_4, z_8);
2027 const __m128 diff = _mm_sub_ps(z_4, z_8);
2028 const __m128 sum_23 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3,2,3,2));
2029 const __m128 sum_01 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1,0,1,0));
2030 const __m128 diff_23 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(3,2,3,2));
2031 const __m128 diff_10 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(0,1,0,1));
2032 _mm_store_ps(&z[-4], _mm_add_ps(sum_23, _mm_xor_sign(sign_0011, sum_01)));
2033 _mm_store_ps(&z[-8], _mm_add_ps(diff_23, _mm_xor_sign(sign_0110, diff_10)));
2034
2035 #else
2036 const float k00 = z[-1] - z[-5];
2037 const float k11 = z[-2] - z[-6];
2038 const float k22 = z[-3] - z[-7];
2039 const float k33 = z[-4] - z[-8];
2040 const float y0 = z[-1] + z[-5];
2041 const float y1 = z[-2] + z[-6];
2042 const float y2 = z[-3] + z[-7];
2043 const float y3 = z[-4] + z[-8];
2044
2045 z[-1] = y0 + y2; // z1 + z5 + z3 + z7
2046 z[-2] = y1 + y3; // z2 + z6 + z4 + z8
2047 z[-3] = y0 - y2; // z1 + z5 - z3 - z7
2048 z[-4] = y1 - y3; // z2 + z6 - z4 - z8
2049 z[-5] = k00 + k33; // z1 - z5 + z4 - z8
2050 z[-6] = k11 - k22; // z2 - z6 + z3 - z7
2051 z[-7] = k00 - k33; // z1 - z5 - z4 + z8
2052 z[-8] = k11 + k22; // z2 - z6 - z3 + z7
2053
2054 #endif // ENABLE_ASM_*
2055 }
2056
2057 /*-----------------------------------------------------------------------*/
2058
2059 /**
2060 * imdct_step3_inner_s_loop_ld654: Step 3 of the IMDCT for a single
2061 * iteration of the l and r loops at l=log2_n-{6,5,4}.
2062 *
2063 * [Parameters]
2064 * n: Window size.
2065 * A: Twiddle factor A.
2066 * e: Input/output buffer (length n/2, modified in place).
2067 */
2068 static void imdct_step3_inner_s_loop_ld654(
2069 const unsigned int n, const float *A, float *e)
2070 {
2071 const int a_off = n/8;
2072 const float A2 = A[a_off];
2073
2074 (18/18) for (float *z = e + n/2; z > e; z -= 16) {
2075
2076 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)
2077 # if defined(ENABLE_ASM_X86_AVX2)
2078 const __m256 z_8 = _mm256_load_ps(&z[-8]);
2079 const __m256 z_16 = _mm256_load_ps(&z[-16]);
2080 _mm256_store_ps(&z[-8], _mm256_add_ps(z_8, z_16));
2081 const __m256 diff = _mm256_sub_ps(z_8, z_16);
2082 const __m128 diff_8 = _mm256_castps256_ps128(diff);
2083 const __m128 diff_4 = _mm256_extractf128_ps(diff, 1);
2084 # else
2085 const __m128 z_4 = _mm_load_ps(&z[-4]);
2086 const __m128 z_8 = _mm_load_ps(&z[-8]);
2087 const __m128 z_12 = _mm_load_ps(&z[-12]);
2088 const __m128 z_16 = _mm_load_ps(&z[-16]);
2089 _mm_store_ps(&z[-4], _mm_add_ps(z_4, z_12));
2090 _mm_store_ps(&z[-8], _mm_add_ps(z_8, z_16));
2091 const __m128 diff_4 = _mm_sub_ps(z_4, z_12);
2092 const __m128 diff_8 = _mm_sub_ps(z_8, z_16);
2093 # endif
2094 /* We can't use the same algorithm as imdct_step3_inner_s_loop(),
2095 * since that can lead to a loss of precision for z[-11,-12,-15,-16]
2096 * if the two non-constant inputs to the calculation are of nearly
2097 * equal magnitude and of the appropriate signs to give an
2098 * intermediate sum or difference of much smaller magnitude:
2099 * changing the order of operations to multiply by A2 first can
2100 * introduce an error of at most 1 ULP at the point of the
2101 * multiplication, but if the final result has a smaller magnitude,
2102 * that error will be a greater part of the final result. */
2103 const __m128 temp_4 = _mm_xor_sign(
2104 _mm_set_epi32(0, 0, 0, UINT32_C(1)<<31),
2105 _mm_shuffle_ps(diff_4, _mm_set1_ps(0), _MM_SHUFFLE(3,2,0,1)));
2106 _mm_store_ps(&z[-12], _mm_mul_ps(_mm_add_ps(diff_4, temp_4),
2107 _mm_set_ps(1, 1, A2, A2)));
2108 const __m128 temp1_8 = _mm_xor_sign(
2109 _mm_set_epi32(0, UINT32_C(1)<<31, 0, UINT32_C(1)<<31),
2110 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1)));
2111 const __m128 temp2_8 =
2112 _mm_shuffle_ps(diff_8, _mm_set1_ps(0), _MM_SHUFFLE(3,2,1,0));
2113 _mm_store_ps(&z[-16], _mm_mul_ps(_mm_sub_ps(temp1_8, temp2_8),
2114 _mm_set_ps(1, 1, A2, A2)));
2115
2116 #else
2117 float k00, k11;
2118
2119 k00 = z[ -1] - z[ -9];
2120 k11 = z[ -2] - z[-10];
2121 z[ -1] = z[ -1] + z[ -9];
2122 z[ -2] = z[ -2] + z[-10];
2123 z[ -9] = k00; // k00*1 - k11*0
2124 z[-10] = k11; // k11*1 + k00*0
2125
2126 k00 = z[ -3] - z[-11];
2127 k11 = z[ -4] - z[-12];
2128 z[ -3] = z[ -3] + z[-11];
2129 z[ -4] = z[ -4] + z[-12];
2130 z[-11] = (k00+k11) * A2; // k00*A2 - k11*-A2 (but see asm note above)
2131 z[-12] = (k11-k00) * A2; // k11*A2 + k00*-A2
2132
2133 k00 = z[- 5] - z[-13];
2134 k11 = z[ -6] - z[-14];
2135 z[ -5] = z[ -5] + z[-13];
2136 z[ -6] = z[ -6] + z[-14];
2137 z[-13] = k11; // k00*0 - k11*-1
2138 z[-14] = -k00; // k11*0 + k00*-1
2139
2140 k00 = z[- 7] - z[-15];
2141 k11 = z[ -8] - z[-16];
2142 z[ -7] = z[ -7] + z[-15];
2143 z[ -8] = z[ -8] + z[-16];
2144 z[-15] = (k11-k00) * A2; // k00*-A2 - k11*-A2
2145 z[-16] = -(k00+k11) * A2; // k11*-A2 + k00*-A2
2146
2147 #endif // ENABLE_ASM_*
2148
2149 iter_54(z);
2150 iter_54(z-8);
2151 }
2152 }
2153
2154 /*-----------------------------------------------------------------------*/
2155
2156 /**
2157 * imdct_step456: Steps 4, 5, and 6 of the IMDCT.
2158 *
2159 * [Parameters]
2160 * n: Window size.
2161 * bitrev: Bit-reverse lookup table.
2162 * u: Input buffer (length n/2).
2163 * U: Output buffer (length n/2). Must be distinct from the input buffer.
2164 */
2165 static void imdct_step456(const unsigned int n, const uint16_t *bitrev,
2166 const float *u, float *U)
2167 {
2168 /* stb_vorbis note: "weirdly, I'd have thought reading sequentially and
2169 * writing erratically would have been better than vice-versa, but in fact
2170 * that's not what my testing showed. (That is, with j = bitreverse(i),
2171 * do you read i and write j, or read j and write i.)" */
2172
2173 float *U0 = &U[n/4];
2174 float *U1 = &U[n/2];
2175
2176 (18/18) for (int i = 0, j = -4; i < (int)(n/8); i += 2, j -= 4) {
2177
2178 #if defined(ENABLE_ASM_X86_SSE2)
2179 const __m128 bitrev_0 = _mm_load_ps(&u[bitrev[i+0]]);
2180 const __m128 bitrev_1 = _mm_load_ps(&u[bitrev[i+1]]);
2181 _mm_store_ps(&U0[j], _mm_shuffle_ps(
2182 bitrev_1, bitrev_0, _MM_SHUFFLE(2,3,2,3)));
2183 _mm_store_ps(&U1[j], _mm_shuffle_ps(
2184 bitrev_1, bitrev_0, _MM_SHUFFLE(0,1,0,1)));
2185
2186 #else
2187 int k4;
2188
2189 k4 = bitrev[i+0];
2190 U1[j+3] = u[k4+0];
2191 U1[j+2] = u[k4+1];
2192 U0[j+3] = u[k4+2];
2193 U0[j+2] = u[k4+3];
2194
2195 k4 = bitrev[i+1];
2196 U1[j+1] = u[k4+0];
2197 U1[j+0] = u[k4+1];
2198 U0[j+1] = u[k4+2];
2199 U0[j+0] = u[k4+3];
2200
2201 #endif // ENABLE_ASM_*
2202
2203 }
2204 }
2205
2206 /*-----------------------------------------------------------------------*/
2207
2208 /**
2209 * imdct_step7: Step 7 of the IMDCT.
2210 *
2211 * [Parameters]
2212 * n: Window size.
2213 * C: Twiddle factor C.
2214 * buffer: Input/output buffer (length n/2, modified in place).
2215 */
2216 static void imdct_step7(const unsigned int n, const float *C, float *buffer)
2217 {
2218 #if defined(ENABLE_ASM_ARM_NEON)
2219 const int step = 4;
2220 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);
2221 #elif defined(ENABLE_ASM_X86_AVX2)
2222 const int step = 8;
2223 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);
2224 #elif defined(ENABLE_ASM_X86_SSE2)
2225 const int step = 4;
2226 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);
2227 #else
2228 const int step = 4;
2229 #endif
2230 ASSERT((n/2) % (2*step) == 0);
2231
2232 (18/18) for (float *d = buffer, *e = buffer + (n/2) - step; d < e;
2233 C += step, d += step, e -= step)
2234 {
2235
2236 #if defined(ENABLE_ASM_ARM_NEON)
2237 const float32x4_t C_0 = vld1q_f32(C);
2238 const float32x4_t d_0 = vld1q_f32(d);
2239 const float32x4_t e_0 = vld1q_f32(e);
2240 const float32x4_t e_2 = veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_0));
2241 const float32x4_t sub = vsubq_f32(d_0, e_2);
2242 const float32x4_t add = vaddq_f32(d_0, e_2);
2243 const float32x4_t C02 = veorq_f32(sign_1010, vswizzleq_xxzz_f32(C_0));
2244 const float32x4_t C13 = vswizzleq_yyww_f32(C_0);
2245 const float32x4_t mix = vaddq_f32(
2246 vmulq_f32(C13, sub), vmulq_f32(C02, vswizzleq_yxwz_f32(sub)));
2247 const float32x4_t e_temp = vsubq_f32(add, mix);
2248 const float32x4_t e_out =
2249 veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_temp));
2250 vst1q_f32(d, vaddq_f32(add, mix));
2251 vst1q_f32(e, e_out);
2252
2253 #elif defined(ENABLE_ASM_X86_AVX2)
2254 const __m256 C_0 = _mm256_load_ps(C);
2255 const __m256 d_0 = _mm256_load_ps(d);
2256 const __m256 e_0 = _mm256_load_ps(e);
2257 const __m256 e_4 = _mm256_permute4x64_ps(e_0, _MM_SHUFFLE(1,0,3,2));
2258 const __m256 e_6 = _mm256_xor_sign(
2259 sign_1010, _mm256_permute_ps(e_4, _MM_SHUFFLE(1,0,3,2)));
2260 const __m256 sub = _mm256_sub_ps(d_0, e_6);
2261 const __m256 add = _mm256_add_ps(d_0, e_6);
2262 const __m256 C02 = _mm256_xor_sign(
2263 sign_1010, _mm256_permute_ps(C_0, _MM_SHUFFLE(2,2,0,0)));
2264 const __m256 C13 = _mm256_permute_ps(C_0, _MM_SHUFFLE(3,3,1,1));
2265 const __m256 mix = _mm256_add_ps(
2266 _mm256_mul_ps(C13, sub),
2267 _mm256_mul_ps(C02, _mm256_permute_ps(sub, _MM_SHUFFLE(2,3,0,1))));
2268 const __m256 e_temp = _mm256_sub_ps(add, mix);
2269 const __m256 e_swap =
2270 _mm256_permute4x64_ps(e_temp, _MM_SHUFFLE(1,0,3,2));
2271 const __m256 e_out = _mm256_xor_sign(
2272 sign_1010, _mm256_permute_ps(e_swap, _MM_SHUFFLE(1,0,3,2)));
2273 _mm256_store_ps(d, _mm256_add_ps(add, mix));
2274 _mm256_store_ps(e, e_out);
2275
2276 #elif defined(ENABLE_ASM_X86_SSE2)
2277 const __m128 C_0 = _mm_load_ps(C);
2278 const __m128 d_0 = _mm_load_ps(d);
2279 const __m128 e_0 = _mm_load_ps(e);
2280 const __m128 e_2 = _mm_xor_sign(
2281 sign_1010, _mm_shuffle_ps(e_0, e_0, _MM_SHUFFLE(1,0,3,2)));
2282 const __m128 sub = _mm_sub_ps(d_0, e_2);
2283 const __m128 add = _mm_add_ps(d_0, e_2);
2284 const __m128 C02 = _mm_xor_sign(
2285 sign_1010, _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(2,2,0,0)));
2286 const __m128 C13 = _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(3,3,1,1));
2287 const __m128 mix = _mm_add_ps(
2288 _mm_mul_ps(C13, sub),
2289 _mm_mul_ps(C02, _mm_shuffle_ps(sub, sub, _MM_SHUFFLE(2,3,0,1))));
2290 const __m128 e_temp = _mm_sub_ps(add, mix);
2291 const __m128 e_out = _mm_xor_sign(
2292 sign_1010, _mm_shuffle_ps(e_temp, e_temp, _MM_SHUFFLE(1,0,3,2)));
2293 _mm_store_ps(d, _mm_add_ps(add, mix));
2294 _mm_store_ps(e, e_out);
2295
2296 #else
2297 float sub0 = d[0] - e[2];
2298 float sub1 = d[1] + e[3];
2299 float sub2 = d[2] - e[0];
2300 float sub3 = d[3] + e[1];
2301
2302 float mix0 = C[1]*sub0 + C[0]*sub1;
2303 float mix1 = C[1]*sub1 - C[0]*sub0;
2304 float mix2 = C[3]*sub2 + C[2]*sub3;
2305 float mix3 = C[3]*sub3 - C[2]*sub2;
2306
2307 float add0 = d[0] + e[2];
2308 float add1 = d[1] - e[3];
2309 float add2 = d[2] + e[0];
2310 float add3 = d[3] - e[1];
2311
2312 d[0] = add0 + mix0;
2313 d[1] = add1 + mix1;
2314 d[2] = add2 + mix2;
2315 d[3] = add3 + mix3;
2316
2317 e[0] = add2 - mix2;
2318 e[1] = -(add3 - mix3);
2319 e[2] = add0 - mix0;
2320 e[3] = -(add1 - mix1);
2321
2322 #endif // ENABLE_ASM_*
2323
2324 }
2325 }
2326
2327 /*-----------------------------------------------------------------------*/
2328
2329 /**
2330 * imdct_step8_decode: Step 8 and final decoding for the IMDCT.
2331 *
2332 * [Parameters]
2333 * n: Window size.
2334 * B: Twiddle factor B.
2335 * in: Input buffer (length n/2).
2336 * out: Output buffer (length n).
2337 */
2338 static void imdct_step8_decode(const unsigned int n, const float *B,
2339 const float *in, float *out)
2340 {
2341 /* stb_vorbis note: "this generates pairs of data a la 8 and pushes
2342 * them directly through the decode kernel (pushing rather than
2343 * pulling) to avoid having to make another pass later" */
2344
2345 B += (n/2) - 16;
2346 float *d0 = &out[0];
2347 float *d1 = &out[(n/2)-8];
2348 float *d2 = &out[(n/2)];
2349 float *d3 = &out[n-8];
2350 (18/18) for (const float *e = in + (n/2) - 16; e >= in;
2351 e -= 16, B -= 16, d0 += 8, d1 -= 8, d2 += 8, d3 -= 8)
2352 {
2353
2354 #if defined(ENABLE_ASM_ARM_NEON)
2355 const float32x4x2_t e_0 =
2356 vuzpq_f32(vld1q_f32(&e[0]), vld1q_f32(&e[4]));
2357 const float32x4x2_t e_8 =
2358 vuzpq_f32(vld1q_f32(&e[8]), vld1q_f32(&e[12]));
2359 const float32x4x2_t B_0 =
2360 vuzpq_f32(vld1q_f32(&B[0]), vld1q_f32(&B[4]));
2361 const float32x4x2_t B_8 =
2362 vuzpq_f32(vld1q_f32(&B[8]), vld1q_f32(&B[12]));
2363 const float32x4_t d1_0 =
2364 vsubq_f32(vmulq_f32(e_0.val[1], B_0.val[0]),
2365 vmulq_f32(e_0.val[0], B_0.val[1]));
2366 const float32x4_t d0_0 = vnegq_f32(vswizzleq_wzyx_f32(d1_0));
2367 const float32x4_t d3_0 =
2368 vnegq_f32(vaddq_f32(vmulq_f32(e_0.val[0], B_0.val[0]),
2369 vmulq_f32(e_0.val[1], B_0.val[1])));
2370 const float32x4_t d2_0 = vswizzleq_wzyx_f32(d3_0);
2371 const float32x4_t d1_8 =
2372 vsubq_f32(vmulq_f32(e_8.val[1], B_8.val[0]),
2373 vmulq_f32(e_8.val[0], B_8.val[1]));
2374 const float32x4_t d0_8 = vnegq_f32(vswizzleq_wzyx_f32(d1_8));
2375 const float32x4_t d3_8 =
2376 vnegq_f32(vaddq_f32(vmulq_f32(e_8.val[0], B_8.val[0]),
2377 vmulq_f32(e_8.val[1], B_8.val[1])));
2378 const float32x4_t d2_8 = vswizzleq_wzyx_f32(d3_8);
2379 vst1q_f32(d0, d0_8);
2380 vst1q_f32(d0+4, d0_0);
2381 vst1q_f32(d1+4, d1_8);
2382 vst1q_f32(d1, d1_0);
2383 vst1q_f32(d2, d2_8);
2384 vst1q_f32(d2+4, d2_0);
2385 vst1q_f32(d3+4, d3_8);
2386 vst1q_f32(d3, d3_0);
2387
2388 #elif defined(ENABLE_ASM_X86_AVX2)
2389 const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31);
2390 const __m256i permute_reverse = _mm256_set_epi32(0,1,2,3,4,5,6,7);
2391 /* It's tempting to use the gather-load instructions here to load
2392 * even and odd indices, but that causes a massive slowdown, so
2393 * we just use normal loads and lots of permutes. */
2394 const __m256i permute_even_odd = _mm256_set_epi32(7,5,3,1,6,4,2,0);
2395 const __m256 e_0 = _mm256_load_ps(&e[0]);
2396 const __m256 e_8 = _mm256_load_ps(&e[8]);
2397 const __m256 B_0 = _mm256_load_ps(&B[0]);
2398 const __m256 B_8 = _mm256_load_ps(&B[8]);
2399 const __m256 e_0p = _mm256_permutevar8x32_ps(e_0, permute_even_odd);
2400 const __m256 e_8p = _mm256_permutevar8x32_ps(e_8, permute_even_odd);
2401 const __m256 B_0p = _mm256_permutevar8x32_ps(B_0, permute_even_odd);
2402 const __m256 B_8p = _mm256_permutevar8x32_ps(B_8, permute_even_odd);
2403 const __m256 e_even = _mm256_permute2f128_ps(e_0p, e_8p, 0x20);
2404 const __m256 e_odd = _mm256_permute2f128_ps(e_0p, e_8p, 0x31);
2405 const __m256 B_even = _mm256_permute2f128_ps(B_0p, B_8p, 0x20);
2406 const __m256 B_odd = _mm256_permute2f128_ps(B_0p, B_8p, 0x31);
2407 const __m256 d1_0 = _mm256_fmsub_ps(e_odd, B_even,
2408 _mm256_mul_ps(e_even, B_odd));
2409 const __m256 d0_0 = _mm256_xor_sign(
2410 sign_1111, _mm256_permutevar8x32_ps(d1_0, permute_reverse));
2411 const __m256 d3_0 = _mm256_xor_sign(
2412 sign_1111, _mm256_fmadd_ps(e_even, B_even,
2413 _mm256_mul_ps(e_odd, B_odd)));
2414 const __m256 d2_0 = _mm256_permutevar8x32_ps(d3_0, permute_reverse);
2415 _mm256_store_ps(d0, d0_0);
2416 _mm256_store_ps(d1, d1_0);
2417 _mm256_store_ps(d2, d2_0);
2418 _mm256_store_ps(d3, d3_0);
2419
2420 #elif defined(ENABLE_ASM_X86_SSE2)
2421 const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31);
2422 const __m128 e_0 = _mm_load_ps(&e[0]);
2423 const __m128 e_4 = _mm_load_ps(&e[4]);
2424 const __m128 B_0 = _mm_load_ps(&B[0]);
2425 const __m128 B_4 = _mm_load_ps(&B[4]);
2426 const __m128 e_8 = _mm_load_ps(&e[8]);
2427 const __m128 e_12 = _mm_load_ps(&e[12]);
2428 const __m128 B_8 = _mm_load_ps(&B[8]);
2429 const __m128 B_12 = _mm_load_ps(&B[12]);
2430 const __m128 e_even_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(2,0,2,0));
2431 const __m128 e_odd_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(3,1,3,1));
2432 const __m128 B_even_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(2,0,2,0));
2433 const __m128 B_odd_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(3,1,3,1));
2434 const __m128 e_even_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(2,0,2,0));
2435 const __m128 e_odd_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(3,1,3,1));
2436 const __m128 B_even_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(2,0,2,0));
2437 const __m128 B_odd_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(3,1,3,1));
2438 const __m128 d1_0 = _mm_sub_ps(_mm_mul_ps(e_odd_0, B_even_0),
2439 _mm_mul_ps(e_even_0, B_odd_0));
2440 const __m128 d0_0 = _mm_xor_sign(
2441 sign_1111, _mm_shuffle_ps(d1_0, d1_0, _MM_SHUFFLE(0,1,2,3)));
2442 const __m128 d3_0 = _mm_xor_sign(
2443 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_0, B_even_0),
2444 _mm_mul_ps(e_odd_0, B_odd_0)));
2445 const __m128 d2_0 = _mm_shuffle_ps(d3_0, d3_0, _MM_SHUFFLE(0,1,2,3));
2446 const __m128 d1_8 = _mm_sub_ps(_mm_mul_ps(e_odd_8, B_even_8),
2447 _mm_mul_ps(e_even_8, B_odd_8));
2448 const __m128 d0_8 = _mm_xor_sign(
2449 sign_1111, _mm_shuffle_ps(d1_8, d1_8, _MM_SHUFFLE(0,1,2,3)));
2450 const __m128 d3_8 = _mm_xor_sign(
2451 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_8, B_even_8),
2452 _mm_mul_ps(e_odd_8, B_odd_8)));
2453 const __m128 d2_8 = _mm_shuffle_ps(d3_8, d3_8, _MM_SHUFFLE(0,1,2,3));
2454 _mm_store_ps(d0, d0_8);
2455 _mm_store_ps(d0+4, d0_0);
2456 _mm_store_ps(d1+4, d1_8);
2457 _mm_store_ps(d1, d1_0);
2458 _mm_store_ps(d2, d2_8);
2459 _mm_store_ps(d2+4, d2_0);
2460 _mm_store_ps(d3+4, d3_8);
2461 _mm_store_ps(d3, d3_0);
2462
2463 #else
2464 float p0, p1, p2, p3;
2465
2466 p3 = e[14]*B[15] - e[15]*B[14];
2467 p2 = -e[14]*B[14] - e[15]*B[15];
2468 d0[0] = p3;
2469 d1[7] = - p3;
2470 d2[0] = p2;
2471 d3[7] = p2;
2472
2473 p1 = e[12]*B[13] - e[13]*B[12];
2474 p0 = -e[12]*B[12] - e[13]*B[13];
2475 d0[1] = p1;
2476 d1[6] = - p1;
2477 d2[1] = p0;
2478 d3[6] = p0;
2479
2480 p3 = e[10]*B[11] - e[11]*B[10];
2481 p2 = -e[10]*B[10] - e[11]*B[11];
2482 d0[2] = p3;
2483 d1[5] = - p3;
2484 d2[2] = p2;
2485 d3[5] = p2;
2486
2487 p1 = e[8]*B[9] - e[9]*B[8];
2488 p0 = -e[8]*B[8] - e[9]*B[9];
2489 d0[3] = p1;
2490 d1[4] = - p1;
2491 d2[3] = p0;
2492 d3[4] = p0;
2493
2494 p3 = e[6]*B[7] - e[7]*B[6];
2495 p2 = -e[6]*B[6] - e[7]*B[7];
2496 d0[4] = p3;
2497 d1[3] = - p3;
2498 d2[4] = p2;
2499 d3[3] = p2;
2500
2501 p1 = e[4]*B[5] - e[5]*B[4];
2502 p0 = -e[4]*B[4] - e[5]*B[5];
2503 d0[5] = p1;
2504 d1[2] = - p1;
2505 d2[5] = p0;
2506 d3[2] = p0;
2507
2508 p3 = e[2]*B[3] - e[3]*B[2];
2509 p2 = -e[2]*B[2] - e[3]*B[3];
2510 d0[6] = p3;
2511 d1[1] = - p3;
2512 d2[6] = p2;
2513 d3[1] = p2;
2514
2515 p1 = e[0]*B[1] - e[1]*B[0];
2516 p0 = -e[0]*B[0] - e[1]*B[1];
2517 d0[7] = p1;
2518 d1[0] = - p1;
2519 d2[7] = p0;
2520 d3[0] = p0;
2521
2522 #endif // ENABLE_ASM_*
2523
2524 }
2525 }
2526
2527 /*-----------------------------------------------------------------------*/
2528
2529 /**
2530 * inverse_mdct: Perform the inverse MDCT operation on the given buffer.
2531 * The algorithm is taken from "The use of multirate filter banks for
2532 * coding of high quality digital audio", Th. Sporer et. al. (1992), with
2533 * corrections for errors in that paper, and the step numbers listed in
2534 * comments refer to the algorithm steps as described in the paper.
2535 *
2536 * [Parameters]
2537 * handle: Stream handle.
2538 * buffer: Input/output buffer.
2539 * blocktype: 0 if the current frame is a short block, 1 if a long block.
2540 */
2541 static void inverse_mdct(stb_vorbis *handle, float *buffer, int blocktype)
2542 {
2543 const unsigned int n = handle->blocksize[blocktype];
2544 const int log2_n = handle->blocksize_bits[blocktype];
2545 const float *A = handle->A[blocktype];
2546 float *buf2 = handle->imdct_temp_buf;
2547
2548 /* Setup and step 1. Note that step 1 involves subtracting pairs of
2549 * spectral coefficients, but the two items subtracted are actually
2550 * arithmetic inverses of each other(!), e.g.
2551 * u[4*k] - u[n-4*k-1] == 2*u[4*k]
2552 * We omit the factor of 2 here; that propagates linearly through to
2553 * the final step, and it is compensated for by "*0.5f" on the B[]
2554 * values computed during setup. */
2555 imdct_setup_step1(n, A, buffer, buf2);
2556
2557 /* Step 2.
2558 * stb_vorbis note: "this could be in place, but the data ends up in
2559 * the wrong place... _somebody_'s got to swap it, so this is nominated" */
2560 imdct_step2(n, A, buf2, buffer);
2561
2562 /* Step 3.
2563 * stb_vorbis note: "the original step3 loop can be nested r inside s
2564 * or s inside r; it's written originally as s inside r, but this is
2565 * dumb when r iterates many times, and s few. So I have two copies of
2566 * it and switch between them halfway." */
2567
2568 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*0, n/4, 8);
2569 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*1, n/4, 8);
2570
2571 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*0, n/8, 16);
2572 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*1, n/8, 16);
2573 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*2, n/8, 16);
2574 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*3, n/8, 16);
2575
2576 int l = 2;
2577 (18/18) for (; l < (log2_n-3)/2; l++) {
2578 const int k0 = n >> (l+2);
2579 const int lim = 1 << (l+1);
2580 (18/18) for (int i = 0; i < lim; i++) {
2581 imdct_step3_inner_r_loop(n >> (l+4), A, buffer, (n/2) - k0*i,
2582 k0, 1 << (l+3));
2583 }
2584 }
2585
2586 (18/18) for (; l < log2_n-6; l++) {
2587 const int k0 = n >> (l+2), k1 = 1 << (l+3);
2588 const int rlim = n >> (l+6);
2589 const int lim = 1 << (l+1);
2590 const float *A0 = A;
2591 int i_off = n/2;
2592 (18/18) for (int r = rlim; r > 0; r--) {
2593 imdct_step3_inner_s_loop(lim, A0, buffer, i_off, k0, k1);
2594 A0 += k1*4;
2595 i_off -= 8;
2596 }
2597 }
2598
2599 /* stb_vorbis note: "log2_n-6,-5,-4 all interleaved together - the big
2600 * win comes from getting rid of needless flops due to the constants on
2601 * pass 5 & 4 being all 1 and 0; combining them to be simultaneous to
2602 * improve cache made little difference" */
2603 imdct_step3_inner_s_loop_ld654(n, A, buffer);
2604
2605 /* Steps 4, 5, and 6. */
2606 imdct_step456(n, handle->bit_reverse[blocktype], buffer, buf2);
2607
2608 /* Step 7. */
2609 imdct_step7(n, handle->C[blocktype], buf2);
2610
2611 /* Step 8 and final decoding. */
2612 imdct_step8_decode(n, handle->B[blocktype], buf2, buffer);
2613 }
2614
2615 /*************************************************************************/
2616 /******************* Main decoding routine (internal) ********************/
2617 /*************************************************************************/
2618
2619 /**
2620 * vorbis_decode_packet_rest: Perform all decoding operations for a frame
2621 * except mode selection and windowing.
2622 *
2623 * [Parameters]
2624 * handle: Stream handle.
2625 * mode: Mode configuration for this frame.
2626 * left_start: Start position of the left overlap region.
2627 * left_end: End position of the left overlap region.
2628 * right_start: Start position of the right overlap region.
2629 * right_end: End position of the right overlap region.
2630 * len_ret: Pointer to variable to receive the frame length (offset
2631 * within the window of just past the final fully-decoded sample).
2632 * [Return value]
2633 * True on success, false on error.
2634 */
2635 static bool vorbis_decode_packet_rest(
2636 stb_vorbis *handle, const Mode *mode, int left_start, int left_end,
2637 int right_start, int right_end, int *len_ret)
2638 {
2639 const int n = handle->blocksize[mode->blockflag];
2640 const Mapping *map = &handle->mapping[mode->mapping];
2641 float ** const channel_buffers =
2642 handle->channel_buffers[handle->cur_channel_buffer];
2643
2644 /**** Floor processing (4.3.2). ****/
2645
2646 int64_t floor0_amplitude[256];
2647 bool zero_channel[256];
2648 (18/18) for (int ch = 0; ch < handle->channels; ch++) {
2649 const int floor_index = map->submap_floor[map->mux[ch]];
2650 (18/18) if (handle->floor_types[floor_index] == 0) {
2651 Floor0 *floor = &handle->floor_config[floor_index].floor0;
2652 const int64_t amplitude =
2653 decode_floor0(handle, floor, handle->coefficients[ch]);
2654 (18/18) if (UNLIKELY(amplitude < 0)) {
2655 flush_packet(handle);
2656 return error(handle, VORBIS_invalid_packet);
2657 }
2658 zero_channel[ch] = (amplitude == 0);
2659 floor0_amplitude[ch] = amplitude;
2660 } else { // handle->floor_types[floor_index] == 1
2661 Floor1 *floor = &handle->floor_config[floor_index].floor1;
2662 zero_channel[ch] =
2663 !decode_floor1(handle, floor, handle->final_Y[ch]);
2664 }
2665 }
2666
2667 /**** Nonzero vector propagation (4.3.3). ****/
2668 bool really_zero_channel[256];
2669 memcpy(really_zero_channel, zero_channel,
2670 sizeof(zero_channel[0]) * handle->channels);
2671 (18/18) for (int i = 0; i < map->coupling_steps; i++) {
2672 (18/18) if (!zero_channel[map->coupling[i].magnitude]
2673 (18/18) || !zero_channel[map->coupling[i].angle]) {
2674 zero_channel[map->coupling[i].magnitude] = false;
2675 zero_channel[map->coupling[i].angle] = false;
2676 }
2677 }
2678
2679 /**** Residue decoding (4.3.4). ****/
2680 (18/18) for (int i = 0; i < map->submaps; i++) {
2681 float *residue_buffers[256];
2682 int ch = 0;
2683 (18/18) for (int j = 0; j < handle->channels; j++) {
2684 (18/18) if (map->mux[j] == i) {
2685 (18/18) if (zero_channel[j]) {
2686 residue_buffers[ch] = NULL;
2687 } else {
2688 residue_buffers[ch] = channel_buffers[j];
2689 }
2690 ch++;
2691 }
2692 }
2693 decode_residue(handle, map->submap_residue[i], n/2, ch,
2694 residue_buffers);
2695 }
2696
2697 /**** Inverse coupling (4.3.5). ****/
2698 (18/18) for (int i = map->coupling_steps-1; i >= 0; i--) {
2699 float *magnitude = channel_buffers[map->coupling[i].magnitude];
2700 float *angle = channel_buffers[map->coupling[i].angle];
2701 (18/18) for (int j = 0; j < n/2; j++) {
2702 const float M = magnitude[j];
2703 const float A = angle[j];
2704 float new_M, new_A;
2705 (18/18) if (M > 0) {
2706 (18/18) if (A > 0) {
2707 new_M = M;
2708 new_A = M - A;
2709 } else {
2710 new_A = M;
2711 new_M = M + A;
2712 }
2713 } else {
2714 (18/18) if (A > 0) {
2715 new_M = M;
2716 new_A = M + A;
2717 } else {
2718 new_A = M;
2719 new_M = M - A;
2720 }
2721 }
2722 magnitude[j] = new_M;
2723 angle[j] = new_A;
2724 }
2725 }
2726
2727 /**** Floor curve synthesis and residue product (4.3.6). The spec ****
2728 **** uses the term "dot product", but the actual operation is ****
2729 **** component-by-component vector multiplication. ****/
2730 (18/18) for (int i = 0; i < handle->channels; i++) {
2731 (18/18) if (really_zero_channel[i]) {
2732 memset(channel_buffers[i], 0, sizeof(*channel_buffers[i]) * (n/2));
2733 } else {
2734 const int floor_index = map->submap_floor[map->mux[i]];
2735 (18/18) if (handle->floor_types[floor_index] == 0) {
2736 Floor0 *floor = &handle->floor_config[floor_index].floor0;
2737 do_floor0_final(handle, floor, i, n, floor0_amplitude[i]);
2738 } else { // handle->floor_types[floor_index] == 1
2739 Floor1 *floor = &handle->floor_config[floor_index].floor1;
2740 do_floor1_final(handle, floor, i, n);
2741 }
2742 }
2743 }
2744
2745 /**** Inverse MDCT (4.3.7). ****/
2746 (18/18) for (int i = 0; i < handle->channels; i++) {
2747 inverse_mdct(handle, channel_buffers[i], mode->blockflag);
2748 }
2749
2750 /**** Frame length, sample position, and other miscellany. ****/
2751
2752 /* Flush any leftover data in the current packet so the next packet
2753 * doesn't try to read it. */
2754 flush_packet(handle);
2755
2756 /* Deal with discarding samples from the first packet. */
2757 (18/18) if (handle->first_decode) {
2758 /* We don't support streams with nonzero start positions (what the
2759 * spec describes as "The granule (PCM) position of the first page
2760 * need not indicate that the stream started at position zero..."
2761 * in A.2), so we just omit the left half of the window. The value
2762 * we calculate here is negative, but the addition below will bring
2763 * it up to zero again. */
2764 handle->current_loc = -(n/2 - left_start);
2765 handle->current_loc_valid = true;
2766 handle->first_decode = false;
2767 }
2768
2769 /* If this is the last complete frame in a non-final Ogg page, update
2770 * the current sample position based on the Ogg granule position.
2771 * Normally this should have no effect, but it's important for the
2772 * first page in case that page indicates a positive initial offset,
2773 * and it helps resync if a page is missing in the stream. */
2774 (18/18) if (handle->last_seg_index == handle->end_seg_with_known_loc
2775 (18/18) && !(handle->page_flag & PAGEFLAG_last_page)) {
2776 /* For non-final pages, the granule position should be the "last
2777 * complete sample returned by decode" (A.2), thus right_start.
2778 * However, it appears that the reference encoder blindly uses the
2779 * middle of the window (n/2) even on a long/short frame boundary,
2780 * and based on <https://trac.xiph.org/ticket/1169>, the window
2781 * center appears to have been the original intention. */
2782 handle->current_loc =
2783 handle->known_loc_for_packet - (n/2 - left_start);
2784 handle->current_loc_valid = true;
2785 }
2786
2787 /* Update the current sample position for samples returned from this
2788 * frame. */
2789 (18/18) if (handle->current_loc_valid) {
2790 handle->current_loc += (right_start - left_start);
2791 }
2792
2793 /* Return the amount of data stored in the window, including the
2794 * right-side overlap region.*/
2795 *len_ret = right_end;
2796
2797 /* The granule position on the final page of the Ogg stream specifies
2798 * the length of the stream, thus indicating how many samples to
2799 * return from the final frame. However, some streams attempt to
2800 * truncate to within the _second-to-last_ frame instead, so we have
2801 * to check for end-of-stream truncation on every packet in the last
2802 * page. (The specification is ambiguous about whether this is
2803 * actually permitted; while the wording suggests that the truncation
2804 * point must lie within the last frame, there is no explicit language
2805 * specifying that limitation. Since such streams actually exist,
2806 * however, we go ahead and support them.) */
2807 (18/18) if (handle->current_loc_valid
2808 (18/18) && (handle->page_flag & PAGEFLAG_last_page)) {
2809 const uint64_t end_loc = handle->known_loc_for_packet;
2810 /* Be careful of wraparound here! A stream with a bogus position
2811 * in the second-to-last page which has the high bit set could
2812 * cause a simple "end_loc <= handle->current_loc" test to
2813 * improperly pass, so we have to test by taking the difference
2814 * and checking its sign bit. */
2815 const uint64_t frame_end =
2816 handle->current_loc + (right_end - right_start);
2817 (18/18) if (handle->current_loc - end_loc < UINT64_C(1)<<63 // end_loc <= handle->current_loc
2818 (18/18) || (handle->last_seg_index == handle->end_seg_with_known_loc
2819 (18/18) && end_loc - frame_end >= UINT64_C(1)<<63)) // end_loc < frame_end
2820 {
2821 const uint64_t packet_begin_loc =
2822 handle->current_loc - (right_start - left_start);
2823 (18/18) if (end_loc - packet_begin_loc >= UINT64_C(1)<<63) { // end_loc < packet_begin_loc
2824 /* We can't truncate to before the beginning of the current
2825 * frame (that wouldn't make sense anyway, because the
2826 * stream should just have ended at the previous frame);
2827 * just truncate to the beginning of the frame in that case. */
2828 *len_ret = left_start;
2829 } else {
2830 const uint64_t true_len =
2831 end_loc - (handle->current_loc - right_start);
2832 ASSERT(true_len < (uint64_t)*len_ret);
2833 *len_ret = (int)true_len;
2834 }
2835 handle->current_loc = end_loc;
2836 }
2837 }
2838
2839 return true;
2840 }
2841
2842 /*************************************************************************/
2843 /************************** Interface routines ***************************/
2844 /*************************************************************************/
2845
2846 bool vorbis_decode_initial(stb_vorbis *handle, int *left_start_ret,
2847 int *left_end_ret, int *right_start_ret,
2848 int *right_end_ret, int *mode_ret)
2849 {
2850 /* Start reading the next packet, and verify that it's an audio packet. */
2851 (18/18) if (!handle->packet_mode) {
2852 (18/18) if (UNLIKELY(handle->eof)) {
2853 return false;
2854 }
2855 (18/18) if (UNLIKELY(!start_packet(handle))) {
2856 (18/18) if (handle->error == VORBIS_reached_eof) {
2857 /* EOF at page boundary is not an error! */
2858 handle->error = VORBIS__no_error;
2859 }
2860 return false;
2861 }
2862 }
2863 (18/18) if (UNLIKELY(get_bits(handle, 1) != 0)
2864 (18/18) || UNLIKELY(handle->valid_bits < 0)) {
2865 /* Empty or non-audio packet. If not in packet mode, skip it and
2866 * try again. */
2867 flush_packet(handle);
2868 (18/18) if (handle->packet_mode) {
2869 handle->error = VORBIS_invalid_packet;
2870 return false;
2871 } else {
2872 return vorbis_decode_initial(handle, left_start_ret, left_end_ret,
2873 right_start_ret, right_end_ret,
2874 mode_ret);
2875 }
2876 }
2877
2878 /* Read the mode number and window flags for this packet. */
2879 const int mode_index = get_bits(handle, handle->mode_bits);
2880 /* Still in the same byte, so we can't hit EOP here. */
2881 ASSERT(mode_index != EOP);
2882 (18/18) if (mode_index >= handle->mode_count) {
2883 handle->error = VORBIS_invalid_packet;
2884 flush_packet(handle);
2885 return false;
2886 }
2887 *mode_ret = mode_index;
2888 Mode *mode = &handle->mode_config[mode_index];
2889 bool next, prev;
2890 (18/18) if (mode->blockflag) {
2891 prev = get_bits(handle, 1);
2892 next = get_bits(handle, 1);
2893 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {
2894 flush_packet(handle);
2895 return error(handle, VORBIS_invalid_packet);
2896 }
2897 } else {
2898 prev = next = false;
2899 }
2900
2901 /* Set up the window bounds for this frame. */
2902 const int n = handle->blocksize[mode->blockflag];
2903 (36/36) if (mode->blockflag && !prev) {
2904 *left_start_ret = (n - handle->blocksize[0]) / 4;
2905 *left_end_ret = (n + handle->blocksize[0]) / 4;
2906 } else {
2907 *left_start_ret = 0;
2908 *left_end_ret = n/2;
2909 }
2910 (36/36) if (mode->blockflag && !next) {
2911 *right_start_ret = (n*3 - handle->blocksize[0]) / 4;
2912 *right_end_ret = (n*3 + handle->blocksize[0]) / 4;
2913 } else {
2914 *right_start_ret = n/2;
2915 *right_end_ret = n;
2916 }
2917
2918 return true;
2919 }
2920
2921 /*-----------------------------------------------------------------------*/
2922
2923 bool vorbis_decode_packet(stb_vorbis *handle, int *len_ret)
2924 {
2925 const bool first_decode = handle->first_decode;
2926
2927 int mode, left_start, left_end, right_start, right_end, len;
2928 (18/18) if (!vorbis_decode_initial(handle, &left_start, &left_end,
2929 &right_start, &right_end, &mode)
2930 (18/18) || !vorbis_decode_packet_rest(handle, &handle->mode_config[mode],
2931 left_start, left_end, right_start,
2932 right_end, &len)) {
2933 return false;
2934 }
2935
2936 float ** const channel_buffers =
2937 handle->channel_buffers[handle->cur_channel_buffer];
2938 const int prev = handle->previous_length;
2939
2940 /* We deliberately (though harmlessly) deviate from the spec with
2941 * respect to the portion of data to return for a given frame. The
2942 * spec says that we should return the region from the middle of the
2943 * previous frame to the middle of the current frame (4.8), but that
2944 * would require a buffer copy. Instead we return all finalized
2945 * samples from the current frame, i.e., up to right_start. This
2946 * unfortunately requires a bit of fudging around when decoding the
2947 * first frame of a stream. */
2948
2949 /* Mix in data from the previous window's right side. */
2950 (18/18) if (prev > 0) {
2951 const float *weights;
2952 (18/18) if (prev*2 == handle->blocksize[0]) {
2953 weights = handle->window_weights[0];
2954 } else {
2955 ASSERT(prev*2 == handle->blocksize[1]);
2956 weights = handle->window_weights[1];
2957 }
2958 (18/18) for (int i = 0; i < handle->channels; i++) {
2959 (18/18) for (int j = 0; j < prev; j++) {
2960 channel_buffers[i][left_start+j] =
2961 channel_buffers[i][left_start+j] * weights[j] +
2962 handle->previous_window[i][j] * weights[prev-1-j];
2963 }
2964 }
2965 }
2966
2967 /* Point the previous_window pointers at the right side of this window. */
2968 handle->previous_length = right_end - right_start;
2969 (18/18) for (int i = 0; i < handle->channels; i++) {
2970 handle->previous_window[i] = channel_buffers[i] + right_start;
2971 }
2972
2973 /* If this is the first frame, push left_start (the beginning of data
2974 * to return) to the center of the window, since the left half of the
2975 * window contains garbage. */
2976 (18/18) if (first_decode) {
2977 const int n = handle->blocksize[handle->mode_config[mode].blockflag];
2978 left_start = n/2;
2979 }
2980
2981 /* Save this channel's output pointers. */
2982 (18/18) for (int i = 0; i < handle->channels; i++) {
2983 handle->outputs[i] = channel_buffers[i] + left_start;
2984 }
2985
2986 /* Advance to the next set of channel buffers for the next frame's data. */
2987 handle->cur_channel_buffer =
2988 (handle->cur_channel_buffer + 1) % lenof(handle->channel_buffers);
2989
2990 /* Return the final frame length, if requested. */
2991 (18/18) if (len_ret) {
2992 (18/18) if (len < right_start) {
2993 *len_ret = len - left_start;
2994 /* Watch out for trying to truncate into the first packet of
2995 * the stream (which is not returned). */
2996 (18/18) if (*len_ret < 0) {
2997 *len_ret = 0;
2998 }
2999 } else {
3000 *len_ret = right_start - left_start;
3001 ASSERT(*len_ret >= 0);
3002 }
3003 }
3004 return true;
3005 }
3006
3007 /*-----------------------------------------------------------------------*/
3008
3009 bool vorbis_decode_packet_direct(
3010 stb_vorbis *handle, const void *packet, int32_t packet_len, int *len_ret)
3011 {
3012 start_packet_direct(handle, packet, packet_len);
3013 return vorbis_decode_packet(handle, len_ret);
3014 }
3015
3016 /*************************************************************************/
3017 /*************************************************************************/