Code Coverage Report for src/decode/decode.c


Hit Total Coverage
Lines: 1025 1050 97.6%
Branches: 740 778 95.1%

1 /*
2 * libnogg: a decoder library for Ogg Vorbis streams
3 * Copyright (c) 2014-2019 Andrew Church <achurch@achurch.org>
4 *
5 * This software may be copied and redistributed under certain conditions;
6 * see the file "COPYING" in the source code distribution for details.
7 * NO WARRANTY is provided with this software.
8 */
9
10 #include "include/nogg.h"
11 #include "src/common.h"
12 #include "src/decode/common.h"
13 #include "src/decode/crc32.h"
14 #include "src/decode/decode.h"
15 #include "src/decode/inlines.h"
16 #include "src/decode/io.h"
17 #include "src/decode/packet.h"
18 #include "src/decode/setup.h"
19 #include "src/util/memory.h"
20
21 #include <math.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #ifdef ENABLE_ASM_ARM_NEON
26 # include <arm_neon.h>
27 /* Note that vectorization is sometimes slower on ARM because of the lack
28 * of fast vector swizzle instructions, so ARM code is deliberately omitted
29 * in those cases. */
30 #endif
31 #ifdef ENABLE_ASM_X86_SSE2
32 # include <emmintrin.h>
33 #endif
34
35 /* Older versions of GCC warn about shadowing functions with variables, so
36 * work around those warnings. */
37 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__==4 && __GNUC_MINOR__<7
38 # define div div_
39 # define floor floor_
40 # define y0 y0_
41 # define y1 y1_
42 #endif
43
44 /*
45 * Note on variable naming: Variables are generally named following usage
46 * in the Vorbis spec, though some variables have been renamed for clarity.
47 * In particular, the Vorbis spec consistently uses "n" for the window size
48 * of a packet, and we follow that usage here.
49 */
50
51 /*************************************************************************/
52 /****************************** Local data *******************************/
53 /*************************************************************************/
54
55 /* Lookup table for type 1 floor curve generation, copied from the Vorbis
56 * specification. */
57 static float floor1_inverse_db_table[256] =
58 {
59 1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f,
60 1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f,
61 1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f,
62 2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f,
63 2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f,
64 3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f,
65 4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f,
66 6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f,
67 7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f,
68 1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f,
69 1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f,
70 1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f,
71 2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f,
72 2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f,
73 3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f,
74 4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f,
75 5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f,
76 7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f,
77 9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f,
78 1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f,
79 1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f,
80 2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f,
81 2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f,
82 3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f,
83 4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f,
84 5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f,
85 7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f,
86 9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f,
87 0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f,
88 0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f,
89 0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f,
90 0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f,
91 0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f,
92 0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f,
93 0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f,
94 0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f,
95 0.00092223983f, 0.00098217216f, 0.0010459992f, 0.0011139742f,
96 0.0011863665f, 0.0012634633f, 0.0013455702f, 0.0014330129f,
97 0.0015261382f, 0.0016253153f, 0.0017309374f, 0.0018434235f,
98 0.0019632195f, 0.0020908006f, 0.0022266726f, 0.0023713743f,
99 0.0025254795f, 0.0026895994f, 0.0028643847f, 0.0030505286f,
100 0.0032487691f, 0.0034598925f, 0.0036847358f, 0.0039241906f,
101 0.0041792066f, 0.0044507950f, 0.0047400328f, 0.0050480668f,
102 0.0053761186f, 0.0057254891f, 0.0060975636f, 0.0064938176f,
103 0.0069158225f, 0.0073652516f, 0.0078438871f, 0.0083536271f,
104 0.0088964928f, 0.009474637f, 0.010090352f, 0.010746080f,
105 0.011444421f, 0.012188144f, 0.012980198f, 0.013823725f,
106 0.014722068f, 0.015678791f, 0.016697687f, 0.017782797f,
107 0.018938423f, 0.020169149f, 0.021479854f, 0.022875735f,
108 0.024362330f, 0.025945531f, 0.027631618f, 0.029427276f,
109 0.031339626f, 0.033376252f, 0.035545228f, 0.037855157f,
110 0.040315199f, 0.042935108f, 0.045725273f, 0.048696758f,
111 0.051861348f, 0.055231591f, 0.058820850f, 0.062643361f,
112 0.066714279f, 0.071049749f, 0.075666962f, 0.080584227f,
113 0.085821044f, 0.091398179f, 0.097337747f, 0.10366330f,
114 0.11039993f, 0.11757434f, 0.12521498f, 0.13335215f,
115 0.14201813f, 0.15124727f, 0.16107617f, 0.17154380f,
116 0.18269168f, 0.19456402f, 0.20720788f, 0.22067342f,
117 0.23501402f, 0.25028656f, 0.26655159f, 0.28387361f,
118 0.30232132f, 0.32196786f, 0.34289114f, 0.36517414f,
119 0.38890521f, 0.41417847f, 0.44109412f, 0.46975890f,
120 0.50028648f, 0.53279791f, 0.56742212f, 0.60429640f,
121 0.64356699f, 0.68538959f, 0.72993007f, 0.77736504f,
122 0.82788260f, 0.88168307f, 0.9389798f, 1.0f
123 };
124
125 /*************************************************************************/
126 /************************* Vectorization helpers *************************/
127 /*************************************************************************/
128
129 #ifdef ENABLE_ASM_ARM_NEON
130
131 /* Used to avoid unnecessary typecasts when flipping sign bits. */
132 static inline float32x4_t veorq_f32(uint32x4_t a, float32x4_t b) {
133 return (float32x4_t)veorq_u32(a, (uint32x4_t)b);
134 }
135
136 /* Various kinds of vector swizzles (xyzw = identity). The separate
137 * declarations are needed because some functions are implemented in
138 * terms of others. */
139 static inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a);
140 static inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a);
141 static inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a);
142 static inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a);
143 static inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a);
144 static inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a);
145 static inline float32x4_t vswizzleq_yyww_f32(float32x4_t a);
146 static inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a);
147 static inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a);
148 static inline float32x4_t vswizzleq_zzww_f32(float32x4_t a);
149 static inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a);
150 static inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a);
151 static inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a);
152 static inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a);
153 /* These are all marked UNUSED so Clang doesn't complain about the fact
154 * that we don't currently use all of them. */
155 static UNUSED inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a) {
156 return vzipq_f32(a, a).val[0];
157 }
158 static UNUSED inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a) {
159 const uint32x4_t sel = {~0U, 0, ~0U, 0};
160 /* vbsl operands: vbsl(mask, mask_on_value, mask_off_value) */
161 return vbslq_f32(sel, a, (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));
162 }
163 static UNUSED inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a) {
164 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[0]);
165 }
166 static UNUSED inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a) {
167 return vuzpq_f32(a, a).val[0];
168 }
169 static UNUSED inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a) {
170 return vswizzleq_xyxy_f32(vswizzleq_yxwz_f32(a));
171 }
172 static UNUSED inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a) {
173 const uint32x4_t sel = {~0U, 0, ~0U, 0};
174 return vbslq_f32(sel,
175 (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32),
176 (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));
177 }
178 static UNUSED inline float32x4_t vswizzleq_yyww_f32(float32x4_t a) {
179 const uint32x4_t sel = {~0U, 0, ~0U, 0};
180 return vbslq_f32(sel, (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32), a);
181 }
182 static UNUSED inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a) {
183 return vuzpq_f32(a, a).val[1];
184 }
185 static UNUSED inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a) {
186 return vswizzleq_zwxy_f32(vswizzleq_xxzz_f32(a));
187 }
188 static UNUSED inline float32x4_t vswizzleq_zzww_f32(float32x4_t a) {
189 return vzipq_f32(a, a).val[1];
190 }
191 static UNUSED inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a) {
192 return vextq_f32(a, a, 2);
193 }
194 static UNUSED inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a) {
195 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[1]);
196 }
197 static UNUSED inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a) {
198 return vswizzleq_yxwz_f32(vswizzleq_zwxy_f32(a));
199 }
200 static UNUSED inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a) {
201 return vswizzleq_zwxy_f32(vswizzleq_yyww_f32(a));
202 }
203
204 #endif // ENABLE_ASM_ARM_NEON
205
206 /*************************************************************************/
207 /******************* Codebook decoding: scalar context *******************/
208 /*************************************************************************/
209
210 /**
211 * codebook_decode_scalar_raw_slow: Slow path for Huffman decoding,
212 * searching through the lookup tables to decode the next symbol.
213 */
214 static int32_t codebook_decode_scalar_raw_slow(stb_vorbis *handle,
215 const Codebook *book)
216 {
217 /* Fill the bit accumulator far enough to read any valid code. */
218 (4/4) if (handle->valid_bits < 24) {
219 fill_bits(handle);
220 }
221
222 /* Find the code using binary search if we can, linear search otherwise. */
223 (4/4) if (book->sorted_codewords) {
224 const uint32_t code = bit_reverse(handle->acc);
225 int32_t low = 0, high = book->sorted_entries;
226 /* Invariant: sorted_codewords[low] <= code < sorted_codewords[high] */
227 (4/4) while (low+1 < high) {
228 const int32_t mid = (low + high) / 2;
229 (4/4) if (book->sorted_codewords[mid] <= code) {
230 low = mid;
231 } else {
232 high = mid;
233 }
234 }
235 (4/4) const int32_t result = book->sparse ? low : book->sorted_values[low];
236 const int len = book->codeword_lengths[result];
237 handle->acc >>= len;
238 handle->valid_bits -= len;
239 (4/4) if (UNLIKELY(handle->valid_bits < 0)) {
240 return -1;
241 }
242 return result;
243 } else {
244 /* This loop terminates when the code is found. We ensure that all
245 * Huffman trees are completely specified during setup, so the
246 * assertion below will never fail. */
247 for (int i = 0; ; i++) {
248 ASSERT(i < book->entries);
249 (4/4) if (book->codeword_lengths[i] == NO_CODE) {
250 continue;
251 }
252 if (book->codewords[i] ==
253 (4/4) (handle->acc & ((UINT32_C(1) << book->codeword_lengths[i])-1)))
254 {
255 handle->acc >>= book->codeword_lengths[i];
256 handle->valid_bits -= book->codeword_lengths[i];
257 (4/4) if (UNLIKELY(handle->valid_bits < 0)) {
258 return -1;
259 }
260 return i;
261 }
262 }
263 }
264 }
265
266 /*-----------------------------------------------------------------------*/
267
268 /**
269 * codebook_decode_scalar_raw: Read a Huffman code from the packet and
270 * decode it in scalar context using the given codebook, returning the
271 * symbol for non-sparse codebooks and the sorted_values[] index for sparse
272 * codebooks.
273 *
274 * This function contains only the fast path for Huffman lookup so that it
275 * can be easily inlined into callers. The slow path is implemented as a
276 * separate, non-inlined function to avoid code bloat.
277 *
278 * [Parameters]
279 * handle: Stream handle.
280 * book: Codebook to use.
281 * [Return value]
282 * Value read, or -1 if the end of the packet is reached.
283 */
284 static inline int32_t codebook_decode_scalar_raw(stb_vorbis *handle,
285 const Codebook *book)
286 {
287 /* First try the O(1) table. We only fill the bit accumulator if we
288 * don't have enough bits for the fast table, to avoid overhead from
289 * repeatedly reading single bytes from the packet. */
290 (4/4) if (handle->valid_bits < handle->fast_huffman_length) {
291 /* Note that moving this "semi-fast" path out to a separate
292 * function is no faster, and possibly slower (though not
293 * significantly so), than just calling fill_bits() here, at least
294 * on x86_64 and 32-bit ARM. */
295 fill_bits(handle);
296 }
297 const int fast_code =
298 book->fast_huffman[handle->acc & handle->fast_huffman_mask];
299 (4/4) if (fast_code >= 0) {
300 const int bits = book->codeword_lengths[fast_code];
301 handle->acc >>= bits;
302 handle->valid_bits -= bits;
303 (4/4) if (UNLIKELY(handle->valid_bits < 0)) {
304 return -1;
305 }
306 return fast_code;
307 }
308
309 /* Fall back to the slow (table search) method. */
310 return codebook_decode_scalar_raw_slow(handle, book);
311 }
312
313 /*-----------------------------------------------------------------------*/
314
315 /**
316 * codebook_decode_scalar: Read a Huffman code from the packet and decode
317 * it in scalar context using the given codebook.
318 *
319 * [Parameters]
320 * handle: Stream handle.
321 * book: Codebook to use.
322 * [Return value]
323 * Symbol read, or -1 if the end of the packet is reached.
324 */
325 static int32_t codebook_decode_scalar(stb_vorbis *handle, const Codebook *book)
326 {
327 const int32_t value = codebook_decode_scalar_raw(handle, book);
328 (4/4) return book->sparse ? book->sorted_values[value] : value;
329 }
330
331 /*************************************************************************/
332 /********************* Codebook decoding: VQ context *********************/
333 /*************************************************************************/
334
335 /**
336 * codebook_decode_scalar_for_vq: Read a Huffman code from the packet and
337 * return a value appropriate for decoding the code in VQ context. Helper
338 * function for codebook_decode() and related functions.
339 *
340 * [Parameters]
341 * handle: Stream handle.
342 * book: Codebook to use.
343 * [Return value]
344 * Symbol or value read (depending on library configuration and
345 * codebook type), or -1 if the end of the packet is reached.
346 */
347 static inline int32_t codebook_decode_scalar_for_vq(stb_vorbis *handle,
348 const Codebook *book)
349 {
350 (4/4) if (handle->divides_in_codebook) {
351 return codebook_decode_scalar(handle, book);
352 } else {
353 return codebook_decode_scalar_raw(handle, book);
354 }
355 }
356
357 /*-----------------------------------------------------------------------*/
358
359 /**
360 * codebook_decode: Read a Huffman code from the packet and decode it in
361 * VQ context using the given codebook. Decoded vector components are
362 * added componentwise to the values in the output vector.
363 *
364 * If an error occurs, the current packet is flushed.
365 *
366 * [Parameters]
367 * handle: Stream handle.
368 * book: Codebook to use. Must be a vector (not scalar) codebook.
369 * output: Output vector.
370 * len: Number of elements to read. Automatically capped at
371 * book->dimensions.
372 * [Return value]
373 * True on success, false on error or end-of-packet.
374 */
375 static bool codebook_decode(stb_vorbis *handle, const Codebook *book,
376 float *output, int len)
377 {
378 ASSERT(book->lookup_type != 0);
379 ASSERT(book->multiplicands);
380
381 (4/4) if (len > book->dimensions) {
382 len = book->dimensions;
383 }
384
385 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
386 (4/4) if (UNLIKELY(code < 0)) {
387 return false;
388 }
389
390 (4/4) if (book->lookup_type == 1) {
391 int div = 1;
392 (4/4) if (book->sequence_p) {
393 float last = 0;
394 (4/4) for (int i = 0; i < len; i++) {
395 const int32_t offset = (code / div) % book->lookup_values;
396 const float val = book->multiplicands[offset] + last;
397 output[i] += val;
398 last = val;
399 div *= book->lookup_values;
400 }
401 } else {
402 (4/4) for (int i = 0; i < len; i++) {
403 const int32_t offset = (code / div) % book->lookup_values;
404 output[i] += book->multiplicands[offset];
405 div *= book->lookup_values;
406 }
407 }
408 } else {
409 const int32_t offset = code * book->dimensions;
410 int i = 0;
411 #if defined(ENABLE_ASM_ARM_NEON)
412 for (; i+4 <= len; i += 4) {
413 vst1q_f32(&output[i], vaddq_f32(
414 vld1q_f32(&output[i]),
415 vld1q_f32(&book->multiplicands[offset+i])));
416 }
417 #elif defined(ENABLE_ASM_X86_SSE2)
418 (2/2) for (; i+4 <= len; i += 4) {
419 _mm_storeu_ps(&output[i], _mm_add_ps(
420 _mm_loadu_ps(&output[i]),
421 _mm_loadu_ps(&book->multiplicands[offset+i])));
422 }
423 #endif
424 (4/4) for (; i < len; i++) {
425 output[i] += book->multiplicands[offset+i];
426 }
427 }
428
429 return true;
430 }
431
432 /*-----------------------------------------------------------------------*/
433
434 /**
435 * codebook_decode_step: Read a Huffman code from the packet and decode it
436 * in VQ context using the given codebook. Decoded vector components are
437 * added componentwise to values in the output vector at intervals of "step"
438 * array elements. (codebook_decode() is effectively a specialization of
439 * this function with step==1.)
440 *
441 * If an error occurs, the current packet is flushed.
442 *
443 * [Parameters]
444 * handle: Stream handle.
445 * book: Codebook to use. Must be a vector (not scalar) codebook.
446 * output: Output vector.
447 * len: Number of elements to read. Automatically capped at
448 * book->dimensions.
449 * step: Interval between consecutive output elements: the second
450 * element is output[step], the third is output[2*step], etc.
451 * [Return value]
452 * True on success, false on error or end-of-packet.
453 */
454 static bool codebook_decode_step(stb_vorbis *handle, const Codebook *book,
455 float *output, int len, int step)
456 {
457 ASSERT(book->lookup_type != 0);
458 ASSERT(book->multiplicands);
459
460 (4/4) if (len > book->dimensions) {
461 len = book->dimensions;
462 }
463
464 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
465 (4/4) if (UNLIKELY(code < 0)) {
466 return false;
467 }
468
469 (4/4) if (book->lookup_type == 1) {
470 int div = 1;
471 (2/4) if (book->sequence_p) {
472 /* This case appears not to be used by the reference encoder. */
473 float last = 0;
474 (0/4) for (int i = 0; i < len; i++) {
475 const int32_t offset = (code / div) % book->lookup_values;
476 const float val = book->multiplicands[offset] + last;
477 output[i*step] += val;
478 last = val;
479 div *= book->lookup_values;
480 }
481 } else {
482 (4/4) for (int i = 0; i < len; i++) {
483 const int32_t offset = (code / div) % book->lookup_values;
484 output[i*step] += book->multiplicands[offset];
485 div *= book->lookup_values;
486 }
487 }
488 } else {
489 const int32_t offset = code * book->dimensions;
490 (4/4) for (int i = 0; i < len; i++) {
491 output[i*step] += book->multiplicands[offset+i];
492 }
493 }
494
495 return true;
496 }
497
498 /*-----------------------------------------------------------------------*/
499
500 /**
501 * codebook_decode_deinterleave_repeat: Read Huffman codes from the packet
502 * and decode them in VQ context using the given codebook, deinterleaving
503 * across "ch" output channels, and repeat until total_decode vector
504 * elements have been decoded. Decoded vector components are added
505 * componentwise to the values in the output vectors.
506 *
507 * If an error occurs, the current packet is flushed.
508 *
509 * [Parameters]
510 * handle: Stream handle.
511 * book: Codebook to use.
512 * outputs: Output vectors for each channel.
513 * ch: Number of channels.
514 * n: Length of each output vector.
515 * total_offset: Offset within the interleaved output vector at which
516 * to start writing output values.
517 * total_decode: Total number of elements to read.
518 * [Return value]
519 * True on success, false on error or end-of-packet.
520 */
521 static bool codebook_decode_deinterleave_repeat(
522 stb_vorbis *handle, const Codebook *book, float **outputs, int ch, int n,
523 int32_t total_offset, int32_t total_decode)
524 {
525 (2/4) if (total_decode > ch*n - total_offset) {
526 total_decode = ch*n - total_offset;
527 }
528 int c_inter = total_offset % ch;
529 int p_inter = total_offset / ch;
530 int len = book->dimensions;
531
532 (4/4) while (total_decode > 0) {
533 /* Make sure we don't run off the end of the output vectors. */
534 (2/4) if (len > total_decode) {
535 len = total_decode;
536 }
537
538 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
539 (4/4) if (UNLIKELY(code < 0)) {
540 return false;
541 }
542
543 (4/4) if (book->lookup_type == 1) {
544 int div = 1;
545 (2/4) if (book->sequence_p) {
546 /* This case appears not to be used by the reference encoder. */
547 float last = 0;
548 (0/4) for (int i = 0; i < len; i++) {
549 const int32_t offset = (code / div) % book->lookup_values;
550 const float val = book->multiplicands[offset] + last;
551 (0/4) if (outputs[c_inter]) {
552 outputs[c_inter][p_inter] += val;
553 }
554 (0/4) if (++c_inter == ch) {
555 c_inter = 0;
556 p_inter++;
557 }
558 last = val;
559 div *= book->lookup_values;
560 }
561 } else {
562 (4/4) for (int i = 0; i < len; i++) {
563 const int32_t offset = (code / div) % book->lookup_values;
564 (2/4) if (outputs[c_inter]) {
565 outputs[c_inter][p_inter] +=
566 book->multiplicands[offset];
567 }
568 (4/4) if (++c_inter == ch) {
569 c_inter = 0;
570 p_inter++;
571 }
572 div *= book->lookup_values;
573 }
574 }
575 } else { // book->lookup_type == 2
576 const int32_t offset = code * book->dimensions;
577 (4/4) for (int i = 0; i < len; i++) {
578 (4/4) if (outputs[c_inter]) {
579 outputs[c_inter][p_inter] += book->multiplicands[offset+i];
580 }
581 (4/4) if (++c_inter == ch) {
582 c_inter = 0;
583 p_inter++;
584 }
585 }
586 }
587
588 total_decode -= len;
589 }
590
591 return true;
592 }
593
594 /*-----------------------------------------------------------------------*/
595
596 /**
597 * codebook_decode_deinterleave_repeat_2: Read Huffman codes from the
598 * packet and decode them in VQ context using the given codebook,
599 * deinterleaving across 2 output channels. This function is a
600 * specialization of codebook_decode_deinterleave_repeat() for ch==2 and
601 * VORBIS_OPTION_DIVIDES_IN_CODEBOOK disabled.
602 *
603 * If an error occurs, the current packet is flushed.
604 *
605 * [Parameters]
606 * handle: Stream handle.
607 * book: Codebook to use.
608 * outputs: Output vectors for each channel.
609 * len: Length of each output vector.
610 * total_decode: Total number of elements to read.
611 * [Return value]
612 * True on success, false on error or end-of-packet.
613 */
614 static bool codebook_decode_deinterleave_repeat_2(
615 stb_vorbis *handle, const Codebook *book, float **outputs, int n,
616 int32_t total_offset, int32_t total_decode)
617 {
618 (2/4) if (total_decode > 2*n - total_offset) {
619 total_decode = 2*n - total_offset;
620 }
621 float * const output0 = outputs[0];
622 float * const output1 = outputs[1];
623 ASSERT(output0);
624 ASSERT(output1);
625 int c_inter = total_offset % 2;
626 int p_inter = total_offset / 2;
627 int len = book->dimensions;
628
629 (4/4) while (total_decode > 0) {
630 /* Make sure we don't run off the end of the output vectors. */
631 (2/4) if (len > total_decode) {
632 len = total_decode;
633 }
634
635 const int32_t code = codebook_decode_scalar_for_vq(handle, book);
636 (4/4) if (UNLIKELY(code < 0)) {
637 return false;
638 }
639
640 const int32_t offset = code * book->dimensions;
641 int i = 0;
642 (4/4) if (c_inter == 1) {
643 output1[p_inter] += book->multiplicands[offset];
644 c_inter = 0;
645 p_inter++;
646 i++;
647 }
648 (4/4) for (; i+2 <= len; i += 2) {
649 output0[p_inter] += book->multiplicands[offset+i];
650 output1[p_inter] += book->multiplicands[offset+i+1];
651 p_inter++;
652 }
653 (4/4) if (i < len) {
654 output0[p_inter] += book->multiplicands[offset+i];
655 c_inter++;
656 }
657
658 total_decode -= len;
659 }
660
661 return true;
662 }
663
664 /*************************************************************************/
665 /*************************** Floor processing ****************************/
666 /*************************************************************************/
667
668 /**
669 * render_point: Calculate the Y coordinate for point X on the line
670 * (x0,y0)-(x1,y1). Defined by section 9.2.6 in the Vorbis specification.
671 *
672 * [Parameters]
673 * x0, y0: First endpoint of the line.
674 * x1, y1: Second endpoint of the line.
675 * x: X coordinate of point to calculate.
676 * [Return value]
677 * Y value for the given X coordinate.
678 */
679 static CONST_FUNCTION int render_point(int x0, int y0, int x1, int y1, int x)
680 {
681 /* The definition of this function in the spec has separate code
682 * branches for positive and negative dy values, presumably to try and
683 * avoid defining integer division as rounding negative values toward
684 * zero (even though it ends up doing exactly that in the very next
685 * section...), but C already defines integer division to round both
686 * positive and negative values toward zero, so we just do things the
687 * natural way. */
688 return y0 + ((y1 - y0) * (x - x0) / (x1 - x0));
689 }
690
691 /*-----------------------------------------------------------------------*/
692
693 /**
694 * render_line: Render a line for a type 1 floor curve and perform the
695 * dot-product operation with the residue vector. Defined by section
696 * 9.2.7 in the Vorbis specification.
697 *
698 * [Parameters]
699 * x0, y0: First endpoint of the line.
700 * x1, y1: Second endpoint of the line.
701 * output: Output buffer. On entry, this buffer should contain the
702 * residue vector.
703 * n: Window length.
704 */
705 static inline void render_line(int x0, int y0, int x1, int y1, float *output,
706 int n)
707 {
708 /* N.B.: The spec requires a very specific sequence of operations for
709 * this function to ensure that both the encoder and the decoder
710 * generate the same integer-quantized curve. Take care that any
711 * optimizations to this function do not change the output. */
712
713 ASSERT(x0 >= 0);
714 ASSERT(y0 >= 0);
715 ASSERT(x1 > x0);
716
717 const int dy = y1 - y0;
718 const int adx = x1 - x0;
719 const int base = dy / adx;
720 const int ady = abs(dy) - (abs(base) * adx);
721 (4/4) const int sy = (dy < 0 ? base-1 : base+1);
722 int x = x0, y = y0;
723 int err = 0;
724
725 (4/4) if (x1 >= n) {
726 (2/4) if (x0 >= n) {
727 return;
728 }
729 x1 = n;
730 }
731 output[x] *= floor1_inverse_db_table[y];
732 (4/4) for (x++; x < x1; x++) {
733 err += ady;
734 (4/4) if (err >= adx) {
735 err -= adx;
736 y += sy;
737 } else {
738 y += base;
739 }
740 output[x] *= floor1_inverse_db_table[y];
741 }
742 }
743
744 /*-----------------------------------------------------------------------*/
745
746 /**
747 * decode_floor0: Perform type 0 floor decoding.
748 *
749 * [Parameters]
750 * handle: Stream handle.
751 * floor: Floor configuration.
752 * coefficients: Floor coefficient buffer for the current channel.
753 * [Return value]
754 * The floor amplitude (positive), 0 to indicate "unused" status, or
755 * -1 to indicate an undecodable packet.
756 */
757 static int64_t decode_floor0(stb_vorbis *handle, const Floor0 *floor,
758 float *coefficients)
759 {
760 const int amplitude = get_bits(handle, floor->amplitude_bits);
761 (4/4) if (amplitude == 0) {
762 return 0;
763 }
764
765 const int booknumber = get_bits(handle, floor->book_bits);
766 (4/4) if (booknumber >= floor->number_of_books) {
767 return -1;
768 }
769 const Codebook *book = &handle->codebooks[floor->book_list[booknumber]];
770 (4/4) if (book->lookup_type == 0) {
771 /* The specification is unclear on whether a floor 0 configuration
772 * referencing a scalar codebook is a setup-time or decode-time
773 * error; the only applicable rule seems to be 3.3 "If decoder
774 * setup or decode requests such an action [using a codebook of
775 * lookup type 0 in any context expecting a vector return value],
776 * that is an error condition rendering the packet undecodable".
777 * In the spirit of being lenient in what you accept, we treat it
778 * as a decode-time error so that packets which do not use that
779 * floor configuration can still be decoded. */
780 return -1;
781 }
782
783 float last = 0;
784 int coef_count = 0;
785 (4/4) while (coef_count < floor->order) {
786 const int len = min(book->dimensions, floor->order - coef_count);
787 (4/4) for (int i = 0; i < len; i++) {
788 coefficients[coef_count + i] = last;
789 }
790 (4/4) if (!codebook_decode(handle, book, &coefficients[coef_count], len)) {
791 return 0;
792 }
793 coef_count += len;
794 last = coefficients[coef_count - 1];
795 }
796
797 return amplitude;
798 }
799
800 /*-----------------------------------------------------------------------*/
801
802 /**
803 * decode_floor1_partition: Decode a single partition for a type 1 floor.
804 *
805 * [Parameters]
806 * handle: Stream handle.
807 * floor: Floor configuration.
808 * final_Y: final_Y buffer for the current channel.
809 * offset: Current offset in final_Y buffer.
810 * class: Partition class for the current partition.
811 * [Return value]
812 * New value of offset.
813 */
814 static int decode_floor1_partition(stb_vorbis *handle, const Floor1 *floor,
815 int16_t *final_Y, int offset,
816 const int class)
817 {
818 const int cdim = floor->class_dimensions[class];
819 const int cbits = floor->class_subclasses[class];
820 const int csub = (1 << cbits) - 1;
821 int cval = 0;
822 (4/4) if (cbits > 0) {
823 const Codebook *book =
824 &handle->codebooks[floor->class_masterbooks[class]];
825 cval = codebook_decode_scalar(handle, book);
826 }
827 (4/4) for (int j = 0; j < cdim; j++) {
828 const int book_index = floor->subclass_books[class][cval & csub];
829 cval >>= cbits;
830 (4/4) if (book_index >= 0) {
831 const Codebook *book = &handle->codebooks[book_index];
832 final_Y[offset++] = codebook_decode_scalar(handle, book);
833 } else {
834 final_Y[offset++] = 0;
835 }
836 }
837 return offset;
838 }
839
840 /*-----------------------------------------------------------------------*/
841
842 /**
843 * decode_floor1_amplitude: Calculate an element of the final_Y vector
844 * for a type 1 floor.
845 *
846 * [Parameters]
847 * floor: Floor configuration.
848 * final_Y: final_Y buffer for the current channel.
849 * range: Maximum value for final_Y elements.
850 * step2_flag: Array of flags indicating final_Y elements which will
851 * need to be recalculated later.
852 * i: Element of final_Y to calculate.
853 */
854 static void decode_floor1_amplitude(const Floor1 *floor, int16_t *final_Y,
855 const int range, bool *step2_flag,
856 const int i)
857 {
858 const int low = floor->neighbors[i].low;
859 const int high = floor->neighbors[i].high;
860 const int predicted = render_point(floor->X_list[low], final_Y[low],
861 floor->X_list[high], final_Y[high],
862 floor->X_list[i]);
863 const int val = final_Y[i]; // Value read from packet.
864 const int highroom = range - predicted;
865 const int lowroom = predicted;
866 int room;
867 (4/4) if (highroom > lowroom) {
868 room = lowroom * 2;
869 } else {
870 room = highroom * 2;
871 }
872 (4/4) if (val) {
873 step2_flag[low] = true;
874 step2_flag[high] = true;
875 step2_flag[i] = true;
876 (4/4) if (val >= room) {
877 (4/4) if (highroom > lowroom) {
878 /* The spec (7.2.4) suggests that "decoder implementations
879 * guard the values in vector [floor1_final_Y] by clamping
880 * each element to [0, [range])" because "it is possible
881 * to abuse the setup and codebook machinery to produce
882 * negative or over-range results". That can only happen
883 * in these two cases, so we insert tests here. This has
884 * about a 1% performance penalty, but that's better than
885 * corrupt data causing the library to crash. */
886 (2/4) if (UNLIKELY(val > range - 1)) {
887 final_Y[i] = range - 1;
888 } else {
889 final_Y[i] = val;
890 }
891 } else {
892 (2/4) if (UNLIKELY(val > range - 1)) {
893 final_Y[i] = 0;
894 } else {
895 final_Y[i] = (range - 1) - val;
896 }
897 }
898 } else {
899 (4/4) if (val % 2 != 0) {
900 final_Y[i] = predicted - (val+1)/2;
901 } else {
902 final_Y[i] = predicted + val/2;
903 }
904 }
905 } else {
906 step2_flag[i] = false;
907 final_Y[i] = predicted;
908 }
909 }
910
911 /*-----------------------------------------------------------------------*/
912
913 /**
914 * decode_floor1: Perform type 1 floor decoding.
915 *
916 * [Parameters]
917 * handle: Stream handle.
918 * floor: Floor configuration.
919 * final_Y: final_Y buffer for the current channel.
920 * [Return value]
921 * False to indicate "unused" status (4.2.3 step 6), true otherwise.
922 */
923 static bool decode_floor1(stb_vorbis *handle, const Floor1 *floor,
924 int16_t *final_Y)
925 {
926 static const int16_t range_list[4] = {256, 128, 86, 64};
927 static const int8_t range_bits_list[4] = {8, 7, 7, 6};
928 const int range = range_list[floor->floor1_multiplier - 1];
929 const int range_bits = range_bits_list[floor->floor1_multiplier - 1];
930
931 (4/4) if (!get_bits(handle, 1)) {
932 return false; // Channel is zero.
933 }
934
935 /* Floor decode (7.2.3). */
936 int offset = 2;
937 final_Y[0] = get_bits(handle, range_bits);
938 final_Y[1] = get_bits(handle, range_bits);
939 (4/4) for (int i = 0; i < floor->partitions; i++) {
940 const int class = floor->partition_class_list[i];
941 offset = decode_floor1_partition(handle, floor, final_Y, offset, class);
942 }
943 (4/4) if (UNLIKELY(handle->valid_bits < 0)) {
944 return false;
945 }
946
947 /* Amplitude value synthesis (7.2.4 step 1). */
948 bool step2_flag[256];
949 step2_flag[0] = true;
950 step2_flag[1] = true;
951 (4/4) for (int i = 2; i < floor->values; i++) {
952 decode_floor1_amplitude(floor, final_Y, range, step2_flag, i);
953 }
954
955 /* Curve synthesis (7.2.4 step 2). We defer final floor computation
956 * until after synthesis; here, we just clear final_Y values which
957 * need to be synthesized later. */
958 (4/4) for (int i = 0; i < floor->values; i++) {
959 (4/4) if (!step2_flag[i]) {
960 final_Y[i] = -1;
961 }
962 }
963
964 return true;
965 }
966
967 /*-----------------------------------------------------------------------*/
968
969 /**
970 * do_floor0_final: Perform the floor curve computation and residue
971 * product for type 0 floors.
972 *
973 * [Parameters]
974 * handle: Stream handle.
975 * floor: Floor configuration.
976 * ch: Channel to operate on.
977 * n: Frame window size.
978 * amplitude: Floor amplitude.
979 */
980 static void do_floor0_final(stb_vorbis *handle, const Floor0 *floor,
981 const int ch, const int n, const int64_t amplitude)
982 {
983 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];
984 float *coefficients = handle->coefficients[ch];
985 const int16_t *map = floor->map[(n == handle->blocksize[1])];
986 const float omega_base = M_PIf / floor->bark_map_size;
987 const float scaled_amplitude = (float)amplitude
988 / (float)((UINT64_C(1) << floor->amplitude_bits) - 1);
989 const float lfv_scale = 0.11512925f * floor->amplitude_offset;
990
991 (4/4) for (int i = 0; i < floor->order; i++) {
992 coefficients[i] = 2 * cosf(coefficients[i]);
993 }
994
995 int i = 0;
996 do {
997 const float two_cos_omega = 2 * cosf(map[i] * omega_base);
998 float p = 0.5, q = 0.5;
999 int j;
1000 (4/4) for (j = 0; j <= (floor->order - 2) / 2; j++) {
1001 p *= coefficients[2*j+1] - two_cos_omega;
1002 q *= coefficients[2*j] - two_cos_omega;
1003 }
1004 (4/4) if (floor->order % 2 != 0) {
1005 q *= coefficients[2*j] - two_cos_omega;
1006 /* The spec gives this as "4 - cos^2(omega)", but we use the
1007 * equality (a^2-b^2) = (a+b)(a-b) to give us constants
1008 * common between both branches of the if statement. */
1009 p *= p * (2 + two_cos_omega) * (2 - two_cos_omega);
1010 q *= q;
1011 } else {
1012 p *= p * (2 - two_cos_omega);
1013 q *= q * (2 + two_cos_omega);
1014 }
1015 const float linear_floor_value =
1016 expf(lfv_scale * ((scaled_amplitude / sqrtf(p + q)) - 1));
1017 const int iteration_condition = map[i];
1018 do {
1019 output[i] *= linear_floor_value;
1020 i++;
1021 (4/4) } while (map[i] == iteration_condition);
1022 (4/4) } while (i < n/2);
1023 }
1024
1025 /*-----------------------------------------------------------------------*/
1026
1027 /**
1028 * do_floor1_final: Perform the final floor curve computation and residue
1029 * product for type 1 floors.
1030 *
1031 * [Parameters]
1032 * handle: Stream handle.
1033 * floor: Floor configuration.
1034 * ch: Channel to operate on.
1035 * n: Frame window size.
1036 */
1037 static void do_floor1_final(stb_vorbis *handle, const Floor1 *floor,
1038 const int ch, const int n)
1039 {
1040 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];
1041 const int16_t *final_Y = handle->final_Y[ch];
1042 int lx = 0;
1043 int ly = final_Y[0] * floor->floor1_multiplier;
1044 (4/4) for (int i = 1; i < floor->values; i++) {
1045 int j = floor->sorted_order[i];
1046 (4/4) if (final_Y[j] >= 0) {
1047 const int hx = floor->X_list[j];
1048 const int hy = final_Y[j] * floor->floor1_multiplier;
1049 render_line(lx, ly, hx, hy, output, n/2);
1050 lx = hx;
1051 ly = hy;
1052 }
1053 }
1054 (4/4) if (lx < n/2) {
1055 /* Optimization of: render_line(lx, ly, n/2, ly, output, n/2); */
1056 (4/4) for (int i = lx; i < n/2; i++) {
1057 output[i] *= floor1_inverse_db_table[ly];
1058 }
1059 }
1060 }
1061
1062 /*************************************************************************/
1063 /*************************** Residue handling ****************************/
1064 /*************************************************************************/
1065
1066 /**
1067 * decode_residue_partition_0: Decode a single residue partition for
1068 * residue type 0.
1069 *
1070 * [Parameters]
1071 * handle: Stream handle.
1072 * book: Codebook to use for decoding.
1073 * size: Residue partition size.
1074 * offset: Output offset.
1075 * outputs: Output vector array.
1076 * n: Length of residue vectors (half of window size).
1077 * ch: Number of channels of residue data to decode.
1078 * [Return value]
1079 * True on success, false on end of packet.
1080 */
1081 static bool decode_residue_partition_0(
1082 stb_vorbis *handle, const Codebook *book, int size, int offset,
1083 float **output_ptr, UNUSED int n, UNUSED int ch)
1084 {
1085 float *output = *output_ptr;
1086 const int dimensions = book->dimensions;
1087 const int step = size / dimensions;
1088 (4/4) for (int i = 0; i < step; i++) {
1089 (4/4) if (!codebook_decode_step(handle, book, &output[offset+i], size-i,
1090 step)) {
1091 return false;
1092 }
1093 }
1094 return true;
1095 }
1096
1097 /*-----------------------------------------------------------------------*/
1098
1099 /**
1100 * decode_residue_partition_1: Decode a single residue partition for
1101 * residue type 1.
1102 *
1103 * [Parameters]
1104 * handle: Stream handle.
1105 * book: Codebook to use for decoding.
1106 * size: Residue partition size.
1107 * offset: Output offset.
1108 * outputs: Output vector array.
1109 * n: Length of residue vectors (half of window size).
1110 * ch: Number of channels of residue data to decode.
1111 * [Return value]
1112 * True on success, false on end of packet.
1113 */
1114 static bool decode_residue_partition_1(
1115 stb_vorbis *handle, const Codebook *book, int size, int offset,
1116 float **output_ptr, UNUSED int n, UNUSED int ch)
1117 {
1118 float *output = *output_ptr;
1119 const int dimensions = book->dimensions;
1120 (4/4) for (int i = 0; i < size; i += dimensions) {
1121 (4/4) if (!codebook_decode(handle, book, &output[offset+i], size-i)) {
1122 return false;
1123 }
1124 }
1125 return true;
1126 }
1127
1128 /*-----------------------------------------------------------------------*/
1129
1130 /**
1131 * decode_residue_partition_2: Decode a single residue partition for
1132 * residue type 2.
1133 *
1134 * [Parameters]
1135 * handle: Stream handle.
1136 * book: Codebook to use for decoding.
1137 * size: Residue partition size.
1138 * offset: Output offset.
1139 * outputs: Output vector array.
1140 * n: Length of residue vectors (half of window size).
1141 * ch: Number of channels of residue data to decode.
1142 * [Return value]
1143 * True on success, false on end of packet.
1144 */
1145 static bool decode_residue_partition_2(
1146 stb_vorbis *handle, const Codebook *book, int size, int offset,
1147 float **outputs, int n, int ch)
1148 {
1149 return codebook_decode_deinterleave_repeat(
1150 handle, book, outputs, ch, n, offset, size);
1151 }
1152
1153 /*-----------------------------------------------------------------------*/
1154
1155 /**
1156 * decode_residue_partition_2_2ch: Decode a single residue partition for
1157 * residue type 2 with two channels. Both channels must be active for
1158 * decoding.
1159 *
1160 * [Parameters]
1161 * handle: Stream handle.
1162 * book: Codebook to use for decoding.
1163 * size: Residue partition size.
1164 * offset: Output offset.
1165 * outputs: Output vector array.
1166 * n: Length of residue vectors (half of window size).
1167 * ch: Number of channels of residue data to decode.
1168 * [Return value]
1169 * True on success, false on end of packet.
1170 */
1171 static bool decode_residue_partition_2_2ch(
1172 stb_vorbis *handle, const Codebook *book, int size, int offset,
1173 float **outputs, int n, UNUSED int ch)
1174 {
1175 return codebook_decode_deinterleave_repeat_2(
1176 handle, book, outputs, n, offset, size);
1177 }
1178
1179 /*-----------------------------------------------------------------------*/
1180
1181 /**
1182 * decode_residue_partition: Decode a single residue partition.
1183 *
1184 * [Parameters]
1185 * handle: Stream handle.
1186 * res: Residue configuration.
1187 * pass: Residue decoding pass.
1188 * partition_count: Current partition index.
1189 * vqclass: Classification for this partition.
1190 * outputs: Output vector array.
1191 * n: Length of residue vectors (half of window size).
1192 * ch: Number of channels of residue data to decode.
1193 * do_partition: Partition decoding function.
1194 * [Return value]
1195 * True on success, false on end of packet.
1196 */
1197 static inline bool decode_residue_partition(
1198 stb_vorbis *handle, const Residue *res, int pass, int partition_count,
1199 int vqclass, float **outputs, int n, int ch,
1200 bool (*do_partition)(
1201 stb_vorbis *handle, const Codebook *book, int size, int offset,
1202 float **outputs, int n, int ch))
1203 {
1204 const int vqbook = res->residue_books[vqclass][pass];
1205 (4/4) if (vqbook < 0) {
1206 return true;
1207 }
1208
1209 const Codebook *book = &handle->codebooks[vqbook];
1210 (4/4) if (UNLIKELY(book->lookup_type == 0)) {
1211 /* Lookup type 0 is only valid in a scalar context. The spec
1212 * doesn't say what to do about this case; we treat it like
1213 * end-of-packet. */
1214 flush_packet(handle);
1215 handle->valid_bits = -1;
1216 return false;
1217 }
1218
1219 const int32_t offset = res->begin + partition_count * res->part_size;
1220 return (*do_partition)(
1221 handle, book, res->part_size, offset, outputs, n, ch);
1222 }
1223
1224 /*-----------------------------------------------------------------------*/
1225
1226 /**
1227 * decode_residue_common: Common decode processing for all residue vector
1228 * types.
1229 *
1230 * [Parameters]
1231 * handle: Stream handle.
1232 * res: Residue configuration.
1233 * type: Residue type.
1234 * n: Length of residue vectors (half of window size).
1235 * ch: Number of channels of residue data to decode.
1236 * residue_buffers: Pointers to the buffers for each channel in which
1237 * the residue vectors should be stored. A pointer of NULL
1238 * indicates that the given channel should not be decoded (the
1239 * "do not decode" state referred to by the specification).
1240 * do_partition: Function pointer for the partition decoding function
1241 * to use.
1242 */
1243 static void decode_residue_common(
1244 stb_vorbis *handle, const Residue *res, int type, int n, int ch,
1245 float *residue_buffers[],
1246 bool (*do_partition)(
1247 stb_vorbis *handle, const Codebook *book, int size, int offset,
1248 float **outputs, int n, int ch))
1249 {
1250 const Codebook *classbook = &handle->codebooks[res->classbook];
1251 const int classwords = classbook->dimensions;
1252 (4/4) const int actual_size = (type == 2 ? n*ch : n);
1253 const int n_to_read =
1254 min(res->end, actual_size) - min(res->begin, actual_size);
1255 const int partitions_to_read = n_to_read / res->part_size;
1256 (4/4) const int ch_to_read = (type == 2 ? 1 : ch);
1257
1258 (4/4) if (n_to_read <= 0) {
1259 return;
1260 }
1261
1262 (4/4) if (handle->divides_in_residue) {
1263
1264 int **classifications = handle->classifications;
1265
1266 (4/4) for (int pass = 0; pass < 8; pass++) {
1267 int partition_count = 0;
1268 (4/4) while (partition_count < partitions_to_read) {
1269 (4/4) if (pass == 0) {
1270 (4/4) for (int j = 0; j < ch_to_read; j++) {
1271 (4/4) if (residue_buffers[j]) {
1272 int temp =
1273 codebook_decode_scalar(handle, classbook);
1274 (4/4) if (UNLIKELY(temp == EOP)) {
1275 return;
1276 }
1277 (4/4) for (int i = classwords-1; i >= 0; i--) {
1278 classifications[j][i+partition_count] =
1279 temp % res->classifications;
1280 temp /= res->classifications;
1281 }
1282 }
1283 }
1284 }
1285 (4/4) for (int i = 0;
1286 (4/4) i < classwords && partition_count < partitions_to_read;
1287 i++, partition_count++)
1288 {
1289 (4/4) for (int j = 0; j < ch_to_read; j++) {
1290 (4/4) if (residue_buffers[j]) {
1291 const int vqclass =
1292 classifications[j][partition_count];
1293 (4/4) if (!decode_residue_partition(
1294 handle, res, pass, partition_count, vqclass,
1295 &residue_buffers[j], n, ch, do_partition)) {
1296 return;
1297 }
1298 }
1299 }
1300 }
1301 } // while (partition_count < partitions_to_read)
1302 } // for (int pass = 0; pass < 8; pass++)
1303
1304 } else { // !handle->divides_in_residue
1305
1306 uint8_t ***part_classdata = handle->classifications;
1307
1308 (4/4) for (int pass = 0; pass < 8; pass++) {
1309 int partition_count = 0;
1310 int class_set = 0;
1311 (4/4) while (partition_count < partitions_to_read) {
1312 (4/4) if (pass == 0) {
1313 (4/4) for (int j = 0; j < ch_to_read; j++) {
1314 (4/4) if (residue_buffers[j]) {
1315 const int temp =
1316 codebook_decode_scalar(handle, classbook);
1317 (4/4) if (UNLIKELY(temp == EOP)) {
1318 return;
1319 }
1320 part_classdata[j][class_set] =
1321 res->classdata[temp];
1322 }
1323 }
1324 }
1325 (4/4) for (int i = 0;
1326 (4/4) i < classwords && partition_count < partitions_to_read;
1327 i++, partition_count++)
1328 {
1329 (4/4) for (int j = 0; j < ch_to_read; j++) {
1330 (4/4) if (residue_buffers[j]) {
1331 const int vqclass =
1332 part_classdata[j][class_set][i];
1333 (4/4) if (!decode_residue_partition(
1334 handle, res, pass, partition_count, vqclass,
1335 &residue_buffers[j], n, ch, do_partition)) {
1336 return;
1337 }
1338 }
1339 }
1340 }
1341 class_set++;
1342 } // while (partition_count < partitions_to_read)
1343 } // for (int pass = 0; pass < 8; pass++)
1344
1345 } // if (handle->divides_in_residue)
1346 }
1347
1348 /*-----------------------------------------------------------------------*/
1349
1350 /**
1351 * decode_residue: Decode the residue vectors for the current frame.
1352 *
1353 * [Parameters]
1354 * handle: Stream handle.
1355 * residue_index: Index of residue configuration to use.
1356 * n: Length of residue vectors (half of window size).
1357 * ch: Number of channels of residue data to decode.
1358 * residue_buffers: Pointers to the buffers for each channel in which
1359 * the residue vectors should be stored. A pointer of NULL
1360 * indicates that the given channel should not be decoded (the
1361 * "do not decode" state referred to by the specification).
1362 */
1363 static void decode_residue(stb_vorbis *handle, int residue_index, int n,
1364 int ch, float *residue_buffers[])
1365 {
1366 Residue *res = &handle->residue_config[residue_index];
1367 const int type = handle->residue_types[residue_index];
1368
1369 (4/4) for (int i = 0; i < ch; i++) {
1370 (4/4) if (residue_buffers[i]) {
1371 memset(residue_buffers[i], 0, sizeof(*residue_buffers[i]) * n);
1372 }
1373 }
1374
1375 /* For residue type 2, if there are multiple channels, we need to
1376 * deinterleave the data after decoding it. */
1377 (8/8) if (type == 2 && ch > 1) {
1378 (8/8) if (ch == 2 && !handle->divides_in_residue
1379 (6/8) && residue_buffers[0] && residue_buffers[1]) {
1380 /* We have slightly optimized handling for this case. */
1381 decode_residue_common(
1382 handle, res, type, n, ch, residue_buffers,
1383 decode_residue_partition_2_2ch);
1384 } else {
1385 decode_residue_common(
1386 handle, res, type, n, ch, residue_buffers,
1387 decode_residue_partition_2);
1388 }
1389 } else {
1390 (4/4) decode_residue_common(
1391 handle, res, type, n, ch, residue_buffers,
1392 type==0 ? decode_residue_partition_0 : decode_residue_partition_1);
1393 }
1394 }
1395
1396 /*************************************************************************/
1397 /************************ Inverse MDCT processing ************************/
1398 /*************************************************************************/
1399
1400 /**
1401 * imdct_setup_step1: Setup and step 1 of the IMDCT.
1402 *
1403 * [Parameters]
1404 * n: Window size.
1405 * A: Twiddle factor A.
1406 * Y: Input buffer (length n/2).
1407 * v: Output buffer (length n/2). Must be distinct from the input buffer.
1408 */
1409 static void imdct_setup_step1(const unsigned int n, const float *A,
1410 const float *Y, float *v)
1411 {
1412 #if 0 // Roughly literal implementation.
1413
1414 for (unsigned int i = 0; i < n/4; i += 2) {
1415 v[(n/2)-i-1] = Y[i*2]*A[i+0] - Y[(i+1)*2]*A[i+1];
1416 v[(n/2)-i-2] = Y[i*2]*A[i+1] + Y[(i+1)*2]*A[i+0];
1417 }
1418 for (unsigned int i = 0; i < n/4; i += 2) {
1419 v[(n/4)-i-1] =
1420 -Y[(n/2-1)-i*2]*A[(n/4)+i+0] - -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+1];
1421 v[(n/4)-i-2] =
1422 -Y[(n/2-1)-i*2]*A[(n/4)+i+1] + -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+0];
1423 }
1424
1425 #else // Optimized implementation (reduces general-purpose register pressure).
1426
1427 #if defined(ENABLE_ASM_ARM_NEON)
1428 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1) << 63);
1429 #elif defined(ENABLE_ASM_X86_SSE2)
1430 const __m128 sign_1010 = (__m128)_mm_set1_epi64x(UINT64_C(1) << 63);
1431 const __m128 sign_1111 = (__m128)_mm_set1_epi32(UINT32_C(1) << 31);
1432 #endif
1433
1434 v += n/2;
1435 (4/4) for (int i = 0, j = -4; i < (int)(n/4); i += 4, j -= 4) {
1436 #if defined(ENABLE_ASM_ARM_NEON)
1437 const float32x4x2_t Y_all = vld2q_f32(&Y[i*2]);
1438 const float32x4_t Y_i2 = vswizzleq_zzxx_f32(Y_all.val[0]);
1439 const float32x4_t Y_i3 = vswizzleq_wwyy_f32(Y_all.val[0]);
1440 const float32x4_t A_i = vld1q_f32(&A[i]);
1441 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);
1442 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);
1443 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_i2, A_i3),
1444 veorq_f32(sign_1010,
1445 vmulq_f32(Y_i3, A_i2))));
1446 #elif defined(ENABLE_ASM_X86_SSE2)
1447 const __m128 Y_2i_0 = _mm_load_ps(&Y[(i+0)*2]);
1448 const __m128 Y_2i_4 = _mm_load_ps(&Y[(i+2)*2]);
1449 const __m128 Y_i2 =
1450 _mm_shuffle_ps(Y_2i_4, Y_2i_0, _MM_SHUFFLE(0,0,0,0));
1451 const __m128 Y_i3 =
1452 _mm_shuffle_ps(Y_2i_4, Y_2i_0, _MM_SHUFFLE(2,2,2,2));
1453 const __m128 A_i = _mm_load_ps(&A[i]);
1454 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));
1455 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));
1456 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_i2, A_i3),
1457 _mm_xor_ps(sign_1010,
1458 _mm_mul_ps(Y_i3, A_i2))));
1459 #else
1460 v[j+3] = Y[(i+0)*2]*A[i+0] - Y[(i+1)*2]*A[i+1];
1461 v[j+2] = Y[(i+0)*2]*A[i+1] + Y[(i+1)*2]*A[i+0];
1462 v[j+1] = Y[(i+2)*2]*A[i+2] - Y[(i+3)*2]*A[i+3];
1463 v[j+0] = Y[(i+2)*2]*A[i+3] + Y[(i+3)*2]*A[i+2];
1464 #endif
1465 }
1466
1467 Y += n/2;
1468 A += n/4;
1469 v -= n/4;
1470 (4/4) for (int i = 0, j = -4; i < (int)(n/4); i += 4, j -= 4) {
1471 #if defined(ENABLE_ASM_ARM_NEON)
1472 const float32x4x2_t Y_all = vld2q_f32(&Y[j*2]);
1473 const float32x4_t Y_j1 = vnegq_f32(vswizzleq_yyww_f32(Y_all.val[1]));
1474 const float32x4_t Y_j0 = vnegq_f32(vswizzleq_xxzz_f32(Y_all.val[1]));
1475 const float32x4_t A_i = vld1q_f32(&A[i]);
1476 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);
1477 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);
1478 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_j1, A_i3),
1479 veorq_f32(sign_1010,
1480 vmulq_f32(Y_j0, A_i2))));
1481 #elif defined(ENABLE_ASM_X86_SSE2)
1482 const __m128 Y_2j_0 = _mm_xor_ps(sign_1111, _mm_load_ps(&Y[(j+0)*2]));
1483 const __m128 Y_2j_4 = _mm_xor_ps(sign_1111, _mm_load_ps(&Y[(j+2)*2]));
1484 const __m128 Y_j1 =
1485 _mm_shuffle_ps(Y_2j_0, Y_2j_4, _MM_SHUFFLE(3,3,3,3));
1486 const __m128 Y_j0 =
1487 _mm_shuffle_ps(Y_2j_0, Y_2j_4, _MM_SHUFFLE(1,1,1,1));
1488 const __m128 A_i = _mm_load_ps(&A[i]);
1489 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));
1490 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));
1491 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_j1, A_i3),
1492 _mm_xor_ps(sign_1010,
1493 _mm_mul_ps(Y_j0, A_i2))));
1494 #else
1495 v[j+3] = -Y[(j+3)*2+1]*A[i+0] - -Y[(j+2)*2+1]*A[i+1];
1496 v[j+2] = -Y[(j+3)*2+1]*A[i+1] + -Y[(j+2)*2+1]*A[i+0];
1497 v[j+1] = -Y[(j+1)*2+1]*A[i+2] - -Y[(j+0)*2+1]*A[i+3];
1498 v[j+0] = -Y[(j+1)*2+1]*A[i+3] + -Y[(j+0)*2+1]*A[i+2];
1499 #endif
1500 }
1501
1502 #endif // Literal vs. optimized implementation.
1503 }
1504
1505 /*-----------------------------------------------------------------------*/
1506
1507 /**
1508 * imdct_step2: Step 2 of the IMDCT.
1509 *
1510 * [Parameters]
1511 * n: Window size.
1512 * A: Twiddle factor A.
1513 * v: Input buffer (length n/2).
1514 * w: Output buffer (length n/2). May be the same as the input buffer.
1515 */
1516 static void imdct_step2(const unsigned int n, const float *A,
1517 const float *v, float *w)
1518 {
1519 const float *v0 = &v[(n/4)];
1520 const float *v1 = &v[0];
1521 float *w0 = &w[(n/4)];
1522 float *w1 = &w[0];
1523 const float *AA = &A[(n/2)-8];
1524
1525 (4/4) for (int i = 0; AA >= A; i += 4, AA -= 8) {
1526
1527 #if defined(ENABLE_ASM_ARM_NEON)
1528
1529 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1) << 63);
1530 const float32x4_t v0_i = vld1q_f32(&v0[i]);
1531 const float32x4_t v1_i = vld1q_f32(&v1[i]);
1532 const float32x4x2_t AA_all = vld2q_f32(AA);
1533 vst1q_f32(&w0[i], vaddq_f32(v0_i, v1_i));
1534 const float32x4_t diff = vsubq_f32(v0_i, v1_i);
1535 const float32x4_t diff2 = vswizzleq_yxwz_f32(diff);
1536 const uint32x4_t AA_sel = {~0U, 0, ~0U, 0};
1537 const float32x4_t AA_40 = vswizzleq_zzxx_f32(vbslq_f32(
1538 AA_sel, AA_all.val[0],
1539 (float32x4_t)vshlq_n_u64((uint64x2_t)AA_all.val[0], 32)));
1540 const float32x4_t AA_51 = vswizzleq_zzxx_f32(vbslq_f32(
1541 AA_sel, AA_all.val[1],
1542 (float32x4_t)vshlq_n_u64((uint64x2_t)AA_all.val[1], 32)));
1543 vst1q_f32(&w1[i], vaddq_f32(vmulq_f32(diff, AA_40),
1544 veorq_f32(sign_1010,
1545 vmulq_f32(diff2, AA_51))));
1546
1547 #elif defined(ENABLE_ASM_X86_SSE2)
1548
1549 const __m128 sign_1010 = (__m128)_mm_set1_epi64x(UINT64_C(1) << 63);
1550 const __m128 v0_i = _mm_load_ps(&v0[i]);
1551 const __m128 v1_i = _mm_load_ps(&v1[i]);
1552 const __m128 AA_0 = _mm_load_ps(&AA[0]);
1553 const __m128 AA_4 = _mm_load_ps(&AA[4]);
1554 _mm_store_ps(&w0[i], _mm_add_ps(v0_i, v1_i));
1555 const __m128 diff = _mm_sub_ps(v0_i, v1_i);
1556 const __m128 diff2 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(2,3,0,1));
1557 const __m128 AA_40 = _mm_shuffle_ps(AA_4, AA_0, _MM_SHUFFLE(0,0,0,0));
1558 const __m128 AA_51 = _mm_shuffle_ps(AA_4, AA_0, _MM_SHUFFLE(1,1,1,1));
1559 _mm_store_ps(&w1[i], _mm_add_ps(_mm_mul_ps(diff, AA_40),
1560 _mm_xor_ps(sign_1010,
1561 _mm_mul_ps(diff2, AA_51))));
1562
1563 #else
1564
1565 float v40_20, v41_21;
1566
1567 v41_21 = v0[i+1] - v1[i+1];
1568 v40_20 = v0[i+0] - v1[i+0];
1569 w0[i+1] = v0[i+1] + v1[i+1];
1570 w0[i+0] = v0[i+0] + v1[i+0];
1571 w1[i+1] = v41_21*AA[4] - v40_20*AA[5];
1572 w1[i+0] = v40_20*AA[4] + v41_21*AA[5];
1573
1574 v41_21 = v0[i+3] - v1[i+3];
1575 v40_20 = v0[i+2] - v1[i+2];
1576 w0[i+3] = v0[i+3] + v1[i+3];
1577 w0[i+2] = v0[i+2] + v1[i+2];
1578 w1[i+3] = v41_21*AA[0] - v40_20*AA[1];
1579 w1[i+2] = v40_20*AA[0] + v41_21*AA[1];
1580
1581 #endif // ENABLE_ASM_*
1582
1583 }
1584 }
1585
1586 /*-----------------------------------------------------------------------*/
1587
1588 /**
1589 * imdct_step3_inner_r_loop: Step 3 of the IMDCT for a single iteration
1590 * of the l and s loops.
1591 *
1592 * [Parameters]
1593 * lim: Iteration limit for the r loop.
1594 * A: Twiddle factor A.
1595 * e: Input/output buffer (length n/2, modified in place).
1596 * i_off: Initial offset, calculated as (n/2 - k0*s).
1597 * k0: Constant k0.
1598 * k1: Constant k1.
1599 */
1600 static void imdct_step3_inner_r_loop(const unsigned int lim, const float *A,
1601 float *e, int i_off, int k0, int k1)
1602 {
1603 float *e0 = e + i_off;
1604 float *e2 = e0 - k0/2;
1605
1606 (4/4) for (int i = lim/4; i > 0; --i, e0 -= 8, e2 -= 8) {
1607
1608 #if defined(ENABLE_ASM_ARM_NEON)
1609
1610 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1) << 63);
1611 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);
1612 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);
1613 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);
1614 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);
1615 const float32x4x2_t A_0k =
1616 vuzpq_f32(vld1q_f32(&A[0]), vld1q_f32(&A[k1]));
1617 const float32x4x2_t A_2k =
1618 vuzpq_f32(vld1q_f32(&A[2*k1]), vld1q_f32(&A[3*k1]));
1619 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));
1620 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));
1621 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);
1622 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);
1623 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);
1624 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);
1625 const uint32x4_t A_sel = {~0U, 0, ~0U, 0};
1626 const float32x4_t A_0 = vswizzleq_zzxx_f32(vbslq_f32(
1627 A_sel, A_0k.val[0],
1628 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[0], 32)));
1629 const float32x4_t A_1 = vswizzleq_zzxx_f32(vbslq_f32(
1630 A_sel, A_0k.val[1],
1631 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[1], 32)));
1632 const float32x4_t A_2 = vswizzleq_zzxx_f32(vbslq_f32(
1633 A_sel, A_2k.val[0],
1634 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[0], 32)));
1635 const float32x4_t A_3 = vswizzleq_zzxx_f32(vbslq_f32(
1636 A_sel, A_2k.val[1],
1637 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[1], 32)));
1638 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_0),
1639 veorq_f32(sign_1010,
1640 vmulq_f32(diff2_4, A_1))));
1641 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_2),
1642 veorq_f32(sign_1010,
1643 vmulq_f32(diff2_8, A_3))));
1644 A += 4*k1;
1645
1646 #elif defined(ENABLE_ASM_X86_SSE2)
1647
1648 const __m128 sign_1010 = (__m128)_mm_set1_epi64x(UINT64_C(1) << 63);
1649 const __m128 e0_4 = _mm_load_ps(&e0[-4]);
1650 const __m128 e2_4 = _mm_load_ps(&e2[-4]);
1651 const __m128 e0_8 = _mm_load_ps(&e0[-8]);
1652 const __m128 e2_8 = _mm_load_ps(&e2[-8]);
1653 const __m128 A_0k = _mm_load_ps(A);
1654 const __m128 A_1k = _mm_load_ps(&A[k1]);
1655 const __m128 A_2k = _mm_load_ps(&A[2*k1]);
1656 const __m128 A_3k = _mm_load_ps(&A[3*k1]);
1657 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));
1658 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));
1659 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);
1660 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);
1661 const __m128 diff2_4 =
1662 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));
1663 const __m128 diff2_8 =
1664 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));
1665 const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0));
1666 const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1));
1667 const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0));
1668 const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1));
1669 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_0),
1670 _mm_xor_ps(sign_1010,
1671 _mm_mul_ps(diff2_4, A_1))));
1672 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_2),
1673 _mm_xor_ps(sign_1010,
1674 _mm_mul_ps(diff2_8, A_3))));
1675 A += 4*k1;
1676
1677 #else
1678
1679 float k00_20, k01_21;
1680
1681 k00_20 = e0[-1] - e2[-1];
1682 k01_21 = e0[-2] - e2[-2];
1683 e0[-1] = e0[-1] + e2[-1];
1684 e0[-2] = e0[-2] + e2[-2];
1685 e2[-1] = k00_20 * A[0] - k01_21 * A[1];
1686 e2[-2] = k01_21 * A[0] + k00_20 * A[1];
1687 A += k1;
1688
1689 k00_20 = e0[-3] - e2[-3];
1690 k01_21 = e0[-4] - e2[-4];
1691 e0[-3] = e0[-3] + e2[-3];
1692 e0[-4] = e0[-4] + e2[-4];
1693 e2[-3] = k00_20 * A[0] - k01_21 * A[1];
1694 e2[-4] = k01_21 * A[0] + k00_20 * A[1];
1695 A += k1;
1696
1697 k00_20 = e0[-5] - e2[-5];
1698 k01_21 = e0[-6] - e2[-6];
1699 e0[-5] = e0[-5] + e2[-5];
1700 e0[-6] = e0[-6] + e2[-6];
1701 e2[-5] = k00_20 * A[0] - k01_21 * A[1];
1702 e2[-6] = k01_21 * A[0] + k00_20 * A[1];
1703 A += k1;
1704
1705 k00_20 = e0[-7] - e2[-7];
1706 k01_21 = e0[-8] - e2[-8];
1707 e0[-7] = e0[-7] + e2[-7];
1708 e0[-8] = e0[-8] + e2[-8];
1709 e2[-7] = k00_20 * A[0] - k01_21 * A[1];
1710 e2[-8] = k01_21 * A[0] + k00_20 * A[1];
1711 A += k1;
1712
1713 #endif // ENABLE_ASM_*
1714
1715 }
1716 }
1717
1718 /*-----------------------------------------------------------------------*/
1719
1720 /**
1721 * imdct_step3_inner_s_loop: Step 3 of the IMDCT for a single iteration
1722 * of the l and r loops.
1723 *
1724 * [Parameters]
1725 * lim: Iteration limit for the s loop.
1726 * A: Twiddle factor A.
1727 * e: Input/output buffer (length n/2, modified in place).
1728 * i_off: Initial offset, calculated as (n/2 - k0*s).
1729 * k0: Constant k0.
1730 * k1: Constant k1.
1731 */
1732 static void imdct_step3_inner_s_loop(const unsigned int lim, const float *A,
1733 float *e, int i_off, int k0, int k1)
1734 {
1735 const float A0 = A[0];
1736 const float A1 = A[0+1];
1737 const float A2 = A[0+k1];
1738 const float A3 = A[0+k1+1];
1739 const float A4 = A[0+k1*2+0];
1740 const float A5 = A[0+k1*2+1];
1741 const float A6 = A[0+k1*3+0];
1742 const float A7 = A[0+k1*3+1];
1743
1744 float *e0 = e + i_off;
1745 float *e2 = e0 - k0/2;
1746
1747 (4/4) for (int i = lim; i > 0; i--, e0 -= k0, e2 -= k0) {
1748
1749 #if defined(ENABLE_ASM_ARM_NEON)
1750
1751 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1) << 63);
1752 const float32x4_t A_20 = {A2, A2, A0, A0};
1753 const float32x4_t A_31 = {A3, A3, A1, A1};
1754 const float32x4_t A_64 = {A6, A6, A4, A4};
1755 const float32x4_t A_75 = {A7, A7, A5, A5};
1756 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);
1757 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);
1758 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);
1759 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);
1760 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));
1761 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));
1762 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);
1763 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);
1764 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);
1765 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);
1766 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_20),
1767 veorq_f32(sign_1010,
1768 vmulq_f32(diff2_4, A_31))));
1769 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_64),
1770 veorq_f32(sign_1010,
1771 vmulq_f32(diff2_8, A_75))));
1772
1773 #elif defined(ENABLE_ASM_X86_SSE2)
1774
1775 const __m128 sign_1010 = (__m128)_mm_set1_epi64x(UINT64_C(1) << 63);
1776 const __m128 A_20 = _mm_set_ps(A0, A0, A2, A2);
1777 const __m128 A_31 = _mm_set_ps(A1, A1, A3, A3);
1778 const __m128 A_64 = _mm_set_ps(A4, A4, A6, A6);
1779 const __m128 A_75 = _mm_set_ps(A5, A5, A7, A7);
1780 const __m128 e0_4 = _mm_load_ps(&e0[-4]);
1781 const __m128 e2_4 = _mm_load_ps(&e2[-4]);
1782 const __m128 e0_8 = _mm_load_ps(&e0[-8]);
1783 const __m128 e2_8 = _mm_load_ps(&e2[-8]);
1784 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));
1785 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));
1786 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);
1787 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);
1788 const __m128 diff2_4 =
1789 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));
1790 const __m128 diff2_8 =
1791 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));
1792 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_20),
1793 _mm_xor_ps(sign_1010,
1794 _mm_mul_ps(diff2_4, A_31))));
1795 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_64),
1796 _mm_xor_ps(sign_1010,
1797 _mm_mul_ps(diff2_8, A_75))));
1798
1799 #else
1800
1801 float k00, k11;
1802
1803 k00 = e0[-1] - e2[-1];
1804 k11 = e0[-2] - e2[-2];
1805 e0[-1] = e0[-1] + e2[-1];
1806 e0[-2] = e0[-2] + e2[-2];
1807 e2[-1] = k00 * A0 - k11 * A1;
1808 e2[-2] = k11 * A0 + k00 * A1;
1809
1810 k00 = e0[-3] - e2[-3];
1811 k11 = e0[-4] - e2[-4];
1812 e0[-3] = e0[-3] + e2[-3];
1813 e0[-4] = e0[-4] + e2[-4];
1814 e2[-3] = k00 * A2 - k11 * A3;
1815 e2[-4] = k11 * A2 + k00 * A3;
1816
1817 k00 = e0[-5] - e2[-5];
1818 k11 = e0[-6] - e2[-6];
1819 e0[-5] = e0[-5] + e2[-5];
1820 e0[-6] = e0[-6] + e2[-6];
1821 e2[-5] = k00 * A4 - k11 * A5;
1822 e2[-6] = k11 * A4 + k00 * A5;
1823
1824 k00 = e0[-7] - e2[-7];
1825 k11 = e0[-8] - e2[-8];
1826 e0[-7] = e0[-7] + e2[-7];
1827 e0[-8] = e0[-8] + e2[-8];
1828 e2[-7] = k00 * A6 - k11 * A7;
1829 e2[-8] = k11 * A6 + k00 * A7;
1830
1831 #endif // ENABLE_ASM_*
1832
1833 }
1834 }
1835
1836 /*-----------------------------------------------------------------------*/
1837
1838 /**
1839 * iter_54: Step 3 of the IMDCT for l=ld(n)-5 and l=ld(n)-4 for a sequence
1840 * of 8 elements. Helper function for imdct_step3_inner_s_loop_ld654().
1841 *
1842 * [Parameters]
1843 * z: Pointer to elements to operate on.
1844 */
1845 static inline void iter_54(float *z)
1846 {
1847 #if defined(ENABLE_ASM_X86_SSE2)
1848
1849 const __m128 sign_0011 =
1850 (__m128)_mm_set_epi32(0, 0, UINT32_C(1) << 31, UINT32_C(1) << 31);
1851 const __m128 sign_0110 =
1852 (__m128)_mm_set_epi32(0, UINT32_C(1) << 31, UINT32_C(1) << 31, 0);
1853 const __m128 z_4 = _mm_load_ps(&z[-4]);
1854 const __m128 z_8 = _mm_load_ps(&z[-8]);
1855 const __m128 sum = _mm_add_ps(z_4, z_8);
1856 const __m128 diff = _mm_sub_ps(z_4, z_8);
1857 const __m128 sum_23 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3,2,3,2));
1858 const __m128 sum_01 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1,0,1,0));
1859 const __m128 diff_23 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(3,2,3,2));
1860 const __m128 diff_10 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(0,1,0,1));
1861 _mm_store_ps(&z[-4], _mm_add_ps(sum_23, _mm_xor_ps(sign_0011, sum_01)));
1862 _mm_store_ps(&z[-8], _mm_add_ps(diff_23, _mm_xor_ps(sign_0110, diff_10)));
1863
1864 #else
1865
1866 const float k00 = z[-1] - z[-5];
1867 const float k11 = z[-2] - z[-6];
1868 const float k22 = z[-3] - z[-7];
1869 const float k33 = z[-4] - z[-8];
1870 const float y0 = z[-1] + z[-5];
1871 const float y1 = z[-2] + z[-6];
1872 const float y2 = z[-3] + z[-7];
1873 const float y3 = z[-4] + z[-8];
1874
1875 z[-1] = y0 + y2; // z1 + z5 + z3 + z7
1876 z[-2] = y1 + y3; // z2 + z6 + z4 + z8
1877 z[-3] = y0 - y2; // z1 + z5 - z3 - z7
1878 z[-4] = y1 - y3; // z2 + z6 - z4 - z8
1879 z[-5] = k00 + k33; // z1 - z5 + z4 - z8
1880 z[-6] = k11 - k22; // z2 - z6 + z3 - z7
1881 z[-7] = k00 - k33; // z1 - z5 - z4 + z8
1882 z[-8] = k11 + k22; // z2 - z6 - z3 + z7
1883
1884 #endif // ENABLE_ASM_*
1885 }
1886
1887 /*-----------------------------------------------------------------------*/
1888
1889 /**
1890 * imdct_step3_inner_s_loop_ld654: Step 3 of the IMDCT for a single
1891 * iteration of the l and r loops at l=log2_n-{6,5,4}.
1892 *
1893 * [Parameters]
1894 * n: Window size.
1895 * A: Twiddle factor A.
1896 * e: Input/output buffer (length n/2, modified in place).
1897 */
1898 static void imdct_step3_inner_s_loop_ld654(
1899 const unsigned int n, const float *A, float *e)
1900 {
1901 const int a_off = n/8;
1902 const float A2 = A[a_off];
1903
1904 (4/4) for (float *z = e + n/2; z > e; z -= 16) {
1905
1906 #if defined(ENABLE_ASM_X86_SSE2)
1907
1908 const __m128 z_4 = _mm_load_ps(&z[-4]);
1909 const __m128 z_8 = _mm_load_ps(&z[-8]);
1910 const __m128 z_12 = _mm_load_ps(&z[-12]);
1911 const __m128 z_16 = _mm_load_ps(&z[-16]);
1912 _mm_store_ps(&z[-4], _mm_add_ps(z_4, z_12));
1913 _mm_store_ps(&z[-8], _mm_add_ps(z_8, z_16));
1914 const __m128 diff_4 = _mm_sub_ps(z_4, z_12);
1915 const __m128 diff_8 = _mm_sub_ps(z_8, z_16);
1916 /* We can't use the same algorithm as imdct_step3_inner_s_loop(),
1917 * since that can lead to a loss of precision for z[-11,-12,-15,-16]
1918 * if the two non-constant inputs to the calculation are of nearly
1919 * equal magnitude and of the appropriate signs to give an
1920 * intermediate sum or difference of much smaller magnitude:
1921 * changing the order of operations to multiply by A2 first can
1922 * introduce an error of at most 1 ULP at the point of the
1923 * multiplication, but if the final result has a smaller magnitude,
1924 * that error will be a greater part of the final result. */
1925 const __m128 temp_4 = _mm_xor_ps(
1926 _mm_shuffle_ps(diff_4, _mm_set1_ps(0), _MM_SHUFFLE(3,2,0,1)),
1927 (__m128)_mm_set_epi32(0, 0, 0, UINT32_C(1)<<31));
1928 _mm_store_ps(&z[-12], _mm_mul_ps(_mm_add_ps(diff_4, temp_4),
1929 _mm_set_ps(1, 1, A2, A2)));
1930 const __m128 temp1_8 = _mm_xor_ps(
1931 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1)),
1932 (__m128)_mm_set_epi32(0, UINT32_C(1)<<31, 0, UINT32_C(1)<<31));
1933 const __m128 temp2_8 =
1934 _mm_shuffle_ps(diff_8, _mm_set1_ps(0), _MM_SHUFFLE(3,2,1,0));
1935 _mm_store_ps(&z[-16], _mm_mul_ps(_mm_sub_ps(temp1_8, temp2_8),
1936 _mm_set_ps(1, 1, A2, A2)));
1937
1938 #else
1939
1940 float k00, k11;
1941
1942 k00 = z[ -1] - z[ -9];
1943 k11 = z[ -2] - z[-10];
1944 z[ -1] = z[ -1] + z[ -9];
1945 z[ -2] = z[ -2] + z[-10];
1946 z[ -9] = k00; // k00*1 - k11*0
1947 z[-10] = k11; // k11*1 + k00*0
1948
1949 k00 = z[ -3] - z[-11];
1950 k11 = z[ -4] - z[-12];
1951 z[ -3] = z[ -3] + z[-11];
1952 z[ -4] = z[ -4] + z[-12];
1953 z[-11] = (k00+k11) * A2; // k00*A2 - k11*-A2 (but see asm note above)
1954 z[-12] = (k11-k00) * A2; // k11*A2 + k00*-A2
1955
1956 k00 = z[- 5] - z[-13];
1957 k11 = z[ -6] - z[-14];
1958 z[ -5] = z[ -5] + z[-13];
1959 z[ -6] = z[ -6] + z[-14];
1960 z[-13] = k11; // k00*0 - k11*-1
1961 z[-14] = -k00; // k11*0 + k00*-1
1962
1963 k00 = z[- 7] - z[-15];
1964 k11 = z[ -8] - z[-16];
1965 z[ -7] = z[ -7] + z[-15];
1966 z[ -8] = z[ -8] + z[-16];
1967 z[-15] = (k11-k00) * A2; // k00*-A2 - k11*-A2
1968 z[-16] = -(k00+k11) * A2; // k11*-A2 + k00*-A2
1969
1970 #endif // ENABLE_ASM_*
1971
1972 iter_54(z);
1973 iter_54(z-8);
1974 }
1975 }
1976
1977 /*-----------------------------------------------------------------------*/
1978
1979 /**
1980 * imdct_step456: Steps 4, 5, and 6 of the IMDCT.
1981 *
1982 * [Parameters]
1983 * n: Window size.
1984 * bitrev: Bit-reverse lookup table.
1985 * u: Input buffer (length n/2).
1986 * U: Output buffer (length n/2). Must be distinct from the input buffer.
1987 */
1988 static void imdct_step456(const unsigned int n, const uint16_t *bitrev,
1989 const float *u, float *U)
1990 {
1991 /* stb_vorbis note: "weirdly, I'd have thought reading sequentially and
1992 * writing erratically would have been better than vice-versa, but in fact
1993 * that's not what my testing showed. (That is, with j = bitreverse(i),
1994 * do you read i and write j, or read j and write i.)" */
1995
1996 float *U0 = &U[n/4];
1997 float *U1 = &U[n/2];
1998
1999 (4/4) for (int i = 0, j = -4; i < (int)(n/8); i += 2, j -= 4) {
2000
2001 #if defined(ENABLE_ASM_X86_SSE2)
2002
2003 const __m128 bitrev_0 = _mm_load_ps(&u[bitrev[i+0]]);
2004 const __m128 bitrev_1 = _mm_load_ps(&u[bitrev[i+1]]);
2005 _mm_store_ps(&U0[j], (__m128)_mm_shuffle_ps(
2006 bitrev_1, bitrev_0, _MM_SHUFFLE(2,3,2,3)));
2007 _mm_store_ps(&U1[j], (__m128)_mm_shuffle_ps(
2008 bitrev_1, bitrev_0, _MM_SHUFFLE(0,1,0,1)));
2009
2010 #else
2011
2012 int k4;
2013
2014 k4 = bitrev[i+0];
2015 U1[j+3] = u[k4+0];
2016 U1[j+2] = u[k4+1];
2017 U0[j+3] = u[k4+2];
2018 U0[j+2] = u[k4+3];
2019
2020 k4 = bitrev[i+1];
2021 U1[j+1] = u[k4+0];
2022 U1[j+0] = u[k4+1];
2023 U0[j+1] = u[k4+2];
2024 U0[j+0] = u[k4+3];
2025
2026 #endif // ENABLE_ASM_*
2027
2028 }
2029 }
2030
2031 /*-----------------------------------------------------------------------*/
2032
2033 /**
2034 * imdct_step7: Step 7 of the IMDCT.
2035 *
2036 * [Parameters]
2037 * n: Window size.
2038 * C: Twiddle factor C.
2039 * buffer: Input/output buffer (length n/2, modified in place).
2040 */
2041 static void imdct_step7(const unsigned int n, const float *C, float *buffer)
2042 {
2043 (4/4) for (float *d = buffer, *e = buffer + (n/2) - 4; d < e;
2044 C += 4, d += 4, e -= 4)
2045 {
2046
2047 #if defined(ENABLE_ASM_ARM_NEON)
2048
2049 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);
2050 const float32x4_t C_0 = vld1q_f32(C);
2051 const float32x4_t d_0 = vld1q_f32(d);
2052 const float32x4_t e_0 = vld1q_f32(e);
2053 const float32x4_t e_2 = veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_0));
2054 const float32x4_t sub = vsubq_f32(d_0, e_2);
2055 const float32x4_t add = vaddq_f32(d_0, e_2);
2056 const float32x4_t C02 = veorq_f32(sign_1010, vswizzleq_xxzz_f32(C_0));
2057 const float32x4_t C13 = vswizzleq_yyww_f32(C_0);
2058 const float32x4_t mix = vaddq_f32(
2059 vmulq_f32(C13, sub), vmulq_f32(C02, vswizzleq_yxwz_f32(sub)));
2060 const float32x4_t e_temp = vsubq_f32(add, mix);
2061 const float32x4_t e_out =
2062 veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_temp));
2063 vst1q_f32(d, vaddq_f32(add, mix));
2064 vst1q_f32(e, e_out);
2065
2066 #elif defined(ENABLE_ASM_X86_SSE2)
2067
2068 const __m128 sign_1010 = (__m128)_mm_set1_epi64x(UINT64_C(1) << 63);
2069 const __m128 C_0 = _mm_load_ps(C);
2070 const __m128 d_0 = _mm_load_ps(d);
2071 const __m128 e_0 = _mm_load_ps(e);
2072 const __m128 e_2 = _mm_xor_ps(
2073 sign_1010, _mm_shuffle_ps(e_0, e_0, _MM_SHUFFLE(1,0,3,2)));
2074 const __m128 sub = _mm_sub_ps(d_0, e_2);
2075 const __m128 add = _mm_add_ps(d_0, e_2);
2076 const __m128 C02 = _mm_xor_ps(
2077 sign_1010, _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(2,2,0,0)));
2078 const __m128 C13 = _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(3,3,1,1));
2079 const __m128 mix = _mm_add_ps(
2080 _mm_mul_ps(C13, sub),
2081 _mm_mul_ps(C02, _mm_shuffle_ps(sub, sub, _MM_SHUFFLE(2,3,0,1))));
2082 const __m128 e_temp = _mm_sub_ps(add, mix);
2083 const __m128 e_out = _mm_xor_ps(
2084 sign_1010, _mm_shuffle_ps(e_temp, e_temp, _MM_SHUFFLE(1,0,3,2)));
2085 _mm_store_ps(d, _mm_add_ps(add, mix));
2086 _mm_store_ps(e, e_out);
2087
2088 #else
2089
2090 float sub0 = d[0] - e[2];
2091 float sub1 = d[1] + e[3];
2092 float sub2 = d[2] - e[0];
2093 float sub3 = d[3] + e[1];
2094
2095 float mix0 = C[1]*sub0 + C[0]*sub1;
2096 float mix1 = C[1]*sub1 - C[0]*sub0;
2097 float mix2 = C[3]*sub2 + C[2]*sub3;
2098 float mix3 = C[3]*sub3 - C[2]*sub2;
2099
2100 float add0 = d[0] + e[2];
2101 float add1 = d[1] - e[3];
2102 float add2 = d[2] + e[0];
2103 float add3 = d[3] - e[1];
2104
2105 d[0] = add0 + mix0;
2106 d[1] = add1 + mix1;
2107 d[2] = add2 + mix2;
2108 d[3] = add3 + mix3;
2109
2110 e[0] = add2 - mix2;
2111 e[1] = -(add3 - mix3);
2112 e[2] = add0 - mix0;
2113 e[3] = -(add1 - mix1);
2114
2115 #endif // ENABLE_ASM_*
2116
2117 }
2118 }
2119
2120 /*-----------------------------------------------------------------------*/
2121
2122 /**
2123 * imdct_step8_decode: Step 8 and final decoding for the IMDCT.
2124 *
2125 * [Parameters]
2126 * n: Window size.
2127 * B: Twiddle factor B.
2128 * in: Input buffer (length n/2).
2129 * out: Output buffer (length n).
2130 */
2131 static void imdct_step8_decode(const unsigned int n, const float *B,
2132 const float *in, float *out)
2133 {
2134 /* stb_vorbis note: "this generates pairs of data a la 8 and pushes
2135 * them directly through the decode kernel (pushing rather than
2136 * pulling) to avoid having to make another pass later" */
2137
2138 B += (n/2) - 16;
2139 float *d0 = &out[0];
2140 float *d1 = &out[(n/2)-8];
2141 float *d2 = &out[(n/2)];
2142 float *d3 = &out[n-8];
2143 (4/4) for (const float *e = in + (n/2) - 16; e >= in;
2144 e -= 16, B -= 16, d0 += 8, d1 -= 8, d2 += 8, d3 -= 8)
2145 {
2146
2147 #if defined(ENABLE_ASM_ARM_NEON)
2148
2149 const float32x4x2_t e_0 =
2150 vuzpq_f32(vld1q_f32(&e[0]), vld1q_f32(&e[4]));
2151 const float32x4x2_t e_8 =
2152 vuzpq_f32(vld1q_f32(&e[8]), vld1q_f32(&e[12]));
2153 const float32x4x2_t B_0 =
2154 vuzpq_f32(vld1q_f32(&B[0]), vld1q_f32(&B[4]));
2155 const float32x4x2_t B_8 =
2156 vuzpq_f32(vld1q_f32(&B[8]), vld1q_f32(&B[12]));
2157 const float32x4_t d1_0 =
2158 vsubq_f32(vmulq_f32(e_0.val[1], B_0.val[0]),
2159 vmulq_f32(e_0.val[0], B_0.val[1]));
2160 const float32x4_t d0_0 = vnegq_f32(vswizzleq_wzyx_f32(d1_0));
2161 const float32x4_t d3_0 =
2162 vnegq_f32(vaddq_f32(vmulq_f32(e_0.val[0], B_0.val[0]),
2163 vmulq_f32(e_0.val[1], B_0.val[1])));
2164 const float32x4_t d2_0 = vswizzleq_wzyx_f32(d3_0);
2165 const float32x4_t d1_8 =
2166 vsubq_f32(vmulq_f32(e_8.val[1], B_8.val[0]),
2167 vmulq_f32(e_8.val[0], B_8.val[1]));
2168 const float32x4_t d0_8 = vnegq_f32(vswizzleq_wzyx_f32(d1_8));
2169 const float32x4_t d3_8 =
2170 vnegq_f32(vaddq_f32(vmulq_f32(e_8.val[0], B_8.val[0]),
2171 vmulq_f32(e_8.val[1], B_8.val[1])));
2172 const float32x4_t d2_8 = vswizzleq_wzyx_f32(d3_8);
2173 vst1q_f32(d0, d0_8);
2174 vst1q_f32(d0+4, d0_0);
2175 vst1q_f32(d1+4, d1_8);
2176 vst1q_f32(d1, d1_0);
2177 vst1q_f32(d2, d2_8);
2178 vst1q_f32(d2+4, d2_0);
2179 vst1q_f32(d3+4, d3_8);
2180 vst1q_f32(d3, d3_0);
2181
2182 #elif defined(ENABLE_ASM_X86_SSE2)
2183
2184 const __m128 sign_1111 = (__m128)_mm_set1_epi32(UINT32_C(1) << 31);
2185 const __m128 e_0 = _mm_load_ps(&e[0]);
2186 const __m128 e_4 = _mm_load_ps(&e[4]);
2187 const __m128 B_0 = _mm_load_ps(&B[0]);
2188 const __m128 B_4 = _mm_load_ps(&B[4]);
2189 const __m128 e_8 = _mm_load_ps(&e[8]);
2190 const __m128 e_12 = _mm_load_ps(&e[12]);
2191 const __m128 B_8 = _mm_load_ps(&B[8]);
2192 const __m128 B_12 = _mm_load_ps(&B[12]);
2193 const __m128 e_even_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(2,0,2,0));
2194 const __m128 e_odd_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(3,1,3,1));
2195 const __m128 B_even_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(2,0,2,0));
2196 const __m128 B_odd_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(3,1,3,1));
2197 const __m128 e_even_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(2,0,2,0));
2198 const __m128 e_odd_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(3,1,3,1));
2199 const __m128 B_even_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(2,0,2,0));
2200 const __m128 B_odd_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(3,1,3,1));
2201 const __m128 d1_0 = _mm_sub_ps(_mm_mul_ps(e_odd_0, B_even_0),
2202 _mm_mul_ps(e_even_0, B_odd_0));
2203 const __m128 d0_0 = _mm_xor_ps(
2204 sign_1111, _mm_shuffle_ps(d1_0, d1_0, _MM_SHUFFLE(0,1,2,3)));
2205 const __m128 d3_0 = _mm_xor_ps(
2206 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_0, B_even_0),
2207 _mm_mul_ps(e_odd_0, B_odd_0)));
2208 const __m128 d2_0 = _mm_shuffle_ps(d3_0, d3_0, _MM_SHUFFLE(0,1,2,3));
2209 const __m128 d1_8 = _mm_sub_ps(_mm_mul_ps(e_odd_8, B_even_8),
2210 _mm_mul_ps(e_even_8, B_odd_8));
2211 const __m128 d0_8 = _mm_xor_ps(
2212 sign_1111, _mm_shuffle_ps(d1_8, d1_8, _MM_SHUFFLE(0,1,2,3)));
2213 const __m128 d3_8 = _mm_xor_ps(
2214 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_8, B_even_8),
2215 _mm_mul_ps(e_odd_8, B_odd_8)));
2216 const __m128 d2_8 = _mm_shuffle_ps(d3_8, d3_8, _MM_SHUFFLE(0,1,2,3));
2217 _mm_store_ps(d0, d0_8);
2218 _mm_store_ps(d0+4, d0_0);
2219 _mm_store_ps(d1+4, d1_8);
2220 _mm_store_ps(d1, d1_0);
2221 _mm_store_ps(d2, d2_8);
2222 _mm_store_ps(d2+4, d2_0);
2223 _mm_store_ps(d3+4, d3_8);
2224 _mm_store_ps(d3, d3_0);
2225
2226 #else
2227
2228 float p0, p1, p2, p3;
2229
2230 p3 = e[14]*B[15] - e[15]*B[14];
2231 p2 = -e[14]*B[14] - e[15]*B[15];
2232 d0[0] = p3;
2233 d1[7] = - p3;
2234 d2[0] = p2;
2235 d3[7] = p2;
2236
2237 p1 = e[12]*B[13] - e[13]*B[12];
2238 p0 = -e[12]*B[12] - e[13]*B[13];
2239 d0[1] = p1;
2240 d1[6] = - p1;
2241 d2[1] = p0;
2242 d3[6] = p0;
2243
2244 p3 = e[10]*B[11] - e[11]*B[10];
2245 p2 = -e[10]*B[10] - e[11]*B[11];
2246 d0[2] = p3;
2247 d1[5] = - p3;
2248 d2[2] = p2;
2249 d3[5] = p2;
2250
2251 p1 = e[8]*B[9] - e[9]*B[8];
2252 p0 = -e[8]*B[8] - e[9]*B[9];
2253 d0[3] = p1;
2254 d1[4] = - p1;
2255 d2[3] = p0;
2256 d3[4] = p0;
2257
2258 p3 = e[6]*B[7] - e[7]*B[6];
2259 p2 = -e[6]*B[6] - e[7]*B[7];
2260 d0[4] = p3;
2261 d1[3] = - p3;
2262 d2[4] = p2;
2263 d3[3] = p2;
2264
2265 p1 = e[4]*B[5] - e[5]*B[4];
2266 p0 = -e[4]*B[4] - e[5]*B[5];
2267 d0[5] = p1;
2268 d1[2] = - p1;
2269 d2[5] = p0;
2270 d3[2] = p0;
2271
2272 p3 = e[2]*B[3] - e[3]*B[2];
2273 p2 = -e[2]*B[2] - e[3]*B[3];
2274 d0[6] = p3;
2275 d1[1] = - p3;
2276 d2[6] = p2;
2277 d3[1] = p2;
2278
2279 p1 = e[0]*B[1] - e[1]*B[0];
2280 p0 = -e[0]*B[0] - e[1]*B[1];
2281 d0[7] = p1;
2282 d1[0] = - p1;
2283 d2[7] = p0;
2284 d3[0] = p0;
2285
2286 #endif // ENABLE_ASM_*
2287
2288 }
2289 }
2290
2291 /*-----------------------------------------------------------------------*/
2292
2293 /**
2294 * inverse_mdct: Perform the inverse MDCT operation on the given buffer.
2295 * The algorithm is taken from "The use of multirate filter banks for
2296 * coding of high quality digital audio", Th. Sporer et. al. (1992), with
2297 * corrections for errors in that paper, and the step numbers listed in
2298 * comments refer to the algorithm steps as described in the paper.
2299 *
2300 * [Parameters]
2301 * handle: Stream handle.
2302 * buffer: Input/output buffer.
2303 * blocktype: 0 if the current frame is a short block, 1 if a long block.
2304 */
2305 static void inverse_mdct(stb_vorbis *handle, float *buffer, int blocktype)
2306 {
2307 const unsigned int n = handle->blocksize[blocktype];
2308 const int log2_n = handle->blocksize_bits[blocktype];
2309 const float *A = handle->A[blocktype];
2310 float *buf2 = handle->imdct_temp_buf;
2311
2312 /* Setup and step 1. Note that step 1 involves subtracting pairs of
2313 * spectral coefficients, but the two items subtracted are actually
2314 * arithmetic inverses of each other(!), e.g.
2315 * u[4*k] - u[n-4*k-1] == 2*u[4*k]
2316 * We omit the factor of 2 here; that propagates linearly through to
2317 * the final step, and it is compensated for by "*0.5f" on the B[]
2318 * values computed during setup. */
2319 imdct_setup_step1(n, A, buffer, buf2);
2320
2321 /* Step 2.
2322 * stb_vorbis note: "this could be in place, but the data ends up in
2323 * the wrong place... _somebody_'s got to swap it, so this is nominated" */
2324 imdct_step2(n, A, buf2, buffer);
2325
2326 /* Step 3.
2327 * stb_vorbis note: "the original step3 loop can be nested r inside s
2328 * or s inside r; it's written originally as s inside r, but this is
2329 * dumb when r iterates many times, and s few. So I have two copies of
2330 * it and switch between them halfway." */
2331
2332 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*0, n/4, 8);
2333 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*1, n/4, 8);
2334
2335 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*0, n/8, 16);
2336 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*1, n/8, 16);
2337 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*2, n/8, 16);
2338 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*3, n/8, 16);
2339
2340 int l = 2;
2341 (4/4) for (; l < (log2_n-3)/2; l++) {
2342 const int k0 = n >> (l+2);
2343 const int lim = 1 << (l+1);
2344 (4/4) for (int i = 0; i < lim; i++) {
2345 imdct_step3_inner_r_loop(n >> (l+4), A, buffer, (n/2) - k0*i,
2346 k0, 1 << (l+3));
2347 }
2348 }
2349
2350 (4/4) for (; l < log2_n-6; l++) {
2351 const int k0 = n >> (l+2), k1 = 1 << (l+3);
2352 const int rlim = n >> (l+6);
2353 const int lim = 1 << (l+1);
2354 const float *A0 = A;
2355 int i_off = n/2;
2356 (4/4) for (int r = rlim; r > 0; r--) {
2357 imdct_step3_inner_s_loop(lim, A0, buffer, i_off, k0, k1);
2358 A0 += k1*4;
2359 i_off -= 8;
2360 }
2361 }
2362
2363 /* stb_vorbis note: "log2_n-6,-5,-4 all interleaved together - the big
2364 * win comes from getting rid of needless flops due to the constants on
2365 * pass 5 & 4 being all 1 and 0; combining them to be simultaneous to
2366 * improve cache made little difference" */
2367 imdct_step3_inner_s_loop_ld654(n, A, buffer);
2368
2369 /* Steps 4, 5, and 6. */
2370 imdct_step456(n, handle->bit_reverse[blocktype], buffer, buf2);
2371
2372 /* Step 7. */
2373 imdct_step7(n, handle->C[blocktype], buf2);
2374
2375 /* Step 8 and final decoding. */
2376 imdct_step8_decode(n, handle->B[blocktype], buf2, buffer);
2377 }
2378
2379 /*************************************************************************/
2380 /******************* Main decoding routine (internal) ********************/
2381 /*************************************************************************/
2382
2383 /**
2384 * vorbis_decode_packet_rest: Perform all decoding operations for a frame
2385 * except mode selection and windowing.
2386 *
2387 * [Parameters]
2388 * handle: Stream handle.
2389 * mode: Mode configuration for this frame.
2390 * left_start: Start position of the left overlap region.
2391 * left_end: End position of the left overlap region.
2392 * right_start: Start position of the right overlap region.
2393 * right_end: End position of the right overlap region.
2394 * len_ret: Pointer to variable to receive the frame length (offset
2395 * within the window of just past the final fully-decoded sample).
2396 * [Return value]
2397 * True on success, false on error.
2398 */
2399 static bool vorbis_decode_packet_rest(
2400 stb_vorbis *handle, const Mode *mode, int left_start, int left_end,
2401 int right_start, int right_end, int *len_ret)
2402 {
2403 const int n = handle->blocksize[mode->blockflag];
2404 const Mapping *map = &handle->mapping[mode->mapping];
2405 float ** const channel_buffers =
2406 handle->channel_buffers[handle->cur_channel_buffer];
2407
2408 /**** Floor processing (4.3.2). ****/
2409
2410 int64_t floor0_amplitude[256];
2411 bool zero_channel[256];
2412 (4/4) for (int ch = 0; ch < handle->channels; ch++) {
2413 const int floor_index = map->submap_floor[map->mux[ch]];
2414 (4/4) if (handle->floor_types[floor_index] == 0) {
2415 Floor0 *floor = &handle->floor_config[floor_index].floor0;
2416 const int64_t amplitude =
2417 decode_floor0(handle, floor, handle->coefficients[ch]);
2418 (4/4) if (UNLIKELY(amplitude < 0)) {
2419 flush_packet(handle);
2420 return error(handle, VORBIS_invalid_packet);
2421 }
2422 zero_channel[ch] = (amplitude == 0);
2423 floor0_amplitude[ch] = amplitude;
2424 } else { // handle->floor_types[floor_index] == 1
2425 Floor1 *floor = &handle->floor_config[floor_index].floor1;
2426 zero_channel[ch] =
2427 !decode_floor1(handle, floor, handle->final_Y[ch]);
2428 }
2429 }
2430
2431 /**** Nonzero vector propatation (4.3.3). ****/
2432 bool really_zero_channel[256];
2433 memcpy(really_zero_channel, zero_channel,
2434 sizeof(zero_channel[0]) * handle->channels);
2435 (4/4) for (int i = 0; i < map->coupling_steps; i++) {
2436 (4/4) if (!zero_channel[map->coupling[i].magnitude]
2437 (4/4) || !zero_channel[map->coupling[i].angle]) {
2438 zero_channel[map->coupling[i].magnitude] = false;
2439 zero_channel[map->coupling[i].angle] = false;
2440 }
2441 }
2442
2443 /**** Residue decoding (4.3.4). ****/
2444 (4/4) for (int i = 0; i < map->submaps; i++) {
2445 float *residue_buffers[256];
2446 int ch = 0;
2447 (4/4) for (int j = 0; j < handle->channels; j++) {
2448 (4/4) if (map->mux[j] == i) {
2449 (4/4) if (zero_channel[j]) {
2450 residue_buffers[ch] = NULL;
2451 } else {
2452 residue_buffers[ch] = channel_buffers[j];
2453 }
2454 ch++;
2455 }
2456 }
2457 decode_residue(handle, map->submap_residue[i], n/2, ch,
2458 residue_buffers);
2459 }
2460
2461 /**** Inverse coupling (4.3.5). ****/
2462 (4/4) for (int i = map->coupling_steps-1; i >= 0; i--) {
2463 float *magnitude = channel_buffers[map->coupling[i].magnitude];
2464 float *angle = channel_buffers[map->coupling[i].angle];
2465 (4/4) for (int j = 0; j < n/2; j++) {
2466 const float M = magnitude[j];
2467 const float A = angle[j];
2468 float new_M, new_A;
2469 (4/4) if (M > 0) {
2470 (4/4) if (A > 0) {
2471 new_M = M;
2472 new_A = M - A;
2473 } else {
2474 new_A = M;
2475 new_M = M + A;
2476 }
2477 } else {
2478 (4/4) if (A > 0) {
2479 new_M = M;
2480 new_A = M + A;
2481 } else {
2482 new_A = M;
2483 new_M = M - A;
2484 }
2485 }
2486 magnitude[j] = new_M;
2487 angle[j] = new_A;
2488 }
2489 }
2490
2491 /**** Floor curve synthesis and residue product (4.3.6). The spec ****
2492 **** uses the term "dot product", but the actual operation is ****
2493 **** component-by-component vector multiplication. ****/
2494 (4/4) for (int i = 0; i < handle->channels; i++) {
2495 (4/4) if (really_zero_channel[i]) {
2496 memset(channel_buffers[i], 0, sizeof(*channel_buffers[i]) * (n/2));
2497 } else {
2498 const int floor_index = map->submap_floor[map->mux[i]];
2499 (4/4) if (handle->floor_types[floor_index] == 0) {
2500 Floor0 *floor = &handle->floor_config[floor_index].floor0;
2501 do_floor0_final(handle, floor, i, n, floor0_amplitude[i]);
2502 } else { // handle->floor_types[floor_index] == 1
2503 Floor1 *floor = &handle->floor_config[floor_index].floor1;
2504 do_floor1_final(handle, floor, i, n);
2505 }
2506 }
2507 }
2508
2509 /**** Inverse MDCT (4.3.7). ****/
2510 (4/4) for (int i = 0; i < handle->channels; i++) {
2511 inverse_mdct(handle, channel_buffers[i], mode->blockflag);
2512 }
2513
2514 /**** Frame length, sample position, and other miscellany. ****/
2515
2516 /* Flush any leftover data in the current packet so the next packet
2517 * doesn't try to read it. */
2518 flush_packet(handle);
2519
2520 /* Deal with discarding samples from the first packet. */
2521 (4/4) if (handle->first_decode) {
2522 /* We don't support straems with nonzero start positions (what the
2523 * spec describes as "The granule (PCM) position of the first page
2524 * need not indicate that the stream started at position zero..."
2525 * in A.2), so we just omit the left half of the window. The value
2526 * we calculate here is negative, but the addition below will bring
2527 * it up to zero again. */
2528 handle->current_loc = -(n/2 - left_start);
2529 handle->current_loc_valid = true;
2530 handle->first_decode = false;
2531 }
2532
2533 /* If this is the last complete frame in a non-final Ogg page, update
2534 * the current sample position based on the Ogg granule position.
2535 * Normally this should have no effect, but it's important for the
2536 * first page in case that page indicates a positive initial offset,
2537 * and it helps resync if a page is missing in the stream. */
2538 (4/4) if (handle->last_seg_index == handle->end_seg_with_known_loc
2539 (4/4) && !(handle->page_flag & PAGEFLAG_last_page)) {
2540 /* For non-final pages, the granule position should be the "last
2541 * complete sample returned by decode" (A.2), thus right_start.
2542 * However, it appears that the reference encoder blindly uses the
2543 * middle of the window (n/2) even on a long/short frame boundary,
2544 * and based on <https://trac.xiph.org/ticket/1169>, the window
2545 * center appears to have been the original intention. */
2546 handle->current_loc =
2547 handle->known_loc_for_packet - (n/2 - left_start);
2548 handle->current_loc_valid = true;
2549 }
2550
2551 /* Update the current sample position for samples returned from this
2552 * frame. */
2553 (4/4) if (handle->current_loc_valid) {
2554 handle->current_loc += (right_start - left_start);
2555 }
2556
2557 /* Return the amount of data stored in the window, including the
2558 * right-side overlap region.*/
2559 *len_ret = right_end;
2560
2561 /* The granule position on the final page of the Ogg stream specifies
2562 * the length of the stream, thus indicating how many samples to
2563 * return from the final frame. However, some streams attempt to
2564 * truncate to within the _second-to-last_ frame instead, so we have
2565 * to check for end-of-stream truncation on every packet in the last
2566 * page. (The specification is ambiguous about whether this is
2567 * actually permitted; while the wording suggests that the truncation
2568 * point must lie within the last frame, there is no explicit language
2569 * specifying that limitation. Since such streams actually exist,
2570 * however, we go ahead and support them.) */
2571 (4/4) if (handle->current_loc_valid
2572 (4/4) && (handle->page_flag & PAGEFLAG_last_page)) {
2573 const uint64_t end_loc = handle->known_loc_for_packet;
2574 /* Be careful of wraparound here! A stream with a bogus position
2575 * in the second-to-last page which has the high bit set could
2576 * cause a simple "end_loc <= handle->current_loc" test to
2577 * improperly pass, so we have to test by taking the difference
2578 * and checking its sign bit. */
2579 const uint64_t frame_end =
2580 handle->current_loc + (right_end - right_start);
2581 (4/4) if (handle->current_loc - end_loc < UINT64_C(1)<<63 // end_loc <= handle->current_loc
2582 (4/4) || (handle->last_seg_index == handle->end_seg_with_known_loc
2583 (4/4) && end_loc - frame_end >= UINT64_C(1)<<63)) // end_loc < frame_end
2584 {
2585 const uint64_t packet_begin_loc =
2586 handle->current_loc - (right_start - left_start);
2587 (4/4) if (end_loc - packet_begin_loc >= UINT64_C(1)<<63) { // end_loc < packet_begin_loc
2588 /* We can't truncate to before the beginning of the current
2589 * frame (that wouldn't make sense anyway, because the
2590 * stream should just have ended at the previous frame);
2591 * just truncate to the beginning of the frame in that case. */
2592 *len_ret = left_start;
2593 } else {
2594 const uint64_t true_len =
2595 end_loc - (handle->current_loc - right_start);
2596 ASSERT(true_len < (uint64_t)*len_ret);
2597 *len_ret = (int)true_len;
2598 }
2599 handle->current_loc = end_loc;
2600 }
2601 }
2602
2603 return true;
2604 }
2605
2606 /*************************************************************************/
2607 /************************** Interface routines ***************************/
2608 /*************************************************************************/
2609
2610 bool vorbis_decode_initial(stb_vorbis *handle, int *left_start_ret,
2611 int *left_end_ret, int *right_start_ret,
2612 int *right_end_ret, int *mode_ret)
2613 {
2614 /* Start reading the next packet, and verify that it's an audio packet. */
2615 (4/4) if (UNLIKELY(handle->eof)) {
2616 return false;
2617 }
2618 (4/4) if (UNLIKELY(!start_packet(handle))) {
2619 (4/4) if (handle->error == VORBIS_reached_eof) {
2620 /* EOF at page boundary is not an error! */
2621 handle->error = VORBIS__no_error;
2622 }
2623 return false;
2624 }
2625 (4/4) if (UNLIKELY(get_bits(handle, 1) != 0)
2626 (4/4) || UNLIKELY(handle->valid_bits < 0)) {
2627 /* Empty or non-audio packet, so skip it and try again. */
2628 flush_packet(handle);
2629 return vorbis_decode_initial(handle, left_start_ret, left_end_ret,
2630 right_start_ret, right_end_ret, mode_ret);
2631 }
2632
2633 /* Read the mode number and window flags for this packet. */
2634 const int mode_index = get_bits(handle, handle->mode_bits);
2635 /* Still in the same byte, so we can't hit EOP here. */
2636 ASSERT(mode_index != EOP);
2637 (4/4) if (mode_index >= handle->mode_count) {
2638 handle->error = VORBIS_invalid_packet;
2639 flush_packet(handle);
2640 return false;
2641 }
2642 *mode_ret = mode_index;
2643 Mode *mode = &handle->mode_config[mode_index];
2644 bool next, prev;
2645 (4/4) if (mode->blockflag) {
2646 prev = get_bits(handle, 1);
2647 next = get_bits(handle, 1);
2648 (4/4) if (UNLIKELY(handle->valid_bits < 0)) {
2649 flush_packet(handle);
2650 return error(handle, VORBIS_invalid_packet);
2651 }
2652 } else {
2653 prev = next = false;
2654 }
2655
2656 /* Set up the window bounds for this frame. */
2657 const int n = handle->blocksize[mode->blockflag];
2658 (8/8) if (mode->blockflag && !prev) {
2659 *left_start_ret = (n - handle->blocksize[0]) / 4;
2660 *left_end_ret = (n + handle->blocksize[0]) / 4;
2661 } else {
2662 *left_start_ret = 0;
2663 *left_end_ret = n/2;
2664 }
2665 (8/8) if (mode->blockflag && !next) {
2666 *right_start_ret = (n*3 - handle->blocksize[0]) / 4;
2667 *right_end_ret = (n*3 + handle->blocksize[0]) / 4;
2668 } else {
2669 *right_start_ret = n/2;
2670 *right_end_ret = n;
2671 }
2672
2673 return true;
2674 }
2675
2676 /*-----------------------------------------------------------------------*/
2677
2678 bool vorbis_decode_packet(stb_vorbis *handle, int *len_ret)
2679 {
2680 const bool first_decode = handle->first_decode;
2681
2682 int mode, left_start, left_end, right_start, right_end, len;
2683 (4/4) if (!vorbis_decode_initial(handle, &left_start, &left_end,
2684 &right_start, &right_end, &mode)
2685 (4/4) || !vorbis_decode_packet_rest(handle, &handle->mode_config[mode],
2686 left_start, left_end, right_start,
2687 right_end, &len)) {
2688 return false;
2689 }
2690
2691 float ** const channel_buffers =
2692 handle->channel_buffers[handle->cur_channel_buffer];
2693 const int prev = handle->previous_length;
2694
2695 /* We deliberately (though harmlessly) deviate from the spec with
2696 * respect to the portion of data to return for a given frame. The
2697 * spec says that we should return the region from the middle of the
2698 * previous frame to the middle of the current frame (4.8), but that
2699 * would require a buffer copy. Instead we return all finalized
2700 * samples from the current frame, i.e., up to right_start. This
2701 * unfortunately requires a bit of fudging around when decoding the
2702 * first frame of a stream. */
2703
2704 /* Mix in data from the previous window's right side. */
2705 (4/4) if (prev > 0) {
2706 const float *weights;
2707 (4/4) if (prev*2 == handle->blocksize[0]) {
2708 weights = handle->window_weights[0];
2709 } else {
2710 ASSERT(prev*2 == handle->blocksize[1]);
2711 weights = handle->window_weights[1];
2712 }
2713 (4/4) for (int i = 0; i < handle->channels; i++) {
2714 (4/4) for (int j = 0; j < prev; j++) {
2715 channel_buffers[i][left_start+j] =
2716 channel_buffers[i][left_start+j] * weights[j] +
2717 handle->previous_window[i][j] * weights[prev-1-j];
2718 }
2719 }
2720 }
2721
2722 /* Point the previous_window pointers at the right side of this window. */
2723 handle->previous_length = right_end - right_start;
2724 (4/4) for (int i = 0; i < handle->channels; i++) {
2725 handle->previous_window[i] = channel_buffers[i] + right_start;
2726 }
2727
2728 /* If this is the first frame, push left_start (the beginning of data
2729 * to return) to the center of the window, since the left half of the
2730 * window contains garbage. */
2731 (4/4) if (first_decode) {
2732 const int n = handle->blocksize[handle->mode_config[mode].blockflag];
2733 left_start = n/2;
2734 }
2735
2736 /* Save this channel's output pointers. */
2737 (4/4) for (int i = 0; i < handle->channels; i++) {
2738 handle->outputs[i] = channel_buffers[i] + left_start;
2739 }
2740
2741 /* Advance to the next set of channel buffers for the next frame's data. */
2742 handle->cur_channel_buffer =
2743 (handle->cur_channel_buffer + 1) % lenof(handle->channel_buffers);
2744
2745 /* Return the final frame length, if requested. */
2746 (4/4) if (len_ret) {
2747 (4/4) if (len < right_start) {
2748 *len_ret = len - left_start;
2749 /* Watch out for trying to truncate into the first packet of
2750 * the stream (which is not returned). */
2751 (4/4) if (*len_ret < 0) {
2752 *len_ret = 0;
2753 }
2754 } else {
2755 *len_ret = right_start - left_start;
2756 ASSERT(*len_ret >= 0);
2757 }
2758 }
2759 return true;
2760 }
2761
2762 /*************************************************************************/
2763 /*************************************************************************/