/* * libnogg: a decoder library for Ogg Vorbis streams * Copyright (c) 2014-2024 Andrew Church * * This software may be copied and redistributed under certain conditions; * see the file "COPYING" in the source code distribution for details. * NO WARRANTY is provided with this software. */ #include "include/nogg.h" #include "src/common.h" #include "src/decode/common.h" #include "src/decode/crc32.h" #include "src/decode/decode.h" #include "src/decode/inlines.h" #include "src/decode/io.h" #include "src/decode/packet.h" #include "src/decode/setup.h" #include "src/util/memory.h" #include "src/x86.h" #include #include #include #ifdef ENABLE_ASM_ARM_NEON # include /* Note that vectorization is sometimes slower on ARM because of the lack * of fast vector swizzle instructions, so ARM code is deliberately omitted * in those cases. */ #endif /* Older versions of GCC warn about shadowing functions with variables, so * work around those warnings. */ #if defined(__GNUC__) && !defined(__clang__) && __GNUC__==4 && __GNUC_MINOR__<7 # define div div_ # define floor floor_ # define y0 y0_ # define y1 y1_ #endif /* * Note on variable naming: Variables are generally named following usage * in the Vorbis spec, though some variables have been renamed for clarity. * In particular, the Vorbis spec consistently uses "n" for the window size * of a packet, and we follow that usage here. */ /*************************************************************************/ /****************************** Local data *******************************/ /*************************************************************************/ /* Lookup table for type 1 floor curve generation, copied from the Vorbis * specification. */ static float floor1_inverse_db_table[256] = { 1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f, 1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f, 1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f, 2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f, 2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f, 3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f, 4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f, 6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f, 7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f, 1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f, 1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f, 1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f, 2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f, 2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f, 3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f, 4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f, 5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f, 7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f, 9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f, 1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f, 1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f, 2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f, 2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f, 3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f, 4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f, 5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f, 7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f, 9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f, 0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f, 0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f, 0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f, 0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f, 0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f, 0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f, 0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f, 0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f, 0.00092223983f, 0.00098217216f, 0.0010459992f, 0.0011139742f, 0.0011863665f, 0.0012634633f, 0.0013455702f, 0.0014330129f, 0.0015261382f, 0.0016253153f, 0.0017309374f, 0.0018434235f, 0.0019632195f, 0.0020908006f, 0.0022266726f, 0.0023713743f, 0.0025254795f, 0.0026895994f, 0.0028643847f, 0.0030505286f, 0.0032487691f, 0.0034598925f, 0.0036847358f, 0.0039241906f, 0.0041792066f, 0.0044507950f, 0.0047400328f, 0.0050480668f, 0.0053761186f, 0.0057254891f, 0.0060975636f, 0.0064938176f, 0.0069158225f, 0.0073652516f, 0.0078438871f, 0.0083536271f, 0.0088964928f, 0.009474637f, 0.010090352f, 0.010746080f, 0.011444421f, 0.012188144f, 0.012980198f, 0.013823725f, 0.014722068f, 0.015678791f, 0.016697687f, 0.017782797f, 0.018938423f, 0.020169149f, 0.021479854f, 0.022875735f, 0.024362330f, 0.025945531f, 0.027631618f, 0.029427276f, 0.031339626f, 0.033376252f, 0.035545228f, 0.037855157f, 0.040315199f, 0.042935108f, 0.045725273f, 0.048696758f, 0.051861348f, 0.055231591f, 0.058820850f, 0.062643361f, 0.066714279f, 0.071049749f, 0.075666962f, 0.080584227f, 0.085821044f, 0.091398179f, 0.097337747f, 0.10366330f, 0.11039993f, 0.11757434f, 0.12521498f, 0.13335215f, 0.14201813f, 0.15124727f, 0.16107617f, 0.17154380f, 0.18269168f, 0.19456402f, 0.20720788f, 0.22067342f, 0.23501402f, 0.25028656f, 0.26655159f, 0.28387361f, 0.30232132f, 0.32196786f, 0.34289114f, 0.36517414f, 0.38890521f, 0.41417847f, 0.44109412f, 0.46975890f, 0.50028648f, 0.53279791f, 0.56742212f, 0.60429640f, 0.64356699f, 0.68538959f, 0.72993007f, 0.77736504f, 0.82788260f, 0.88168307f, 0.9389798f, 1.0f }; /*************************************************************************/ /************************* Vectorization helpers *************************/ /*************************************************************************/ #ifdef ENABLE_ASM_ARM_NEON /* Used to avoid unnecessary typecasts when flipping sign bits. */ static inline float32x4_t veorq_f32(uint32x4_t a, float32x4_t b) { return (float32x4_t)veorq_u32(a, (uint32x4_t)b); } /* Various kinds of vector swizzles (xyzw = identity). The separate * declarations are needed because some functions are implemented in * terms of others. */ static inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a); static inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a); static inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a); static inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a); static inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a); static inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a); static inline float32x4_t vswizzleq_yyww_f32(float32x4_t a); static inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a); static inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a); static inline float32x4_t vswizzleq_zzww_f32(float32x4_t a); static inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a); static inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a); static inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a); static inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a); /* These are all marked UNUSED so Clang doesn't complain about the fact * that we don't currently use all of them. */ static UNUSED inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a) { return vzipq_f32(a, a).val[0]; } static UNUSED inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a) { const uint32x4_t sel = {~0U, 0, ~0U, 0}; /* vbsl operands: vbsl(mask, mask_on_value, mask_off_value) */ return vbslq_f32(sel, a, (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32)); } static UNUSED inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a) { return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[0]); } static UNUSED inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a) { return vuzpq_f32(a, a).val[0]; } static UNUSED inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a) { return vswizzleq_xyxy_f32(vswizzleq_yxwz_f32(a)); } static UNUSED inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a) { const uint32x4_t sel = {~0U, 0, ~0U, 0}; return vbslq_f32(sel, (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32), (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32)); } static UNUSED inline float32x4_t vswizzleq_yyww_f32(float32x4_t a) { const uint32x4_t sel = {~0U, 0, ~0U, 0}; return vbslq_f32(sel, (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32), a); } static UNUSED inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a) { return vuzpq_f32(a, a).val[1]; } static UNUSED inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a) { return vswizzleq_zwxy_f32(vswizzleq_xxzz_f32(a)); } static UNUSED inline float32x4_t vswizzleq_zzww_f32(float32x4_t a) { return vzipq_f32(a, a).val[1]; } static UNUSED inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a) { return vextq_f32(a, a, 2); } static UNUSED inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a) { return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[1]); } static UNUSED inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a) { return vswizzleq_yxwz_f32(vswizzleq_zwxy_f32(a)); } static UNUSED inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a) { return vswizzleq_zwxy_f32(vswizzleq_yyww_f32(a)); } #endif // ENABLE_ASM_ARM_NEON /*-----------------------------------------------------------------------*/ #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2) /* * _mm_xor_sign: Simple wrapper function for _mm_xor_ps() used to avoid * unnecessary typecasts when flipping sign bits and work around compiler * over-optimizations. * * Unlike ARM, the x86 architecture includes an exclusive-or instruction * which nominally operates on floating-point data (xorps, with the * corresponding intrinsic _mm_xor_ps()), which has the same effect as * the equivalent integer instruction but avoids cross-domain data * movement on processors with separate integer and floating-point vector * pipelines, so at first glance it would seem like we could simply cast * the sign vector to a float vector and use it that way. But some * compilers, notably GCC 8 and later and Clang 11 and later, aggressively * optimize away signed zero values in vector constants when inexact * floating-point optimizations are enabled, and a mask containing only * sign bits looks like an all-zero vector when interpreted as * floating-point data. * * We could potentially accept the potential cross-domain stall and use * the integer instruction (pxor), which works for GCC, but Clang seems * to "pierce the veil" of the typecast and optimize out the operation * anyway (in fact, this was diagnosed as incorrectly treating an XOR of * two 64-bit sign masks as a vector negation of four 32-bit floats and * applying the transformation "-(x - y) => y - x"). So we interpose a * dummy (empty) inline assembly statement to hide knowledge of the * constant's value from these compilers, thus forcing them to emit the * exclusive-or operation as desired. * * See also: * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86855 * https://github.com/llvm/llvm-project/issues/55758 (fixed in 15.0.0) */ static inline __m128 _mm_xor_sign(__m128i sign_mask, __m128 value) { #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0)) __asm__("" : "=x" (value) : "0" (value)); #endif return _mm_xor_ps(CAST_M128(sign_mask), value); } #endif // ENABLE_ASM_X86_SSE2 /*-----------------------------------------------------------------------*/ #ifdef ENABLE_ASM_X86_AVX2 /* * _mm256_xor_sign: 256-bit version of _mm_xor_sign(), with the same * caveats. */ static inline __m256 _mm256_xor_sign(__m256i sign_mask, __m256 value) { #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0)) __asm__("" : "=x" (value) : "0" (value)); #endif return _mm256_xor_ps(_mm256_castsi256_ps(sign_mask), value); } /** * _mm256_permute4x64_ps: Convenience wrapper for _mm256_permute4x64_pd() * which encapsulates ps/pd casts. Written as a macro because "control" * must be a compile-time constant, which the compiler may not be able * to determine if this is written as a function. */ #define _mm256_permute4x64_ps(value, control) \ (_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd((value)), \ (control)))) #endif // ENABLE_ASM_X86_AVX2 /*************************************************************************/ /******************* Codebook decoding: scalar context *******************/ /*************************************************************************/ /** * codebook_decode_scalar_raw_slow: Slow path for Huffman decoding, * searching through the lookup tables to decode the next symbol. */ static int32_t codebook_decode_scalar_raw_slow(stb_vorbis *handle, const Codebook *book) { /* Fill the bit accumulator far enough to read any valid code. */ if (handle->valid_bits < 24) { fill_bits(handle); } /* Find the code using binary search if we can, linear search otherwise. */ if (book->sorted_codewords) { const uint32_t code = bit_reverse(handle->acc); int32_t low = 0, high = book->sorted_entries; /* Invariant: sorted_codewords[low] <= code < sorted_codewords[high] */ while (low+1 < high) { const int32_t mid = (low + high) / 2; if (book->sorted_codewords[mid] <= code) { low = mid; } else { high = mid; } } const int32_t result = book->sparse ? low : book->sorted_values[low]; const int len = book->codeword_lengths[result]; handle->acc >>= len; handle->valid_bits -= len; if (UNLIKELY(handle->valid_bits < 0)) { return -1; } return result; } else { /* This loop terminates when the code is found. We ensure that all * Huffman trees are completely specified during setup, so the * assertion below will never fail. */ for (int i = 0; ; i++) { ASSERT(i < book->entries); if (book->codeword_lengths[i] == NO_CODE) { continue; } if (book->codewords[i] == (handle->acc & ((UINT32_C(1) << book->codeword_lengths[i])-1))) { handle->acc >>= book->codeword_lengths[i]; handle->valid_bits -= book->codeword_lengths[i]; if (UNLIKELY(handle->valid_bits < 0)) { return -1; } return i; } } } } /*-----------------------------------------------------------------------*/ /** * codebook_decode_scalar_raw: Read a Huffman code from the packet and * decode it in scalar context using the given codebook, returning the * symbol for non-sparse codebooks and the sorted_values[] index for sparse * codebooks. * * This function contains only the fast path for Huffman lookup so that it * can be easily inlined into callers. The slow path is implemented as a * separate, non-inlined function to avoid code bloat. * * [Parameters] * handle: Stream handle. * book: Codebook to use. * [Return value] * Value read, or -1 if the end of the packet is reached. */ static inline int32_t codebook_decode_scalar_raw(stb_vorbis *handle, const Codebook *book) { /* First try the O(1) table. We only fill the bit accumulator if we * don't have enough bits for the fast table, to avoid overhead from * repeatedly reading single bytes from the packet. */ if (handle->valid_bits < handle->fast_huffman_length) { /* Note that moving this "semi-fast" path out to a separate * function is no faster, and possibly slower (though not * significantly so), than just calling fill_bits() here, at least * on x86_64 and 32-bit ARM. */ fill_bits(handle); } const int fast_code = book->fast_huffman[handle->acc & handle->fast_huffman_mask]; if (fast_code >= 0) { const int bits = book->codeword_lengths[fast_code]; handle->acc >>= bits; handle->valid_bits -= bits; if (UNLIKELY(handle->valid_bits < 0)) { return -1; } return fast_code; } /* Fall back to the slow (table search) method. */ return codebook_decode_scalar_raw_slow(handle, book); } /*-----------------------------------------------------------------------*/ /** * codebook_decode_scalar: Read a Huffman code from the packet and decode * it in scalar context using the given codebook. * * [Parameters] * handle: Stream handle. * book: Codebook to use. * [Return value] * Symbol read, or -1 if the end of the packet is reached. */ static int32_t codebook_decode_scalar(stb_vorbis *handle, const Codebook *book) { const int32_t value = codebook_decode_scalar_raw(handle, book); return book->sparse ? book->sorted_values[value] : value; } /*************************************************************************/ /********************* Codebook decoding: VQ context *********************/ /*************************************************************************/ /** * codebook_decode_scalar_for_vq: Read a Huffman code from the packet and * return a value appropriate for decoding the code in VQ context. Helper * function for codebook_decode() and related functions. * * [Parameters] * handle: Stream handle. * book: Codebook to use. * [Return value] * Symbol or value read (depending on library configuration and * codebook type), or -1 if the end of the packet is reached. */ static inline int32_t codebook_decode_scalar_for_vq(stb_vorbis *handle, const Codebook *book) { if (handle->divides_in_codebook) { return codebook_decode_scalar(handle, book); } else { return codebook_decode_scalar_raw(handle, book); } } /*-----------------------------------------------------------------------*/ /** * codebook_decode: Read a Huffman code from the packet and decode it in * VQ context using the given codebook. Decoded vector components are * added componentwise to the values in the output vector. * * If an error occurs, the current packet is flushed. * * [Parameters] * handle: Stream handle. * book: Codebook to use. Must be a vector (not scalar) codebook. * output: Output vector. * len: Number of elements to read. Automatically capped at * book->dimensions. * [Return value] * True on success, false on error or end-of-packet. */ static bool codebook_decode(stb_vorbis *handle, const Codebook *book, float *output, int len) { ASSERT(book->lookup_type != 0); ASSERT(book->multiplicands); if (len > book->dimensions) { len = book->dimensions; } const int32_t code = codebook_decode_scalar_for_vq(handle, book); if (UNLIKELY(code < 0)) { return false; } if (book->lookup_type == 1) { int div = 1; if (book->sequence_p) { float last = 0; for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; const float val = book->multiplicands[offset] + last; output[i] += val; last = val; div *= book->lookup_values; } } else { for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; output[i] += book->multiplicands[offset]; div *= book->lookup_values; } } } else { const int32_t offset = code * book->dimensions; int i = 0; #if defined(ENABLE_ASM_ARM_NEON) for (; i+4 <= len; i += 4) { vst1q_f32(&output[i], vaddq_f32( vld1q_f32(&output[i]), vld1q_f32(&book->multiplicands[offset+i]))); } #elif defined(ENABLE_ASM_X86_SSE2) for (; i+4 <= len; i += 4) { _mm_storeu_ps(&output[i], _mm_add_ps( _mm_loadu_ps(&output[i]), _mm_loadu_ps(&book->multiplicands[offset+i]))); } #endif for (; i < len; i++) { output[i] += book->multiplicands[offset+i]; } } return true; } /*-----------------------------------------------------------------------*/ /** * codebook_decode_step: Read a Huffman code from the packet and decode it * in VQ context using the given codebook. Decoded vector components are * added componentwise to values in the output vector at intervals of "step" * array elements. (codebook_decode() is effectively a specialization of * this function with step==1.) * * If an error occurs, the current packet is flushed. * * [Parameters] * handle: Stream handle. * book: Codebook to use. Must be a vector (not scalar) codebook. * output: Output vector. * len: Number of elements to read. Automatically capped at * book->dimensions. * step: Interval between consecutive output elements: the second * element is output[step], the third is output[2*step], etc. * [Return value] * True on success, false on error or end-of-packet. */ static bool codebook_decode_step(stb_vorbis *handle, const Codebook *book, float *output, int len, int step) { ASSERT(book->lookup_type != 0); ASSERT(book->multiplicands); if (len > book->dimensions) { len = book->dimensions; } const int32_t code = codebook_decode_scalar_for_vq(handle, book); if (UNLIKELY(code < 0)) { return false; } if (book->lookup_type == 1) { int div = 1; if (book->sequence_p) { /* This case appears not to be used by the reference encoder. */ float last = 0; for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; const float val = book->multiplicands[offset] + last; output[i*step] += val; last = val; div *= book->lookup_values; } } else { for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; output[i*step] += book->multiplicands[offset]; div *= book->lookup_values; } } } else { const int32_t offset = code * book->dimensions; for (int i = 0; i < len; i++) { output[i*step] += book->multiplicands[offset+i]; } } return true; } /*-----------------------------------------------------------------------*/ /** * codebook_decode_deinterleave_repeat: Read Huffman codes from the packet * and decode them in VQ context using the given codebook, deinterleaving * across "ch" output channels, and repeat until total_decode vector * elements have been decoded. Decoded vector components are added * componentwise to the values in the output vectors. * * If an error occurs, the current packet is flushed. * * [Parameters] * handle: Stream handle. * book: Codebook to use. * outputs: Output vectors for each channel. * ch: Number of channels. * n: Length of each output vector. * total_offset: Offset within the interleaved output vector at which * to start writing output values. * total_decode: Total number of elements to read. * [Return value] * True on success, false on error or end-of-packet. */ static bool codebook_decode_deinterleave_repeat( stb_vorbis *handle, const Codebook *book, float **outputs, int ch, int n, int32_t total_offset, int32_t total_decode) { if (total_decode > ch*n - total_offset) { total_decode = ch*n - total_offset; } int c_inter = total_offset % ch; int p_inter = total_offset / ch; int len = book->dimensions; while (total_decode > 0) { /* Make sure we don't run off the end of the output vectors. */ if (len > total_decode) { len = total_decode; } const int32_t code = codebook_decode_scalar_for_vq(handle, book); if (UNLIKELY(code < 0)) { return false; } if (book->lookup_type == 1) { int div = 1; if (book->sequence_p) { /* This case appears not to be used by the reference encoder. */ float last = 0; for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; const float val = book->multiplicands[offset] + last; if (outputs[c_inter]) { outputs[c_inter][p_inter] += val; } if (++c_inter == ch) { c_inter = 0; p_inter++; } last = val; div *= book->lookup_values; } } else { for (int i = 0; i < len; i++) { const int32_t offset = (code / div) % book->lookup_values; if (outputs[c_inter]) { outputs[c_inter][p_inter] += book->multiplicands[offset]; } if (++c_inter == ch) { c_inter = 0; p_inter++; } div *= book->lookup_values; } } } else { // book->lookup_type == 2 const int32_t offset = code * book->dimensions; for (int i = 0; i < len; i++) { if (outputs[c_inter]) { outputs[c_inter][p_inter] += book->multiplicands[offset+i]; } if (++c_inter == ch) { c_inter = 0; p_inter++; } } } total_decode -= len; } return true; } /*-----------------------------------------------------------------------*/ /** * codebook_decode_deinterleave_repeat_2: Read Huffman codes from the * packet and decode them in VQ context using the given codebook, * deinterleaving across 2 output channels. This function is a * specialization of codebook_decode_deinterleave_repeat() for ch==2 and * VORBIS_OPTION_DIVIDES_IN_CODEBOOK disabled. * * If an error occurs, the current packet is flushed. * * [Parameters] * handle: Stream handle. * book: Codebook to use. * outputs: Output vectors for each channel. * len: Length of each output vector. * total_decode: Total number of elements to read. * [Return value] * True on success, false on error or end-of-packet. */ static bool codebook_decode_deinterleave_repeat_2( stb_vorbis *handle, const Codebook *book, float **outputs, int n, int32_t total_offset, int32_t total_decode) { if (total_decode > 2*n - total_offset) { total_decode = 2*n - total_offset; } float * const output0 = outputs[0]; float * const output1 = outputs[1]; ASSERT(output0); ASSERT(output1); int c_inter = total_offset % 2; int p_inter = total_offset / 2; int len = book->dimensions; while (total_decode > 0) { /* Make sure we don't run off the end of the output vectors. */ if (len > total_decode) { len = total_decode; } const int32_t code = codebook_decode_scalar_for_vq(handle, book); if (UNLIKELY(code < 0)) { return false; } const int32_t offset = code * book->dimensions; int i = 0; if (c_inter == 1) { output1[p_inter] += book->multiplicands[offset]; c_inter = 0; p_inter++; i++; } for (; i+2 <= len; i += 2) { output0[p_inter] += book->multiplicands[offset+i]; output1[p_inter] += book->multiplicands[offset+i+1]; p_inter++; } if (i < len) { output0[p_inter] += book->multiplicands[offset+i]; c_inter++; } total_decode -= len; } return true; } /*************************************************************************/ /*************************** Floor processing ****************************/ /*************************************************************************/ /** * render_point: Calculate the Y coordinate for point X on the line * (x0,y0)-(x1,y1). Defined by section 9.2.6 in the Vorbis specification. * * [Parameters] * x0, y0: First endpoint of the line. * x1, y1: Second endpoint of the line. * x: X coordinate of point to calculate. * [Return value] * Y value for the given X coordinate. */ static CONST_FUNCTION int render_point(int x0, int y0, int x1, int y1, int x) { /* The definition of this function in the spec has separate code * branches for positive and negative dy values, presumably to try and * avoid defining integer division as rounding negative values toward * zero (even though it ends up doing exactly that in the very next * section...), but C already defines integer division to round both * positive and negative values toward zero, so we just do things the * natural way. */ return y0 + ((y1 - y0) * (x - x0) / (x1 - x0)); } /*-----------------------------------------------------------------------*/ /** * render_line: Render a line for a type 1 floor curve and perform the * dot-product operation with the residue vector. Defined by section * 9.2.7 in the Vorbis specification. * * [Parameters] * x0, y0: First endpoint of the line. * x1, y1: Second endpoint of the line. * output: Output buffer. On entry, this buffer should contain the * residue vector. * n: Window length. */ static inline void render_line(int x0, int y0, int x1, int y1, float *output, int n) { /* N.B.: The spec requires a very specific sequence of operations for * this function to ensure that both the encoder and the decoder * generate the same integer-quantized curve. Take care that any * optimizations to this function do not change the output. */ ASSERT(x0 >= 0); ASSERT(y0 >= 0); ASSERT(x1 > x0); const int dy = y1 - y0; const int adx = x1 - x0; const int base = dy / adx; const int ady = abs(dy) - (abs(base) * adx); const int sy = (dy < 0 ? base-1 : base+1); int x = x0, y = y0; int err = 0; if (x1 >= n) { if (x0 >= n) { return; } x1 = n; } output[x] *= floor1_inverse_db_table[y]; for (x++; x < x1; x++) { err += ady; if (err >= adx) { err -= adx; y += sy; } else { y += base; } output[x] *= floor1_inverse_db_table[y]; } } /*-----------------------------------------------------------------------*/ /** * decode_floor0: Perform type 0 floor decoding. * * [Parameters] * handle: Stream handle. * floor: Floor configuration. * coefficients: Floor coefficient buffer for the current channel. * [Return value] * The floor amplitude (positive), 0 to indicate "unused" status, or * -1 to indicate an undecodable packet. */ static int64_t decode_floor0(stb_vorbis *handle, const Floor0 *floor, float *coefficients) { const int amplitude = get_bits(handle, floor->amplitude_bits); if (amplitude == 0) { return 0; } const int booknumber = get_bits(handle, floor->book_bits); if (booknumber >= floor->number_of_books) { return -1; } const Codebook *book = &handle->codebooks[floor->book_list[booknumber]]; if (book->lookup_type == 0) { /* The specification is unclear on whether a floor 0 configuration * referencing a scalar codebook is a setup-time or decode-time * error; the only applicable rule seems to be 3.3 "If decoder * setup or decode requests such an action [using a codebook of * lookup type 0 in any context expecting a vector return value], * that is an error condition rendering the packet undecodable". * In the spirit of being lenient in what you accept, we treat it * as a decode-time error so that packets which do not use that * floor configuration can still be decoded. */ return -1; } float last = 0; int coef_count = 0; while (coef_count < floor->order) { const int len = min(book->dimensions, floor->order - coef_count); for (int i = 0; i < len; i++) { coefficients[coef_count + i] = last; } if (!codebook_decode(handle, book, &coefficients[coef_count], len)) { return 0; } coef_count += len; last = coefficients[coef_count - 1]; } return amplitude; } /*-----------------------------------------------------------------------*/ /** * decode_floor1_partition: Decode a single partition for a type 1 floor. * * [Parameters] * handle: Stream handle. * floor: Floor configuration. * final_Y: final_Y buffer for the current channel. * offset: Current offset in final_Y buffer. * class: Partition class for the current partition. * [Return value] * New value of offset. */ static int decode_floor1_partition(stb_vorbis *handle, const Floor1 *floor, int16_t *final_Y, int offset, const int class) { const int cdim = floor->class_dimensions[class]; const int cbits = floor->class_subclasses[class]; const int csub = (1 << cbits) - 1; int cval = 0; if (cbits > 0) { const Codebook *book = &handle->codebooks[floor->class_masterbooks[class]]; cval = codebook_decode_scalar(handle, book); } for (int j = 0; j < cdim; j++) { const int book_index = floor->subclass_books[class][cval & csub]; cval >>= cbits; if (book_index >= 0) { const Codebook *book = &handle->codebooks[book_index]; final_Y[offset++] = codebook_decode_scalar(handle, book); } else { final_Y[offset++] = 0; } } return offset; } /*-----------------------------------------------------------------------*/ /** * decode_floor1_amplitude: Calculate an element of the final_Y vector * for a type 1 floor. * * [Parameters] * floor: Floor configuration. * final_Y: final_Y buffer for the current channel. * range: Maximum value for final_Y elements. * step2_flag: Array of flags indicating final_Y elements which will * need to be recalculated later. * i: Element of final_Y to calculate. */ static void decode_floor1_amplitude(const Floor1 *floor, int16_t *final_Y, const int range, bool *step2_flag, const int i) { const int low = floor->neighbors[i].low; const int high = floor->neighbors[i].high; const int predicted = render_point(floor->X_list[low], final_Y[low], floor->X_list[high], final_Y[high], floor->X_list[i]); const int val = final_Y[i]; // Value read from packet. const int highroom = range - predicted; const int lowroom = predicted; int room; if (highroom > lowroom) { room = lowroom * 2; } else { room = highroom * 2; } if (val) { step2_flag[low] = true; step2_flag[high] = true; step2_flag[i] = true; if (val >= room) { if (highroom > lowroom) { /* The spec (7.2.4) suggests that "decoder implementations * guard the values in vector [floor1_final_Y] by clamping * each element to [0, [range])" because "it is possible * to abuse the setup and codebook machinery to produce * negative or over-range results". That can only happen * in these two cases, so we insert tests here. This has * about a 1% performance penalty, but that's better than * corrupt data causing the library to crash. */ if (UNLIKELY(val > range - 1)) { final_Y[i] = range - 1; } else { final_Y[i] = val; } } else { if (UNLIKELY(val > range - 1)) { final_Y[i] = 0; } else { final_Y[i] = (range - 1) - val; } } } else { if (val % 2 != 0) { final_Y[i] = predicted - (val+1)/2; } else { final_Y[i] = predicted + val/2; } } } else { step2_flag[i] = false; final_Y[i] = predicted; } } /*-----------------------------------------------------------------------*/ /** * decode_floor1: Perform type 1 floor decoding. * * [Parameters] * handle: Stream handle. * floor: Floor configuration. * final_Y: final_Y buffer for the current channel. * [Return value] * False to indicate "unused" status (4.2.3 step 6), true otherwise. */ static bool decode_floor1(stb_vorbis *handle, const Floor1 *floor, int16_t *final_Y) { static const int16_t range_list[4] = {256, 128, 86, 64}; static const int8_t range_bits_list[4] = {8, 7, 7, 6}; const int range = range_list[floor->floor1_multiplier - 1]; const int range_bits = range_bits_list[floor->floor1_multiplier - 1]; if (!get_bits(handle, 1)) { return false; // Channel is zero. } /* Floor decode (7.2.3). */ int offset = 2; final_Y[0] = get_bits(handle, range_bits); final_Y[1] = get_bits(handle, range_bits); for (int i = 0; i < floor->partitions; i++) { const int class = floor->partition_class_list[i]; offset = decode_floor1_partition(handle, floor, final_Y, offset, class); } if (UNLIKELY(handle->valid_bits < 0)) { return false; } /* Amplitude value synthesis (7.2.4 step 1). */ bool step2_flag[256]; step2_flag[0] = true; step2_flag[1] = true; for (int i = 2; i < floor->values; i++) { decode_floor1_amplitude(floor, final_Y, range, step2_flag, i); } /* Curve synthesis (7.2.4 step 2). We defer final floor computation * until after synthesis; here, we just clear final_Y values which * need to be synthesized later. */ for (int i = 0; i < floor->values; i++) { if (!step2_flag[i]) { final_Y[i] = -1; } } return true; } /*-----------------------------------------------------------------------*/ /** * do_floor0_final: Perform the floor curve computation and residue * product for type 0 floors. * * [Parameters] * handle: Stream handle. * floor: Floor configuration. * ch: Channel to operate on. * n: Frame window size. * amplitude: Floor amplitude. */ static void do_floor0_final(stb_vorbis *handle, const Floor0 *floor, const int ch, const int n, const int64_t amplitude) { float *output = handle->channel_buffers[handle->cur_channel_buffer][ch]; float *coefficients = handle->coefficients[ch]; const int16_t *map = floor->map[(n == handle->blocksize[1])]; const float omega_base = M_PIf / floor->bark_map_size; const float scaled_amplitude = (float)amplitude / (float)((UINT64_C(1) << floor->amplitude_bits) - 1); const float lfv_scale = 0.11512925f * floor->amplitude_offset; for (int i = 0; i < floor->order; i++) { coefficients[i] = 2 * cosf(coefficients[i]); } int i = 0; do { const float two_cos_omega = 2 * cosf(map[i] * omega_base); float p = 0.5, q = 0.5; int j; for (j = 0; j <= (floor->order - 2) / 2; j++) { p *= coefficients[2*j+1] - two_cos_omega; q *= coefficients[2*j] - two_cos_omega; } if (floor->order % 2 != 0) { q *= coefficients[2*j] - two_cos_omega; /* The spec gives this as "4 - cos^2(omega)", but we use the * equality (a^2-b^2) = (a+b)(a-b) to give us constants * common between both branches of the if statement. */ p *= p * (2 + two_cos_omega) * (2 - two_cos_omega); q *= q; } else { p *= p * (2 - two_cos_omega); q *= q * (2 + two_cos_omega); } const float linear_floor_value = expf(lfv_scale * ((scaled_amplitude / sqrtf(p + q)) - 1)); const int iteration_condition = map[i]; do { output[i] *= linear_floor_value; i++; } while (map[i] == iteration_condition); } while (i < n/2); } /*-----------------------------------------------------------------------*/ /** * do_floor1_final: Perform the final floor curve computation and residue * product for type 1 floors. * * [Parameters] * handle: Stream handle. * floor: Floor configuration. * ch: Channel to operate on. * n: Frame window size. */ static void do_floor1_final(stb_vorbis *handle, const Floor1 *floor, const int ch, const int n) { float *output = handle->channel_buffers[handle->cur_channel_buffer][ch]; const int16_t *final_Y = handle->final_Y[ch]; int lx = 0; int ly = final_Y[0] * floor->floor1_multiplier; for (int i = 1; i < floor->values; i++) { int j = floor->sorted_order[i]; if (final_Y[j] >= 0) { const int hx = floor->X_list[j]; const int hy = final_Y[j] * floor->floor1_multiplier; render_line(lx, ly, hx, hy, output, n/2); lx = hx; ly = hy; } } if (lx < n/2) { /* Optimization of: render_line(lx, ly, n/2, ly, output, n/2); */ for (int i = lx; i < n/2; i++) { output[i] *= floor1_inverse_db_table[ly]; } } } /*************************************************************************/ /*************************** Residue handling ****************************/ /*************************************************************************/ /** * decode_residue_partition_0: Decode a single residue partition for * residue type 0. * * [Parameters] * handle: Stream handle. * book: Codebook to use for decoding. * size: Residue partition size. * offset: Output offset. * outputs: Output vector array. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * [Return value] * True on success, false on end of packet. */ static bool decode_residue_partition_0( stb_vorbis *handle, const Codebook *book, int size, int offset, float **output_ptr, UNUSED int n, UNUSED int ch) { float *output = *output_ptr; const int dimensions = book->dimensions; const int step = size / dimensions; for (int i = 0; i < step; i++) { if (!codebook_decode_step(handle, book, &output[offset+i], size-i, step)) { return false; } } return true; } /*-----------------------------------------------------------------------*/ /** * decode_residue_partition_1: Decode a single residue partition for * residue type 1. * * [Parameters] * handle: Stream handle. * book: Codebook to use for decoding. * size: Residue partition size. * offset: Output offset. * outputs: Output vector array. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * [Return value] * True on success, false on end of packet. */ static bool decode_residue_partition_1( stb_vorbis *handle, const Codebook *book, int size, int offset, float **output_ptr, UNUSED int n, UNUSED int ch) { float *output = *output_ptr; const int dimensions = book->dimensions; for (int i = 0; i < size; i += dimensions) { if (!codebook_decode(handle, book, &output[offset+i], size-i)) { return false; } } return true; } /*-----------------------------------------------------------------------*/ /** * decode_residue_partition_2: Decode a single residue partition for * residue type 2. * * [Parameters] * handle: Stream handle. * book: Codebook to use for decoding. * size: Residue partition size. * offset: Output offset. * outputs: Output vector array. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * [Return value] * True on success, false on end of packet. */ static bool decode_residue_partition_2( stb_vorbis *handle, const Codebook *book, int size, int offset, float **outputs, int n, int ch) { return codebook_decode_deinterleave_repeat( handle, book, outputs, ch, n, offset, size); } /*-----------------------------------------------------------------------*/ /** * decode_residue_partition_2_2ch: Decode a single residue partition for * residue type 2 with two channels. Both channels must be active for * decoding. * * [Parameters] * handle: Stream handle. * book: Codebook to use for decoding. * size: Residue partition size. * offset: Output offset. * outputs: Output vector array. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * [Return value] * True on success, false on end of packet. */ static bool decode_residue_partition_2_2ch( stb_vorbis *handle, const Codebook *book, int size, int offset, float **outputs, int n, UNUSED int ch) { return codebook_decode_deinterleave_repeat_2( handle, book, outputs, n, offset, size); } /*-----------------------------------------------------------------------*/ /** * decode_residue_partition: Decode a single residue partition. * * [Parameters] * handle: Stream handle. * res: Residue configuration. * pass: Residue decoding pass. * partition_count: Current partition index. * vqclass: Classification for this partition. * outputs: Output vector array. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * do_partition: Partition decoding function. * [Return value] * True on success, false on end of packet. */ static inline bool decode_residue_partition( stb_vorbis *handle, const Residue *res, int pass, int partition_count, int vqclass, float **outputs, int n, int ch, bool (*do_partition)( stb_vorbis *handle, const Codebook *book, int size, int offset, float **outputs, int n, int ch)) { const int vqbook = res->residue_books[vqclass][pass]; if (vqbook < 0) { return true; } const Codebook *book = &handle->codebooks[vqbook]; if (UNLIKELY(book->lookup_type == 0)) { /* Lookup type 0 is only valid in a scalar context. The spec * doesn't say what to do about this case; we treat it like * end-of-packet. */ flush_packet(handle); handle->valid_bits = -1; return false; } const int32_t offset = res->begin + partition_count * res->part_size; return (*do_partition)( handle, book, res->part_size, offset, outputs, n, ch); } /*-----------------------------------------------------------------------*/ /** * decode_residue_common: Common decode processing for all residue vector * types. * * [Parameters] * handle: Stream handle. * res: Residue configuration. * type: Residue type. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * residue_buffers: Pointers to the buffers for each channel in which * the residue vectors should be stored. A pointer of NULL * indicates that the given channel should not be decoded (the * "do not decode" state referred to by the specification). * do_partition: Function pointer for the partition decoding function * to use. */ static void decode_residue_common( stb_vorbis *handle, const Residue *res, int type, int n, int ch, float *residue_buffers[], bool (*do_partition)( stb_vorbis *handle, const Codebook *book, int size, int offset, float **outputs, int n, int ch)) { const Codebook *classbook = &handle->codebooks[res->classbook]; const int classwords = classbook->dimensions; const int actual_size = (type == 2 ? n*ch : n); const int n_to_read = min(res->end, actual_size) - min(res->begin, actual_size); const int partitions_to_read = n_to_read / res->part_size; const int ch_to_read = (type == 2 ? 1 : ch); if (n_to_read <= 0) { return; } if (handle->divides_in_residue) { int **classifications = handle->classifications; for (int pass = 0; pass < 8; pass++) { int partition_count = 0; while (partition_count < partitions_to_read) { if (pass == 0) { for (int j = 0; j < ch_to_read; j++) { if (residue_buffers[j]) { int temp = codebook_decode_scalar(handle, classbook); if (UNLIKELY(temp == EOP)) { return; } for (int i = classwords-1; i >= 0; i--) { classifications[j][i+partition_count] = temp % res->classifications; temp /= res->classifications; } } } } for (int i = 0; i < classwords && partition_count < partitions_to_read; i++, partition_count++) { for (int j = 0; j < ch_to_read; j++) { if (residue_buffers[j]) { const int vqclass = classifications[j][partition_count]; if (!decode_residue_partition( handle, res, pass, partition_count, vqclass, &residue_buffers[j], n, ch, do_partition)) { return; } } } } } // while (partition_count < partitions_to_read) } // for (int pass = 0; pass < 8; pass++) } else { // !handle->divides_in_residue uint8_t ***part_classdata = handle->classifications; for (int pass = 0; pass < 8; pass++) { int partition_count = 0; int class_set = 0; while (partition_count < partitions_to_read) { if (pass == 0) { for (int j = 0; j < ch_to_read; j++) { if (residue_buffers[j]) { const int temp = codebook_decode_scalar(handle, classbook); if (UNLIKELY(temp == EOP)) { return; } part_classdata[j][class_set] = res->classdata[temp]; } } } for (int i = 0; i < classwords && partition_count < partitions_to_read; i++, partition_count++) { for (int j = 0; j < ch_to_read; j++) { if (residue_buffers[j]) { const int vqclass = part_classdata[j][class_set][i]; if (!decode_residue_partition( handle, res, pass, partition_count, vqclass, &residue_buffers[j], n, ch, do_partition)) { return; } } } } class_set++; } // while (partition_count < partitions_to_read) } // for (int pass = 0; pass < 8; pass++) } // if (handle->divides_in_residue) } /*-----------------------------------------------------------------------*/ /** * decode_residue: Decode the residue vectors for the current frame. * * [Parameters] * handle: Stream handle. * residue_index: Index of residue configuration to use. * n: Length of residue vectors (half of window size). * ch: Number of channels of residue data to decode. * residue_buffers: Pointers to the buffers for each channel in which * the residue vectors should be stored. A pointer of NULL * indicates that the given channel should not be decoded (the * "do not decode" state referred to by the specification). */ static void decode_residue(stb_vorbis *handle, int residue_index, int n, int ch, float *residue_buffers[]) { Residue *res = &handle->residue_config[residue_index]; const int type = handle->residue_types[residue_index]; for (int i = 0; i < ch; i++) { if (residue_buffers[i]) { memset(residue_buffers[i], 0, sizeof(*residue_buffers[i]) * n); } } /* For residue type 2, if there are multiple channels, we need to * deinterleave the data after decoding it. */ if (type == 2 && ch > 1) { if (ch == 2 && !handle->divides_in_residue && residue_buffers[0] && residue_buffers[1]) { /* We have slightly optimized handling for this case. */ decode_residue_common( handle, res, type, n, ch, residue_buffers, decode_residue_partition_2_2ch); } else { decode_residue_common( handle, res, type, n, ch, residue_buffers, decode_residue_partition_2); } } else { decode_residue_common( handle, res, type, n, ch, residue_buffers, type==0 ? decode_residue_partition_0 : decode_residue_partition_1); } } /*************************************************************************/ /************************ Inverse MDCT processing ************************/ /*************************************************************************/ /** * imdct_setup_step1: Setup and step 1 of the IMDCT. * * [Parameters] * n: Window size. * A: Twiddle factor A. * Y: Input buffer (length n/2). * v: Output buffer (length n/2). Must be distinct from the input buffer. */ static void imdct_setup_step1(const unsigned int n, const float *A, const float *Y, float *v) { #if 0 // Roughly literal implementation. for (unsigned int i = 0; i < n/4; i += 2) { v[(n/2)-i-1] = Y[i*2]*A[i+0] - Y[(i+1)*2]*A[i+1]; v[(n/2)-i-2] = Y[i*2]*A[i+1] + Y[(i+1)*2]*A[i+0]; } for (unsigned int i = 0; i < n/4; i += 2) { v[(n/4)-i-1] = -Y[(n/2-1)-i*2]*A[(n/4)+i+0] - -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+1]; v[(n/4)-i-2] = -Y[(n/2-1)-i*2]*A[(n/4)+i+1] + -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+0]; } #else // Optimized implementations. #if defined(ENABLE_ASM_ARM_NEON) const int step = 4; const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_AVX2) const int step = 8; const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63); const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31); #elif defined(ENABLE_ASM_X86_SSE2) const int step = 4; const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63); const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31); #else const int step = 4; #endif ASSERT((n/4) % step == 0); v += n/2; for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4x2_t Y_all = vld2q_f32(&Y[i*2]); const float32x4_t Y_i2 = vswizzleq_zzxx_f32(Y_all.val[0]); const float32x4_t Y_i3 = vswizzleq_wwyy_f32(Y_all.val[0]); const float32x4_t A_i = vld1q_f32(&A[i]); const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i); const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i); vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_i2, A_i3), veorq_f32(sign_1010, vmulq_f32(Y_i3, A_i2)))); #elif defined(ENABLE_ASM_X86_AVX2) /* AVX2 doesn't include an instruction allowing us to mix two * registers while also shuffling values between 128-bit lanes * of each register, so we need a couple of extra temporaries to * get to our desired element order. */ const __m256 Y_lo = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+0)*2])); const __m256 Y_hi = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+4)*2])); const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(0,2,1,3)); const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(0,2,1,3)); const __m256 Y_i2 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x13); const __m256 Y_i3 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x02); /* Again, we need an extra operation to swap between 128-bit lanes. * The use of permute4x64 here should help the compiler optimize to * a single vpermpd on a memory operand. */ const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]), _MM_SHUFFLE(1,0,3,2)); const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3)); const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2)); _mm256_store_ps(&v[j], _mm256_add_ps( _mm256_mul_ps(Y_i2, A_i3), _mm256_xor_sign(sign_1010, _mm256_mul_ps(Y_i3, A_i2)))); #elif defined(ENABLE_ASM_X86_SSE2) const __m128 Y_lo = _mm_load_ps(&Y[(i+0)*2]); const __m128 Y_hi = _mm_load_ps(&Y[(i+2)*2]); const __m128 Y_i2 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(0,0,0,0)); const __m128 Y_i3 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(2,2,2,2)); const __m128 A_i = _mm_load_ps(&A[i]); const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3)); const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2)); _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_i2, A_i3), _mm_xor_sign(sign_1010, _mm_mul_ps(Y_i3, A_i2)))); #else v[j+3] = Y[(i+0)*2]*A[i+0] - Y[(i+1)*2]*A[i+1]; v[j+2] = Y[(i+0)*2]*A[i+1] + Y[(i+1)*2]*A[i+0]; v[j+1] = Y[(i+2)*2]*A[i+2] - Y[(i+3)*2]*A[i+3]; v[j+0] = Y[(i+2)*2]*A[i+3] + Y[(i+3)*2]*A[i+2]; #endif } Y += n/2; A += n/4; v -= n/4; for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4x2_t Y_all = vld2q_f32(&Y[j*2]); const float32x4_t Y_j1 = vnegq_f32(vswizzleq_yyww_f32(Y_all.val[1])); const float32x4_t Y_j0 = vnegq_f32(vswizzleq_xxzz_f32(Y_all.val[1])); const float32x4_t A_i = vld1q_f32(&A[i]); const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i); const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i); vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_j1, A_i3), veorq_f32(sign_1010, vmulq_f32(Y_j0, A_i2)))); #elif defined(ENABLE_ASM_X86_AVX2) const __m256 Y_lo = _mm256_xor_sign(sign_1111, _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+0)*2]))); const __m256 Y_hi = _mm256_xor_sign(sign_1111, _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+4)*2]))); const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(2,0,3,1)); const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(2,0,3,1)); const __m256 Y_j1 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x20); const __m256 Y_j0 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x31); const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]), _MM_SHUFFLE(1,0,3,2)); const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3)); const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2)); _mm256_store_ps(&v[j], _mm256_add_ps( _mm256_mul_ps(Y_j1, A_i3), _mm256_xor_sign(sign_1010, _mm256_mul_ps(Y_j0, A_i2)))); #elif defined(ENABLE_ASM_X86_SSE2) const __m128 Y_lo = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+0)*2])); const __m128 Y_hi = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+2)*2])); const __m128 Y_j1 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(3,3,3,3)); const __m128 Y_j0 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(1,1,1,1)); const __m128 A_i = _mm_load_ps(&A[i]); const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3)); const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2)); _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_j1, A_i3), _mm_xor_sign(sign_1010, _mm_mul_ps(Y_j0, A_i2)))); #else v[j+3] = -Y[(j+3)*2+1]*A[i+0] - -Y[(j+2)*2+1]*A[i+1]; v[j+2] = -Y[(j+3)*2+1]*A[i+1] + -Y[(j+2)*2+1]*A[i+0]; v[j+1] = -Y[(j+1)*2+1]*A[i+2] - -Y[(j+0)*2+1]*A[i+3]; v[j+0] = -Y[(j+1)*2+1]*A[i+3] + -Y[(j+0)*2+1]*A[i+2]; #endif } #endif // Literal vs. optimized implementation. } /*-----------------------------------------------------------------------*/ /** * imdct_step2: Step 2 of the IMDCT. * * [Parameters] * n: Window size. * A: Twiddle factor A. * v: Input buffer (length n/2). * w: Output buffer (length n/2). May be the same as the input buffer. */ static void imdct_step2(const unsigned int n, const float *A, const float *v, float *w) { const float *v0 = &v[(n/4)]; const float *v1 = &v[0]; float *w0 = &w[(n/4)]; float *w1 = &w[0]; #if defined(ENABLE_ASM_ARM_NEON) const int step = 4; const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_AVX2) const int step = 8; const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63); const __m256i permute = _mm256_set_epi32(1, 1, 3, 3, 0, 0, 2, 2); #elif defined(ENABLE_ASM_X86_SSE2) const int step = 4; const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63); #else const int step = 4; #endif ASSERT((n/2) % (2*step) == 0); for (int i = 0, j = n/2 - 2*step; j >= 0; i += step, j -= 2*step) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4_t v0_i = vld1q_f32(&v0[i]); const float32x4_t v1_i = vld1q_f32(&v1[i]); const float32x4x2_t A_all = vld2q_f32(&A[j]); vst1q_f32(&w0[i], vaddq_f32(v0_i, v1_i)); const float32x4_t diff = vsubq_f32(v0_i, v1_i); const float32x4_t diff2 = vswizzleq_yxwz_f32(diff); const uint32x4_t A_sel = {~0U, 0, ~0U, 0}; const float32x4_t A_40 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_all.val[0], (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[0], 32))); const float32x4_t A_51 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_all.val[1], (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[1], 32))); vst1q_f32(&w1[i], vaddq_f32(vmulq_f32(diff, A_40), veorq_f32(sign_1010, vmulq_f32(diff2, A_51)))); #elif defined(ENABLE_ASM_X86_AVX2) const __m256 v0_i = _mm256_load_ps(&v0[i]); const __m256 v1_i = _mm256_load_ps(&v1[i]); const __m256 A_0 = _mm256_load_ps(&A[j+0]); const __m256 A_8 = _mm256_load_ps(&A[j+8]); _mm256_store_ps(&w0[i], _mm256_add_ps(v0_i, v1_i)); const __m256 diff = _mm256_sub_ps(v0_i, v1_i); const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1)); const __m256 A_lo = _mm256_permutevar_ps( _mm256_permute4x64_ps(A_0, _MM_SHUFFLE(2,0,2,0)), permute); const __m256 A_hi = _mm256_permutevar_ps( _mm256_permute4x64_ps(A_8, _MM_SHUFFLE(2,0,2,0)), permute); const __m256 A_40 = _mm256_permute2f128_ps(A_lo, A_hi, 0x02); const __m256 A_51 = _mm256_permute2f128_ps(A_lo, A_hi, 0x13); _mm256_store_ps(&w1[i], _mm256_add_ps( _mm256_mul_ps(diff, A_40), _mm256_xor_sign(sign_1010, _mm256_mul_ps(diff2, A_51)))); #elif defined(ENABLE_ASM_X86_SSE2) const __m128 v0_i = _mm_load_ps(&v0[i]); const __m128 v1_i = _mm_load_ps(&v1[i]); const __m128 A_0 = _mm_load_ps(&A[j+0]); const __m128 A_4 = _mm_load_ps(&A[j+4]); _mm_store_ps(&w0[i], _mm_add_ps(v0_i, v1_i)); const __m128 diff = _mm_sub_ps(v0_i, v1_i); const __m128 diff2 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(2,3,0,1)); const __m128 A_40 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(0,0,0,0)); const __m128 A_51 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(1,1,1,1)); _mm_store_ps(&w1[i], _mm_add_ps(_mm_mul_ps(diff, A_40), _mm_xor_sign(sign_1010, _mm_mul_ps(diff2, A_51)))); #else float v40_20, v41_21; v41_21 = v0[i+1] - v1[i+1]; v40_20 = v0[i+0] - v1[i+0]; w0[i+1] = v0[i+1] + v1[i+1]; w0[i+0] = v0[i+0] + v1[i+0]; w1[i+1] = v41_21*A[j+4] - v40_20*A[j+5]; w1[i+0] = v40_20*A[j+4] + v41_21*A[j+5]; v41_21 = v0[i+3] - v1[i+3]; v40_20 = v0[i+2] - v1[i+2]; w0[i+3] = v0[i+3] + v1[i+3]; w0[i+2] = v0[i+2] + v1[i+2]; w1[i+3] = v41_21*A[j+0] - v40_20*A[j+1]; w1[i+2] = v40_20*A[j+0] + v41_21*A[j+1]; #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * imdct_step3_inner_r_loop: Step 3 of the IMDCT for a single iteration * of the l and s loops. * * [Parameters] * lim: Iteration limit for the r loop. * A: Twiddle factor A. * e: Input/output buffer (length n/2, modified in place). * i_off: Initial offset, calculated as (n/2 - k0*s). * k0: Constant k0. * k1: Constant k1. */ static void imdct_step3_inner_r_loop(const unsigned int lim, const float *A, float *e, int i_off, int k0, int k1) { float *e0 = e + i_off; float *e2 = e0 - k0/2; #if defined(ENABLE_ASM_ARM_NEON) const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_AVX2) const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_SSE2) const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63); #endif for (int i = lim/4; i > 0; --i, e0 -= 8, e2 -= 8) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4_t e0_4 = vld1q_f32(&e0[-4]); const float32x4_t e2_4 = vld1q_f32(&e2[-4]); const float32x4_t e0_8 = vld1q_f32(&e0[-8]); const float32x4_t e2_8 = vld1q_f32(&e2[-8]); const float32x4x2_t A_0k = vuzpq_f32(vld1q_f32(&A[0]), vld1q_f32(&A[k1])); const float32x4x2_t A_2k = vuzpq_f32(vld1q_f32(&A[2*k1]), vld1q_f32(&A[3*k1])); vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4)); vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8)); const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4); const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8); const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4); const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8); const uint32x4_t A_sel = {~0U, 0, ~0U, 0}; const float32x4_t A_0 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_0k.val[0], (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[0], 32))); const float32x4_t A_1 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_0k.val[1], (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[1], 32))); const float32x4_t A_2 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_2k.val[0], (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[0], 32))); const float32x4_t A_3 = vswizzleq_zzxx_f32(vbslq_f32( A_sel, A_2k.val[1], (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[1], 32))); vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_0), veorq_f32(sign_1010, vmulq_f32(diff2_4, A_1)))); vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_2), veorq_f32(sign_1010, vmulq_f32(diff2_8, A_3)))); A += 4*k1; #elif defined(ENABLE_ASM_X86_AVX2) const __m256 e0_8 = _mm256_load_ps(&e0[-8]); const __m256 e2_8 = _mm256_load_ps(&e2[-8]); const __m128 A_0k = _mm_load_ps(A); const __m128 A_1k = _mm_load_ps(&A[k1]); const __m128 A_2k = _mm_load_ps(&A[2*k1]); const __m128 A_3k = _mm_load_ps(&A[3*k1]); _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8)); const __m256 diff = _mm256_sub_ps(e0_8, e2_8); const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1)); const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0)); const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1)); const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0)); const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1)); const __m256 A_20 = _mm256_set_m128(A_0, A_2); const __m256 A_31 = _mm256_set_m128(A_1, A_3); _mm256_store_ps(&e2[-8], _mm256_add_ps( _mm256_mul_ps(diff, A_20), _mm256_xor_sign(sign_1010, _mm256_mul_ps(diff2, A_31)))); A += 4*k1; #elif defined(ENABLE_ASM_X86_SSE2) const __m128 e0_4 = _mm_load_ps(&e0[-4]); const __m128 e2_4 = _mm_load_ps(&e2[-4]); const __m128 e0_8 = _mm_load_ps(&e0[-8]); const __m128 e2_8 = _mm_load_ps(&e2[-8]); const __m128 A_0k = _mm_load_ps(A); const __m128 A_1k = _mm_load_ps(&A[k1]); const __m128 A_2k = _mm_load_ps(&A[2*k1]); const __m128 A_3k = _mm_load_ps(&A[3*k1]); _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4)); _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8)); const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4); const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8); const __m128 diff2_4 = _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1)); const __m128 diff2_8 = _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1)); const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0)); const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1)); const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0)); const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1)); _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_0), _mm_xor_sign(sign_1010, _mm_mul_ps(diff2_4, A_1)))); _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_2), _mm_xor_sign(sign_1010, _mm_mul_ps(diff2_8, A_3)))); A += 4*k1; #else float k00_20, k01_21; k00_20 = e0[-1] - e2[-1]; k01_21 = e0[-2] - e2[-2]; e0[-1] = e0[-1] + e2[-1]; e0[-2] = e0[-2] + e2[-2]; e2[-1] = k00_20 * A[0] - k01_21 * A[1]; e2[-2] = k01_21 * A[0] + k00_20 * A[1]; A += k1; k00_20 = e0[-3] - e2[-3]; k01_21 = e0[-4] - e2[-4]; e0[-3] = e0[-3] + e2[-3]; e0[-4] = e0[-4] + e2[-4]; e2[-3] = k00_20 * A[0] - k01_21 * A[1]; e2[-4] = k01_21 * A[0] + k00_20 * A[1]; A += k1; k00_20 = e0[-5] - e2[-5]; k01_21 = e0[-6] - e2[-6]; e0[-5] = e0[-5] + e2[-5]; e0[-6] = e0[-6] + e2[-6]; e2[-5] = k00_20 * A[0] - k01_21 * A[1]; e2[-6] = k01_21 * A[0] + k00_20 * A[1]; A += k1; k00_20 = e0[-7] - e2[-7]; k01_21 = e0[-8] - e2[-8]; e0[-7] = e0[-7] + e2[-7]; e0[-8] = e0[-8] + e2[-8]; e2[-7] = k00_20 * A[0] - k01_21 * A[1]; e2[-8] = k01_21 * A[0] + k00_20 * A[1]; A += k1; #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * imdct_step3_inner_s_loop: Step 3 of the IMDCT for a single iteration * of the l and r loops. * * [Parameters] * lim: Iteration limit for the s loop. * A: Twiddle factor A. * e: Input/output buffer (length n/2, modified in place). * i_off: Initial offset, calculated as (n/2 - k0*s). * k0: Constant k0. * k1: Constant k1. */ static void imdct_step3_inner_s_loop(const unsigned int lim, const float *A, float *e, int i_off, int k0, int k1) { const float A0 = A[0]; const float A1 = A[0+1]; const float A2 = A[0+k1]; const float A3 = A[0+k1+1]; const float A4 = A[0+k1*2+0]; const float A5 = A[0+k1*2+1]; const float A6 = A[0+k1*3+0]; const float A7 = A[0+k1*3+1]; float *e0 = e + i_off; float *e2 = e0 - k0/2; for (int i = lim; i > 0; i--, e0 -= k0, e2 -= k0) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4_t A_20 = {A2, A2, A0, A0}; const float32x4_t A_31 = {A3, -A3, A1, -A1}; const float32x4_t A_64 = {A6, A6, A4, A4}; const float32x4_t A_75 = {A7, -A7, A5, -A5}; const float32x4_t e0_4 = vld1q_f32(&e0[-4]); const float32x4_t e2_4 = vld1q_f32(&e2[-4]); const float32x4_t e0_8 = vld1q_f32(&e0[-8]); const float32x4_t e2_8 = vld1q_f32(&e2[-8]); vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4)); vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8)); const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4); const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8); const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4); const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8); vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_20), vmulq_f32(diff2_4, A_31))); vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_64), vmulq_f32(diff2_8, A_75))); #elif defined(ENABLE_ASM_X86_AVX2) const __m256 A_6420 = _mm256_set_ps(A0, A0, A2, A2, A4, A4, A6, A6); const __m256 A_7531 = _mm256_set_ps(-A1, A1, -A3, A3, -A5, A5, -A7, A7); const __m256 e0_8 = _mm256_load_ps(&e0[-8]); const __m256 e2_8 = _mm256_load_ps(&e2[-8]); _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8)); const __m256 diff = _mm256_sub_ps(e0_8, e2_8); const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1)); _mm256_store_ps(&e2[-8], _mm256_fmadd_ps( diff, A_6420, _mm256_mul_ps(diff2, A_7531))); #elif defined(ENABLE_ASM_X86_SSE2) const __m128 A_20 = _mm_set_ps(A0, A0, A2, A2); const __m128 A_31 = _mm_set_ps(-A1, A1, -A3, A3); const __m128 A_64 = _mm_set_ps(A4, A4, A6, A6); const __m128 A_75 = _mm_set_ps(-A5, A5, -A7, A7); const __m128 e0_4 = _mm_load_ps(&e0[-4]); const __m128 e2_4 = _mm_load_ps(&e2[-4]); const __m128 e0_8 = _mm_load_ps(&e0[-8]); const __m128 e2_8 = _mm_load_ps(&e2[-8]); _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4)); _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8)); const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4); const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8); const __m128 diff2_4 = _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1)); const __m128 diff2_8 = _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1)); _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_20), _mm_mul_ps(diff2_4, A_31))); _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_64), _mm_mul_ps(diff2_8, A_75))); #else float k00, k11; k00 = e0[-1] - e2[-1]; k11 = e0[-2] - e2[-2]; e0[-1] = e0[-1] + e2[-1]; e0[-2] = e0[-2] + e2[-2]; e2[-1] = k00 * A0 - k11 * A1; e2[-2] = k11 * A0 + k00 * A1; k00 = e0[-3] - e2[-3]; k11 = e0[-4] - e2[-4]; e0[-3] = e0[-3] + e2[-3]; e0[-4] = e0[-4] + e2[-4]; e2[-3] = k00 * A2 - k11 * A3; e2[-4] = k11 * A2 + k00 * A3; k00 = e0[-5] - e2[-5]; k11 = e0[-6] - e2[-6]; e0[-5] = e0[-5] + e2[-5]; e0[-6] = e0[-6] + e2[-6]; e2[-5] = k00 * A4 - k11 * A5; e2[-6] = k11 * A4 + k00 * A5; k00 = e0[-7] - e2[-7]; k11 = e0[-8] - e2[-8]; e0[-7] = e0[-7] + e2[-7]; e0[-8] = e0[-8] + e2[-8]; e2[-7] = k00 * A6 - k11 * A7; e2[-8] = k11 * A6 + k00 * A7; #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * iter_54: Step 3 of the IMDCT for l=ld(n)-5 and l=ld(n)-4 for a sequence * of 8 elements. Helper function for imdct_step3_inner_s_loop_ld654(). * * [Parameters] * z: Pointer to elements to operate on. */ static inline void iter_54(float *z) { #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2) const __m128i sign_0011 = _mm_set_epi32(0, 0, UINT32_C(1)<<31, UINT32_C(1)<<31); const __m128i sign_0110 = _mm_set_epi32(0, UINT32_C(1)<<31, UINT32_C(1)<<31, 0); const __m128 z_4 = _mm_load_ps(&z[-4]); const __m128 z_8 = _mm_load_ps(&z[-8]); const __m128 sum = _mm_add_ps(z_4, z_8); const __m128 diff = _mm_sub_ps(z_4, z_8); const __m128 sum_23 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3,2,3,2)); const __m128 sum_01 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1,0,1,0)); const __m128 diff_23 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(3,2,3,2)); const __m128 diff_10 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(0,1,0,1)); _mm_store_ps(&z[-4], _mm_add_ps(sum_23, _mm_xor_sign(sign_0011, sum_01))); _mm_store_ps(&z[-8], _mm_add_ps(diff_23, _mm_xor_sign(sign_0110, diff_10))); #else const float k00 = z[-1] - z[-5]; const float k11 = z[-2] - z[-6]; const float k22 = z[-3] - z[-7]; const float k33 = z[-4] - z[-8]; const float y0 = z[-1] + z[-5]; const float y1 = z[-2] + z[-6]; const float y2 = z[-3] + z[-7]; const float y3 = z[-4] + z[-8]; z[-1] = y0 + y2; // z1 + z5 + z3 + z7 z[-2] = y1 + y3; // z2 + z6 + z4 + z8 z[-3] = y0 - y2; // z1 + z5 - z3 - z7 z[-4] = y1 - y3; // z2 + z6 - z4 - z8 z[-5] = k00 + k33; // z1 - z5 + z4 - z8 z[-6] = k11 - k22; // z2 - z6 + z3 - z7 z[-7] = k00 - k33; // z1 - z5 - z4 + z8 z[-8] = k11 + k22; // z2 - z6 - z3 + z7 #endif // ENABLE_ASM_* } /*-----------------------------------------------------------------------*/ /** * imdct_step3_inner_s_loop_ld654: Step 3 of the IMDCT for a single * iteration of the l and r loops at l=log2_n-{6,5,4}. * * [Parameters] * n: Window size. * A: Twiddle factor A. * e: Input/output buffer (length n/2, modified in place). */ static void imdct_step3_inner_s_loop_ld654( const unsigned int n, const float *A, float *e) { const int a_off = n/8; const float A2 = A[a_off]; for (float *z = e + n/2; z > e; z -= 16) { #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2) # if defined(ENABLE_ASM_X86_AVX2) const __m256 z_8 = _mm256_load_ps(&z[-8]); const __m256 z_16 = _mm256_load_ps(&z[-16]); _mm256_store_ps(&z[-8], _mm256_add_ps(z_8, z_16)); const __m256 diff = _mm256_sub_ps(z_8, z_16); const __m128 diff_8 = _mm256_castps256_ps128(diff); const __m128 diff_4 = _mm256_extractf128_ps(diff, 1); # else const __m128 z_4 = _mm_load_ps(&z[-4]); const __m128 z_8 = _mm_load_ps(&z[-8]); const __m128 z_12 = _mm_load_ps(&z[-12]); const __m128 z_16 = _mm_load_ps(&z[-16]); _mm_store_ps(&z[-4], _mm_add_ps(z_4, z_12)); _mm_store_ps(&z[-8], _mm_add_ps(z_8, z_16)); const __m128 diff_4 = _mm_sub_ps(z_4, z_12); const __m128 diff_8 = _mm_sub_ps(z_8, z_16); # endif /* We can't use the same algorithm as imdct_step3_inner_s_loop(), * since that can lead to a loss of precision for z[-11,-12,-15,-16] * if the two non-constant inputs to the calculation are of nearly * equal magnitude and of the appropriate signs to give an * intermediate sum or difference of much smaller magnitude: * changing the order of operations to multiply by A2 first can * introduce an error of at most 1 ULP at the point of the * multiplication, but if the final result has a smaller magnitude, * that error will be a greater part of the final result. */ const __m128 temp_4 = _mm_xor_sign( _mm_set_epi32(0, 0, 0, UINT32_C(1)<<31), _mm_shuffle_ps(diff_4, _mm_set1_ps(0), _MM_SHUFFLE(3,2,0,1))); _mm_store_ps(&z[-12], _mm_mul_ps(_mm_add_ps(diff_4, temp_4), _mm_set_ps(1, 1, A2, A2))); const __m128 temp1_8 = _mm_xor_sign( _mm_set_epi32(0, UINT32_C(1)<<31, 0, UINT32_C(1)<<31), _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1))); const __m128 temp2_8 = _mm_shuffle_ps(diff_8, _mm_set1_ps(0), _MM_SHUFFLE(3,2,1,0)); _mm_store_ps(&z[-16], _mm_mul_ps(_mm_sub_ps(temp1_8, temp2_8), _mm_set_ps(1, 1, A2, A2))); #else float k00, k11; k00 = z[ -1] - z[ -9]; k11 = z[ -2] - z[-10]; z[ -1] = z[ -1] + z[ -9]; z[ -2] = z[ -2] + z[-10]; z[ -9] = k00; // k00*1 - k11*0 z[-10] = k11; // k11*1 + k00*0 k00 = z[ -3] - z[-11]; k11 = z[ -4] - z[-12]; z[ -3] = z[ -3] + z[-11]; z[ -4] = z[ -4] + z[-12]; z[-11] = (k00+k11) * A2; // k00*A2 - k11*-A2 (but see asm note above) z[-12] = (k11-k00) * A2; // k11*A2 + k00*-A2 k00 = z[- 5] - z[-13]; k11 = z[ -6] - z[-14]; z[ -5] = z[ -5] + z[-13]; z[ -6] = z[ -6] + z[-14]; z[-13] = k11; // k00*0 - k11*-1 z[-14] = -k00; // k11*0 + k00*-1 k00 = z[- 7] - z[-15]; k11 = z[ -8] - z[-16]; z[ -7] = z[ -7] + z[-15]; z[ -8] = z[ -8] + z[-16]; z[-15] = (k11-k00) * A2; // k00*-A2 - k11*-A2 z[-16] = -(k00+k11) * A2; // k11*-A2 + k00*-A2 #endif // ENABLE_ASM_* iter_54(z); iter_54(z-8); } } /*-----------------------------------------------------------------------*/ /** * imdct_step456: Steps 4, 5, and 6 of the IMDCT. * * [Parameters] * n: Window size. * bitrev: Bit-reverse lookup table. * u: Input buffer (length n/2). * U: Output buffer (length n/2). Must be distinct from the input buffer. */ static void imdct_step456(const unsigned int n, const uint16_t *bitrev, const float *u, float *U) { /* stb_vorbis note: "weirdly, I'd have thought reading sequentially and * writing erratically would have been better than vice-versa, but in fact * that's not what my testing showed. (That is, with j = bitreverse(i), * do you read i and write j, or read j and write i.)" */ float *U0 = &U[n/4]; float *U1 = &U[n/2]; for (int i = 0, j = -4; i < (int)(n/8); i += 2, j -= 4) { #if defined(ENABLE_ASM_X86_SSE2) const __m128 bitrev_0 = _mm_load_ps(&u[bitrev[i+0]]); const __m128 bitrev_1 = _mm_load_ps(&u[bitrev[i+1]]); _mm_store_ps(&U0[j], _mm_shuffle_ps( bitrev_1, bitrev_0, _MM_SHUFFLE(2,3,2,3))); _mm_store_ps(&U1[j], _mm_shuffle_ps( bitrev_1, bitrev_0, _MM_SHUFFLE(0,1,0,1))); #else int k4; k4 = bitrev[i+0]; U1[j+3] = u[k4+0]; U1[j+2] = u[k4+1]; U0[j+3] = u[k4+2]; U0[j+2] = u[k4+3]; k4 = bitrev[i+1]; U1[j+1] = u[k4+0]; U1[j+0] = u[k4+1]; U0[j+1] = u[k4+2]; U0[j+0] = u[k4+3]; #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * imdct_step7: Step 7 of the IMDCT. * * [Parameters] * n: Window size. * C: Twiddle factor C. * buffer: Input/output buffer (length n/2, modified in place). */ static void imdct_step7(const unsigned int n, const float *C, float *buffer) { #if defined(ENABLE_ASM_ARM_NEON) const int step = 4; const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_AVX2) const int step = 8; const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63); #elif defined(ENABLE_ASM_X86_SSE2) const int step = 4; const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63); #else const int step = 4; #endif ASSERT((n/2) % (2*step) == 0); for (float *d = buffer, *e = buffer + (n/2) - step; d < e; C += step, d += step, e -= step) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4_t C_0 = vld1q_f32(C); const float32x4_t d_0 = vld1q_f32(d); const float32x4_t e_0 = vld1q_f32(e); const float32x4_t e_2 = veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_0)); const float32x4_t sub = vsubq_f32(d_0, e_2); const float32x4_t add = vaddq_f32(d_0, e_2); const float32x4_t C02 = veorq_f32(sign_1010, vswizzleq_xxzz_f32(C_0)); const float32x4_t C13 = vswizzleq_yyww_f32(C_0); const float32x4_t mix = vaddq_f32( vmulq_f32(C13, sub), vmulq_f32(C02, vswizzleq_yxwz_f32(sub))); const float32x4_t e_temp = vsubq_f32(add, mix); const float32x4_t e_out = veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_temp)); vst1q_f32(d, vaddq_f32(add, mix)); vst1q_f32(e, e_out); #elif defined(ENABLE_ASM_X86_AVX2) const __m256 C_0 = _mm256_load_ps(C); const __m256 d_0 = _mm256_load_ps(d); const __m256 e_0 = _mm256_load_ps(e); const __m256 e_4 = _mm256_permute4x64_ps(e_0, _MM_SHUFFLE(1,0,3,2)); const __m256 e_6 = _mm256_xor_sign( sign_1010, _mm256_permute_ps(e_4, _MM_SHUFFLE(1,0,3,2))); const __m256 sub = _mm256_sub_ps(d_0, e_6); const __m256 add = _mm256_add_ps(d_0, e_6); const __m256 C02 = _mm256_xor_sign( sign_1010, _mm256_permute_ps(C_0, _MM_SHUFFLE(2,2,0,0))); const __m256 C13 = _mm256_permute_ps(C_0, _MM_SHUFFLE(3,3,1,1)); const __m256 mix = _mm256_add_ps( _mm256_mul_ps(C13, sub), _mm256_mul_ps(C02, _mm256_permute_ps(sub, _MM_SHUFFLE(2,3,0,1)))); const __m256 e_temp = _mm256_sub_ps(add, mix); const __m256 e_swap = _mm256_permute4x64_ps(e_temp, _MM_SHUFFLE(1,0,3,2)); const __m256 e_out = _mm256_xor_sign( sign_1010, _mm256_permute_ps(e_swap, _MM_SHUFFLE(1,0,3,2))); _mm256_store_ps(d, _mm256_add_ps(add, mix)); _mm256_store_ps(e, e_out); #elif defined(ENABLE_ASM_X86_SSE2) const __m128 C_0 = _mm_load_ps(C); const __m128 d_0 = _mm_load_ps(d); const __m128 e_0 = _mm_load_ps(e); const __m128 e_2 = _mm_xor_sign( sign_1010, _mm_shuffle_ps(e_0, e_0, _MM_SHUFFLE(1,0,3,2))); const __m128 sub = _mm_sub_ps(d_0, e_2); const __m128 add = _mm_add_ps(d_0, e_2); const __m128 C02 = _mm_xor_sign( sign_1010, _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(2,2,0,0))); const __m128 C13 = _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(3,3,1,1)); const __m128 mix = _mm_add_ps( _mm_mul_ps(C13, sub), _mm_mul_ps(C02, _mm_shuffle_ps(sub, sub, _MM_SHUFFLE(2,3,0,1)))); const __m128 e_temp = _mm_sub_ps(add, mix); const __m128 e_out = _mm_xor_sign( sign_1010, _mm_shuffle_ps(e_temp, e_temp, _MM_SHUFFLE(1,0,3,2))); _mm_store_ps(d, _mm_add_ps(add, mix)); _mm_store_ps(e, e_out); #else float sub0 = d[0] - e[2]; float sub1 = d[1] + e[3]; float sub2 = d[2] - e[0]; float sub3 = d[3] + e[1]; float mix0 = C[1]*sub0 + C[0]*sub1; float mix1 = C[1]*sub1 - C[0]*sub0; float mix2 = C[3]*sub2 + C[2]*sub3; float mix3 = C[3]*sub3 - C[2]*sub2; float add0 = d[0] + e[2]; float add1 = d[1] - e[3]; float add2 = d[2] + e[0]; float add3 = d[3] - e[1]; d[0] = add0 + mix0; d[1] = add1 + mix1; d[2] = add2 + mix2; d[3] = add3 + mix3; e[0] = add2 - mix2; e[1] = -(add3 - mix3); e[2] = add0 - mix0; e[3] = -(add1 - mix1); #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * imdct_step8_decode: Step 8 and final decoding for the IMDCT. * * [Parameters] * n: Window size. * B: Twiddle factor B. * in: Input buffer (length n/2). * out: Output buffer (length n). */ static void imdct_step8_decode(const unsigned int n, const float *B, const float *in, float *out) { /* stb_vorbis note: "this generates pairs of data a la 8 and pushes * them directly through the decode kernel (pushing rather than * pulling) to avoid having to make another pass later" */ B += (n/2) - 16; float *d0 = &out[0]; float *d1 = &out[(n/2)-8]; float *d2 = &out[(n/2)]; float *d3 = &out[n-8]; for (const float *e = in + (n/2) - 16; e >= in; e -= 16, B -= 16, d0 += 8, d1 -= 8, d2 += 8, d3 -= 8) { #if defined(ENABLE_ASM_ARM_NEON) const float32x4x2_t e_0 = vuzpq_f32(vld1q_f32(&e[0]), vld1q_f32(&e[4])); const float32x4x2_t e_8 = vuzpq_f32(vld1q_f32(&e[8]), vld1q_f32(&e[12])); const float32x4x2_t B_0 = vuzpq_f32(vld1q_f32(&B[0]), vld1q_f32(&B[4])); const float32x4x2_t B_8 = vuzpq_f32(vld1q_f32(&B[8]), vld1q_f32(&B[12])); const float32x4_t d1_0 = vsubq_f32(vmulq_f32(e_0.val[1], B_0.val[0]), vmulq_f32(e_0.val[0], B_0.val[1])); const float32x4_t d0_0 = vnegq_f32(vswizzleq_wzyx_f32(d1_0)); const float32x4_t d3_0 = vnegq_f32(vaddq_f32(vmulq_f32(e_0.val[0], B_0.val[0]), vmulq_f32(e_0.val[1], B_0.val[1]))); const float32x4_t d2_0 = vswizzleq_wzyx_f32(d3_0); const float32x4_t d1_8 = vsubq_f32(vmulq_f32(e_8.val[1], B_8.val[0]), vmulq_f32(e_8.val[0], B_8.val[1])); const float32x4_t d0_8 = vnegq_f32(vswizzleq_wzyx_f32(d1_8)); const float32x4_t d3_8 = vnegq_f32(vaddq_f32(vmulq_f32(e_8.val[0], B_8.val[0]), vmulq_f32(e_8.val[1], B_8.val[1]))); const float32x4_t d2_8 = vswizzleq_wzyx_f32(d3_8); vst1q_f32(d0, d0_8); vst1q_f32(d0+4, d0_0); vst1q_f32(d1+4, d1_8); vst1q_f32(d1, d1_0); vst1q_f32(d2, d2_8); vst1q_f32(d2+4, d2_0); vst1q_f32(d3+4, d3_8); vst1q_f32(d3, d3_0); #elif defined(ENABLE_ASM_X86_AVX2) const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31); const __m256i permute_reverse = _mm256_set_epi32(0,1,2,3,4,5,6,7); /* It's tempting to use the gather-load instructions here to load * even and odd indices, but that causes a massive slowdown, so * we just use normal loads and lots of permutes. */ const __m256i permute_even_odd = _mm256_set_epi32(7,5,3,1,6,4,2,0); const __m256 e_0 = _mm256_load_ps(&e[0]); const __m256 e_8 = _mm256_load_ps(&e[8]); const __m256 B_0 = _mm256_load_ps(&B[0]); const __m256 B_8 = _mm256_load_ps(&B[8]); const __m256 e_0p = _mm256_permutevar8x32_ps(e_0, permute_even_odd); const __m256 e_8p = _mm256_permutevar8x32_ps(e_8, permute_even_odd); const __m256 B_0p = _mm256_permutevar8x32_ps(B_0, permute_even_odd); const __m256 B_8p = _mm256_permutevar8x32_ps(B_8, permute_even_odd); const __m256 e_even = _mm256_permute2f128_ps(e_0p, e_8p, 0x20); const __m256 e_odd = _mm256_permute2f128_ps(e_0p, e_8p, 0x31); const __m256 B_even = _mm256_permute2f128_ps(B_0p, B_8p, 0x20); const __m256 B_odd = _mm256_permute2f128_ps(B_0p, B_8p, 0x31); const __m256 d1_0 = _mm256_fmsub_ps(e_odd, B_even, _mm256_mul_ps(e_even, B_odd)); const __m256 d0_0 = _mm256_xor_sign( sign_1111, _mm256_permutevar8x32_ps(d1_0, permute_reverse)); const __m256 d3_0 = _mm256_xor_sign( sign_1111, _mm256_fmadd_ps(e_even, B_even, _mm256_mul_ps(e_odd, B_odd))); const __m256 d2_0 = _mm256_permutevar8x32_ps(d3_0, permute_reverse); _mm256_store_ps(d0, d0_0); _mm256_store_ps(d1, d1_0); _mm256_store_ps(d2, d2_0); _mm256_store_ps(d3, d3_0); #elif defined(ENABLE_ASM_X86_SSE2) const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31); const __m128 e_0 = _mm_load_ps(&e[0]); const __m128 e_4 = _mm_load_ps(&e[4]); const __m128 B_0 = _mm_load_ps(&B[0]); const __m128 B_4 = _mm_load_ps(&B[4]); const __m128 e_8 = _mm_load_ps(&e[8]); const __m128 e_12 = _mm_load_ps(&e[12]); const __m128 B_8 = _mm_load_ps(&B[8]); const __m128 B_12 = _mm_load_ps(&B[12]); const __m128 e_even_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(2,0,2,0)); const __m128 e_odd_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(3,1,3,1)); const __m128 B_even_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(2,0,2,0)); const __m128 B_odd_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(3,1,3,1)); const __m128 e_even_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(2,0,2,0)); const __m128 e_odd_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(3,1,3,1)); const __m128 B_even_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(2,0,2,0)); const __m128 B_odd_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(3,1,3,1)); const __m128 d1_0 = _mm_sub_ps(_mm_mul_ps(e_odd_0, B_even_0), _mm_mul_ps(e_even_0, B_odd_0)); const __m128 d0_0 = _mm_xor_sign( sign_1111, _mm_shuffle_ps(d1_0, d1_0, _MM_SHUFFLE(0,1,2,3))); const __m128 d3_0 = _mm_xor_sign( sign_1111, _mm_add_ps(_mm_mul_ps(e_even_0, B_even_0), _mm_mul_ps(e_odd_0, B_odd_0))); const __m128 d2_0 = _mm_shuffle_ps(d3_0, d3_0, _MM_SHUFFLE(0,1,2,3)); const __m128 d1_8 = _mm_sub_ps(_mm_mul_ps(e_odd_8, B_even_8), _mm_mul_ps(e_even_8, B_odd_8)); const __m128 d0_8 = _mm_xor_sign( sign_1111, _mm_shuffle_ps(d1_8, d1_8, _MM_SHUFFLE(0,1,2,3))); const __m128 d3_8 = _mm_xor_sign( sign_1111, _mm_add_ps(_mm_mul_ps(e_even_8, B_even_8), _mm_mul_ps(e_odd_8, B_odd_8))); const __m128 d2_8 = _mm_shuffle_ps(d3_8, d3_8, _MM_SHUFFLE(0,1,2,3)); _mm_store_ps(d0, d0_8); _mm_store_ps(d0+4, d0_0); _mm_store_ps(d1+4, d1_8); _mm_store_ps(d1, d1_0); _mm_store_ps(d2, d2_8); _mm_store_ps(d2+4, d2_0); _mm_store_ps(d3+4, d3_8); _mm_store_ps(d3, d3_0); #else float p0, p1, p2, p3; p3 = e[14]*B[15] - e[15]*B[14]; p2 = -e[14]*B[14] - e[15]*B[15]; d0[0] = p3; d1[7] = - p3; d2[0] = p2; d3[7] = p2; p1 = e[12]*B[13] - e[13]*B[12]; p0 = -e[12]*B[12] - e[13]*B[13]; d0[1] = p1; d1[6] = - p1; d2[1] = p0; d3[6] = p0; p3 = e[10]*B[11] - e[11]*B[10]; p2 = -e[10]*B[10] - e[11]*B[11]; d0[2] = p3; d1[5] = - p3; d2[2] = p2; d3[5] = p2; p1 = e[8]*B[9] - e[9]*B[8]; p0 = -e[8]*B[8] - e[9]*B[9]; d0[3] = p1; d1[4] = - p1; d2[3] = p0; d3[4] = p0; p3 = e[6]*B[7] - e[7]*B[6]; p2 = -e[6]*B[6] - e[7]*B[7]; d0[4] = p3; d1[3] = - p3; d2[4] = p2; d3[3] = p2; p1 = e[4]*B[5] - e[5]*B[4]; p0 = -e[4]*B[4] - e[5]*B[5]; d0[5] = p1; d1[2] = - p1; d2[5] = p0; d3[2] = p0; p3 = e[2]*B[3] - e[3]*B[2]; p2 = -e[2]*B[2] - e[3]*B[3]; d0[6] = p3; d1[1] = - p3; d2[6] = p2; d3[1] = p2; p1 = e[0]*B[1] - e[1]*B[0]; p0 = -e[0]*B[0] - e[1]*B[1]; d0[7] = p1; d1[0] = - p1; d2[7] = p0; d3[0] = p0; #endif // ENABLE_ASM_* } } /*-----------------------------------------------------------------------*/ /** * inverse_mdct: Perform the inverse MDCT operation on the given buffer. * The algorithm is taken from "The use of multirate filter banks for * coding of high quality digital audio", Th. Sporer et. al. (1992), with * corrections for errors in that paper, and the step numbers listed in * comments refer to the algorithm steps as described in the paper. * * [Parameters] * handle: Stream handle. * buffer: Input/output buffer. * blocktype: 0 if the current frame is a short block, 1 if a long block. */ static void inverse_mdct(stb_vorbis *handle, float *buffer, int blocktype) { const unsigned int n = handle->blocksize[blocktype]; const int log2_n = handle->blocksize_bits[blocktype]; const float *A = handle->A[blocktype]; float *buf2 = handle->imdct_temp_buf; /* Setup and step 1. Note that step 1 involves subtracting pairs of * spectral coefficients, but the two items subtracted are actually * arithmetic inverses of each other(!), e.g. * u[4*k] - u[n-4*k-1] == 2*u[4*k] * We omit the factor of 2 here; that propagates linearly through to * the final step, and it is compensated for by "*0.5f" on the B[] * values computed during setup. */ imdct_setup_step1(n, A, buffer, buf2); /* Step 2. * stb_vorbis note: "this could be in place, but the data ends up in * the wrong place... _somebody_'s got to swap it, so this is nominated" */ imdct_step2(n, A, buf2, buffer); /* Step 3. * stb_vorbis note: "the original step3 loop can be nested r inside s * or s inside r; it's written originally as s inside r, but this is * dumb when r iterates many times, and s few. So I have two copies of * it and switch between them halfway." */ imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*0, n/4, 8); imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*1, n/4, 8); imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*0, n/8, 16); imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*1, n/8, 16); imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*2, n/8, 16); imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*3, n/8, 16); int l = 2; for (; l < (log2_n-3)/2; l++) { const int k0 = n >> (l+2); const int lim = 1 << (l+1); for (int i = 0; i < lim; i++) { imdct_step3_inner_r_loop(n >> (l+4), A, buffer, (n/2) - k0*i, k0, 1 << (l+3)); } } for (; l < log2_n-6; l++) { const int k0 = n >> (l+2), k1 = 1 << (l+3); const int rlim = n >> (l+6); const int lim = 1 << (l+1); const float *A0 = A; int i_off = n/2; for (int r = rlim; r > 0; r--) { imdct_step3_inner_s_loop(lim, A0, buffer, i_off, k0, k1); A0 += k1*4; i_off -= 8; } } /* stb_vorbis note: "log2_n-6,-5,-4 all interleaved together - the big * win comes from getting rid of needless flops due to the constants on * pass 5 & 4 being all 1 and 0; combining them to be simultaneous to * improve cache made little difference" */ imdct_step3_inner_s_loop_ld654(n, A, buffer); /* Steps 4, 5, and 6. */ imdct_step456(n, handle->bit_reverse[blocktype], buffer, buf2); /* Step 7. */ imdct_step7(n, handle->C[blocktype], buf2); /* Step 8 and final decoding. */ imdct_step8_decode(n, handle->B[blocktype], buf2, buffer); } /*************************************************************************/ /******************* Main decoding routine (internal) ********************/ /*************************************************************************/ /** * vorbis_decode_packet_rest: Perform all decoding operations for a frame * except mode selection and windowing. * * [Parameters] * handle: Stream handle. * mode: Mode configuration for this frame. * left_start: Start position of the left overlap region. * left_end: End position of the left overlap region. * right_start: Start position of the right overlap region. * right_end: End position of the right overlap region. * len_ret: Pointer to variable to receive the frame length (offset * within the window of just past the final fully-decoded sample). * [Return value] * True on success, false on error. */ static bool vorbis_decode_packet_rest( stb_vorbis *handle, const Mode *mode, int left_start, int left_end, int right_start, int right_end, int *len_ret) { const int n = handle->blocksize[mode->blockflag]; const Mapping *map = &handle->mapping[mode->mapping]; float ** const channel_buffers = handle->channel_buffers[handle->cur_channel_buffer]; /**** Floor processing (4.3.2). ****/ int64_t floor0_amplitude[256]; bool zero_channel[256]; for (int ch = 0; ch < handle->channels; ch++) { const int floor_index = map->submap_floor[map->mux[ch]]; if (handle->floor_types[floor_index] == 0) { Floor0 *floor = &handle->floor_config[floor_index].floor0; const int64_t amplitude = decode_floor0(handle, floor, handle->coefficients[ch]); if (UNLIKELY(amplitude < 0)) { flush_packet(handle); return error(handle, VORBIS_invalid_packet); } zero_channel[ch] = (amplitude == 0); floor0_amplitude[ch] = amplitude; } else { // handle->floor_types[floor_index] == 1 Floor1 *floor = &handle->floor_config[floor_index].floor1; zero_channel[ch] = !decode_floor1(handle, floor, handle->final_Y[ch]); } } /**** Nonzero vector propagation (4.3.3). ****/ bool really_zero_channel[256]; memcpy(really_zero_channel, zero_channel, sizeof(zero_channel[0]) * handle->channels); for (int i = 0; i < map->coupling_steps; i++) { if (!zero_channel[map->coupling[i].magnitude] || !zero_channel[map->coupling[i].angle]) { zero_channel[map->coupling[i].magnitude] = false; zero_channel[map->coupling[i].angle] = false; } } /**** Residue decoding (4.3.4). ****/ for (int i = 0; i < map->submaps; i++) { float *residue_buffers[256]; int ch = 0; for (int j = 0; j < handle->channels; j++) { if (map->mux[j] == i) { if (zero_channel[j]) { residue_buffers[ch] = NULL; } else { residue_buffers[ch] = channel_buffers[j]; } ch++; } } decode_residue(handle, map->submap_residue[i], n/2, ch, residue_buffers); } /**** Inverse coupling (4.3.5). ****/ for (int i = map->coupling_steps-1; i >= 0; i--) { float *magnitude = channel_buffers[map->coupling[i].magnitude]; float *angle = channel_buffers[map->coupling[i].angle]; for (int j = 0; j < n/2; j++) { const float M = magnitude[j]; const float A = angle[j]; float new_M, new_A; if (M > 0) { if (A > 0) { new_M = M; new_A = M - A; } else { new_A = M; new_M = M + A; } } else { if (A > 0) { new_M = M; new_A = M + A; } else { new_A = M; new_M = M - A; } } magnitude[j] = new_M; angle[j] = new_A; } } /**** Floor curve synthesis and residue product (4.3.6). The spec **** **** uses the term "dot product", but the actual operation is **** **** component-by-component vector multiplication. ****/ for (int i = 0; i < handle->channels; i++) { if (really_zero_channel[i]) { memset(channel_buffers[i], 0, sizeof(*channel_buffers[i]) * (n/2)); } else { const int floor_index = map->submap_floor[map->mux[i]]; if (handle->floor_types[floor_index] == 0) { Floor0 *floor = &handle->floor_config[floor_index].floor0; do_floor0_final(handle, floor, i, n, floor0_amplitude[i]); } else { // handle->floor_types[floor_index] == 1 Floor1 *floor = &handle->floor_config[floor_index].floor1; do_floor1_final(handle, floor, i, n); } } } /**** Inverse MDCT (4.3.7). ****/ for (int i = 0; i < handle->channels; i++) { inverse_mdct(handle, channel_buffers[i], mode->blockflag); } /**** Frame length, sample position, and other miscellany. ****/ /* Flush any leftover data in the current packet so the next packet * doesn't try to read it. */ flush_packet(handle); /* Deal with discarding samples from the first packet. */ if (handle->first_decode) { /* We don't support streams with nonzero start positions (what the * spec describes as "The granule (PCM) position of the first page * need not indicate that the stream started at position zero..." * in A.2), so we just omit the left half of the window. The value * we calculate here is negative, but the addition below will bring * it up to zero again. */ handle->current_loc = -(n/2 - left_start); handle->current_loc_valid = true; handle->first_decode = false; } /* If this is the last complete frame in a non-final Ogg page, update * the current sample position based on the Ogg granule position. * Normally this should have no effect, but it's important for the * first page in case that page indicates a positive initial offset, * and it helps resync if a page is missing in the stream. */ if (handle->last_seg_index == handle->end_seg_with_known_loc && !(handle->page_flag & PAGEFLAG_last_page)) { /* For non-final pages, the granule position should be the "last * complete sample returned by decode" (A.2), thus right_start. * However, it appears that the reference encoder blindly uses the * middle of the window (n/2) even on a long/short frame boundary, * and based on , the window * center appears to have been the original intention. */ handle->current_loc = handle->known_loc_for_packet - (n/2 - left_start); handle->current_loc_valid = true; } /* Update the current sample position for samples returned from this * frame. */ if (handle->current_loc_valid) { handle->current_loc += (right_start - left_start); } /* Return the amount of data stored in the window, including the * right-side overlap region.*/ *len_ret = right_end; /* The granule position on the final page of the Ogg stream specifies * the length of the stream, thus indicating how many samples to * return from the final frame. However, some streams attempt to * truncate to within the _second-to-last_ frame instead, so we have * to check for end-of-stream truncation on every packet in the last * page. (The specification is ambiguous about whether this is * actually permitted; while the wording suggests that the truncation * point must lie within the last frame, there is no explicit language * specifying that limitation. Since such streams actually exist, * however, we go ahead and support them.) */ if (handle->current_loc_valid && (handle->page_flag & PAGEFLAG_last_page)) { const uint64_t end_loc = handle->known_loc_for_packet; /* Be careful of wraparound here! A stream with a bogus position * in the second-to-last page which has the high bit set could * cause a simple "end_loc <= handle->current_loc" test to * improperly pass, so we have to test by taking the difference * and checking its sign bit. */ const uint64_t frame_end = handle->current_loc + (right_end - right_start); if (handle->current_loc - end_loc < UINT64_C(1)<<63 // end_loc <= handle->current_loc || (handle->last_seg_index == handle->end_seg_with_known_loc && end_loc - frame_end >= UINT64_C(1)<<63)) // end_loc < frame_end { const uint64_t packet_begin_loc = handle->current_loc - (right_start - left_start); if (end_loc - packet_begin_loc >= UINT64_C(1)<<63) { // end_loc < packet_begin_loc /* We can't truncate to before the beginning of the current * frame (that wouldn't make sense anyway, because the * stream should just have ended at the previous frame); * just truncate to the beginning of the frame in that case. */ *len_ret = left_start; } else { const uint64_t true_len = end_loc - (handle->current_loc - right_start); ASSERT(true_len < (uint64_t)*len_ret); *len_ret = (int)true_len; } handle->current_loc = end_loc; } } return true; } /*************************************************************************/ /************************** Interface routines ***************************/ /*************************************************************************/ bool vorbis_decode_initial(stb_vorbis *handle, int *left_start_ret, int *left_end_ret, int *right_start_ret, int *right_end_ret, int *mode_ret) { /* Start reading the next packet, and verify that it's an audio packet. */ if (!handle->packet_mode) { if (UNLIKELY(handle->eof)) { return false; } if (UNLIKELY(!start_packet(handle))) { if (handle->error == VORBIS_reached_eof) { /* EOF at page boundary is not an error! */ handle->error = VORBIS__no_error; } return false; } } if (UNLIKELY(get_bits(handle, 1) != 0) || UNLIKELY(handle->valid_bits < 0)) { /* Empty or non-audio packet. If not in packet mode, skip it and * try again. */ flush_packet(handle); if (handle->packet_mode) { handle->error = VORBIS_invalid_packet; return false; } else { return vorbis_decode_initial(handle, left_start_ret, left_end_ret, right_start_ret, right_end_ret, mode_ret); } } /* Read the mode number and window flags for this packet. */ const int mode_index = get_bits(handle, handle->mode_bits); /* Still in the same byte, so we can't hit EOP here. */ ASSERT(mode_index != EOP); if (mode_index >= handle->mode_count) { handle->error = VORBIS_invalid_packet; flush_packet(handle); return false; } *mode_ret = mode_index; Mode *mode = &handle->mode_config[mode_index]; bool next, prev; if (mode->blockflag) { prev = get_bits(handle, 1); next = get_bits(handle, 1); if (UNLIKELY(handle->valid_bits < 0)) { flush_packet(handle); return error(handle, VORBIS_invalid_packet); } } else { prev = next = false; } /* Set up the window bounds for this frame. */ const int n = handle->blocksize[mode->blockflag]; if (mode->blockflag && !prev) { *left_start_ret = (n - handle->blocksize[0]) / 4; *left_end_ret = (n + handle->blocksize[0]) / 4; } else { *left_start_ret = 0; *left_end_ret = n/2; } if (mode->blockflag && !next) { *right_start_ret = (n*3 - handle->blocksize[0]) / 4; *right_end_ret = (n*3 + handle->blocksize[0]) / 4; } else { *right_start_ret = n/2; *right_end_ret = n; } return true; } /*-----------------------------------------------------------------------*/ bool vorbis_decode_packet(stb_vorbis *handle, int *len_ret) { const bool first_decode = handle->first_decode; int mode, left_start, left_end, right_start, right_end, len; if (!vorbis_decode_initial(handle, &left_start, &left_end, &right_start, &right_end, &mode) || !vorbis_decode_packet_rest(handle, &handle->mode_config[mode], left_start, left_end, right_start, right_end, &len)) { return false; } float ** const channel_buffers = handle->channel_buffers[handle->cur_channel_buffer]; const int prev = handle->previous_length; /* We deliberately (though harmlessly) deviate from the spec with * respect to the portion of data to return for a given frame. The * spec says that we should return the region from the middle of the * previous frame to the middle of the current frame (4.8), but that * would require a buffer copy. Instead we return all finalized * samples from the current frame, i.e., up to right_start. This * unfortunately requires a bit of fudging around when decoding the * first frame of a stream. */ /* Mix in data from the previous window's right side. */ if (prev > 0) { const float *weights; if (prev*2 == handle->blocksize[0]) { weights = handle->window_weights[0]; } else { ASSERT(prev*2 == handle->blocksize[1]); weights = handle->window_weights[1]; } for (int i = 0; i < handle->channels; i++) { for (int j = 0; j < prev; j++) { channel_buffers[i][left_start+j] = channel_buffers[i][left_start+j] * weights[j] + handle->previous_window[i][j] * weights[prev-1-j]; } } } /* Point the previous_window pointers at the right side of this window. */ handle->previous_length = right_end - right_start; for (int i = 0; i < handle->channels; i++) { handle->previous_window[i] = channel_buffers[i] + right_start; } /* If this is the first frame, push left_start (the beginning of data * to return) to the center of the window, since the left half of the * window contains garbage. */ if (first_decode) { const int n = handle->blocksize[handle->mode_config[mode].blockflag]; left_start = n/2; } /* Save this channel's output pointers. */ for (int i = 0; i < handle->channels; i++) { handle->outputs[i] = channel_buffers[i] + left_start; } /* Advance to the next set of channel buffers for the next frame's data. */ handle->cur_channel_buffer = (handle->cur_channel_buffer + 1) % lenof(handle->channel_buffers); /* Return the final frame length, if requested. */ if (len_ret) { if (len < right_start) { *len_ret = len - left_start; /* Watch out for trying to truncate into the first packet of * the stream (which is not returned). */ if (*len_ret < 0) { *len_ret = 0; } } else { *len_ret = right_start - left_start; ASSERT(*len_ret >= 0); } } return true; } /*-----------------------------------------------------------------------*/ bool vorbis_decode_packet_direct( stb_vorbis *handle, const void *packet, int32_t packet_len, int *len_ret) { start_packet_direct(handle, packet, packet_len); return vorbis_decode_packet(handle, len_ret); } /*************************************************************************/ /*************************************************************************/