Code Coverage Report for src/decode/decode.c

1 /*

2 * libnogg: a decoder library for Ogg Vorbis streams

4 *

5 * This software may be copied and redistributed under certain conditions;

6 * see the file "COPYING" in the source code distribution for details.

7 * NO WARRANTY is provided with this software.

8 */

9

10 #include "include/nogg.h"

11 #include "src/common.h"

12 #include "src/decode/common.h"

13 #include "src/decode/crc32.h"

14 #include "src/decode/decode.h"

15 #include "src/decode/inlines.h"

16 #include "src/decode/io.h"

17 #include "src/decode/packet.h"

18 #include "src/decode/setup.h"

19 #include "src/util/memory.h"

20 #include "src/x86.h"

21

22 #include <math.h>

23 #include <stdlib.h>

24 #include <string.h>

25

26 #ifdef ENABLE_ASM_ARM_NEON

27 # include <arm_neon.h>

28 /* Note that vectorization is sometimes slower on ARM because of the lack

29 * of fast vector swizzle instructions, so ARM code is deliberately omitted

30 * in those cases. */

31 #endif

32

33 /* Older versions of GCC warn about shadowing functions with variables, so

34 * work around those warnings. */

35 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__==4 && __GNUC_MINOR__<7

36 # define div div_

37 # define floor floor_

38 # define y0 y0_

39 # define y1 y1_

40 #endif

41

42 /*

43 * Note on variable naming: Variables are generally named following usage

44 * in the Vorbis spec, though some variables have been renamed for clarity.

45 * In particular, the Vorbis spec consistently uses "n" for the window size

46 * of a packet, and we follow that usage here.

47 */

48

49 /*************************************************************************/

50 /****************************** Local data *******************************/

51 /*************************************************************************/

52

53 /* Lookup table for type 1 floor curve generation, copied from the Vorbis

54 * specification. */

55 static float floor1_inverse_db_table[256] =

56 {

57 1.0649863e-07f, 1.1341951e-07f, 1.2079015e-07f, 1.2863978e-07f,

58 1.3699951e-07f, 1.4590251e-07f, 1.5538408e-07f, 1.6548181e-07f,

59 1.7623575e-07f, 1.8768855e-07f, 1.9988561e-07f, 2.1287530e-07f,

60 2.2670913e-07f, 2.4144197e-07f, 2.5713223e-07f, 2.7384213e-07f,

61 2.9163793e-07f, 3.1059021e-07f, 3.3077411e-07f, 3.5226968e-07f,

62 3.7516214e-07f, 3.9954229e-07f, 4.2550680e-07f, 4.5315863e-07f,

63 4.8260743e-07f, 5.1396998e-07f, 5.4737065e-07f, 5.8294187e-07f,

64 6.2082472e-07f, 6.6116941e-07f, 7.0413592e-07f, 7.4989464e-07f,

65 7.9862701e-07f, 8.5052630e-07f, 9.0579828e-07f, 9.6466216e-07f,

66 1.0273513e-06f, 1.0941144e-06f, 1.1652161e-06f, 1.2409384e-06f,

67 1.3215816e-06f, 1.4074654e-06f, 1.4989305e-06f, 1.5963394e-06f,

68 1.7000785e-06f, 1.8105592e-06f, 1.9282195e-06f, 2.0535261e-06f,

69 2.1869758e-06f, 2.3290978e-06f, 2.4804557e-06f, 2.6416497e-06f,

70 2.8133190e-06f, 2.9961443e-06f, 3.1908506e-06f, 3.3982101e-06f,

71 3.6190449e-06f, 3.8542308e-06f, 4.1047004e-06f, 4.3714470e-06f,

72 4.6555282e-06f, 4.9580707e-06f, 5.2802740e-06f, 5.6234160e-06f,

73 5.9888572e-06f, 6.3780469e-06f, 6.7925283e-06f, 7.2339451e-06f,

74 7.7040476e-06f, 8.2047000e-06f, 8.7378876e-06f, 9.3057248e-06f,

75 9.9104632e-06f, 1.0554501e-05f, 1.1240392e-05f, 1.1970856e-05f,

76 1.2748789e-05f, 1.3577278e-05f, 1.4459606e-05f, 1.5399272e-05f,

77 1.6400004e-05f, 1.7465768e-05f, 1.8600792e-05f, 1.9809576e-05f,

78 2.1096914e-05f, 2.2467911e-05f, 2.3928002e-05f, 2.5482978e-05f,

79 2.7139006e-05f, 2.8902651e-05f, 3.0780908e-05f, 3.2781225e-05f,

80 3.4911534e-05f, 3.7180282e-05f, 3.9596466e-05f, 4.2169667e-05f,

81 4.4910090e-05f, 4.7828601e-05f, 5.0936773e-05f, 5.4246931e-05f,

82 5.7772202e-05f, 6.1526565e-05f, 6.5524908e-05f, 6.9783085e-05f,

83 7.4317983e-05f, 7.9147585e-05f, 8.4291040e-05f, 8.9768747e-05f,

84 9.5602426e-05f, 0.00010181521f, 0.00010843174f, 0.00011547824f,

85 0.00012298267f, 0.00013097477f, 0.00013948625f, 0.00014855085f,

86 0.00015820453f, 0.00016848555f, 0.00017943469f, 0.00019109536f,

87 0.00020351382f, 0.00021673929f, 0.00023082423f, 0.00024582449f,

88 0.00026179955f, 0.00027881276f, 0.00029693158f, 0.00031622787f,

89 0.00033677814f, 0.00035866388f, 0.00038197188f, 0.00040679456f,

90 0.00043323036f, 0.00046138411f, 0.00049136745f, 0.00052329927f,

91 0.00055730621f, 0.00059352311f, 0.00063209358f, 0.00067317058f,

92 0.00071691700f, 0.00076350630f, 0.00081312324f, 0.00086596457f,

93 0.00092223983f, 0.00098217216f, 0.0010459992f, 0.0011139742f,

94 0.0011863665f, 0.0012634633f, 0.0013455702f, 0.0014330129f,

95 0.0015261382f, 0.0016253153f, 0.0017309374f, 0.0018434235f,

96 0.0019632195f, 0.0020908006f, 0.0022266726f, 0.0023713743f,

97 0.0025254795f, 0.0026895994f, 0.0028643847f, 0.0030505286f,

98 0.0032487691f, 0.0034598925f, 0.0036847358f, 0.0039241906f,

99 0.0041792066f, 0.0044507950f, 0.0047400328f, 0.0050480668f,

100 0.0053761186f, 0.0057254891f, 0.0060975636f, 0.0064938176f,

101 0.0069158225f, 0.0073652516f, 0.0078438871f, 0.0083536271f,

102 0.0088964928f, 0.009474637f, 0.010090352f, 0.010746080f,

103 0.011444421f, 0.012188144f, 0.012980198f, 0.013823725f,

104 0.014722068f, 0.015678791f, 0.016697687f, 0.017782797f,

105 0.018938423f, 0.020169149f, 0.021479854f, 0.022875735f,

106 0.024362330f, 0.025945531f, 0.027631618f, 0.029427276f,

107 0.031339626f, 0.033376252f, 0.035545228f, 0.037855157f,

108 0.040315199f, 0.042935108f, 0.045725273f, 0.048696758f,

109 0.051861348f, 0.055231591f, 0.058820850f, 0.062643361f,

110 0.066714279f, 0.071049749f, 0.075666962f, 0.080584227f,

111 0.085821044f, 0.091398179f, 0.097337747f, 0.10366330f,

112 0.11039993f, 0.11757434f, 0.12521498f, 0.13335215f,

113 0.14201813f, 0.15124727f, 0.16107617f, 0.17154380f,

114 0.18269168f, 0.19456402f, 0.20720788f, 0.22067342f,

115 0.23501402f, 0.25028656f, 0.26655159f, 0.28387361f,

116 0.30232132f, 0.32196786f, 0.34289114f, 0.36517414f,

117 0.38890521f, 0.41417847f, 0.44109412f, 0.46975890f,

118 0.50028648f, 0.53279791f, 0.56742212f, 0.60429640f,

119 0.64356699f, 0.68538959f, 0.72993007f, 0.77736504f,

120 0.82788260f, 0.88168307f, 0.9389798f, 1.0f

121 };

122

123 /*************************************************************************/

124 /************************* Vectorization helpers *************************/

125 /*************************************************************************/

126

127 #ifdef ENABLE_ASM_ARM_NEON

128

129 /* Used to avoid unnecessary typecasts when flipping sign bits. */

130 static inline float32x4_t veorq_f32(uint32x4_t a, float32x4_t b) {

131 return (float32x4_t)veorq_u32(a, (uint32x4_t)b);

132 }

133

134 /* Various kinds of vector swizzles (xyzw = identity). The separate

135 * declarations are needed because some functions are implemented in

136 * terms of others. */

137 static inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a);

138 static inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a);

139 static inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a);

140 static inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a);

141 static inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a);

142 static inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a);

143 static inline float32x4_t vswizzleq_yyww_f32(float32x4_t a);

144 static inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a);

145 static inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a);

146 static inline float32x4_t vswizzleq_zzww_f32(float32x4_t a);

147 static inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a);

148 static inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a);

149 static inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a);

150 static inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a);

151 /* These are all marked UNUSED so Clang doesn't complain about the fact

152 * that we don't currently use all of them. */

153 static UNUSED inline float32x4_t vswizzleq_xxyy_f32(float32x4_t a) {

154 return vzipq_f32(a, a).val[0];

155 }

156 static UNUSED inline float32x4_t vswizzleq_xxzz_f32(float32x4_t a) {

157 const uint32x4_t sel = {~0U, 0, ~0U, 0};

158 /* vbsl operands: vbsl(mask, mask_on_value, mask_off_value) */

159 return vbslq_f32(sel, a, (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));

160 }

161 static UNUSED inline float32x4_t vswizzleq_xyxy_f32(float32x4_t a) {

162 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[0]);

163 }

164 static UNUSED inline float32x4_t vswizzleq_xzxz_f32(float32x4_t a) {

165 return vuzpq_f32(a, a).val[0];

166 }

167 static UNUSED inline float32x4_t vswizzleq_yxyx_f32(float32x4_t a) {

168 return vswizzleq_xyxy_f32(vswizzleq_yxwz_f32(a));

169 }

170 static UNUSED inline float32x4_t vswizzleq_yxwz_f32(float32x4_t a) {

171 const uint32x4_t sel = {~0U, 0, ~0U, 0};

172 return vbslq_f32(sel,

173 (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32),

174 (float32x4_t)vshlq_n_u64((uint64x2_t)a, 32));

175 }

176 static UNUSED inline float32x4_t vswizzleq_yyww_f32(float32x4_t a) {

177 const uint32x4_t sel = {~0U, 0, ~0U, 0};

178 return vbslq_f32(sel, (float32x4_t)vshrq_n_u64((uint64x2_t)a, 32), a);

179 }

180 static UNUSED inline float32x4_t vswizzleq_ywyw_f32(float32x4_t a) {

181 return vuzpq_f32(a, a).val[1];

182 }

183 static UNUSED inline float32x4_t vswizzleq_zzxx_f32(float32x4_t a) {

184 return vswizzleq_zwxy_f32(vswizzleq_xxzz_f32(a));

185 }

186 static UNUSED inline float32x4_t vswizzleq_zzww_f32(float32x4_t a) {

187 return vzipq_f32(a, a).val[1];

188 }

189 static UNUSED inline float32x4_t vswizzleq_zwxy_f32(float32x4_t a) {

190 return vextq_f32(a, a, 2);

191 }

192 static UNUSED inline float32x4_t vswizzleq_zwzw_f32(float32x4_t a) {

193 return (float32x4_t)vdupq_n_u64(((uint64x2_t)a)[1]);

194 }

195 static UNUSED inline float32x4_t vswizzleq_wzyx_f32(float32x4_t a) {

196 return vswizzleq_yxwz_f32(vswizzleq_zwxy_f32(a));

197 }

198 static UNUSED inline float32x4_t vswizzleq_wwyy_f32(float32x4_t a) {

199 return vswizzleq_zwxy_f32(vswizzleq_yyww_f32(a));

200 }

201

202 #endif // ENABLE_ASM_ARM_NEON

203

204 /*-----------------------------------------------------------------------*/

205

206 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)

207

208 /*

209 * _mm_xor_sign: Simple wrapper function for _mm_xor_ps() used to avoid

210 * unnecessary typecasts when flipping sign bits and work around compiler

211 * over-optimizations.

212 *

213 * Unlike ARM, the x86 architecture includes an exclusive-or instruction

214 * which nominally operates on floating-point data (xorps, with the

215 * corresponding intrinsic _mm_xor_ps()), which has the same effect as

216 * the equivalent integer instruction but avoids cross-domain data

217 * movement on processors with separate integer and floating-point vector

218 * pipelines, so at first glance it would seem like we could simply cast

219 * the sign vector to a float vector and use it that way. But some

220 * compilers, notably GCC 8 and later and Clang 11 and later, aggressively

221 * optimize away signed zero values in vector constants when inexact

222 * floating-point optimizations are enabled, and a mask containing only

223 * sign bits looks like an all-zero vector when interpreted as

224 * floating-point data.

225 *

226 * We could potentially accept the potential cross-domain stall and use

227 * the integer instruction (pxor), which works for GCC, but Clang seems

228 * to "pierce the veil" of the typecast and optimize out the operation

229 * anyway (in fact, this was diagnosed as incorrectly treating an XOR of

230 * two 64-bit sign masks as a vector negation of four 32-bit floats and

231 * applying the transformation "-(x - y) => y - x"). So we interpose a

232 * dummy (empty) inline assembly statement to hide knowledge of the

233 * constant's value from these compilers, thus forcing them to emit the

234 * exclusive-or operation as desired.

235 *

236 * See also:

237 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86855

238 * https://github.com/llvm/llvm-project/issues/55758 (fixed in 15.0.0)

239 */

240 static inline __m128 _mm_xor_sign(__m128i sign_mask, __m128 value) {

241 #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0))

242 __asm__("" : "=x" (value) : "0" (value));

243 #endif

244 return _mm_xor_ps(CAST_M128(sign_mask), value);

245 }

246

247 #endif // ENABLE_ASM_X86_SSE2

248

249 /*-----------------------------------------------------------------------*/

250

251 #ifdef ENABLE_ASM_X86_AVX2

252

253 /*

254 * _mm256_xor_sign: 256-bit version of _mm_xor_sign(), with the same

255 * caveats.

256 */

257 static inline __m256 _mm256_xor_sign(__m256i sign_mask, __m256 value) {

258 #if IS_GCC(8,0) || (IS_CLANG(11,0) && !IS_CLANG(15,0))

259 __asm__("" : "=x" (value) : "0" (value));

260 #endif

261 return _mm256_xor_ps(_mm256_castsi256_ps(sign_mask), value);

262 }

263

264 /**

265 * _mm256_permute4x64_ps: Convenience wrapper for _mm256_permute4x64_pd()

266 * which encapsulates ps/pd casts. Written as a macro because "control"

267 * must be a compile-time constant, which the compiler may not be able

268 * to determine if this is written as a function.

269 */

270 #define _mm256_permute4x64_ps(value, control) \

271 (_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd((value)), \

272 (control))))

273

274 #endif // ENABLE_ASM_X86_AVX2

275

276 /*************************************************************************/

277 /******************* Codebook decoding: scalar context *******************/

278 /*************************************************************************/

279

280 /**

281 * codebook_decode_scalar_raw_slow: Slow path for Huffman decoding,

282 * searching through the lookup tables to decode the next symbol.

283 */

284 static int32_t codebook_decode_scalar_raw_slow(stb_vorbis *handle,

285 const Codebook *book)

286 {

287 /* Fill the bit accumulator far enough to read any valid code. */

	Hit	Total	Coverage
Lines:	1039	1064	97.6%
Branches:	3367	3538	95.1%

288 (18/18) if (handle->valid_bits < 24) {

289 fill_bits(handle);

290 }

291

292 /* Find the code using binary search if we can, linear search otherwise. */

293 (18/18) if (book->sorted_codewords) {

294 const uint32_t code = bit_reverse(handle->acc);

295 int32_t low = 0, high = book->sorted_entries;

296 /* Invariant: sorted_codewords[low] <= code < sorted_codewords[high] */

297 (18/18) while (low+1 < high) {

298 const int32_t mid = (low + high) / 2;

299 (18/18) if (book->sorted_codewords[mid] <= code) {

300 low = mid;

301 } else {

302 high = mid;

303 }

304 }

305 (18/18) const int32_t result = book->sparse ? low : book->sorted_values[low];

306 const int len = book->codeword_lengths[result];

307 handle->acc >>= len;

308 handle->valid_bits -= len;

309 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {

310 return -1;

311 }

312 return result;

313 } else {

314 /* This loop terminates when the code is found. We ensure that all

315 * Huffman trees are completely specified during setup, so the

316 * assertion below will never fail. */

317 for (int i = 0; ; i++) {

318 ASSERT(i < book->entries);

319 (18/18) if (book->codeword_lengths[i] == NO_CODE) {

320 continue;

321 }

322 if (book->codewords[i] ==

323 (18/18) (handle->acc & ((UINT32_C(1) << book->codeword_lengths[i])-1)))

324 {

325 handle->acc >>= book->codeword_lengths[i];

326 handle->valid_bits -= book->codeword_lengths[i];

327 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {

328 return -1;

329 }

330 return i;

331 }

332 }

333 }

334 }

335

336 /*-----------------------------------------------------------------------*/

337

338 /**

339 * codebook_decode_scalar_raw: Read a Huffman code from the packet and

340 * decode it in scalar context using the given codebook, returning the

341 * symbol for non-sparse codebooks and the sorted_values[] index for sparse

342 * codebooks.

343 *

344 * This function contains only the fast path for Huffman lookup so that it

345 * can be easily inlined into callers. The slow path is implemented as a

346 * separate, non-inlined function to avoid code bloat.

347 *

348 * [Parameters]

349 * handle: Stream handle.

350 * book: Codebook to use.

351 * [Return value]

352 * Value read, or -1 if the end of the packet is reached.

353 */

354 static inline int32_t codebook_decode_scalar_raw(stb_vorbis *handle,

355 const Codebook *book)

356 {

357 /* First try the O(1) table. We only fill the bit accumulator if we

358 * don't have enough bits for the fast table, to avoid overhead from

359 * repeatedly reading single bytes from the packet. */

360 (18/18) if (handle->valid_bits < handle->fast_huffman_length) {

361 /* Note that moving this "semi-fast" path out to a separate

362 * function is no faster, and possibly slower (though not

363 * significantly so), than just calling fill_bits() here, at least

364 * on x86_64 and 32-bit ARM. */

365 fill_bits(handle);

366 }

367 const int fast_code =

368 book->fast_huffman[handle->acc & handle->fast_huffman_mask];

369 (18/18) if (fast_code >= 0) {

370 const int bits = book->codeword_lengths[fast_code];

371 handle->acc >>= bits;

372 handle->valid_bits -= bits;

373 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {

374 return -1;

375 }

376 return fast_code;

377 }

378

379 /* Fall back to the slow (table search) method. */

380 return codebook_decode_scalar_raw_slow(handle, book);

381 }

382

383 /*-----------------------------------------------------------------------*/

384

385 /**

386 * codebook_decode_scalar: Read a Huffman code from the packet and decode

387 * it in scalar context using the given codebook.

388 *

389 * [Parameters]

390 * handle: Stream handle.

391 * book: Codebook to use.

392 * [Return value]

393 * Symbol read, or -1 if the end of the packet is reached.

394 */

395 static int32_t codebook_decode_scalar(stb_vorbis *handle, const Codebook *book)

396 {

397 const int32_t value = codebook_decode_scalar_raw(handle, book);

398 (18/18) return book->sparse ? book->sorted_values[value] : value;

399 }

400

401 /*************************************************************************/

402 /********************* Codebook decoding: VQ context *********************/

403 /*************************************************************************/

404

405 /**

406 * codebook_decode_scalar_for_vq: Read a Huffman code from the packet and

407 * return a value appropriate for decoding the code in VQ context. Helper

408 * function for codebook_decode() and related functions.

409 *

410 * [Parameters]

411 * handle: Stream handle.

412 * book: Codebook to use.

413 * [Return value]

414 * Symbol or value read (depending on library configuration and

415 * codebook type), or -1 if the end of the packet is reached.

416 */

417 static inline int32_t codebook_decode_scalar_for_vq(stb_vorbis *handle,

418 const Codebook *book)

419 {

420 (18/18) if (handle->divides_in_codebook) {

421 return codebook_decode_scalar(handle, book);

422 } else {

423 return codebook_decode_scalar_raw(handle, book);

424 }

425 }

426

427 /*-----------------------------------------------------------------------*/

428

429 /**

430 * codebook_decode: Read a Huffman code from the packet and decode it in

431 * VQ context using the given codebook. Decoded vector components are

432 * added componentwise to the values in the output vector.

433 *

434 * If an error occurs, the current packet is flushed.

435 *

436 * [Parameters]

437 * handle: Stream handle.

438 * book: Codebook to use. Must be a vector (not scalar) codebook.

439 * output: Output vector.

440 * len: Number of elements to read. Automatically capped at

441 * book->dimensions.

442 * [Return value]

443 * True on success, false on error or end-of-packet.

444 */

445 static bool codebook_decode(stb_vorbis *handle, const Codebook *book,

446 float *output, int len)

447 {

448 ASSERT(book->lookup_type != 0);

449 ASSERT(book->multiplicands);

450

451 (18/18) if (len > book->dimensions) {

452 len = book->dimensions;

453 }

454

455 const int32_t code = codebook_decode_scalar_for_vq(handle, book);

456 (18/18) if (UNLIKELY(code < 0)) {

457 return false;

458 }

459

460 (18/18) if (book->lookup_type == 1) {

461 int div = 1;

462 (18/18) if (book->sequence_p) {

463 float last = 0;

464 (18/18) for (int i = 0; i < len; i++) {

465 const int32_t offset = (code / div) % book->lookup_values;

466 const float val = book->multiplicands[offset] + last;

467 output[i] += val;

468 last = val;

469 div *= book->lookup_values;

470 }

471 } else {

472 (18/18) for (int i = 0; i < len; i++) {

473 const int32_t offset = (code / div) % book->lookup_values;

474 output[i] += book->multiplicands[offset];

475 div *= book->lookup_values;

476 }

477 }

478 } else {

479 const int32_t offset = code * book->dimensions;

480 int i = 0;

481 #if defined(ENABLE_ASM_ARM_NEON)

482 for (; i+4 <= len; i += 4) {

483 vst1q_f32(&output[i], vaddq_f32(

484 vld1q_f32(&output[i]),

485 vld1q_f32(&book->multiplicands[offset+i])));

486 }

487 #elif defined(ENABLE_ASM_X86_SSE2)

488 (10/10) for (; i+4 <= len; i += 4) {

489 _mm_storeu_ps(&output[i], _mm_add_ps(

490 _mm_loadu_ps(&output[i]),

491 _mm_loadu_ps(&book->multiplicands[offset+i])));

492 }

493 #endif

494 (18/18) for (; i < len; i++) {

495 output[i] += book->multiplicands[offset+i];

496 }

497 }

498

499 return true;

500 }

501

502 /*-----------------------------------------------------------------------*/

503

504 /**

505 * codebook_decode_step: Read a Huffman code from the packet and decode it

506 * in VQ context using the given codebook. Decoded vector components are

507 * added componentwise to values in the output vector at intervals of "step"

508 * array elements. (codebook_decode() is effectively a specialization of

509 * this function with step==1.)

510 *

511 * If an error occurs, the current packet is flushed.

512 *

513 * [Parameters]

514 * handle: Stream handle.

515 * book: Codebook to use. Must be a vector (not scalar) codebook.

516 * output: Output vector.

517 * len: Number of elements to read. Automatically capped at

518 * book->dimensions.

519 * step: Interval between consecutive output elements: the second

520 * element is output[step], the third is output[2*step], etc.

521 * [Return value]

522 * True on success, false on error or end-of-packet.

523 */

524 static bool codebook_decode_step(stb_vorbis *handle, const Codebook *book,

525 float *output, int len, int step)

526 {

527 ASSERT(book->lookup_type != 0);

528 ASSERT(book->multiplicands);

529

530 (18/18) if (len > book->dimensions) {

531 len = book->dimensions;

532 }

533

534 const int32_t code = codebook_decode_scalar_for_vq(handle, book);

535 (18/18) if (UNLIKELY(code < 0)) {

536 return false;

537 }

538

539 (18/18) if (book->lookup_type == 1) {

540 int div = 1;

541 (9/18) if (book->sequence_p) {

542 /* This case appears not to be used by the reference encoder. */

543 float last = 0;

544 (0/18) for (int i = 0; i < len; i++) {

545 const int32_t offset = (code / div) % book->lookup_values;

546 const float val = book->multiplicands[offset] + last;

547 output[i*step] += val;

548 last = val;

549 div *= book->lookup_values;

550 }

551 } else {

552 (18/18) for (int i = 0; i < len; i++) {

553 const int32_t offset = (code / div) % book->lookup_values;

554 output[i*step] += book->multiplicands[offset];

555 div *= book->lookup_values;

556 }

557 }

558 } else {

559 const int32_t offset = code * book->dimensions;

560 (18/18) for (int i = 0; i < len; i++) {

561 output[i*step] += book->multiplicands[offset+i];

562 }

563 }

564

565 return true;

566 }

567

568 /*-----------------------------------------------------------------------*/

569

570 /**

571 * codebook_decode_deinterleave_repeat: Read Huffman codes from the packet

572 * and decode them in VQ context using the given codebook, deinterleaving

573 * across "ch" output channels, and repeat until total_decode vector

574 * elements have been decoded. Decoded vector components are added

575 * componentwise to the values in the output vectors.

576 *

577 * If an error occurs, the current packet is flushed.

578 *

579 * [Parameters]

580 * handle: Stream handle.

581 * book: Codebook to use.

582 * outputs: Output vectors for each channel.

583 * ch: Number of channels.

584 * n: Length of each output vector.

585 * total_offset: Offset within the interleaved output vector at which

586 * to start writing output values.

587 * total_decode: Total number of elements to read.

588 * [Return value]

589 * True on success, false on error or end-of-packet.

590 */

591 static bool codebook_decode_deinterleave_repeat(

592 stb_vorbis *handle, const Codebook *book, float **outputs, int ch, int n,

593 int32_t total_offset, int32_t total_decode)

594 {

595 (9/18) if (total_decode > ch*n - total_offset) {

596 total_decode = ch*n - total_offset;

597 }

598 int c_inter = total_offset % ch;

599 int p_inter = total_offset / ch;

600 int len = book->dimensions;

601

602 (18/18) while (total_decode > 0) {

603 /* Make sure we don't run off the end of the output vectors. */

604 (9/18) if (len > total_decode) {

605 len = total_decode;

606 }

607

608 const int32_t code = codebook_decode_scalar_for_vq(handle, book);

609 (18/18) if (UNLIKELY(code < 0)) {

610 return false;

611 }

612

613 (18/18) if (book->lookup_type == 1) {

614 int div = 1;

615 (9/18) if (book->sequence_p) {

616 /* This case appears not to be used by the reference encoder. */

617 float last = 0;

618 (0/18) for (int i = 0; i < len; i++) {

619 const int32_t offset = (code / div) % book->lookup_values;

620 const float val = book->multiplicands[offset] + last;

621 (0/18) if (outputs[c_inter]) {

622 outputs[c_inter][p_inter] += val;

623 }

624 (0/18) if (++c_inter == ch) {

625 c_inter = 0;

626 p_inter++;

627 }

628 last = val;

629 div *= book->lookup_values;

630 }

631 } else {

632 (18/18) for (int i = 0; i < len; i++) {

633 const int32_t offset = (code / div) % book->lookup_values;

634 (9/18) if (outputs[c_inter]) {

635 outputs[c_inter][p_inter] +=

636 book->multiplicands[offset];

637 }

638 (18/18) if (++c_inter == ch) {

639 c_inter = 0;

640 p_inter++;

641 }

642 div *= book->lookup_values;

643 }

644 }

645 } else { // book->lookup_type == 2

646 const int32_t offset = code * book->dimensions;

647 (18/18) for (int i = 0; i < len; i++) {

648 (18/18) if (outputs[c_inter]) {

649 outputs[c_inter][p_inter] += book->multiplicands[offset+i];

650 }

651 (18/18) if (++c_inter == ch) {

652 c_inter = 0;

653 p_inter++;

654 }

655 }

656 }

657

658 total_decode -= len;

659 }

660

661 return true;

662 }

663

664 /*-----------------------------------------------------------------------*/

665

666 /**

667 * codebook_decode_deinterleave_repeat_2: Read Huffman codes from the

668 * packet and decode them in VQ context using the given codebook,

669 * deinterleaving across 2 output channels. This function is a

670 * specialization of codebook_decode_deinterleave_repeat() for ch==2 and

671 * VORBIS_OPTION_DIVIDES_IN_CODEBOOK disabled.

672 *

673 * If an error occurs, the current packet is flushed.

674 *

675 * [Parameters]

676 * handle: Stream handle.

677 * book: Codebook to use.

678 * outputs: Output vectors for each channel.

679 * len: Length of each output vector.

680 * total_decode: Total number of elements to read.

681 * [Return value]

682 * True on success, false on error or end-of-packet.

683 */

684 static bool codebook_decode_deinterleave_repeat_2(

685 stb_vorbis *handle, const Codebook *book, float **outputs, int n,

686 int32_t total_offset, int32_t total_decode)

687 {

688 (9/18) if (total_decode > 2*n - total_offset) {

689 total_decode = 2*n - total_offset;

690 }

691 float * const output0 = outputs[0];

692 float * const output1 = outputs[1];

693 ASSERT(output0);

694 ASSERT(output1);

695 int c_inter = total_offset % 2;

696 int p_inter = total_offset / 2;

697 int len = book->dimensions;

698

699 (18/18) while (total_decode > 0) {

700 /* Make sure we don't run off the end of the output vectors. */

701 (9/18) if (len > total_decode) {

702 len = total_decode;

703 }

704

705 const int32_t code = codebook_decode_scalar_for_vq(handle, book);

706 (18/18) if (UNLIKELY(code < 0)) {

707 return false;

708 }

709

710 const int32_t offset = code * book->dimensions;

711 int i = 0;

712 (18/18) if (c_inter == 1) {

713 output1[p_inter] += book->multiplicands[offset];

714 c_inter = 0;

715 p_inter++;

716 i++;

717 }

718 (18/18) for (; i+2 <= len; i += 2) {

719 output0[p_inter] += book->multiplicands[offset+i];

720 output1[p_inter] += book->multiplicands[offset+i+1];

721 p_inter++;

722 }

723 (18/18) if (i < len) {

724 output0[p_inter] += book->multiplicands[offset+i];

725 c_inter++;

726 }

727

728 total_decode -= len;

729 }

730

731 return true;

732 }

733

734 /*************************************************************************/

735 /*************************** Floor processing ****************************/

736 /*************************************************************************/

737

738 /**

739 * render_point: Calculate the Y coordinate for point X on the line

740 * (x0,y0)-(x1,y1). Defined by section 9.2.6 in the Vorbis specification.

741 *

742 * [Parameters]

743 * x0, y0: First endpoint of the line.

744 * x1, y1: Second endpoint of the line.

745 * x: X coordinate of point to calculate.

746 * [Return value]

747 * Y value for the given X coordinate.

748 */

749 static CONST_FUNCTION int render_point(int x0, int y0, int x1, int y1, int x)

750 {

751 /* The definition of this function in the spec has separate code

752 * branches for positive and negative dy values, presumably to try and

753 * avoid defining integer division as rounding negative values toward

754 * zero (even though it ends up doing exactly that in the very next

755 * section...), but C already defines integer division to round both

756 * positive and negative values toward zero, so we just do things the

757 * natural way. */

758 return y0 + ((y1 - y0) * (x - x0) / (x1 - x0));

759 }

760

761 /*-----------------------------------------------------------------------*/

762

763 /**

764 * render_line: Render a line for a type 1 floor curve and perform the

765 * dot-product operation with the residue vector. Defined by section

766 * 9.2.7 in the Vorbis specification.

767 *

768 * [Parameters]

769 * x0, y0: First endpoint of the line.

770 * x1, y1: Second endpoint of the line.

771 * output: Output buffer. On entry, this buffer should contain the

772 * residue vector.

773 * n: Window length.

774 */

775 static inline void render_line(int x0, int y0, int x1, int y1, float *output,

776 int n)

777 {

778 /* N.B.: The spec requires a very specific sequence of operations for

779 * this function to ensure that both the encoder and the decoder

780 * generate the same integer-quantized curve. Take care that any

781 * optimizations to this function do not change the output. */

782

783 ASSERT(x0 >= 0);

784 ASSERT(y0 >= 0);

785 ASSERT(x1 > x0);

786

787 const int dy = y1 - y0;

788 const int adx = x1 - x0;

789 const int base = dy / adx;

790 const int ady = abs(dy) - (abs(base) * adx);

791 (18/18) const int sy = (dy < 0 ? base-1 : base+1);

792 int x = x0, y = y0;

793 int err = 0;

794

795 (18/18) if (x1 >= n) {

796 (9/18) if (x0 >= n) {

797 return;

798 }

799 x1 = n;

800 }

801 output[x] *= floor1_inverse_db_table[y];

802 (18/18) for (x++; x < x1; x++) {

803 err += ady;

804 (18/18) if (err >= adx) {

805 err -= adx;

806 y += sy;

807 } else {

808 y += base;

809 }

810 output[x] *= floor1_inverse_db_table[y];

811 }

812 }

813

814 /*-----------------------------------------------------------------------*/

815

816 /**

817 * decode_floor0: Perform type 0 floor decoding.

818 *

819 * [Parameters]

820 * handle: Stream handle.

821 * floor: Floor configuration.

822 * coefficients: Floor coefficient buffer for the current channel.

823 * [Return value]

824 * The floor amplitude (positive), 0 to indicate "unused" status, or

825 * -1 to indicate an undecodable packet.

826 */

827 static int64_t decode_floor0(stb_vorbis *handle, const Floor0 *floor,

828 float *coefficients)

829 {

830 const int amplitude = get_bits(handle, floor->amplitude_bits);

831 (18/18) if (amplitude == 0) {

832 return 0;

833 }

834

835 const int booknumber = get_bits(handle, floor->book_bits);

836 (18/18) if (booknumber >= floor->number_of_books) {

837 return -1;

838 }

839 const Codebook *book = &handle->codebooks[floor->book_list[booknumber]];

840 (18/18) if (book->lookup_type == 0) {

841 /* The specification is unclear on whether a floor 0 configuration

842 * referencing a scalar codebook is a setup-time or decode-time

843 * error; the only applicable rule seems to be 3.3 "If decoder

844 * setup or decode requests such an action [using a codebook of

845 * lookup type 0 in any context expecting a vector return value],

846 * that is an error condition rendering the packet undecodable".

847 * In the spirit of being lenient in what you accept, we treat it

848 * as a decode-time error so that packets which do not use that

849 * floor configuration can still be decoded. */

850 return -1;

851 }

852

853 float last = 0;

854 int coef_count = 0;

855 (18/18) while (coef_count < floor->order) {

856 const int len = min(book->dimensions, floor->order - coef_count);

857 (18/18) for (int i = 0; i < len; i++) {

858 coefficients[coef_count + i] = last;

859 }

860 (18/18) if (!codebook_decode(handle, book, &coefficients[coef_count], len)) {

861 return 0;

862 }

863 coef_count += len;

864 last = coefficients[coef_count - 1];

865 }

866

867 return amplitude;

868 }

869

870 /*-----------------------------------------------------------------------*/

871

872 /**

873 * decode_floor1_partition: Decode a single partition for a type 1 floor.

874 *

875 * [Parameters]

876 * handle: Stream handle.

877 * floor: Floor configuration.

878 * final_Y: final_Y buffer for the current channel.

879 * offset: Current offset in final_Y buffer.

880 * class: Partition class for the current partition.

881 * [Return value]

882 * New value of offset.

883 */

884 static int decode_floor1_partition(stb_vorbis *handle, const Floor1 *floor,

885 int16_t *final_Y, int offset,

886 const int class)

887 {

888 const int cdim = floor->class_dimensions[class];

889 const int cbits = floor->class_subclasses[class];

890 const int csub = (1 << cbits) - 1;

891 int cval = 0;

892 (18/18) if (cbits > 0) {

893 const Codebook *book =

894 &handle->codebooks[floor->class_masterbooks[class]];

895 cval = codebook_decode_scalar(handle, book);

896 }

897 (18/18) for (int j = 0; j < cdim; j++) {

898 const int book_index = floor->subclass_books[class][cval & csub];

899 cval >>= cbits;

900 (18/18) if (book_index >= 0) {

901 const Codebook *book = &handle->codebooks[book_index];

902 final_Y[offset++] = codebook_decode_scalar(handle, book);

903 } else {

904 final_Y[offset++] = 0;

905 }

906 }

907 return offset;

908 }

909

910 /*-----------------------------------------------------------------------*/

911

912 /**

913 * decode_floor1_amplitude: Calculate an element of the final_Y vector

914 * for a type 1 floor.

915 *

916 * [Parameters]

917 * floor: Floor configuration.

918 * final_Y: final_Y buffer for the current channel.

919 * range: Maximum value for final_Y elements.

920 * step2_flag: Array of flags indicating final_Y elements which will

921 * need to be recalculated later.

922 * i: Element of final_Y to calculate.

923 */

924 static void decode_floor1_amplitude(const Floor1 *floor, int16_t *final_Y,

925 const int range, bool *step2_flag,

926 const int i)

927 {

928 const int low = floor->neighbors[i].low;

929 const int high = floor->neighbors[i].high;

930 const int predicted = render_point(floor->X_list[low], final_Y[low],

931 floor->X_list[high], final_Y[high],

932 floor->X_list[i]);

933 const int val = final_Y[i]; // Value read from packet.

934 const int highroom = range - predicted;

935 const int lowroom = predicted;

936 int room;

937 (18/18) if (highroom > lowroom) {

938 room = lowroom * 2;

939 } else {

940 room = highroom * 2;

941 }

942 (18/18) if (val) {

943 step2_flag[low] = true;

944 step2_flag[high] = true;

945 step2_flag[i] = true;

946 (18/18) if (val >= room) {

947 (18/18) if (highroom > lowroom) {

948 /* The spec (7.2.4) suggests that "decoder implementations

949 * guard the values in vector [floor1_final_Y] by clamping

950 * each element to [0, [range])" because "it is possible

951 * to abuse the setup and codebook machinery to produce

952 * negative or over-range results". That can only happen

953 * in these two cases, so we insert tests here. This has

954 * about a 1% performance penalty, but that's better than

955 * corrupt data causing the library to crash. */

956 (9/18) if (UNLIKELY(val > range - 1)) {

957 final_Y[i] = range - 1;

958 } else {

959 final_Y[i] = val;

960 }

961 } else {

962 (9/18) if (UNLIKELY(val > range - 1)) {

963 final_Y[i] = 0;

964 } else {

965 final_Y[i] = (range - 1) - val;

966 }

967 }

968 } else {

969 (18/18) if (val % 2 != 0) {

970 final_Y[i] = predicted - (val+1)/2;

971 } else {

972 final_Y[i] = predicted + val/2;

973 }

974 }

975 } else {

976 step2_flag[i] = false;

977 final_Y[i] = predicted;

978 }

979 }

980

981 /*-----------------------------------------------------------------------*/

982

983 /**

984 * decode_floor1: Perform type 1 floor decoding.

985 *

986 * [Parameters]

987 * handle: Stream handle.

988 * floor: Floor configuration.

989 * final_Y: final_Y buffer for the current channel.

990 * [Return value]

991 * False to indicate "unused" status (4.2.3 step 6), true otherwise.

992 */

993 static bool decode_floor1(stb_vorbis *handle, const Floor1 *floor,

994 int16_t *final_Y)

995 {

996 static const int16_t range_list[4] = {256, 128, 86, 64};

997 static const int8_t range_bits_list[4] = {8, 7, 7, 6};

998 const int range = range_list[floor->floor1_multiplier - 1];

999 const int range_bits = range_bits_list[floor->floor1_multiplier - 1];

1000

1001 (18/18) if (!get_bits(handle, 1)) {

1002 return false; // Channel is zero.

1003 }

1004

1005 /* Floor decode (7.2.3). */

1006 int offset = 2;

1007 final_Y[0] = get_bits(handle, range_bits);

1008 final_Y[1] = get_bits(handle, range_bits);

1009 (18/18) for (int i = 0; i < floor->partitions; i++) {

1010 const int class = floor->partition_class_list[i];

1011 offset = decode_floor1_partition(handle, floor, final_Y, offset, class);

1012 }

1013 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {

1014 return false;

1015 }

1016

1017 /* Amplitude value synthesis (7.2.4 step 1). */

1018 bool step2_flag[256];

1019 step2_flag[0] = true;

1020 step2_flag[1] = true;

1021 (18/18) for (int i = 2; i < floor->values; i++) {

1022 decode_floor1_amplitude(floor, final_Y, range, step2_flag, i);

1023 }

1024

1025 /* Curve synthesis (7.2.4 step 2). We defer final floor computation

1026 * until after synthesis; here, we just clear final_Y values which

1027 * need to be synthesized later. */

1028 (18/18) for (int i = 0; i < floor->values; i++) {

1029 (18/18) if (!step2_flag[i]) {

1030 final_Y[i] = -1;

1031 }

1032 }

1033

1034 return true;

1035 }

1036

1037 /*-----------------------------------------------------------------------*/

1038

1039 /**

1040 * do_floor0_final: Perform the floor curve computation and residue

1041 * product for type 0 floors.

1042 *

1043 * [Parameters]

1044 * handle: Stream handle.

1045 * floor: Floor configuration.

1046 * ch: Channel to operate on.

1047 * n: Frame window size.

1048 * amplitude: Floor amplitude.

1049 */

1050 static void do_floor0_final(stb_vorbis *handle, const Floor0 *floor,

1051 const int ch, const int n, const int64_t amplitude)

1052 {

1053 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];

1054 float *coefficients = handle->coefficients[ch];

1055 const int16_t *map = floor->map[(n == handle->blocksize[1])];

1056 const float omega_base = M_PIf / floor->bark_map_size;

1057 const float scaled_amplitude = (float)amplitude

1058 / (float)((UINT64_C(1) << floor->amplitude_bits) - 1);

1059 const float lfv_scale = 0.11512925f * floor->amplitude_offset;

1060

1061 (18/18) for (int i = 0; i < floor->order; i++) {

1062 coefficients[i] = 2 * cosf(coefficients[i]);

1063 }

1064

1065 int i = 0;

1066 do {

1067 const float two_cos_omega = 2 * cosf(map[i] * omega_base);

1068 float p = 0.5, q = 0.5;

1069 int j;

1070 (18/18) for (j = 0; j <= (floor->order - 2) / 2; j++) {

1071 p *= coefficients[2*j+1] - two_cos_omega;

1072 q *= coefficients[2*j] - two_cos_omega;

1073 }

1074 (18/18) if (floor->order % 2 != 0) {

1075 q *= coefficients[2*j] - two_cos_omega;

1076 /* The spec gives this as "4 - cos^2(omega)", but we use the

1077 * equality (a^2-b^2) = (a+b)(a-b) to give us constants

1078 * common between both branches of the if statement. */

1079 p *= p * (2 + two_cos_omega) * (2 - two_cos_omega);

1080 q *= q;

1081 } else {

1082 p *= p * (2 - two_cos_omega);

1083 q *= q * (2 + two_cos_omega);

1084 }

1085 const float linear_floor_value =

1086 expf(lfv_scale * ((scaled_amplitude / sqrtf(p + q)) - 1));

1087 const int iteration_condition = map[i];

1088 do {

1089 output[i] *= linear_floor_value;

1090 i++;

1091 (18/18) } while (map[i] == iteration_condition);

1092 (18/18) } while (i < n/2);

1093 }

1094

1095 /*-----------------------------------------------------------------------*/

1096

1097 /**

1098 * do_floor1_final: Perform the final floor curve computation and residue

1099 * product for type 1 floors.

1100 *

1101 * [Parameters]

1102 * handle: Stream handle.

1103 * floor: Floor configuration.

1104 * ch: Channel to operate on.

1105 * n: Frame window size.

1106 */

1107 static void do_floor1_final(stb_vorbis *handle, const Floor1 *floor,

1108 const int ch, const int n)

1109 {

1110 float *output = handle->channel_buffers[handle->cur_channel_buffer][ch];

1111 const int16_t *final_Y = handle->final_Y[ch];

1112 int lx = 0;

1113 int ly = final_Y[0] * floor->floor1_multiplier;

1114 (18/18) for (int i = 1; i < floor->values; i++) {

1115 int j = floor->sorted_order[i];

1116 (18/18) if (final_Y[j] >= 0) {

1117 const int hx = floor->X_list[j];

1118 const int hy = final_Y[j] * floor->floor1_multiplier;

1119 render_line(lx, ly, hx, hy, output, n/2);

1120 lx = hx;

1121 ly = hy;

1122 }

1123 }

1124 (18/18) if (lx < n/2) {

1125 /* Optimization of: render_line(lx, ly, n/2, ly, output, n/2); */

1126 (18/18) for (int i = lx; i < n/2; i++) {

1127 output[i] *= floor1_inverse_db_table[ly];

1128 }

1129 }

1130 }

1131

1132 /*************************************************************************/

1133 /*************************** Residue handling ****************************/

1134 /*************************************************************************/

1135

1136 /**

1137 * decode_residue_partition_0: Decode a single residue partition for

1138 * residue type 0.

1139 *

1140 * [Parameters]

1141 * handle: Stream handle.

1142 * book: Codebook to use for decoding.

1143 * size: Residue partition size.

1144 * offset: Output offset.

1145 * outputs: Output vector array.

1146 * n: Length of residue vectors (half of window size).

1147 * ch: Number of channels of residue data to decode.

1148 * [Return value]

1149 * True on success, false on end of packet.

1150 */

1151 static bool decode_residue_partition_0(

1152 stb_vorbis *handle, const Codebook *book, int size, int offset,

1153 float **output_ptr, UNUSED int n, UNUSED int ch)

1154 {

1155 float *output = *output_ptr;

1156 const int dimensions = book->dimensions;

1157 const int step = size / dimensions;

1158 (18/18) for (int i = 0; i < step; i++) {

1159 (18/18) if (!codebook_decode_step(handle, book, &output[offset+i], size-i,

1160 step)) {

1161 return false;

1162 }

1163 }

1164 return true;

1165 }

1166

1167 /*-----------------------------------------------------------------------*/

1168

1169 /**

1170 * decode_residue_partition_1: Decode a single residue partition for

1171 * residue type 1.

1172 *

1173 * [Parameters]

1174 * handle: Stream handle.

1175 * book: Codebook to use for decoding.

1176 * size: Residue partition size.

1177 * offset: Output offset.

1178 * outputs: Output vector array.

1179 * n: Length of residue vectors (half of window size).

1180 * ch: Number of channels of residue data to decode.

1181 * [Return value]

1182 * True on success, false on end of packet.

1183 */

1184 static bool decode_residue_partition_1(

1185 stb_vorbis *handle, const Codebook *book, int size, int offset,

1186 float **output_ptr, UNUSED int n, UNUSED int ch)

1187 {

1188 float *output = *output_ptr;

1189 const int dimensions = book->dimensions;

1190 (18/18) for (int i = 0; i < size; i += dimensions) {

1191 (18/18) if (!codebook_decode(handle, book, &output[offset+i], size-i)) {

1192 return false;

1193 }

1194 }

1195 return true;

1196 }

1197

1198 /*-----------------------------------------------------------------------*/

1199

1200 /**

1201 * decode_residue_partition_2: Decode a single residue partition for

1202 * residue type 2.

1203 *

1204 * [Parameters]

1205 * handle: Stream handle.

1206 * book: Codebook to use for decoding.

1207 * size: Residue partition size.

1208 * offset: Output offset.

1209 * outputs: Output vector array.

1210 * n: Length of residue vectors (half of window size).

1211 * ch: Number of channels of residue data to decode.

1212 * [Return value]

1213 * True on success, false on end of packet.

1214 */

1215 static bool decode_residue_partition_2(

1216 stb_vorbis *handle, const Codebook *book, int size, int offset,

1217 float **outputs, int n, int ch)

1218 {

1219 return codebook_decode_deinterleave_repeat(

1220 handle, book, outputs, ch, n, offset, size);

1221 }

1222

1223 /*-----------------------------------------------------------------------*/

1224

1225 /**

1226 * decode_residue_partition_2_2ch: Decode a single residue partition for

1227 * residue type 2 with two channels. Both channels must be active for

1228 * decoding.

1229 *

1230 * [Parameters]

1231 * handle: Stream handle.

1232 * book: Codebook to use for decoding.

1233 * size: Residue partition size.

1234 * offset: Output offset.

1235 * outputs: Output vector array.

1236 * n: Length of residue vectors (half of window size).

1237 * ch: Number of channels of residue data to decode.

1238 * [Return value]

1239 * True on success, false on end of packet.

1240 */

1241 static bool decode_residue_partition_2_2ch(

1242 stb_vorbis *handle, const Codebook *book, int size, int offset,

1243 float **outputs, int n, UNUSED int ch)

1244 {

1245 return codebook_decode_deinterleave_repeat_2(

1246 handle, book, outputs, n, offset, size);

1247 }

1248

1249 /*-----------------------------------------------------------------------*/

1250

1251 /**

1252 * decode_residue_partition: Decode a single residue partition.

1253 *

1254 * [Parameters]

1255 * handle: Stream handle.

1256 * res: Residue configuration.

1257 * pass: Residue decoding pass.

1258 * partition_count: Current partition index.

1259 * vqclass: Classification for this partition.

1260 * outputs: Output vector array.

1261 * n: Length of residue vectors (half of window size).

1262 * ch: Number of channels of residue data to decode.

1263 * do_partition: Partition decoding function.

1264 * [Return value]

1265 * True on success, false on end of packet.

1266 */

1267 static inline bool decode_residue_partition(

1268 stb_vorbis *handle, const Residue *res, int pass, int partition_count,

1269 int vqclass, float **outputs, int n, int ch,

1270 bool (*do_partition)(

1271 stb_vorbis *handle, const Codebook *book, int size, int offset,

1272 float **outputs, int n, int ch))

1273 {

1274 const int vqbook = res->residue_books[vqclass][pass];

1275 (18/18) if (vqbook < 0) {

1276 return true;

1277 }

1278

1279 const Codebook *book = &handle->codebooks[vqbook];

1280 (18/18) if (UNLIKELY(book->lookup_type == 0)) {

1281 /* Lookup type 0 is only valid in a scalar context. The spec

1282 * doesn't say what to do about this case; we treat it like

1283 * end-of-packet. */

1284 flush_packet(handle);

1285 handle->valid_bits = -1;

1286 return false;

1287 }

1288

1289 const int32_t offset = res->begin + partition_count * res->part_size;

1290 return (*do_partition)(

1291 handle, book, res->part_size, offset, outputs, n, ch);

1292 }

1293

1294 /*-----------------------------------------------------------------------*/

1295

1296 /**

1297 * decode_residue_common: Common decode processing for all residue vector

1298 * types.

1299 *

1300 * [Parameters]

1301 * handle: Stream handle.

1302 * res: Residue configuration.

1303 * type: Residue type.

1304 * n: Length of residue vectors (half of window size).

1305 * ch: Number of channels of residue data to decode.

1306 * residue_buffers: Pointers to the buffers for each channel in which

1307 * the residue vectors should be stored. A pointer of NULL

1308 * indicates that the given channel should not be decoded (the

1309 * "do not decode" state referred to by the specification).

1310 * do_partition: Function pointer for the partition decoding function

1311 * to use.

1312 */

1313 static void decode_residue_common(

1314 stb_vorbis *handle, const Residue *res, int type, int n, int ch,

1315 float *residue_buffers[],

1316 bool (*do_partition)(

1317 stb_vorbis *handle, const Codebook *book, int size, int offset,

1318 float **outputs, int n, int ch))

1319 {

1320 const Codebook *classbook = &handle->codebooks[res->classbook];

1321 const int classwords = classbook->dimensions;

1322 (18/18) const int actual_size = (type == 2 ? n*ch : n);

1323 const int n_to_read =

1324 min(res->end, actual_size) - min(res->begin, actual_size);

1325 const int partitions_to_read = n_to_read / res->part_size;

1326 (18/18) const int ch_to_read = (type == 2 ? 1 : ch);

1327

1328 (18/18) if (n_to_read <= 0) {

1329 return;

1330 }

1331

1332 (18/18) if (handle->divides_in_residue) {

1333

1334 int **classifications = handle->classifications;

1335

1336 (18/18) for (int pass = 0; pass < 8; pass++) {

1337 int partition_count = 0;

1338 (18/18) while (partition_count < partitions_to_read) {

1339 (18/18) if (pass == 0) {

1340 (18/18) for (int j = 0; j < ch_to_read; j++) {

1341 (18/18) if (residue_buffers[j]) {

1342 int temp =

1343 codebook_decode_scalar(handle, classbook);

1344 (18/18) if (UNLIKELY(temp == EOP)) {

1345 return;

1346 }

1347 (18/18) for (int i = classwords-1; i >= 0; i--) {

1348 classifications[j][i+partition_count] =

1349 temp % res->classifications;

1350 temp /= res->classifications;

1351 }

1352 }

1353 }

1354 }

1355 for (int i = 0;

1356 (36/36) i < classwords && partition_count < partitions_to_read;

1357 i++, partition_count++)

1358 {

1359 (18/18) for (int j = 0; j < ch_to_read; j++) {

1360 (18/18) if (residue_buffers[j]) {

1361 const int vqclass =

1362 classifications[j][partition_count];

1363 (18/18) if (!decode_residue_partition(

1364 handle, res, pass, partition_count, vqclass,

1365 &residue_buffers[j], n, ch, do_partition)) {

1366 return;

1367 }

1368 }

1369 }

1370 }

1371 } // while (partition_count < partitions_to_read)

1372 } // for (int pass = 0; pass < 8; pass++)

1373

1374 } else { // !handle->divides_in_residue

1375

1376 uint8_t ***part_classdata = handle->classifications;

1377

1378 (18/18) for (int pass = 0; pass < 8; pass++) {

1379 int partition_count = 0;

1380 int class_set = 0;

1381 (18/18) while (partition_count < partitions_to_read) {

1382 (18/18) if (pass == 0) {

1383 (18/18) for (int j = 0; j < ch_to_read; j++) {

1384 (18/18) if (residue_buffers[j]) {

1385 const int temp =

1386 codebook_decode_scalar(handle, classbook);

1387 (18/18) if (UNLIKELY(temp == EOP)) {

1388 return;

1389 }

1390 part_classdata[j][class_set] =

1391 res->classdata[temp];

1392 }

1393 }

1394 }

1395 for (int i = 0;

1396 (36/36) i < classwords && partition_count < partitions_to_read;

1397 i++, partition_count++)

1398 {

1399 (18/18) for (int j = 0; j < ch_to_read; j++) {

1400 (18/18) if (residue_buffers[j]) {

1401 const int vqclass =

1402 part_classdata[j][class_set][i];

1403 (18/18) if (!decode_residue_partition(

1404 handle, res, pass, partition_count, vqclass,

1405 &residue_buffers[j], n, ch, do_partition)) {

1406 return;

1407 }

1408 }

1409 }

1410 }

1411 class_set++;

1412 } // while (partition_count < partitions_to_read)

1413 } // for (int pass = 0; pass < 8; pass++)

1414

1415 } // if (handle->divides_in_residue)

1416 }

1417

1418 /*-----------------------------------------------------------------------*/

1419

1420 /**

1421 * decode_residue: Decode the residue vectors for the current frame.

1422 *

1423 * [Parameters]

1424 * handle: Stream handle.

1425 * residue_index: Index of residue configuration to use.

1426 * n: Length of residue vectors (half of window size).

1427 * ch: Number of channels of residue data to decode.

1428 * residue_buffers: Pointers to the buffers for each channel in which

1429 * the residue vectors should be stored. A pointer of NULL

1430 * indicates that the given channel should not be decoded (the

1431 * "do not decode" state referred to by the specification).

1432 */

1433 static void decode_residue(stb_vorbis *handle, int residue_index, int n,

1434 int ch, float *residue_buffers[])

1435 {

1436 Residue *res = &handle->residue_config[residue_index];

1437 const int type = handle->residue_types[residue_index];

1438

1439 (18/18) for (int i = 0; i < ch; i++) {

1440 (18/18) if (residue_buffers[i]) {

1441 memset(residue_buffers[i], 0, sizeof(*residue_buffers[i]) * n);

1442 }

1443 }

1444

1445 /* For residue type 2, if there are multiple channels, we need to

1446 * deinterleave the data after decoding it. */

1447 (36/36) if (type == 2 && ch > 1) {

1448 (36/36) if (ch == 2 && !handle->divides_in_residue

1449 (27/36) && residue_buffers[0] && residue_buffers[1]) {

1450 /* We have slightly optimized handling for this case. */

1451 decode_residue_common(

1452 handle, res, type, n, ch, residue_buffers,

1453 decode_residue_partition_2_2ch);

1454 } else {

1455 decode_residue_common(

1456 handle, res, type, n, ch, residue_buffers,

1457 decode_residue_partition_2);

1458 }

1459 } else {

1460 (18/18) decode_residue_common(

1461 handle, res, type, n, ch, residue_buffers,

1462 type==0 ? decode_residue_partition_0 : decode_residue_partition_1);

1463 }

1464 }

1465

1466 /*************************************************************************/

1467 /************************ Inverse MDCT processing ************************/

1468 /*************************************************************************/

1469

1470 /**

1471 * imdct_setup_step1: Setup and step 1 of the IMDCT.

1472 *

1473 * [Parameters]

1474 * n: Window size.

1475 * A: Twiddle factor A.

1476 * Y: Input buffer (length n/2).

1477 * v: Output buffer (length n/2). Must be distinct from the input buffer.

1478 */

1479 static void imdct_setup_step1(const unsigned int n, const float *A,

1480 const float *Y, float *v)

1481 {

1482 #if 0 // Roughly literal implementation.

1483

1484 for (unsigned int i = 0; i < n/4; i += 2) {

1485 v[(n/2)-i-1] = Y[i*2]*A[i+0] - Y[(i+1)*2]*A[i+1];

1486 v[(n/2)-i-2] = Y[i*2]*A[i+1] + Y[(i+1)*2]*A[i+0];

1487 }

1488 for (unsigned int i = 0; i < n/4; i += 2) {

1489 v[(n/4)-i-1] =

1490 -Y[(n/2-1)-i*2]*A[(n/4)+i+0] - -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+1];

1491 v[(n/4)-i-2] =

1492 -Y[(n/2-1)-i*2]*A[(n/4)+i+1] + -Y[(n/2-1)-(i+1)*2]*A[(n/4)+i+0];

1493 }

1494

1495 #else // Optimized implementations.

1496

1497 #if defined(ENABLE_ASM_ARM_NEON)

1498 const int step = 4;

1499 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);

1500 #elif defined(ENABLE_ASM_X86_AVX2)

1501 const int step = 8;

1502 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);

1503 const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31);

1504 #elif defined(ENABLE_ASM_X86_SSE2)

1505 const int step = 4;

1506 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);

1507 const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31);

1508 #else

1509 const int step = 4;

1510 #endif

1511 ASSERT((n/4) % step == 0);

1512

1513 v += n/2;

1514 (18/18) for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) {

1515 #if defined(ENABLE_ASM_ARM_NEON)

1516 const float32x4x2_t Y_all = vld2q_f32(&Y[i*2]);

1517 const float32x4_t Y_i2 = vswizzleq_zzxx_f32(Y_all.val[0]);

1518 const float32x4_t Y_i3 = vswizzleq_wwyy_f32(Y_all.val[0]);

1519 const float32x4_t A_i = vld1q_f32(&A[i]);

1520 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);

1521 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);

1522 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_i2, A_i3),

1523 veorq_f32(sign_1010,

1524 vmulq_f32(Y_i3, A_i2))));

1525 #elif defined(ENABLE_ASM_X86_AVX2)

1526 /* AVX2 doesn't include an instruction allowing us to mix two

1527 * registers while also shuffling values between 128-bit lanes

1528 * of each register, so we need a couple of extra temporaries to

1529 * get to our desired element order. */

1530 const __m256 Y_lo = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+0)*2]));

1531 const __m256 Y_hi = _mm256_moveldup_ps(_mm256_load_ps(&Y[(i+4)*2]));

1532 const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(0,2,1,3));

1533 const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(0,2,1,3));

1534 const __m256 Y_i2 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x13);

1535 const __m256 Y_i3 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x02);

1536 /* Again, we need an extra operation to swap between 128-bit lanes.

1537 * The use of permute4x64 here should help the compiler optimize to

1538 * a single vpermpd on a memory operand. */

1539 const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]),

1540 _MM_SHUFFLE(1,0,3,2));

1541 const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3));

1542 const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2));

1543 _mm256_store_ps(&v[j], _mm256_add_ps(

1544 _mm256_mul_ps(Y_i2, A_i3),

1545 _mm256_xor_sign(sign_1010,

1546 _mm256_mul_ps(Y_i3, A_i2))));

1547 #elif defined(ENABLE_ASM_X86_SSE2)

1548 const __m128 Y_lo = _mm_load_ps(&Y[(i+0)*2]);

1549 const __m128 Y_hi = _mm_load_ps(&Y[(i+2)*2]);

1550 const __m128 Y_i2 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(0,0,0,0));

1551 const __m128 Y_i3 = _mm_shuffle_ps(Y_hi, Y_lo, _MM_SHUFFLE(2,2,2,2));

1552 const __m128 A_i = _mm_load_ps(&A[i]);

1553 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));

1554 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));

1555 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_i2, A_i3),

1556 _mm_xor_sign(sign_1010,

1557 _mm_mul_ps(Y_i3, A_i2))));

1558 #else

1559 v[j+3] = Y[(i+0)*2]*A[i+0] - Y[(i+1)*2]*A[i+1];

1560 v[j+2] = Y[(i+0)*2]*A[i+1] + Y[(i+1)*2]*A[i+0];

1561 v[j+1] = Y[(i+2)*2]*A[i+2] - Y[(i+3)*2]*A[i+3];

1562 v[j+0] = Y[(i+2)*2]*A[i+3] + Y[(i+3)*2]*A[i+2];

1563 #endif

1564 }

1565

1566 Y += n/2;

1567 A += n/4;

1568 v -= n/4;

1569 (18/18) for (int i = 0, j = -step; i < (int)(n/4); i += step, j -= step) {

1570 #if defined(ENABLE_ASM_ARM_NEON)

1571 const float32x4x2_t Y_all = vld2q_f32(&Y[j*2]);

1572 const float32x4_t Y_j1 = vnegq_f32(vswizzleq_yyww_f32(Y_all.val[1]));

1573 const float32x4_t Y_j0 = vnegq_f32(vswizzleq_xxzz_f32(Y_all.val[1]));

1574 const float32x4_t A_i = vld1q_f32(&A[i]);

1575 const float32x4_t A_i3 = vswizzleq_wzyx_f32(A_i);

1576 const float32x4_t A_i2 = vswizzleq_zwxy_f32(A_i);

1577 vst1q_f32(&v[j], vaddq_f32(vmulq_f32(Y_j1, A_i3),

1578 veorq_f32(sign_1010,

1579 vmulq_f32(Y_j0, A_i2))));

1580 #elif defined(ENABLE_ASM_X86_AVX2)

1581 const __m256 Y_lo =

1582 _mm256_xor_sign(sign_1111,

1583 _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+0)*2])));

1584 const __m256 Y_hi =

1585 _mm256_xor_sign(sign_1111,

1586 _mm256_movehdup_ps(_mm256_load_ps(&Y[(j+4)*2])));

1587 const __m256 tY_lo = _mm256_permute4x64_ps(Y_lo, _MM_SHUFFLE(2,0,3,1));

1588 const __m256 tY_hi = _mm256_permute4x64_ps(Y_hi, _MM_SHUFFLE(2,0,3,1));

1589 const __m256 Y_j1 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x20);

1590 const __m256 Y_j0 = _mm256_permute2f128_ps(tY_lo, tY_hi, 0x31);

1591 const __m256 A_i = _mm256_permute4x64_ps(_mm256_load_ps(&A[i]),

1592 _MM_SHUFFLE(1,0,3,2));

1593 const __m256 A_i3 = _mm256_permute_ps(A_i, _MM_SHUFFLE(0,1,2,3));

1594 const __m256 A_i2 = _mm256_permute_ps(A_i, _MM_SHUFFLE(1,0,3,2));

1595 _mm256_store_ps(&v[j], _mm256_add_ps(

1596 _mm256_mul_ps(Y_j1, A_i3),

1597 _mm256_xor_sign(sign_1010,

1598 _mm256_mul_ps(Y_j0, A_i2))));

1599 #elif defined(ENABLE_ASM_X86_SSE2)

1600 const __m128 Y_lo = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+0)*2]));

1601 const __m128 Y_hi = _mm_xor_sign(sign_1111, _mm_load_ps(&Y[(j+2)*2]));

1602 const __m128 Y_j1 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(3,3,3,3));

1603 const __m128 Y_j0 = _mm_shuffle_ps(Y_lo, Y_hi, _MM_SHUFFLE(1,1,1,1));

1604 const __m128 A_i = _mm_load_ps(&A[i]);

1605 const __m128 A_i3 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(0,1,2,3));

1606 const __m128 A_i2 = _mm_shuffle_ps(A_i, A_i, _MM_SHUFFLE(1,0,3,2));

1607 _mm_store_ps(&v[j], _mm_add_ps(_mm_mul_ps(Y_j1, A_i3),

1608 _mm_xor_sign(sign_1010,

1609 _mm_mul_ps(Y_j0, A_i2))));

1610 #else

1611 v[j+3] = -Y[(j+3)*2+1]*A[i+0] - -Y[(j+2)*2+1]*A[i+1];

1612 v[j+2] = -Y[(j+3)*2+1]*A[i+1] + -Y[(j+2)*2+1]*A[i+0];

1613 v[j+1] = -Y[(j+1)*2+1]*A[i+2] - -Y[(j+0)*2+1]*A[i+3];

1614 v[j+0] = -Y[(j+1)*2+1]*A[i+3] + -Y[(j+0)*2+1]*A[i+2];

1615 #endif

1616 }

1617

1618 #endif // Literal vs. optimized implementation.

1619 }

1620

1621 /*-----------------------------------------------------------------------*/

1622

1623 /**

1624 * imdct_step2: Step 2 of the IMDCT.

1625 *

1626 * [Parameters]

1627 * n: Window size.

1628 * A: Twiddle factor A.

1629 * v: Input buffer (length n/2).

1630 * w: Output buffer (length n/2). May be the same as the input buffer.

1631 */

1632 static void imdct_step2(const unsigned int n, const float *A,

1633 const float *v, float *w)

1634 {

1635 const float *v0 = &v[(n/4)];

1636 const float *v1 = &v[0];

1637 float *w0 = &w[(n/4)];

1638 float *w1 = &w[0];

1639

1640 #if defined(ENABLE_ASM_ARM_NEON)

1641 const int step = 4;

1642 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);

1643 #elif defined(ENABLE_ASM_X86_AVX2)

1644 const int step = 8;

1645 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);

1646 const __m256i permute = _mm256_set_epi32(1, 1, 3, 3, 0, 0, 2, 2);

1647 #elif defined(ENABLE_ASM_X86_SSE2)

1648 const int step = 4;

1649 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);

1650 #else

1651 const int step = 4;

1652 #endif

1653 ASSERT((n/2) % (2*step) == 0);

1654

1655 (18/18) for (int i = 0, j = n/2 - 2*step; j >= 0; i += step, j -= 2*step) {

1656

1657 #if defined(ENABLE_ASM_ARM_NEON)

1658 const float32x4_t v0_i = vld1q_f32(&v0[i]);

1659 const float32x4_t v1_i = vld1q_f32(&v1[i]);

1660 const float32x4x2_t A_all = vld2q_f32(&A[j]);

1661 vst1q_f32(&w0[i], vaddq_f32(v0_i, v1_i));

1662 const float32x4_t diff = vsubq_f32(v0_i, v1_i);

1663 const float32x4_t diff2 = vswizzleq_yxwz_f32(diff);

1664 const uint32x4_t A_sel = {~0U, 0, ~0U, 0};

1665 const float32x4_t A_40 = vswizzleq_zzxx_f32(vbslq_f32(

1666 A_sel, A_all.val[0],

1667 (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[0], 32)));

1668 const float32x4_t A_51 = vswizzleq_zzxx_f32(vbslq_f32(

1669 A_sel, A_all.val[1],

1670 (float32x4_t)vshlq_n_u64((uint64x2_t)A_all.val[1], 32)));

1671 vst1q_f32(&w1[i], vaddq_f32(vmulq_f32(diff, A_40),

1672 veorq_f32(sign_1010,

1673 vmulq_f32(diff2, A_51))));

1674

1675 #elif defined(ENABLE_ASM_X86_AVX2)

1676 const __m256 v0_i = _mm256_load_ps(&v0[i]);

1677 const __m256 v1_i = _mm256_load_ps(&v1[i]);

1678 const __m256 A_0 = _mm256_load_ps(&A[j+0]);

1679 const __m256 A_8 = _mm256_load_ps(&A[j+8]);

1680 _mm256_store_ps(&w0[i], _mm256_add_ps(v0_i, v1_i));

1681 const __m256 diff = _mm256_sub_ps(v0_i, v1_i);

1682 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));

1683 const __m256 A_lo = _mm256_permutevar_ps(

1684 _mm256_permute4x64_ps(A_0, _MM_SHUFFLE(2,0,2,0)), permute);

1685 const __m256 A_hi = _mm256_permutevar_ps(

1686 _mm256_permute4x64_ps(A_8, _MM_SHUFFLE(2,0,2,0)), permute);

1687 const __m256 A_40 = _mm256_permute2f128_ps(A_lo, A_hi, 0x02);

1688 const __m256 A_51 = _mm256_permute2f128_ps(A_lo, A_hi, 0x13);

1689 _mm256_store_ps(&w1[i], _mm256_add_ps(

1690 _mm256_mul_ps(diff, A_40),

1691 _mm256_xor_sign(sign_1010,

1692 _mm256_mul_ps(diff2, A_51))));

1693

1694 #elif defined(ENABLE_ASM_X86_SSE2)

1695 const __m128 v0_i = _mm_load_ps(&v0[i]);

1696 const __m128 v1_i = _mm_load_ps(&v1[i]);

1697 const __m128 A_0 = _mm_load_ps(&A[j+0]);

1698 const __m128 A_4 = _mm_load_ps(&A[j+4]);

1699 _mm_store_ps(&w0[i], _mm_add_ps(v0_i, v1_i));

1700 const __m128 diff = _mm_sub_ps(v0_i, v1_i);

1701 const __m128 diff2 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(2,3,0,1));

1702 const __m128 A_40 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(0,0,0,0));

1703 const __m128 A_51 = _mm_shuffle_ps(A_4, A_0, _MM_SHUFFLE(1,1,1,1));

1704 _mm_store_ps(&w1[i], _mm_add_ps(_mm_mul_ps(diff, A_40),

1705 _mm_xor_sign(sign_1010,

1706 _mm_mul_ps(diff2, A_51))));

1707

1708 #else

1709 float v40_20, v41_21;

1710

1711 v41_21 = v0[i+1] - v1[i+1];

1712 v40_20 = v0[i+0] - v1[i+0];

1713 w0[i+1] = v0[i+1] + v1[i+1];

1714 w0[i+0] = v0[i+0] + v1[i+0];

1715 w1[i+1] = v41_21*A[j+4] - v40_20*A[j+5];

1716 w1[i+0] = v40_20*A[j+4] + v41_21*A[j+5];

1717

1718 v41_21 = v0[i+3] - v1[i+3];

1719 v40_20 = v0[i+2] - v1[i+2];

1720 w0[i+3] = v0[i+3] + v1[i+3];

1721 w0[i+2] = v0[i+2] + v1[i+2];

1722 w1[i+3] = v41_21*A[j+0] - v40_20*A[j+1];

1723 w1[i+2] = v40_20*A[j+0] + v41_21*A[j+1];

1724

1725 #endif // ENABLE_ASM_*

1726

1727 }

1728 }

1729

1730 /*-----------------------------------------------------------------------*/

1731

1732 /**

1733 * imdct_step3_inner_r_loop: Step 3 of the IMDCT for a single iteration

1734 * of the l and s loops.

1735 *

1736 * [Parameters]

1737 * lim: Iteration limit for the r loop.

1738 * A: Twiddle factor A.

1739 * e: Input/output buffer (length n/2, modified in place).

1740 * i_off: Initial offset, calculated as (n/2 - k0*s).

1741 * k0: Constant k0.

1742 * k1: Constant k1.

1743 */

1744 static void imdct_step3_inner_r_loop(const unsigned int lim, const float *A,

1745 float *e, int i_off, int k0, int k1)

1746 {

1747 float *e0 = e + i_off;

1748 float *e2 = e0 - k0/2;

1749

1750 #if defined(ENABLE_ASM_ARM_NEON)

1751 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);

1752 #elif defined(ENABLE_ASM_X86_AVX2)

1753 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);

1754 #elif defined(ENABLE_ASM_X86_SSE2)

1755 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);

1756 #endif

1757

1758 (18/18) for (int i = lim/4; i > 0; --i, e0 -= 8, e2 -= 8) {

1759

1760 #if defined(ENABLE_ASM_ARM_NEON)

1761 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);

1762 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);

1763 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);

1764 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);

1765 const float32x4x2_t A_0k =

1766 vuzpq_f32(vld1q_f32(&A[0]), vld1q_f32(&A[k1]));

1767 const float32x4x2_t A_2k =

1768 vuzpq_f32(vld1q_f32(&A[2*k1]), vld1q_f32(&A[3*k1]));

1769 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));

1770 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));

1771 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);

1772 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);

1773 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);

1774 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);

1775 const uint32x4_t A_sel = {~0U, 0, ~0U, 0};

1776 const float32x4_t A_0 = vswizzleq_zzxx_f32(vbslq_f32(

1777 A_sel, A_0k.val[0],

1778 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[0], 32)));

1779 const float32x4_t A_1 = vswizzleq_zzxx_f32(vbslq_f32(

1780 A_sel, A_0k.val[1],

1781 (float32x4_t)vshlq_n_u64((uint64x2_t)A_0k.val[1], 32)));

1782 const float32x4_t A_2 = vswizzleq_zzxx_f32(vbslq_f32(

1783 A_sel, A_2k.val[0],

1784 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[0], 32)));

1785 const float32x4_t A_3 = vswizzleq_zzxx_f32(vbslq_f32(

1786 A_sel, A_2k.val[1],

1787 (float32x4_t)vshlq_n_u64((uint64x2_t)A_2k.val[1], 32)));

1788 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_0),

1789 veorq_f32(sign_1010,

1790 vmulq_f32(diff2_4, A_1))));

1791 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_2),

1792 veorq_f32(sign_1010,

1793 vmulq_f32(diff2_8, A_3))));

1794 A += 4*k1;

1795

1796 #elif defined(ENABLE_ASM_X86_AVX2)

1797 const __m256 e0_8 = _mm256_load_ps(&e0[-8]);

1798 const __m256 e2_8 = _mm256_load_ps(&e2[-8]);

1799 const __m128 A_0k = _mm_load_ps(A);

1800 const __m128 A_1k = _mm_load_ps(&A[k1]);

1801 const __m128 A_2k = _mm_load_ps(&A[2*k1]);

1802 const __m128 A_3k = _mm_load_ps(&A[3*k1]);

1803 _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8));

1804 const __m256 diff = _mm256_sub_ps(e0_8, e2_8);

1805 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));

1806 const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0));

1807 const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1));

1808 const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0));

1809 const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1));

1810 const __m256 A_20 = _mm256_set_m128(A_0, A_2);

1811 const __m256 A_31 = _mm256_set_m128(A_1, A_3);

1812 _mm256_store_ps(&e2[-8], _mm256_add_ps(

1813 _mm256_mul_ps(diff, A_20),

1814 _mm256_xor_sign(sign_1010,

1815 _mm256_mul_ps(diff2, A_31))));

1816 A += 4*k1;

1817

1818 #elif defined(ENABLE_ASM_X86_SSE2)

1819 const __m128 e0_4 = _mm_load_ps(&e0[-4]);

1820 const __m128 e2_4 = _mm_load_ps(&e2[-4]);

1821 const __m128 e0_8 = _mm_load_ps(&e0[-8]);

1822 const __m128 e2_8 = _mm_load_ps(&e2[-8]);

1823 const __m128 A_0k = _mm_load_ps(A);

1824 const __m128 A_1k = _mm_load_ps(&A[k1]);

1825 const __m128 A_2k = _mm_load_ps(&A[2*k1]);

1826 const __m128 A_3k = _mm_load_ps(&A[3*k1]);

1827 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));

1828 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));

1829 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);

1830 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);

1831 const __m128 diff2_4 =

1832 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));

1833 const __m128 diff2_8 =

1834 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));

1835 const __m128 A_0 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(0,0,0,0));

1836 const __m128 A_1 = _mm_shuffle_ps(A_1k, A_0k, _MM_SHUFFLE(1,1,1,1));

1837 const __m128 A_2 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(0,0,0,0));

1838 const __m128 A_3 = _mm_shuffle_ps(A_3k, A_2k, _MM_SHUFFLE(1,1,1,1));

1839 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_0),

1840 _mm_xor_sign(sign_1010,

1841 _mm_mul_ps(diff2_4, A_1))));

1842 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_2),

1843 _mm_xor_sign(sign_1010,

1844 _mm_mul_ps(diff2_8, A_3))));

1845 A += 4*k1;

1846

1847 #else

1848 float k00_20, k01_21;

1849

1850 k00_20 = e0[-1] - e2[-1];

1851 k01_21 = e0[-2] - e2[-2];

1852 e0[-1] = e0[-1] + e2[-1];

1853 e0[-2] = e0[-2] + e2[-2];

1854 e2[-1] = k00_20 * A[0] - k01_21 * A[1];

1855 e2[-2] = k01_21 * A[0] + k00_20 * A[1];

1856 A += k1;

1857

1858 k00_20 = e0[-3] - e2[-3];

1859 k01_21 = e0[-4] - e2[-4];

1860 e0[-3] = e0[-3] + e2[-3];

1861 e0[-4] = e0[-4] + e2[-4];

1862 e2[-3] = k00_20 * A[0] - k01_21 * A[1];

1863 e2[-4] = k01_21 * A[0] + k00_20 * A[1];

1864 A += k1;

1865

1866 k00_20 = e0[-5] - e2[-5];

1867 k01_21 = e0[-6] - e2[-6];

1868 e0[-5] = e0[-5] + e2[-5];

1869 e0[-6] = e0[-6] + e2[-6];

1870 e2[-5] = k00_20 * A[0] - k01_21 * A[1];

1871 e2[-6] = k01_21 * A[0] + k00_20 * A[1];

1872 A += k1;

1873

1874 k00_20 = e0[-7] - e2[-7];

1875 k01_21 = e0[-8] - e2[-8];

1876 e0[-7] = e0[-7] + e2[-7];

1877 e0[-8] = e0[-8] + e2[-8];

1878 e2[-7] = k00_20 * A[0] - k01_21 * A[1];

1879 e2[-8] = k01_21 * A[0] + k00_20 * A[1];

1880 A += k1;

1881

1882 #endif // ENABLE_ASM_*

1883

1884 }

1885 }

1886

1887 /*-----------------------------------------------------------------------*/

1888

1889 /**

1890 * imdct_step3_inner_s_loop: Step 3 of the IMDCT for a single iteration

1891 * of the l and r loops.

1892 *

1893 * [Parameters]

1894 * lim: Iteration limit for the s loop.

1895 * A: Twiddle factor A.

1896 * e: Input/output buffer (length n/2, modified in place).

1897 * i_off: Initial offset, calculated as (n/2 - k0*s).

1898 * k0: Constant k0.

1899 * k1: Constant k1.

1900 */

1901 static void imdct_step3_inner_s_loop(const unsigned int lim, const float *A,

1902 float *e, int i_off, int k0, int k1)

1903 {

1904 const float A0 = A[0];

1905 const float A1 = A[0+1];

1906 const float A2 = A[0+k1];

1907 const float A3 = A[0+k1+1];

1908 const float A4 = A[0+k1*2+0];

1909 const float A5 = A[0+k1*2+1];

1910 const float A6 = A[0+k1*3+0];

1911 const float A7 = A[0+k1*3+1];

1912

1913 float *e0 = e + i_off;

1914 float *e2 = e0 - k0/2;

1915

1916 (18/18) for (int i = lim; i > 0; i--, e0 -= k0, e2 -= k0) {

1917

1918 #if defined(ENABLE_ASM_ARM_NEON)

1919 const float32x4_t A_20 = {A2, A2, A0, A0};

1920 const float32x4_t A_31 = {A3, -A3, A1, -A1};

1921 const float32x4_t A_64 = {A6, A6, A4, A4};

1922 const float32x4_t A_75 = {A7, -A7, A5, -A5};

1923 const float32x4_t e0_4 = vld1q_f32(&e0[-4]);

1924 const float32x4_t e2_4 = vld1q_f32(&e2[-4]);

1925 const float32x4_t e0_8 = vld1q_f32(&e0[-8]);

1926 const float32x4_t e2_8 = vld1q_f32(&e2[-8]);

1927 vst1q_f32(&e0[-4], vaddq_f32(e0_4, e2_4));

1928 vst1q_f32(&e0[-8], vaddq_f32(e0_8, e2_8));

1929 const float32x4_t diff_4 = vsubq_f32(e0_4, e2_4);

1930 const float32x4_t diff_8 = vsubq_f32(e0_8, e2_8);

1931 const float32x4_t diff2_4 = vswizzleq_yxwz_f32(diff_4);

1932 const float32x4_t diff2_8 = vswizzleq_yxwz_f32(diff_8);

1933 vst1q_f32(&e2[-4], vaddq_f32(vmulq_f32(diff_4, A_20),

1934 vmulq_f32(diff2_4, A_31)));

1935 vst1q_f32(&e2[-8], vaddq_f32(vmulq_f32(diff_8, A_64),

1936 vmulq_f32(diff2_8, A_75)));

1937

1938 #elif defined(ENABLE_ASM_X86_AVX2)

1939 const __m256 A_6420 = _mm256_set_ps(A0, A0, A2, A2, A4, A4, A6, A6);

1940 const __m256 A_7531 = _mm256_set_ps(-A1, A1, -A3, A3, -A5, A5, -A7, A7);

1941 const __m256 e0_8 = _mm256_load_ps(&e0[-8]);

1942 const __m256 e2_8 = _mm256_load_ps(&e2[-8]);

1943 _mm256_store_ps(&e0[-8], _mm256_add_ps(e0_8, e2_8));

1944 const __m256 diff = _mm256_sub_ps(e0_8, e2_8);

1945 const __m256 diff2 = _mm256_permute_ps(diff, _MM_SHUFFLE(2,3,0,1));

1946 _mm256_store_ps(&e2[-8], _mm256_fmadd_ps(

1947 diff, A_6420,

1948 _mm256_mul_ps(diff2, A_7531)));

1949

1950 #elif defined(ENABLE_ASM_X86_SSE2)

1951 const __m128 A_20 = _mm_set_ps(A0, A0, A2, A2);

1952 const __m128 A_31 = _mm_set_ps(-A1, A1, -A3, A3);

1953 const __m128 A_64 = _mm_set_ps(A4, A4, A6, A6);

1954 const __m128 A_75 = _mm_set_ps(-A5, A5, -A7, A7);

1955 const __m128 e0_4 = _mm_load_ps(&e0[-4]);

1956 const __m128 e2_4 = _mm_load_ps(&e2[-4]);

1957 const __m128 e0_8 = _mm_load_ps(&e0[-8]);

1958 const __m128 e2_8 = _mm_load_ps(&e2[-8]);

1959 _mm_store_ps(&e0[-4], _mm_add_ps(e0_4, e2_4));

1960 _mm_store_ps(&e0[-8], _mm_add_ps(e0_8, e2_8));

1961 const __m128 diff_4 = _mm_sub_ps(e0_4, e2_4);

1962 const __m128 diff_8 = _mm_sub_ps(e0_8, e2_8);

1963 const __m128 diff2_4 =

1964 _mm_shuffle_ps(diff_4, diff_4, _MM_SHUFFLE(2,3,0,1));

1965 const __m128 diff2_8 =

1966 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1));

1967 _mm_store_ps(&e2[-4], _mm_add_ps(_mm_mul_ps(diff_4, A_20),

1968 _mm_mul_ps(diff2_4, A_31)));

1969 _mm_store_ps(&e2[-8], _mm_add_ps(_mm_mul_ps(diff_8, A_64),

1970 _mm_mul_ps(diff2_8, A_75)));

1971

1972 #else

1973 float k00, k11;

1974

1975 k00 = e0[-1] - e2[-1];

1976 k11 = e0[-2] - e2[-2];

1977 e0[-1] = e0[-1] + e2[-1];

1978 e0[-2] = e0[-2] + e2[-2];

1979 e2[-1] = k00 * A0 - k11 * A1;

1980 e2[-2] = k11 * A0 + k00 * A1;

1981

1982 k00 = e0[-3] - e2[-3];

1983 k11 = e0[-4] - e2[-4];

1984 e0[-3] = e0[-3] + e2[-3];

1985 e0[-4] = e0[-4] + e2[-4];

1986 e2[-3] = k00 * A2 - k11 * A3;

1987 e2[-4] = k11 * A2 + k00 * A3;

1988

1989 k00 = e0[-5] - e2[-5];

1990 k11 = e0[-6] - e2[-6];

1991 e0[-5] = e0[-5] + e2[-5];

1992 e0[-6] = e0[-6] + e2[-6];

1993 e2[-5] = k00 * A4 - k11 * A5;

1994 e2[-6] = k11 * A4 + k00 * A5;

1995

1996 k00 = e0[-7] - e2[-7];

1997 k11 = e0[-8] - e2[-8];

1998 e0[-7] = e0[-7] + e2[-7];

1999 e0[-8] = e0[-8] + e2[-8];

2000 e2[-7] = k00 * A6 - k11 * A7;

2001 e2[-8] = k11 * A6 + k00 * A7;

2002

2003 #endif // ENABLE_ASM_*

2004

2005 }

2006 }

2007

2008 /*-----------------------------------------------------------------------*/

2009

2010 /**

2011 * iter_54: Step 3 of the IMDCT for l=ld(n)-5 and l=ld(n)-4 for a sequence

2012 * of 8 elements. Helper function for imdct_step3_inner_s_loop_ld654().

2013 *

2014 * [Parameters]

2015 * z: Pointer to elements to operate on.

2016 */

2017 static inline void iter_54(float *z)

2018 {

2019 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)

2020 const __m128i sign_0011 =

2021 _mm_set_epi32(0, 0, UINT32_C(1)<<31, UINT32_C(1)<<31);

2022 const __m128i sign_0110 =

2023 _mm_set_epi32(0, UINT32_C(1)<<31, UINT32_C(1)<<31, 0);

2024 const __m128 z_4 = _mm_load_ps(&z[-4]);

2025 const __m128 z_8 = _mm_load_ps(&z[-8]);

2026 const __m128 sum = _mm_add_ps(z_4, z_8);

2027 const __m128 diff = _mm_sub_ps(z_4, z_8);

2028 const __m128 sum_23 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3,2,3,2));

2029 const __m128 sum_01 = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1,0,1,0));

2030 const __m128 diff_23 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(3,2,3,2));

2031 const __m128 diff_10 = _mm_shuffle_ps(diff, diff, _MM_SHUFFLE(0,1,0,1));

2032 _mm_store_ps(&z[-4], _mm_add_ps(sum_23, _mm_xor_sign(sign_0011, sum_01)));

2033 _mm_store_ps(&z[-8], _mm_add_ps(diff_23, _mm_xor_sign(sign_0110, diff_10)));

2034

2035 #else

2036 const float k00 = z[-1] - z[-5];

2037 const float k11 = z[-2] - z[-6];

2038 const float k22 = z[-3] - z[-7];

2039 const float k33 = z[-4] - z[-8];

2040 const float y0 = z[-1] + z[-5];

2041 const float y1 = z[-2] + z[-6];

2042 const float y2 = z[-3] + z[-7];

2043 const float y3 = z[-4] + z[-8];

2044

2045 z[-1] = y0 + y2; // z1 + z5 + z3 + z7

2046 z[-2] = y1 + y3; // z2 + z6 + z4 + z8

2047 z[-3] = y0 - y2; // z1 + z5 - z3 - z7

2048 z[-4] = y1 - y3; // z2 + z6 - z4 - z8

2049 z[-5] = k00 + k33; // z1 - z5 + z4 - z8

2050 z[-6] = k11 - k22; // z2 - z6 + z3 - z7

2051 z[-7] = k00 - k33; // z1 - z5 - z4 + z8

2052 z[-8] = k11 + k22; // z2 - z6 - z3 + z7

2053

2054 #endif // ENABLE_ASM_*

2055 }

2056

2057 /*-----------------------------------------------------------------------*/

2058

2059 /**

2060 * imdct_step3_inner_s_loop_ld654: Step 3 of the IMDCT for a single

2061 * iteration of the l and r loops at l=log2_n-{6,5,4}.

2062 *

2063 * [Parameters]

2064 * n: Window size.

2065 * A: Twiddle factor A.

2066 * e: Input/output buffer (length n/2, modified in place).

2067 */

2068 static void imdct_step3_inner_s_loop_ld654(

2069 const unsigned int n, const float *A, float *e)

2070 {

2071 const int a_off = n/8;

2072 const float A2 = A[a_off];

2073

2074 (18/18) for (float *z = e + n/2; z > e; z -= 16) {

2075

2076 #if defined(ENABLE_ASM_X86_SSE2) || defined(ENABLE_ASM_X86_AVX2)

2077 # if defined(ENABLE_ASM_X86_AVX2)

2078 const __m256 z_8 = _mm256_load_ps(&z[-8]);

2079 const __m256 z_16 = _mm256_load_ps(&z[-16]);

2080 _mm256_store_ps(&z[-8], _mm256_add_ps(z_8, z_16));

2081 const __m256 diff = _mm256_sub_ps(z_8, z_16);

2082 const __m128 diff_8 = _mm256_castps256_ps128(diff);

2083 const __m128 diff_4 = _mm256_extractf128_ps(diff, 1);

2084 # else

2085 const __m128 z_4 = _mm_load_ps(&z[-4]);

2086 const __m128 z_8 = _mm_load_ps(&z[-8]);

2087 const __m128 z_12 = _mm_load_ps(&z[-12]);

2088 const __m128 z_16 = _mm_load_ps(&z[-16]);

2089 _mm_store_ps(&z[-4], _mm_add_ps(z_4, z_12));

2090 _mm_store_ps(&z[-8], _mm_add_ps(z_8, z_16));

2091 const __m128 diff_4 = _mm_sub_ps(z_4, z_12);

2092 const __m128 diff_8 = _mm_sub_ps(z_8, z_16);

2093 # endif

2094 /* We can't use the same algorithm as imdct_step3_inner_s_loop(),

2095 * since that can lead to a loss of precision for z[-11,-12,-15,-16]

2096 * if the two non-constant inputs to the calculation are of nearly

2097 * equal magnitude and of the appropriate signs to give an

2098 * intermediate sum or difference of much smaller magnitude:

2099 * changing the order of operations to multiply by A2 first can

2100 * introduce an error of at most 1 ULP at the point of the

2101 * multiplication, but if the final result has a smaller magnitude,

2102 * that error will be a greater part of the final result. */

2103 const __m128 temp_4 = _mm_xor_sign(

2104 _mm_set_epi32(0, 0, 0, UINT32_C(1)<<31),

2105 _mm_shuffle_ps(diff_4, _mm_set1_ps(0), _MM_SHUFFLE(3,2,0,1)));

2106 _mm_store_ps(&z[-12], _mm_mul_ps(_mm_add_ps(diff_4, temp_4),

2107 _mm_set_ps(1, 1, A2, A2)));

2108 const __m128 temp1_8 = _mm_xor_sign(

2109 _mm_set_epi32(0, UINT32_C(1)<<31, 0, UINT32_C(1)<<31),

2110 _mm_shuffle_ps(diff_8, diff_8, _MM_SHUFFLE(2,3,0,1)));

2111 const __m128 temp2_8 =

2112 _mm_shuffle_ps(diff_8, _mm_set1_ps(0), _MM_SHUFFLE(3,2,1,0));

2113 _mm_store_ps(&z[-16], _mm_mul_ps(_mm_sub_ps(temp1_8, temp2_8),

2114 _mm_set_ps(1, 1, A2, A2)));

2115

2116 #else

2117 float k00, k11;

2118

2119 k00 = z[ -1] - z[ -9];

2120 k11 = z[ -2] - z[-10];

2121 z[ -1] = z[ -1] + z[ -9];

2122 z[ -2] = z[ -2] + z[-10];

2123 z[ -9] = k00; // k00*1 - k11*0

2124 z[-10] = k11; // k11*1 + k00*0

2125

2126 k00 = z[ -3] - z[-11];

2127 k11 = z[ -4] - z[-12];

2128 z[ -3] = z[ -3] + z[-11];

2129 z[ -4] = z[ -4] + z[-12];

2130 z[-11] = (k00+k11) * A2; // k00*A2 - k11*-A2 (but see asm note above)

2131 z[-12] = (k11-k00) * A2; // k11*A2 + k00*-A2

2132

2133 k00 = z[- 5] - z[-13];

2134 k11 = z[ -6] - z[-14];

2135 z[ -5] = z[ -5] + z[-13];

2136 z[ -6] = z[ -6] + z[-14];

2137 z[-13] = k11; // k00*0 - k11*-1

2138 z[-14] = -k00; // k11*0 + k00*-1

2139

2140 k00 = z[- 7] - z[-15];

2141 k11 = z[ -8] - z[-16];

2142 z[ -7] = z[ -7] + z[-15];

2143 z[ -8] = z[ -8] + z[-16];

2144 z[-15] = (k11-k00) * A2; // k00*-A2 - k11*-A2

2145 z[-16] = -(k00+k11) * A2; // k11*-A2 + k00*-A2

2146

2147 #endif // ENABLE_ASM_*

2148

2149 iter_54(z);

2150 iter_54(z-8);

2151 }

2152 }

2153

2154 /*-----------------------------------------------------------------------*/

2155

2156 /**

2157 * imdct_step456: Steps 4, 5, and 6 of the IMDCT.

2158 *

2159 * [Parameters]

2160 * n: Window size.

2161 * bitrev: Bit-reverse lookup table.

2162 * u: Input buffer (length n/2).

2163 * U: Output buffer (length n/2). Must be distinct from the input buffer.

2164 */

2165 static void imdct_step456(const unsigned int n, const uint16_t *bitrev,

2166 const float *u, float *U)

2167 {

2168 /* stb_vorbis note: "weirdly, I'd have thought reading sequentially and

2169 * writing erratically would have been better than vice-versa, but in fact

2170 * that's not what my testing showed. (That is, with j = bitreverse(i),

2171 * do you read i and write j, or read j and write i.)" */

2172

2173 float *U0 = &U[n/4];

2174 float *U1 = &U[n/2];

2175

2176 (18/18) for (int i = 0, j = -4; i < (int)(n/8); i += 2, j -= 4) {

2177

2178 #if defined(ENABLE_ASM_X86_SSE2)

2179 const __m128 bitrev_0 = _mm_load_ps(&u[bitrev[i+0]]);

2180 const __m128 bitrev_1 = _mm_load_ps(&u[bitrev[i+1]]);

2181 _mm_store_ps(&U0[j], _mm_shuffle_ps(

2182 bitrev_1, bitrev_0, _MM_SHUFFLE(2,3,2,3)));

2183 _mm_store_ps(&U1[j], _mm_shuffle_ps(

2184 bitrev_1, bitrev_0, _MM_SHUFFLE(0,1,0,1)));

2185

2186 #else

2187 int k4;

2188

2189 k4 = bitrev[i+0];

2190 U1[j+3] = u[k4+0];

2191 U1[j+2] = u[k4+1];

2192 U0[j+3] = u[k4+2];

2193 U0[j+2] = u[k4+3];

2194

2195 k4 = bitrev[i+1];

2196 U1[j+1] = u[k4+0];

2197 U1[j+0] = u[k4+1];

2198 U0[j+1] = u[k4+2];

2199 U0[j+0] = u[k4+3];

2200

2201 #endif // ENABLE_ASM_*

2202

2203 }

2204 }

2205

2206 /*-----------------------------------------------------------------------*/

2207

2208 /**

2209 * imdct_step7: Step 7 of the IMDCT.

2210 *

2211 * [Parameters]

2212 * n: Window size.

2213 * C: Twiddle factor C.

2214 * buffer: Input/output buffer (length n/2, modified in place).

2215 */

2216 static void imdct_step7(const unsigned int n, const float *C, float *buffer)

2217 {

2218 #if defined(ENABLE_ASM_ARM_NEON)

2219 const int step = 4;

2220 const uint32x4_t sign_1010 = (uint32x4_t)vdupq_n_u64(UINT64_C(1)<<63);

2221 #elif defined(ENABLE_ASM_X86_AVX2)

2222 const int step = 8;

2223 const __m256i sign_1010 = _mm256_set1_epi64x(UINT64_C(1)<<63);

2224 #elif defined(ENABLE_ASM_X86_SSE2)

2225 const int step = 4;

2226 const __m128i sign_1010 = _mm_set1_epi64x(UINT64_C(1)<<63);

2227 #else

2228 const int step = 4;

2229 #endif

2230 ASSERT((n/2) % (2*step) == 0);

2231

2232 (18/18) for (float *d = buffer, *e = buffer + (n/2) - step; d < e;

2233 C += step, d += step, e -= step)

2234 {

2235

2236 #if defined(ENABLE_ASM_ARM_NEON)

2237 const float32x4_t C_0 = vld1q_f32(C);

2238 const float32x4_t d_0 = vld1q_f32(d);

2239 const float32x4_t e_0 = vld1q_f32(e);

2240 const float32x4_t e_2 = veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_0));

2241 const float32x4_t sub = vsubq_f32(d_0, e_2);

2242 const float32x4_t add = vaddq_f32(d_0, e_2);

2243 const float32x4_t C02 = veorq_f32(sign_1010, vswizzleq_xxzz_f32(C_0));

2244 const float32x4_t C13 = vswizzleq_yyww_f32(C_0);

2245 const float32x4_t mix = vaddq_f32(

2246 vmulq_f32(C13, sub), vmulq_f32(C02, vswizzleq_yxwz_f32(sub)));

2247 const float32x4_t e_temp = vsubq_f32(add, mix);

2248 const float32x4_t e_out =

2249 veorq_f32(sign_1010, vswizzleq_zwxy_f32(e_temp));

2250 vst1q_f32(d, vaddq_f32(add, mix));

2251 vst1q_f32(e, e_out);

2252

2253 #elif defined(ENABLE_ASM_X86_AVX2)

2254 const __m256 C_0 = _mm256_load_ps(C);

2255 const __m256 d_0 = _mm256_load_ps(d);

2256 const __m256 e_0 = _mm256_load_ps(e);

2257 const __m256 e_4 = _mm256_permute4x64_ps(e_0, _MM_SHUFFLE(1,0,3,2));

2258 const __m256 e_6 = _mm256_xor_sign(

2259 sign_1010, _mm256_permute_ps(e_4, _MM_SHUFFLE(1,0,3,2)));

2260 const __m256 sub = _mm256_sub_ps(d_0, e_6);

2261 const __m256 add = _mm256_add_ps(d_0, e_6);

2262 const __m256 C02 = _mm256_xor_sign(

2263 sign_1010, _mm256_permute_ps(C_0, _MM_SHUFFLE(2,2,0,0)));

2264 const __m256 C13 = _mm256_permute_ps(C_0, _MM_SHUFFLE(3,3,1,1));

2265 const __m256 mix = _mm256_add_ps(

2266 _mm256_mul_ps(C13, sub),

2267 _mm256_mul_ps(C02, _mm256_permute_ps(sub, _MM_SHUFFLE(2,3,0,1))));

2268 const __m256 e_temp = _mm256_sub_ps(add, mix);

2269 const __m256 e_swap =

2270 _mm256_permute4x64_ps(e_temp, _MM_SHUFFLE(1,0,3,2));

2271 const __m256 e_out = _mm256_xor_sign(

2272 sign_1010, _mm256_permute_ps(e_swap, _MM_SHUFFLE(1,0,3,2)));

2273 _mm256_store_ps(d, _mm256_add_ps(add, mix));

2274 _mm256_store_ps(e, e_out);

2275

2276 #elif defined(ENABLE_ASM_X86_SSE2)

2277 const __m128 C_0 = _mm_load_ps(C);

2278 const __m128 d_0 = _mm_load_ps(d);

2279 const __m128 e_0 = _mm_load_ps(e);

2280 const __m128 e_2 = _mm_xor_sign(

2281 sign_1010, _mm_shuffle_ps(e_0, e_0, _MM_SHUFFLE(1,0,3,2)));

2282 const __m128 sub = _mm_sub_ps(d_0, e_2);

2283 const __m128 add = _mm_add_ps(d_0, e_2);

2284 const __m128 C02 = _mm_xor_sign(

2285 sign_1010, _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(2,2,0,0)));

2286 const __m128 C13 = _mm_shuffle_ps(C_0, C_0, _MM_SHUFFLE(3,3,1,1));

2287 const __m128 mix = _mm_add_ps(

2288 _mm_mul_ps(C13, sub),

2289 _mm_mul_ps(C02, _mm_shuffle_ps(sub, sub, _MM_SHUFFLE(2,3,0,1))));

2290 const __m128 e_temp = _mm_sub_ps(add, mix);

2291 const __m128 e_out = _mm_xor_sign(

2292 sign_1010, _mm_shuffle_ps(e_temp, e_temp, _MM_SHUFFLE(1,0,3,2)));

2293 _mm_store_ps(d, _mm_add_ps(add, mix));

2294 _mm_store_ps(e, e_out);

2295

2296 #else

2297 float sub0 = d[0] - e[2];

2298 float sub1 = d[1] + e[3];

2299 float sub2 = d[2] - e[0];

2300 float sub3 = d[3] + e[1];

2301

2302 float mix0 = C[1]*sub0 + C[0]*sub1;

2303 float mix1 = C[1]*sub1 - C[0]*sub0;

2304 float mix2 = C[3]*sub2 + C[2]*sub3;

2305 float mix3 = C[3]*sub3 - C[2]*sub2;

2306

2307 float add0 = d[0] + e[2];

2308 float add1 = d[1] - e[3];

2309 float add2 = d[2] + e[0];

2310 float add3 = d[3] - e[1];

2311

2312 d[0] = add0 + mix0;

2313 d[1] = add1 + mix1;

2314 d[2] = add2 + mix2;

2315 d[3] = add3 + mix3;

2316

2317 e[0] = add2 - mix2;

2318 e[1] = -(add3 - mix3);

2319 e[2] = add0 - mix0;

2320 e[3] = -(add1 - mix1);

2321

2322 #endif // ENABLE_ASM_*

2323

2324 }

2325 }

2326

2327 /*-----------------------------------------------------------------------*/

2328

2329 /**

2330 * imdct_step8_decode: Step 8 and final decoding for the IMDCT.

2331 *

2332 * [Parameters]

2333 * n: Window size.

2334 * B: Twiddle factor B.

2335 * in: Input buffer (length n/2).

2336 * out: Output buffer (length n).

2337 */

2338 static void imdct_step8_decode(const unsigned int n, const float *B,

2339 const float *in, float *out)

2340 {

2341 /* stb_vorbis note: "this generates pairs of data a la 8 and pushes

2342 * them directly through the decode kernel (pushing rather than

2343 * pulling) to avoid having to make another pass later" */

2344

2345 B += (n/2) - 16;

2346 float *d0 = &out[0];

2347 float *d1 = &out[(n/2)-8];

2348 float *d2 = &out[(n/2)];

2349 float *d3 = &out[n-8];

2350 (18/18) for (const float *e = in + (n/2) - 16; e >= in;

2351 e -= 16, B -= 16, d0 += 8, d1 -= 8, d2 += 8, d3 -= 8)

2352 {

2353

2354 #if defined(ENABLE_ASM_ARM_NEON)

2355 const float32x4x2_t e_0 =

2356 vuzpq_f32(vld1q_f32(&e[0]), vld1q_f32(&e[4]));

2357 const float32x4x2_t e_8 =

2358 vuzpq_f32(vld1q_f32(&e[8]), vld1q_f32(&e[12]));

2359 const float32x4x2_t B_0 =

2360 vuzpq_f32(vld1q_f32(&B[0]), vld1q_f32(&B[4]));

2361 const float32x4x2_t B_8 =

2362 vuzpq_f32(vld1q_f32(&B[8]), vld1q_f32(&B[12]));

2363 const float32x4_t d1_0 =

2364 vsubq_f32(vmulq_f32(e_0.val[1], B_0.val[0]),

2365 vmulq_f32(e_0.val[0], B_0.val[1]));

2366 const float32x4_t d0_0 = vnegq_f32(vswizzleq_wzyx_f32(d1_0));

2367 const float32x4_t d3_0 =

2368 vnegq_f32(vaddq_f32(vmulq_f32(e_0.val[0], B_0.val[0]),

2369 vmulq_f32(e_0.val[1], B_0.val[1])));

2370 const float32x4_t d2_0 = vswizzleq_wzyx_f32(d3_0);

2371 const float32x4_t d1_8 =

2372 vsubq_f32(vmulq_f32(e_8.val[1], B_8.val[0]),

2373 vmulq_f32(e_8.val[0], B_8.val[1]));

2374 const float32x4_t d0_8 = vnegq_f32(vswizzleq_wzyx_f32(d1_8));

2375 const float32x4_t d3_8 =

2376 vnegq_f32(vaddq_f32(vmulq_f32(e_8.val[0], B_8.val[0]),

2377 vmulq_f32(e_8.val[1], B_8.val[1])));

2378 const float32x4_t d2_8 = vswizzleq_wzyx_f32(d3_8);

2379 vst1q_f32(d0, d0_8);

2380 vst1q_f32(d0+4, d0_0);

2381 vst1q_f32(d1+4, d1_8);

2382 vst1q_f32(d1, d1_0);

2383 vst1q_f32(d2, d2_8);

2384 vst1q_f32(d2+4, d2_0);

2385 vst1q_f32(d3+4, d3_8);

2386 vst1q_f32(d3, d3_0);

2387

2388 #elif defined(ENABLE_ASM_X86_AVX2)

2389 const __m256i sign_1111 = _mm256_set1_epi32(UINT32_C(1)<<31);

2390 const __m256i permute_reverse = _mm256_set_epi32(0,1,2,3,4,5,6,7);

2391 /* It's tempting to use the gather-load instructions here to load

2392 * even and odd indices, but that causes a massive slowdown, so

2393 * we just use normal loads and lots of permutes. */

2394 const __m256i permute_even_odd = _mm256_set_epi32(7,5,3,1,6,4,2,0);

2395 const __m256 e_0 = _mm256_load_ps(&e[0]);

2396 const __m256 e_8 = _mm256_load_ps(&e[8]);

2397 const __m256 B_0 = _mm256_load_ps(&B[0]);

2398 const __m256 B_8 = _mm256_load_ps(&B[8]);

2399 const __m256 e_0p = _mm256_permutevar8x32_ps(e_0, permute_even_odd);

2400 const __m256 e_8p = _mm256_permutevar8x32_ps(e_8, permute_even_odd);

2401 const __m256 B_0p = _mm256_permutevar8x32_ps(B_0, permute_even_odd);

2402 const __m256 B_8p = _mm256_permutevar8x32_ps(B_8, permute_even_odd);

2403 const __m256 e_even = _mm256_permute2f128_ps(e_0p, e_8p, 0x20);

2404 const __m256 e_odd = _mm256_permute2f128_ps(e_0p, e_8p, 0x31);

2405 const __m256 B_even = _mm256_permute2f128_ps(B_0p, B_8p, 0x20);

2406 const __m256 B_odd = _mm256_permute2f128_ps(B_0p, B_8p, 0x31);

2407 const __m256 d1_0 = _mm256_fmsub_ps(e_odd, B_even,

2408 _mm256_mul_ps(e_even, B_odd));

2409 const __m256 d0_0 = _mm256_xor_sign(

2410 sign_1111, _mm256_permutevar8x32_ps(d1_0, permute_reverse));

2411 const __m256 d3_0 = _mm256_xor_sign(

2412 sign_1111, _mm256_fmadd_ps(e_even, B_even,

2413 _mm256_mul_ps(e_odd, B_odd)));

2414 const __m256 d2_0 = _mm256_permutevar8x32_ps(d3_0, permute_reverse);

2415 _mm256_store_ps(d0, d0_0);

2416 _mm256_store_ps(d1, d1_0);

2417 _mm256_store_ps(d2, d2_0);

2418 _mm256_store_ps(d3, d3_0);

2419

2420 #elif defined(ENABLE_ASM_X86_SSE2)

2421 const __m128i sign_1111 = _mm_set1_epi32(UINT32_C(1)<<31);

2422 const __m128 e_0 = _mm_load_ps(&e[0]);

2423 const __m128 e_4 = _mm_load_ps(&e[4]);

2424 const __m128 B_0 = _mm_load_ps(&B[0]);

2425 const __m128 B_4 = _mm_load_ps(&B[4]);

2426 const __m128 e_8 = _mm_load_ps(&e[8]);

2427 const __m128 e_12 = _mm_load_ps(&e[12]);

2428 const __m128 B_8 = _mm_load_ps(&B[8]);

2429 const __m128 B_12 = _mm_load_ps(&B[12]);

2430 const __m128 e_even_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(2,0,2,0));

2431 const __m128 e_odd_0 = _mm_shuffle_ps(e_0, e_4, _MM_SHUFFLE(3,1,3,1));

2432 const __m128 B_even_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(2,0,2,0));

2433 const __m128 B_odd_0 = _mm_shuffle_ps(B_0, B_4, _MM_SHUFFLE(3,1,3,1));

2434 const __m128 e_even_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(2,0,2,0));

2435 const __m128 e_odd_8 = _mm_shuffle_ps(e_8, e_12, _MM_SHUFFLE(3,1,3,1));

2436 const __m128 B_even_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(2,0,2,0));

2437 const __m128 B_odd_8 = _mm_shuffle_ps(B_8, B_12, _MM_SHUFFLE(3,1,3,1));

2438 const __m128 d1_0 = _mm_sub_ps(_mm_mul_ps(e_odd_0, B_even_0),

2439 _mm_mul_ps(e_even_0, B_odd_0));

2440 const __m128 d0_0 = _mm_xor_sign(

2441 sign_1111, _mm_shuffle_ps(d1_0, d1_0, _MM_SHUFFLE(0,1,2,3)));

2442 const __m128 d3_0 = _mm_xor_sign(

2443 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_0, B_even_0),

2444 _mm_mul_ps(e_odd_0, B_odd_0)));

2445 const __m128 d2_0 = _mm_shuffle_ps(d3_0, d3_0, _MM_SHUFFLE(0,1,2,3));

2446 const __m128 d1_8 = _mm_sub_ps(_mm_mul_ps(e_odd_8, B_even_8),

2447 _mm_mul_ps(e_even_8, B_odd_8));

2448 const __m128 d0_8 = _mm_xor_sign(

2449 sign_1111, _mm_shuffle_ps(d1_8, d1_8, _MM_SHUFFLE(0,1,2,3)));

2450 const __m128 d3_8 = _mm_xor_sign(

2451 sign_1111, _mm_add_ps(_mm_mul_ps(e_even_8, B_even_8),

2452 _mm_mul_ps(e_odd_8, B_odd_8)));

2453 const __m128 d2_8 = _mm_shuffle_ps(d3_8, d3_8, _MM_SHUFFLE(0,1,2,3));

2454 _mm_store_ps(d0, d0_8);

2455 _mm_store_ps(d0+4, d0_0);

2456 _mm_store_ps(d1+4, d1_8);

2457 _mm_store_ps(d1, d1_0);

2458 _mm_store_ps(d2, d2_8);

2459 _mm_store_ps(d2+4, d2_0);

2460 _mm_store_ps(d3+4, d3_8);

2461 _mm_store_ps(d3, d3_0);

2462

2463 #else

2464 float p0, p1, p2, p3;

2465

2466 p3 = e[14]*B[15] - e[15]*B[14];

2467 p2 = -e[14]*B[14] - e[15]*B[15];

2468 d0[0] = p3;

2469 d1[7] = - p3;

2470 d2[0] = p2;

2471 d3[7] = p2;

2472

2473 p1 = e[12]*B[13] - e[13]*B[12];

2474 p0 = -e[12]*B[12] - e[13]*B[13];

2475 d0[1] = p1;

2476 d1[6] = - p1;

2477 d2[1] = p0;

2478 d3[6] = p0;

2479

2480 p3 = e[10]*B[11] - e[11]*B[10];

2481 p2 = -e[10]*B[10] - e[11]*B[11];

2482 d0[2] = p3;

2483 d1[5] = - p3;

2484 d2[2] = p2;

2485 d3[5] = p2;

2486

2487 p1 = e[8]*B[9] - e[9]*B[8];

2488 p0 = -e[8]*B[8] - e[9]*B[9];

2489 d0[3] = p1;

2490 d1[4] = - p1;

2491 d2[3] = p0;

2492 d3[4] = p0;

2493

2494 p3 = e[6]*B[7] - e[7]*B[6];

2495 p2 = -e[6]*B[6] - e[7]*B[7];

2496 d0[4] = p3;

2497 d1[3] = - p3;

2498 d2[4] = p2;

2499 d3[3] = p2;

2500

2501 p1 = e[4]*B[5] - e[5]*B[4];

2502 p0 = -e[4]*B[4] - e[5]*B[5];

2503 d0[5] = p1;

2504 d1[2] = - p1;

2505 d2[5] = p0;

2506 d3[2] = p0;

2507

2508 p3 = e[2]*B[3] - e[3]*B[2];

2509 p2 = -e[2]*B[2] - e[3]*B[3];

2510 d0[6] = p3;

2511 d1[1] = - p3;

2512 d2[6] = p2;

2513 d3[1] = p2;

2514

2515 p1 = e[0]*B[1] - e[1]*B[0];

2516 p0 = -e[0]*B[0] - e[1]*B[1];

2517 d0[7] = p1;

2518 d1[0] = - p1;

2519 d2[7] = p0;

2520 d3[0] = p0;

2521

2522 #endif // ENABLE_ASM_*

2523

2524 }

2525 }

2526

2527 /*-----------------------------------------------------------------------*/

2528

2529 /**

2530 * inverse_mdct: Perform the inverse MDCT operation on the given buffer.

2531 * The algorithm is taken from "The use of multirate filter banks for

2532 * coding of high quality digital audio", Th. Sporer et. al. (1992), with

2533 * corrections for errors in that paper, and the step numbers listed in

2534 * comments refer to the algorithm steps as described in the paper.

2535 *

2536 * [Parameters]

2537 * handle: Stream handle.

2538 * buffer: Input/output buffer.

2539 * blocktype: 0 if the current frame is a short block, 1 if a long block.

2540 */

2541 static void inverse_mdct(stb_vorbis *handle, float *buffer, int blocktype)

2542 {

2543 const unsigned int n = handle->blocksize[blocktype];

2544 const int log2_n = handle->blocksize_bits[blocktype];

2545 const float *A = handle->A[blocktype];

2546 float *buf2 = handle->imdct_temp_buf;

2547

2548 /* Setup and step 1. Note that step 1 involves subtracting pairs of

2549 * spectral coefficients, but the two items subtracted are actually

2550 * arithmetic inverses of each other(!), e.g.

2551 * u[4*k] - u[n-4*k-1] == 2*u[4*k]

2552 * We omit the factor of 2 here; that propagates linearly through to

2553 * the final step, and it is compensated for by "*0.5f" on the B[]

2554 * values computed during setup. */

2555 imdct_setup_step1(n, A, buffer, buf2);

2556

2557 /* Step 2.

2558 * stb_vorbis note: "this could be in place, but the data ends up in

2559 * the wrong place... _somebody_'s got to swap it, so this is nominated" */

2560 imdct_step2(n, A, buf2, buffer);

2561

2562 /* Step 3.

2563 * stb_vorbis note: "the original step3 loop can be nested r inside s

2564 * or s inside r; it's written originally as s inside r, but this is

2565 * dumb when r iterates many times, and s few. So I have two copies of

2566 * it and switch between them halfway." */

2567

2568 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*0, n/4, 8);

2569 imdct_step3_inner_r_loop(n/16, A, buffer, (n/2) - (n/4)*1, n/4, 8);

2570

2571 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*0, n/8, 16);

2572 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*1, n/8, 16);

2573 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*2, n/8, 16);

2574 imdct_step3_inner_r_loop(n/32, A, buffer, (n/2) - (n/8)*3, n/8, 16);

2575

2576 int l = 2;

2577 (18/18) for (; l < (log2_n-3)/2; l++) {

2578 const int k0 = n >> (l+2);

2579 const int lim = 1 << (l+1);

2580 (18/18) for (int i = 0; i < lim; i++) {

2581 imdct_step3_inner_r_loop(n >> (l+4), A, buffer, (n/2) - k0*i,

2582 k0, 1 << (l+3));

2583 }

2584 }

2585

2586 (18/18) for (; l < log2_n-6; l++) {

2587 const int k0 = n >> (l+2), k1 = 1 << (l+3);

2588 const int rlim = n >> (l+6);

2589 const int lim = 1 << (l+1);

2590 const float *A0 = A;

2591 int i_off = n/2;

2592 (18/18) for (int r = rlim; r > 0; r--) {

2593 imdct_step3_inner_s_loop(lim, A0, buffer, i_off, k0, k1);

2594 A0 += k1*4;

2595 i_off -= 8;

2596 }

2597 }

2598

2599 /* stb_vorbis note: "log2_n-6,-5,-4 all interleaved together - the big

2600 * win comes from getting rid of needless flops due to the constants on

2601 * pass 5 & 4 being all 1 and 0; combining them to be simultaneous to

2602 * improve cache made little difference" */

2603 imdct_step3_inner_s_loop_ld654(n, A, buffer);

2604

2605 /* Steps 4, 5, and 6. */

2606 imdct_step456(n, handle->bit_reverse[blocktype], buffer, buf2);

2607

2608 /* Step 7. */

2609 imdct_step7(n, handle->C[blocktype], buf2);

2610

2611 /* Step 8 and final decoding. */

2612 imdct_step8_decode(n, handle->B[blocktype], buf2, buffer);

2613 }

2614

2615 /*************************************************************************/

2616 /******************* Main decoding routine (internal) ********************/

2617 /*************************************************************************/

2618

2619 /**

2620 * vorbis_decode_packet_rest: Perform all decoding operations for a frame

2621 * except mode selection and windowing.

2622 *

2623 * [Parameters]

2624 * handle: Stream handle.

2625 * mode: Mode configuration for this frame.

2626 * left_start: Start position of the left overlap region.

2627 * left_end: End position of the left overlap region.

2628 * right_start: Start position of the right overlap region.

2629 * right_end: End position of the right overlap region.

2630 * len_ret: Pointer to variable to receive the frame length (offset

2631 * within the window of just past the final fully-decoded sample).

2632 * [Return value]

2633 * True on success, false on error.

2634 */

2635 static bool vorbis_decode_packet_rest(

2636 stb_vorbis *handle, const Mode *mode, int left_start, int left_end,

2637 int right_start, int right_end, int *len_ret)

2638 {

2639 const int n = handle->blocksize[mode->blockflag];

2640 const Mapping *map = &handle->mapping[mode->mapping];

2641 float ** const channel_buffers =

2642 handle->channel_buffers[handle->cur_channel_buffer];

2643

2644 /**** Floor processing (4.3.2). ****/

2645

2646 int64_t floor0_amplitude[256];

2647 bool zero_channel[256];

2648 (18/18) for (int ch = 0; ch < handle->channels; ch++) {

2649 const int floor_index = map->submap_floor[map->mux[ch]];

2650 (18/18) if (handle->floor_types[floor_index] == 0) {

2651 Floor0 *floor = &handle->floor_config[floor_index].floor0;

2652 const int64_t amplitude =

2653 decode_floor0(handle, floor, handle->coefficients[ch]);

2654 (18/18) if (UNLIKELY(amplitude < 0)) {

2655 flush_packet(handle);

2656 return error(handle, VORBIS_invalid_packet);

2657 }

2658 zero_channel[ch] = (amplitude == 0);

2659 floor0_amplitude[ch] = amplitude;

2660 } else { // handle->floor_types[floor_index] == 1

2661 Floor1 *floor = &handle->floor_config[floor_index].floor1;

2662 zero_channel[ch] =

2663 !decode_floor1(handle, floor, handle->final_Y[ch]);

2664 }

2665 }

2666

2667 /**** Nonzero vector propagation (4.3.3). ****/

2668 bool really_zero_channel[256];

2669 memcpy(really_zero_channel, zero_channel,

2670 sizeof(zero_channel[0]) * handle->channels);

2671 (18/18) for (int i = 0; i < map->coupling_steps; i++) {

2672 (18/18) if (!zero_channel[map->coupling[i].magnitude]

2673 (18/18) || !zero_channel[map->coupling[i].angle]) {

2674 zero_channel[map->coupling[i].magnitude] = false;

2675 zero_channel[map->coupling[i].angle] = false;

2676 }

2677 }

2678

2679 /**** Residue decoding (4.3.4). ****/

2680 (18/18) for (int i = 0; i < map->submaps; i++) {

2681 float *residue_buffers[256];

2682 int ch = 0;

2683 (18/18) for (int j = 0; j < handle->channels; j++) {

2684 (18/18) if (map->mux[j] == i) {

2685 (18/18) if (zero_channel[j]) {

2686 residue_buffers[ch] = NULL;

2687 } else {

2688 residue_buffers[ch] = channel_buffers[j];

2689 }

2690 ch++;

2691 }

2692 }

2693 decode_residue(handle, map->submap_residue[i], n/2, ch,

2694 residue_buffers);

2695 }

2696

2697 /**** Inverse coupling (4.3.5). ****/

2698 (18/18) for (int i = map->coupling_steps-1; i >= 0; i--) {

2699 float *magnitude = channel_buffers[map->coupling[i].magnitude];

2700 float *angle = channel_buffers[map->coupling[i].angle];

2701 (18/18) for (int j = 0; j < n/2; j++) {

2702 const float M = magnitude[j];

2703 const float A = angle[j];

2704 float new_M, new_A;

2705 (18/18) if (M > 0) {

2706 (18/18) if (A > 0) {

2707 new_M = M;

2708 new_A = M - A;

2709 } else {

2710 new_A = M;

2711 new_M = M + A;

2712 }

2713 } else {

2714 (18/18) if (A > 0) {

2715 new_M = M;

2716 new_A = M + A;

2717 } else {

2718 new_A = M;

2719 new_M = M - A;

2720 }

2721 }

2722 magnitude[j] = new_M;

2723 angle[j] = new_A;

2724 }

2725 }

2726

2727 /**** Floor curve synthesis and residue product (4.3.6). The spec ****

2728 **** uses the term "dot product", but the actual operation is ****

2729 **** component-by-component vector multiplication. ****/

2730 (18/18) for (int i = 0; i < handle->channels; i++) {

2731 (18/18) if (really_zero_channel[i]) {

2732 memset(channel_buffers[i], 0, sizeof(*channel_buffers[i]) * (n/2));

2733 } else {

2734 const int floor_index = map->submap_floor[map->mux[i]];

2735 (18/18) if (handle->floor_types[floor_index] == 0) {

2736 Floor0 *floor = &handle->floor_config[floor_index].floor0;

2737 do_floor0_final(handle, floor, i, n, floor0_amplitude[i]);

2738 } else { // handle->floor_types[floor_index] == 1

2739 Floor1 *floor = &handle->floor_config[floor_index].floor1;

2740 do_floor1_final(handle, floor, i, n);

2741 }

2742 }

2743 }

2744

2745 /**** Inverse MDCT (4.3.7). ****/

2746 (18/18) for (int i = 0; i < handle->channels; i++) {

2747 inverse_mdct(handle, channel_buffers[i], mode->blockflag);

2748 }

2749

2750 /**** Frame length, sample position, and other miscellany. ****/

2751

2752 /* Flush any leftover data in the current packet so the next packet

2753 * doesn't try to read it. */

2754 flush_packet(handle);

2755

2756 /* Deal with discarding samples from the first packet. */

2757 (18/18) if (handle->first_decode) {

2758 /* We don't support streams with nonzero start positions (what the

2759 * spec describes as "The granule (PCM) position of the first page

2760 * need not indicate that the stream started at position zero..."

2761 * in A.2), so we just omit the left half of the window. The value

2762 * we calculate here is negative, but the addition below will bring

2763 * it up to zero again. */

2764 handle->current_loc = -(n/2 - left_start);

2765 handle->current_loc_valid = true;

2766 handle->first_decode = false;

2767 }

2768

2769 /* If this is the last complete frame in a non-final Ogg page, update

2770 * the current sample position based on the Ogg granule position.

2771 * Normally this should have no effect, but it's important for the

2772 * first page in case that page indicates a positive initial offset,

2773 * and it helps resync if a page is missing in the stream. */

2774 (18/18) if (handle->last_seg_index == handle->end_seg_with_known_loc

2775 (18/18) && !(handle->page_flag & PAGEFLAG_last_page)) {

2776 /* For non-final pages, the granule position should be the "last

2777 * complete sample returned by decode" (A.2), thus right_start.

2778 * However, it appears that the reference encoder blindly uses the

2779 * middle of the window (n/2) even on a long/short frame boundary,

2780 * and based on <https://trac.xiph.org/ticket/1169>, the window

2781 * center appears to have been the original intention. */

2782 handle->current_loc =

2783 handle->known_loc_for_packet - (n/2 - left_start);

2784 handle->current_loc_valid = true;

2785 }

2786

2787 /* Update the current sample position for samples returned from this

2788 * frame. */

2789 (18/18) if (handle->current_loc_valid) {

2790 handle->current_loc += (right_start - left_start);

2791 }

2792

2793 /* Return the amount of data stored in the window, including the

2794 * right-side overlap region.*/

2795 *len_ret = right_end;

2796

2797 /* The granule position on the final page of the Ogg stream specifies

2798 * the length of the stream, thus indicating how many samples to

2799 * return from the final frame. However, some streams attempt to

2800 * truncate to within the _second-to-last_ frame instead, so we have

2801 * to check for end-of-stream truncation on every packet in the last

2802 * page. (The specification is ambiguous about whether this is

2803 * actually permitted; while the wording suggests that the truncation

2804 * point must lie within the last frame, there is no explicit language

2805 * specifying that limitation. Since such streams actually exist,

2806 * however, we go ahead and support them.) */

2807 (18/18) if (handle->current_loc_valid

2808 (18/18) && (handle->page_flag & PAGEFLAG_last_page)) {

2809 const uint64_t end_loc = handle->known_loc_for_packet;

2810 /* Be careful of wraparound here! A stream with a bogus position

2811 * in the second-to-last page which has the high bit set could

2812 * cause a simple "end_loc <= handle->current_loc" test to

2813 * improperly pass, so we have to test by taking the difference

2814 * and checking its sign bit. */

2815 const uint64_t frame_end =

2816 handle->current_loc + (right_end - right_start);

2817 (18/18) if (handle->current_loc - end_loc < UINT64_C(1)<<63 // end_loc <= handle->current_loc

2818 (18/18) || (handle->last_seg_index == handle->end_seg_with_known_loc

2819 (18/18) && end_loc - frame_end >= UINT64_C(1)<<63)) // end_loc < frame_end

2820 {

2821 const uint64_t packet_begin_loc =

2822 handle->current_loc - (right_start - left_start);

2823 (18/18) if (end_loc - packet_begin_loc >= UINT64_C(1)<<63) { // end_loc < packet_begin_loc

2824 /* We can't truncate to before the beginning of the current

2825 * frame (that wouldn't make sense anyway, because the

2826 * stream should just have ended at the previous frame);

2827 * just truncate to the beginning of the frame in that case. */

2828 *len_ret = left_start;

2829 } else {

2830 const uint64_t true_len =

2831 end_loc - (handle->current_loc - right_start);

2832 ASSERT(true_len < (uint64_t)*len_ret);

2833 *len_ret = (int)true_len;

2834 }

2835 handle->current_loc = end_loc;

2836 }

2837 }

2838

2839 return true;

2840 }

2841

2842 /*************************************************************************/

2843 /************************** Interface routines ***************************/

2844 /*************************************************************************/

2845

2846 bool vorbis_decode_initial(stb_vorbis *handle, int *left_start_ret,

2847 int *left_end_ret, int *right_start_ret,

2848 int *right_end_ret, int *mode_ret)

2849 {

2850 /* Start reading the next packet, and verify that it's an audio packet. */

2851 (18/18) if (!handle->packet_mode) {

2852 (18/18) if (UNLIKELY(handle->eof)) {

2853 return false;

2854 }

2855 (18/18) if (UNLIKELY(!start_packet(handle))) {

2856 (18/18) if (handle->error == VORBIS_reached_eof) {

2857 /* EOF at page boundary is not an error! */

2858 handle->error = VORBIS__no_error;

2859 }

2860 return false;

2861 }

2862 }

2863 (18/18) if (UNLIKELY(get_bits(handle, 1) != 0)

2864 (18/18) || UNLIKELY(handle->valid_bits < 0)) {

2865 /* Empty or non-audio packet. If not in packet mode, skip it and

2866 * try again. */

2867 flush_packet(handle);

2868 (18/18) if (handle->packet_mode) {

2869 handle->error = VORBIS_invalid_packet;

2870 return false;

2871 } else {

2872 return vorbis_decode_initial(handle, left_start_ret, left_end_ret,

2873 right_start_ret, right_end_ret,

2874 mode_ret);

2875 }

2876 }

2877

2878 /* Read the mode number and window flags for this packet. */

2879 const int mode_index = get_bits(handle, handle->mode_bits);

2880 /* Still in the same byte, so we can't hit EOP here. */

2881 ASSERT(mode_index != EOP);

2882 (18/18) if (mode_index >= handle->mode_count) {

2883 handle->error = VORBIS_invalid_packet;

2884 flush_packet(handle);

2885 return false;

2886 }

2887 *mode_ret = mode_index;

2888 Mode *mode = &handle->mode_config[mode_index];

2889 bool next, prev;

2890 (18/18) if (mode->blockflag) {

2891 prev = get_bits(handle, 1);

2892 next = get_bits(handle, 1);

2893 (18/18) if (UNLIKELY(handle->valid_bits < 0)) {

2894 flush_packet(handle);

2895 return error(handle, VORBIS_invalid_packet);

2896 }

2897 } else {

2898 prev = next = false;

2899 }

2900

2901 /* Set up the window bounds for this frame. */

2902 const int n = handle->blocksize[mode->blockflag];

2903 (36/36) if (mode->blockflag && !prev) {

2904 *left_start_ret = (n - handle->blocksize[0]) / 4;

2905 *left_end_ret = (n + handle->blocksize[0]) / 4;

2906 } else {

2907 *left_start_ret = 0;

2908 *left_end_ret = n/2;

2909 }

2910 (36/36) if (mode->blockflag && !next) {

2911 *right_start_ret = (n*3 - handle->blocksize[0]) / 4;

2912 *right_end_ret = (n*3 + handle->blocksize[0]) / 4;

2913 } else {

2914 *right_start_ret = n/2;

2915 *right_end_ret = n;

2916 }

2917

2918 return true;

2919 }

2920

2921 /*-----------------------------------------------------------------------*/

2922

2923 bool vorbis_decode_packet(stb_vorbis *handle, int *len_ret)

2924 {

2925 const bool first_decode = handle->first_decode;

2926

2927 int mode, left_start, left_end, right_start, right_end, len;

2928 (18/18) if (!vorbis_decode_initial(handle, &left_start, &left_end,

2929 &right_start, &right_end, &mode)

2930 (18/18) || !vorbis_decode_packet_rest(handle, &handle->mode_config[mode],

2931 left_start, left_end, right_start,

2932 right_end, &len)) {

2933 return false;

2934 }

2935

2936 float ** const channel_buffers =

2937 handle->channel_buffers[handle->cur_channel_buffer];

2938 const int prev = handle->previous_length;

2939

2940 /* We deliberately (though harmlessly) deviate from the spec with

2941 * respect to the portion of data to return for a given frame. The

2942 * spec says that we should return the region from the middle of the

2943 * previous frame to the middle of the current frame (4.8), but that

2944 * would require a buffer copy. Instead we return all finalized

2945 * samples from the current frame, i.e., up to right_start. This

2946 * unfortunately requires a bit of fudging around when decoding the

2947 * first frame of a stream. */

2948

2949 /* Mix in data from the previous window's right side. */

2950 (18/18) if (prev > 0) {

2951 const float *weights;

2952 (18/18) if (prev*2 == handle->blocksize[0]) {

2953 weights = handle->window_weights[0];

2954 } else {

2955 ASSERT(prev*2 == handle->blocksize[1]);

2956 weights = handle->window_weights[1];

2957 }

2958 (18/18) for (int i = 0; i < handle->channels; i++) {

2959 (18/18) for (int j = 0; j < prev; j++) {

2960 channel_buffers[i][left_start+j] =

2961 channel_buffers[i][left_start+j] * weights[j] +

2962 handle->previous_window[i][j] * weights[prev-1-j];

2963 }

2964 }

2965 }

2966

2967 /* Point the previous_window pointers at the right side of this window. */

2968 handle->previous_length = right_end - right_start;

2969 (18/18) for (int i = 0; i < handle->channels; i++) {

2970 handle->previous_window[i] = channel_buffers[i] + right_start;

2971 }

2972

2973 /* If this is the first frame, push left_start (the beginning of data

2974 * to return) to the center of the window, since the left half of the

2975 * window contains garbage. */

2976 (18/18) if (first_decode) {

2977 const int n = handle->blocksize[handle->mode_config[mode].blockflag];

2978 left_start = n/2;

2979 }

2980

2981 /* Save this channel's output pointers. */

2982 (18/18) for (int i = 0; i < handle->channels; i++) {

2983 handle->outputs[i] = channel_buffers[i] + left_start;

2984 }

2985

2986 /* Advance to the next set of channel buffers for the next frame's data. */

2987 handle->cur_channel_buffer =

2988 (handle->cur_channel_buffer + 1) % lenof(handle->channel_buffers);

2989

2990 /* Return the final frame length, if requested. */

2991 (18/18) if (len_ret) {

2992 (18/18) if (len < right_start) {

2993 *len_ret = len - left_start;

2994 /* Watch out for trying to truncate into the first packet of

2995 * the stream (which is not returned). */

2996 (18/18) if (*len_ret < 0) {

2997 *len_ret = 0;

2998 }

2999 } else {

3000 *len_ret = right_start - left_start;

3001 ASSERT(*len_ret >= 0);

3002 }

3003 }

3004 return true;

3005 }

3006

3007 /*-----------------------------------------------------------------------*/

3008

3009 bool vorbis_decode_packet_direct(

3010 stb_vorbis *handle, const void *packet, int32_t packet_len, int *len_ret)

3011 {

3012 start_packet_direct(handle, packet, packet_len);

3013 return vorbis_decode_packet(handle, len_ret);

3014 }

3015

3016 /*************************************************************************/

3017 /*************************************************************************/