/* * System Interface Library for games * Copyright (c) 2007-2020 Andrew Church * Released under the GNU GPL version 3 or later; NO WARRANTY is provided. * See the file COPYING.txt for details. * * src/sound/mixer.c: Software mixer implementation. */ #include "src/base.h" #include "src/math.h" #include "src/memory.h" #include "src/mutex.h" #include "src/sound.h" #include "src/sound/mixer.h" #include "src/time.h" // For MIX_TIMING. #ifdef SIL_ARCH_ARM_NEON # include #endif /*************************************************************************/ /************************* Configuration options *************************/ /*************************************************************************/ /** * MIX_TIMING: Define to report the audio processing speed via DLOG(). */ // #define MIX_TIMING /** * MIX_ACCUM_BUFLEN: The size of the sample accumulation buffer used by * mix(). Consequently, this is the largest number of samples that can be * processed at once in sound_mixer_get_pcm(); larger buffer sizes will * require multiple calls to mix(). */ #define MIX_ACCUM_BUFLEN 1024 /*************************************************************************/ /****************************** Local data *******************************/ /*************************************************************************/ /* Have we been initialized? */ static int mixer_initted; /* Total number of channels allocated. */ static int num_channels; /* Output sampling rate. */ static int mix_rate; /* Base volume multiplier. */ static float base_volume; /*-----------------------------------------------------------------------*/ /* Mutual exclusion semaphore for [un]lock_mixer(). */ static int mixer_mutex; /* Data for each sound channel. */ typedef struct MixerChannelInfo MixerChannelInfo; struct MixerChannelInfo { SoundDecodeCallback decode_func; void *decode_handle; uint8_t stereo; // True = stereo, false = monaural uint8_t playing; // Flag: Are we playing? uint8_t fade_cut; // True = stop playback when fade volume hits zero int32_t volume; // sound_mixer_setvol(1.0) ⇒ 1 << VOLUME_BITS int32_t fade_rate; // Amount to add to volume per sample played int32_t fade_target; // Fade target volume (same units as .volume) int32_t pan; // sound_mixer_setpan(-1.0) ⇒ 0 // sound_mixer_setpan(+1.0) ⇒ 1 << PAN_BITS int16_t stereo_pan_l; // Left channel mult. for stereo pan (0...PAN_MULT) int16_t stereo_pan_r; // Right channel mult. for stereo pan (0...PAN_MULT) int16_t *pcm_buffer; // PCM data buffer }; #define VOLUME_BITS 24 #define PAN_BITS 8 #define VOLUME_MULT (1 << VOLUME_BITS) #define PAN_MULT (1 << PAN_BITS) static MixerChannelInfo *mixer_channels; /* Copy of mixer_channels[] used by mix() to avoid holding the lock for * long periods of time. */ static MixerChannelInfo *copy_channels; /* Maximum volume (to avoid integer overflow). */ #define VOLUME_MAX (0x7FFFFFFF >> VOLUME_BITS) /* Shift amounts used in the mixer routine to obtain the final sample value * after multiplication by volume and pan factors. */ #define MONO_SHIFT (VOLUME_BITS + (PAN_BITS-1)) #define STEREO_SHIFT (VOLUME_BITS + PAN_BITS) /*************************************************************************/ /* Local routine declarations. */ /** * lock_mixer: Lock the sound system against state changes by other * threads. Lock calls may _not_ be nested. */ static void lock_mixer(void); /** * unlock_mixer: Unlock the sound system after lock_mixer(). */ static void unlock_mixer(void); /** * mix: Mix samples for all active channels into the given buffer. * * [Parameters] * buffer: Buffer into which to mix. * samples: Number of samples to mix. */ static void mix(void *buffer, int samples); /*************************************************************************/ /***************** Interface: General control functions ******************/ /*************************************************************************/ int sound_mixer_init(int num_channels_, int mix_rate_) { if (UNLIKELY(num_channels_ <= 0) || UNLIKELY(mix_rate_ <= 0)) { DLOG("Invalid parameters: %d %d", num_channels_, mix_rate_); goto error_return; } if (mixer_initted) { DLOG("Tried to init twice!"); goto error_return; } num_channels = num_channels_; mix_rate = mix_rate_; base_volume = 1.0; mixer_channels = mem_alloc(num_channels * sizeof(*mixer_channels), 0, 0); if (UNLIKELY(!mixer_channels)) { DLOG("Failed to allocate %d channels", num_channels); goto error_return; } copy_channels = mem_alloc(num_channels * sizeof(*mixer_channels), 0, 0); if (UNLIKELY(!copy_channels)) { DLOG("Failed to allocate %d channels", num_channels); goto error_free_mixer_channels; } mixer_mutex = mutex_create(MUTEX_SIMPLE, MUTEX_UNLOCKED); if (UNLIKELY(!mixer_mutex)) { DLOG("Failed to create sound mutex"); goto error_free_copy_channels; } int size = num_channels * (MIX_ACCUM_BUFLEN*4); #ifdef SIL_ARCH_MIPS_32 size += 4; // Ensure mix() can overrun by 1 sample (see asm comments). #endif int16_t *pcm_buffer = mem_alloc(size, 0, 0); if (UNLIKELY(!pcm_buffer)) { DLOG("Failed to allocate source PCM buffer"); goto error_destroy_mixer_mutex; } for (int i = 0; i < num_channels; i++, pcm_buffer += MIX_ACCUM_BUFLEN*2) { mixer_channels[i].decode_func = NULL; mixer_channels[i].decode_handle = NULL; mixer_channels[i].stereo = 0; mixer_channels[i].playing = 0; mixer_channels[i].fade_cut = 0; mixer_channels[i].volume = 1 << VOLUME_BITS; mixer_channels[i].fade_rate = 0; mixer_channels[i].fade_target = 0; mixer_channels[i].pan = PAN_MULT/2; mixer_channels[i].stereo_pan_l = PAN_MULT; mixer_channels[i].stereo_pan_r = PAN_MULT; mixer_channels[i].pcm_buffer = pcm_buffer; } mixer_initted = 1; return 1; error_destroy_mixer_mutex: mutex_destroy(mixer_mutex); mixer_mutex = 0; error_free_copy_channels: mem_free(copy_channels); copy_channels = NULL; error_free_mixer_channels: mem_free(mixer_channels); mixer_channels = NULL; error_return: return 0; } /*-----------------------------------------------------------------------*/ void sound_mixer_set_base_volume(float volume) { base_volume = bound(volume, 0, 15); } /*-----------------------------------------------------------------------*/ void sound_mixer_get_pcm(void *buffer, int samples) { if (UNLIKELY(!buffer) || UNLIKELY(samples < 0)) { DLOG("Invalid parameters: %p %d", buffer, samples); return; } if (UNLIKELY(!mixer_initted)) { mem_clear(buffer, samples*4); return; } uint8_t *buffer8 = (uint8_t *)buffer; while (samples > 0) { const int tomix = ubound(samples, MIX_ACCUM_BUFLEN); mix(buffer8, tomix); buffer8 += tomix*4; samples -= tomix; } } /*-----------------------------------------------------------------------*/ void sound_mixer_cleanup(void) { if (!mixer_initted) { return; } for (int i = 1; i <= num_channels; i++) { sound_mixer_reset(i); } mutex_destroy(mixer_mutex); mixer_mutex = 0; mem_free(mixer_channels[0].pcm_buffer); mem_free(mixer_channels); mem_free(copy_channels); mixer_channels = copy_channels = NULL; mixer_initted = 0; } /*************************************************************************/ /*********** Interface: Sound playback/manipulation functions ************/ /*************************************************************************/ int sound_mixer_setdata(int channel, SoundDecodeCallback decode_func, void *handle, int is_stereo) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels) || UNLIKELY(!decode_func)) { DLOG("Invalid parameters: %d %p %p %d", channel, decode_func, handle, is_stereo); return 0; } const int index = channel - 1; if (mixer_channels[index].decode_func) { DLOG("Channel %d is already in use", channel); return 0; } mixer_channels[index].decode_func = decode_func; mixer_channels[index].decode_handle = handle; mixer_channels[index].stereo = is_stereo ? 1 : 0; return 1; } /*-----------------------------------------------------------------------*/ void sound_mixer_setvol(int channel, float volume) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d %g", channel, volume); return; } const int index = channel - 1; lock_mixer(); if (volume < 0) { mixer_channels[index].volume = 0; } else if (volume >= VOLUME_MAX) { mixer_channels[index].volume = iroundf(VOLUME_MAX * VOLUME_MULT); } else { mixer_channels[index].volume = iroundf(volume * VOLUME_MULT); } mixer_channels[index].fade_rate = 0; mixer_channels[index].fade_cut = 0; unlock_mixer(); } /*-----------------------------------------------------------------------*/ void sound_mixer_setpan(int channel, float pan) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d %g", channel, pan); return; } const int index = channel - 1; lock_mixer(); if (pan < -1) { mixer_channels[index].pan = 0; mixer_channels[index].stereo_pan_l = PAN_MULT; mixer_channels[index].stereo_pan_r = 0; } else if (pan > 1) { mixer_channels[index].pan = PAN_MULT; mixer_channels[index].stereo_pan_l = 0; mixer_channels[index].stereo_pan_r = PAN_MULT; } else { mixer_channels[index].pan = iroundf(((pan+1)/2) * PAN_MULT); const float pan_l = 1 - pan; const float pan_r = 1 + pan; if (pan_l < pan_r) { mixer_channels[index].stereo_pan_l = (int16_t)iroundf((pan_l/pan_r) * PAN_MULT); mixer_channels[index].stereo_pan_r = PAN_MULT; } else { mixer_channels[index].stereo_pan_l = PAN_MULT; mixer_channels[index].stereo_pan_r = (int16_t)iroundf((pan_r/pan_l) * PAN_MULT); } } unlock_mixer(); } /*-----------------------------------------------------------------------*/ void sound_mixer_setfade(int channel, float target, float length, int cut) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels) || UNLIKELY(length < 0)) { DLOG("Invalid parameters: %d %g %g", channel, target, length); return; } const int index = channel - 1; if (!mixer_channels[index].decode_func) { DLOG("Channel %d has no data", channel); return; } lock_mixer(); if (length == 0) { mixer_channels[index].fade_rate = 0; mixer_channels[index].fade_cut = 0; } else { target = bound(target, 0, VOLUME_MAX); const float delta_volume = target - ((float)mixer_channels[index].volume / VOLUME_MULT); const float samples = lbound(roundf(length * mix_rate), 1); mixer_channels[index].fade_rate = iroundf((delta_volume / samples) * VOLUME_MULT); mixer_channels[index].fade_target = iroundf(target * VOLUME_MULT); mixer_channels[index].fade_cut = (cut != 0); } unlock_mixer(); } /*-----------------------------------------------------------------------*/ void sound_mixer_start(int channel) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d", channel); return; } const int index = channel - 1; lock_mixer(); if (mixer_channels[index].decode_func != NULL) { mixer_channels[index].playing = 1; } unlock_mixer(); } /*-----------------------------------------------------------------------*/ void sound_mixer_stop(int channel) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d", channel); return; } const int index = channel - 1; lock_mixer(); mixer_channels[index].playing = 0; unlock_mixer(); } /*-----------------------------------------------------------------------*/ void sound_mixer_reset(int channel) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d", channel); return; } const int index = channel - 1; if (mixer_channels[index].decode_func) { lock_mixer(); mixer_channels[index].playing = 0; mixer_channels[index].decode_func = NULL; mixer_channels[index].decode_handle = NULL; mixer_channels[index].stereo = 0; mixer_channels[index].fade_cut = 0; mixer_channels[index].volume = VOLUME_MULT; mixer_channels[index].fade_rate = 0; mixer_channels[index].pan = PAN_MULT/2; mixer_channels[index].stereo_pan_l = PAN_MULT; mixer_channels[index].stereo_pan_r = PAN_MULT; unlock_mixer(); } } /*-----------------------------------------------------------------------*/ int sound_mixer_status(int channel) { if (UNLIKELY(channel < 1) || UNLIKELY(channel > num_channels)) { DLOG("Invalid parameters: %d", channel); return 0; } const int index = channel - 1; return mixer_channels[index].playing; } /*************************************************************************/ /**************************** Local routines *****************************/ /*************************************************************************/ static void lock_mixer(void) { mutex_lock(mixer_mutex); } /*-----------------------------------------------------------------------*/ static void unlock_mixer(void) { mutex_unlock(mixer_mutex); } /*-----------------------------------------------------------------------*/ #if defined(SIL_ARCH_ARM_32) && defined(__GNUC__) && !defined(__clang__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) /* This is needed to avoid running out of registers when building with -O0. * Since the heavy code is all in assembly, the reduction in optimization * level for optimized builds should be insignificant. */ __attribute__((optimize(1))) #endif static void mix(void *buffer, int samples) { PRECOND(buffer != NULL, return); PRECOND(samples <= MIX_ACCUM_BUFLEN, return); /* Intermediate buffer for accumulating samples in 32 bits, so we can * clamp to [-0x8000,0x7FFF] instead of overflowing. */ static int32_t accum_buffer[MIX_ACCUM_BUFLEN*2]; #ifdef MIX_TIMING static double timing_lock = 0; // Total time spent waiting for the lock. static double timing_lock_max = 0;// Max time spent waiting for the lock. static double timing_decode = 0; // Time spent decoding audio. static double timing_render = 0; // Time spent rendering audio. static int timing_samples = 0; // Total number of samples rendered. static int timing_channels = 0; // Sum of channels active on each call. static int timing_count = 0; // Number of calls to this function. const double time_lock = time_now(); #endif lock_mixer(); #ifdef MIX_TIMING const double time_decode = time_now(); #endif for (int ch = 0; ch < num_channels; ch++) { if (mixer_channels[ch].playing) { if (mixer_channels[ch].fade_rate) { const int32_t samples_left = (mixer_channels[ch].fade_target - mixer_channels[ch].volume) / mixer_channels[ch].fade_rate; if (samples >= samples_left) { mixer_channels[ch].volume = mixer_channels[ch].fade_target; mixer_channels[ch].fade_rate = 0; } else { mixer_channels[ch].volume += mixer_channels[ch].fade_rate * samples; } } if (mixer_channels[ch].volume==0 && mixer_channels[ch].fade_cut) { mixer_channels[ch].playing = 0; continue; } const int got_data = (*mixer_channels[ch].decode_func)( mixer_channels[ch].decode_handle, mixer_channels[ch].pcm_buffer, samples); if (!got_data) { mixer_channels[ch].playing = 0; } } } for (int ch = 0; ch < num_channels; ch++) { if (mixer_channels[ch].playing) { copy_channels[ch] = mixer_channels[ch]; } else { copy_channels[ch].playing = 0; } } unlock_mixer(); #ifdef MIX_TIMING const double time_render = time_now(); #endif mem_clear(accum_buffer, samples*8); #if defined(SIL_ARCH_ARM_NEON) && defined(__GNUC__) MixerChannelInfo *channel = copy_channels; const unsigned long base_volume_scaled = iroundf(base_volume * (1<<27)); for (int i = 0; i < num_channels; i++, channel++) { /* Skip out early if this channel isn't playing or is silent. */ if (!channel->playing || channel->volume == 0) { continue; } const int64_t volume = ((uint64_t)channel->volume * base_volume_scaled) >> 27; const int16_t *src = channel->pcm_buffer; int32_t *dest = accum_buffer; int32_t * const dest_limit = dest + samples*2; if (channel->stereo) { const long volume_l = (long)((volume * channel->stereo_pan_l) >> PAN_BITS); const long volume_r = (long)((volume * channel->stereo_pan_r) >> PAN_BITS); /* This needs to be a literal constant for vrsraq below. */ #define stereo_shift (STEREO_SHIFT - PAN_BITS - 16) const int stereo_round = 1 << (stereo_shift - 1); /* If we use the obvious initializer here, Clang goes through * ridiculous contortions involving neg and mvn just to get a * loop counter it can increment instead of decrement. The sub * instruction is presumably no slower than the add instruction * and certainly faster for <=3 loops than adding several extra * initialization instructions, so we hack around to force * Clang to Do The Right Thing. */ //for (int j = samples & 3; j != 0; j--) { for (long j = samples & 3; j != 0; __extension__({__asm__("sub %0, %0, #1" : "=r" (j) : "0" (j));})) { /* Clang wants to use multiple registers and recompute * the value of src and dest on each iteration(!), so we * explicitly use the more logical (and faster) * post-indexed addressing mode. */ //int32_t left = *src++; //int32_t right = *src++; long left, right; __asm__("ldrsh %0, [%1], #4" : "=r" (left), "=r" (src) : "1" (src)); __asm__("ldrsh %0, [%1, #-2]" : "=r" (right) : "r" (src)); #ifdef SIL_ARCH_ARM_64 register long acc_left __asm__("x0") = dest[0]; register long acc_right __asm__("x1") = dest[1]; #else long acc_left = dest[0]; long acc_right = dest[1]; #endif left = (left * (int64_t)volume_l) >> 16; right = (right * (int64_t)volume_r) >> 16; acc_left += ((left + stereo_round) >> stereo_shift); acc_right += ((right + stereo_round) >> stereo_shift); //*dest++ = acc_left; //*dest++ = acc_right; #ifdef SIL_ARCH_ARM_64 __asm__("stp w0, w1, [%0], #8" : "=r" (dest) : "r" (acc_left), "r" (acc_right), "0" (dest)); #else __asm__("str %1, [%0], #8" : "=r" (dest) : "r" (acc_left), "0" (dest)); __asm__("str %1, [%0, #-4]" : "=r" (dest) : "r" (acc_right), "0" (dest)); #endif } const int32x4_t volume_l_vec = vdupq_n_s32(volume_l); const int32x4_t volume_r_vec = vdupq_n_s32(volume_r); while (dest < dest_limit) { #ifndef SIL_ARCH_ARM_64 const int16x4x2_t sample_16 = vld2_s16(src); src += 8; #else /* Clang doesn't seem to know about post-indexed addressing * mode for {ld,st}[1234], so we need to help it out. */ int16x4x2_t sample_16; __asm__("ld2 {%0.4h, %1.4h}, [%2], #16" : "=w" (sample_16.val[0]), "=w" (sample_16.val[1]), "=r" (src) : "2" (src), "m" (*src)); #endif int32x4x2_t accum = vld2q_s32(dest); int32x4_t left = vshll_n_s16(sample_16.val[0], 15); int32x4_t right = vshll_n_s16(sample_16.val[1], 15); left = vqdmulhq_s32(left, volume_l_vec); right = vqdmulhq_s32(right, volume_r_vec); accum.val[0] = vrsraq_n_s32(accum.val[0], left, stereo_shift); accum.val[1] = vrsraq_n_s32(accum.val[1], right, stereo_shift); #ifndef SIL_ARCH_ARM_64 vst2q_s32(dest, accum); dest += 8; #else __asm__("st2 {%2.4s, %3.4s}, [%0], #32" : "=r" (dest), "=m" (*dest) : "w" (accum.val[0]), "w" (accum.val[1]), "0" (dest)); #endif } #undef stereo_shift } else { // !channel->stereo const long volume_l = (long)((volume * (PAN_MULT - channel->pan)) >> PAN_BITS); const long volume_r = (long)((volume * channel->pan) >> PAN_BITS); #define mono_shift (MONO_SHIFT - PAN_BITS - 16) const int mono_round = 1 << (mono_shift - 1); /* See notes above for why all the inline assembly. */ //for (int j = samples & 3; j != 0; j--) { for (long j = samples & 3; j != 0; __extension__({__asm__("sub %0, %0, #1" : "=r" (j) : "0" (j));})) { //int16_t sample = *src++; long sample; __asm__("ldrsh %0, [%1], #2" : "=r" (sample), "=r" (src) : "1" (src)); #ifdef SIL_ARCH_ARM_64 register long acc_left __asm__("x0") = dest[0]; register long acc_right __asm__("x1") = dest[1]; #else long acc_left = dest[0]; long acc_right = dest[1]; #endif int32_t left = (sample * (int64_t)volume_l) >> 16; int32_t right = (sample * (int64_t)volume_r) >> 16; acc_left += ((left + mono_round) >> mono_shift); acc_right += ((right + mono_round) >> mono_shift); //*dest++ = acc_left; //*dest++ = acc_right; #ifdef SIL_ARCH_ARM_64 __asm__("stp w0, w1, [%0], #8" : "=r" (dest) : "r" (acc_left), "r" (acc_right), "0" (dest)); #else __asm__("str %1, [%0], #8" : "=r" (dest) : "r" (acc_left), "0" (dest)); __asm__("str %1, [%0, #-4]" : "=r" (dest) : "r" (acc_right), "0" (dest)); #endif } const int32x4_t volume_l_vec = vdupq_n_s32(volume_l); const int32x4_t volume_r_vec = vdupq_n_s32(volume_r); while (dest < dest_limit) { /* This gets converted to an ldr instruction, with which * Clang does know to use post-indexed addressing. */ const int16x4_t sample_16 = vld1_s16(src); src += 4; int32x4x2_t accum = vld2q_s32(dest); const int32x4_t sample_32 = vshll_n_s16(sample_16, 15); const int32x4_t left = vqdmulhq_s32(sample_32, volume_l_vec); const int32x4_t right = vqdmulhq_s32(sample_32, volume_r_vec); accum.val[0] = vrsraq_n_s32(accum.val[0], left, mono_shift); accum.val[1] = vrsraq_n_s32(accum.val[1], right, mono_shift); #ifndef SIL_ARCH_ARM_64 vst2q_s32(dest, accum); dest += 8; #else __asm__("st2 {%2.4s, %3.4s}, [%0], #32" : "=r" (dest), "=m" (*dest) : "w" (accum.val[0]), "w" (accum.val[1]), "0" (dest)); #endif } #undef mono_shift } } int32_t *src = accum_buffer; int16_t *dest = buffer; int16_t * const dest_limit = dest + samples*2; if (samples & 1) { #ifdef SIL_ARCH_ARM_64 __asm__("ldr d0, [%[src]], #8 \n" "sqxtn v0.4h, v0.4s \n" "str s0, [%[dest]], #4\n" : [src] "=r" (src), [dest] "=r" (dest), "=m" (*dest) : "0" (src), "1" (dest), "m" (*src) : "v0"); #else __asm__("ldmia %[src]!, {r0-r1} \n" "ssat r0, #16, r0 \n" "ssat r1, #16, r1 \n" "pkhbt r0, r0, r1, lsl #16\n" "str r0, [%[dest]], #4 \n" : [src] "=r" (src), [dest] "=r" (dest), "=m" (*dest) : "0" (src), "1" (dest), "m" (*src) : "r0", "r1"); #endif } for (int count = samples & 6; count != 0; count -= 2) { register int32x4_t in = vld1q_s32(src); src += 4; register int16x4_t out = vqmovn_s32(in); vst1_s16(dest, out); dest += 4; } while (dest < dest_limit) { register int32x4_t in0 = vld1q_s32(src); register int32x4_t in1 = vld1q_s32(src+4); register int32x4_t in2 = vld1q_s32(src+8); register int32x4_t in3 = vld1q_s32(src+12); src += 16; #ifdef SIL_ARCH_ARM_64 register int16x8_t out0 = vqmovn_high_s32(vqmovn_s32(in0), in1); register int16x8_t out1 = vqmovn_high_s32(vqmovn_s32(in2), in3); vst1q_s16(dest, out0); vst1q_s16(dest+8, out1); #else register int16x4_t out0 = vqmovn_s32(in0); register int16x4_t out1 = vqmovn_s32(in1); register int16x4_t out2 = vqmovn_s32(in2); register int16x4_t out3 = vqmovn_s32(in3); vst1_s16(dest, out0); vst1_s16(dest+4, out1); vst1_s16(dest+8, out2); vst1_s16(dest+12, out3); #endif dest += 16; } #elif defined(SIL_ARCH_ARM_32) && defined(__GNUC__) /* We modify some of the input registers, so we need to assign outputs * to those registers to ensure that the compiler doesn't try to reuse * the values from those registers (which it thinks are unmodified) or * put the values in registers that we clobber (since GCC-style asm() * assumes that all inputs are consumed before any clobbers occur). */ void * volatile buf_dummy; void * volatile buf_limit_dummy; void * volatile channel_dummy; volatile int i_dummy; __asm__( "@ Register usage is as follows: \n" "@ r0: Temporary; left output sample \n" "@ r1: Temporary; right output sample \n" "@ r2: Sum of left samples \n" "@ r3: Sum of right samples \n" "@ r4: (Mono) Left sample multiplier = volume * (1-pan) \n" "@ (Stereo) Left sample multiplier = volume * pan_l \n" "@ r5: (Mono) Right sample multiplier = volume * pan \n" "@ (Stereo) Right sample multiplier = volume * pan_r \n" "@ r6: Input sample \n" "@ r7: Unused (compiler might use r7 as a frame pointer) \n" "@ r8: Channel PCM data pointer \n" "@ Additionally, the following registers are allocated by the \n" "@ compiler: \n" "@ %[buf]: Output buffer pointer \n" "@ %[buf_limit]: Output buffer limit pointer \n" "@ %[channel]: Pointer to current channel's MixerChannelInfo\n" "@ %[i]: Channel loop counter (counts down to 0) \n" " \n" "@ Save the base output pointer on the stack. \n" "push {%[buf]} \n" " \n" "@@@@@@@@ Process each channel in sequence. @@@@@@@@ \n" " \n" "1: @chanloop \n" "@ (Hmm, using an actual label seems to cause the iOS linker to \n" "@ randomly rearrange pieces of the routine...) \n" " \n" "@ Increment the channel pointer and index, and check whether \n" "@ we've reached the end of the array. \n" "add %[channel], %[channel], %[sizeof_ch] \n" "subs %[i], %[i], #1 \n" "beq 9f @chanloop_end \n" "@ Skip out early if this channel isn't playing or is silent. \n" "ldrb r0, [%[channel], %[playing]] \n" "ldr r1, [%[channel], %[volume]] \n" "teq r0, #0 \n" "beq 1b @chanloop \n" "teq r1, #0 \n" "beq 1b @chanloop \n" "umull r0, r1, r1, %[base_volume_scaled] \n" " \n" "@@@@@@@@ Setup for monaural channels. @@@@@@@@ \n" " \n" "@ Load this channel's data. \n" "ldrb r0, [%[channel], %[stereo]] \n" "ldr %[buf], [sp] \n" "ldr r8, [%[channel], %[pcm_buffer]] \n" "teq r0, #0 \n" "bne 5f @stereo_setup \n" "ldr r5, [%[channel], %[pan]] \n" "@ The -5 here is because multiplying by base_volume_scaled \n" "@ already cut 5 bits from the value. \n" "asr r1, r1, %[pan_bits] - 5 \n" "rsb r4, r5, %[pan_mult] \n" "mul r5, r5, r1 \n" "mul r4, r4, r1 \n" " \n" "@@@@@@@@ Playback loop for monaural channels. @@@@@@@@ \n" " \n" "2: @monoloop \n" "@ Load the sample from the channel's input buffer. \n" "ldrsh r6, [r8], #2 \n" "lsl r6, r6, #16 \n" " \n" "@ Add the sample to the left and right accumulated values. \n" "smmul r0, r6, r4 @ Left output sample \n" "ldmia %[buf], {r2,r3} @ Accumulated sample values \n" "smmul r1, r6, r5 @ Right output sample \n" "add r0, r0, %[mono_round] \n" "add r1, r1, %[mono_round] \n" "add r2, r2, r0, asr %[mono_shift] \n" "add r3, r3, r1, asr %[mono_shift] \n" "stmia %[buf]!, {r2,r3} \n" " \n" "@ Process the next sample (if any are left). \n" "cmp %[buf], %[buf_limit] \n" "bne 2b @monoloop \n" " \n" "@ Continue with the next channel. \n" "b 1b @chanloop \n" " \n" "@@@@@@@@ Setup for stereo channels. @@@@@@@@ \n" " \n" "5: @stereo_setup \n" "asr r1, r1, %[pan_bits] - 5 @ r1 holds channel volume from above.\n" "ldrsh r4, [%[channel], %[stereo_pan_l]] \n" "ldrsh r5, [%[channel], %[stereo_pan_r]] \n" "mul r4, r4, r1 \n" "mul r5, r5, r1 \n" " \n" "@@@@@@@@ Playback loop for stereo channels. @@@@@@@@ \n" " \n" "6: @stereoloop \n" "ldrsh r6, [r8], #4 \n" "lsl r6, r6, #16 \n" "smmul r0, r4, r6 \n" "ldrsh r6, [r8, #-2] \n" "ldmia %[buf], {r2,r3} \n" "lsl r6, r6, #16 \n" "smmul r1, r5, r6 \n" "add r0, r0, %[stereo_round] \n" "add r1, r1, %[stereo_round] \n" "add r2, r2, r0, asr %[stereo_shift] \n" "add r3, r3, r1, asr %[stereo_shift] \n" "stmia %[buf]!, {r2,r3} \n" " \n" "cmp %[buf], %[buf_limit] \n" "bne 6b @stereoloop \n" "b 1b @chanloop \n" " \n" "@@@@@@@@ End of loops. @@@@@@@@ \n" " \n" "9: @chanloop_end \n" "pop {%[buf]} \n" : "=r" (buf_dummy), "=r" (buf_limit_dummy), "=r" (channel_dummy), "=r" (i_dummy) /* Technically, we need to declare accum_buffer as an output, * but if we do that GCC runs out of registers. We get the * correct code even without the output declaration, so we just * drop it. */ /* , "=m" (accum_buffer) */ : [buf] "0" (accum_buffer), [buf_limit] "1" (accum_buffer + samples*2), /* These two are updated at the top of the loop; adjust so the * first loop iteration sees the first channel. */ [channel] "2" (copy_channels - 1), [i] "3" (num_channels + 1), [base_volume_scaled] "r" (iroundf(base_volume * (1 << 27))), [pan_bits] "I" (PAN_BITS), [pan_mult] "I" (PAN_MULT), [mono_shift] "I" (MONO_SHIFT - PAN_BITS - 16), [stereo_shift] "I" (STEREO_SHIFT - PAN_BITS - 16), [mono_round] "I" (1 << (MONO_SHIFT - PAN_BITS - 17)), [stereo_round] "I" (1 << (STEREO_SHIFT - PAN_BITS - 17)), [stereo] "I" (offsetof(MixerChannelInfo,stereo)), [volume] "I" (offsetof(MixerChannelInfo,volume)), [pan] "I" (offsetof(MixerChannelInfo,pan)), [stereo_pan_l] "I" (offsetof(MixerChannelInfo,stereo_pan_l)), [stereo_pan_r] "I" (offsetof(MixerChannelInfo,stereo_pan_r)), [playing] "I" (offsetof(MixerChannelInfo,playing)), [pcm_buffer] "I" (offsetof(MixerChannelInfo,pcm_buffer)), [sizeof_ch] "I" (sizeof(copy_channels[0])) : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "cc" ); void * volatile mix_dummy; void * volatile accum_dummy; /* We use a bit of magic here to efficiently handle non-multiple-of-4 * sample counts: we set up a sequence of three single sample copies * (the maximum possible), then jump (using an ADD PC, PC, Rn) over * the appropriate number of instructions so we copy exactly the * desired number of samples. */ __asm__( "add r9, %[mix_buffer], %[count], lsl #2\n" "and r8, %[count], #3 \n" "add r6, r8, r8, lsl #2 @ r6 = r8*5 \n" "@ 5*3-1: 5 insns * 3 samples - 1 because the ADD insn reads PC+8.\n" "rsb r5, r6, #(5*3-1) \n" "add pc, pc, r5, lsl #2 \n" "3: \n" "ldmia %[accum_buffer]!, {r0-r1} \n" "ssat r0, #16, r0 \n" "ssat r1, #16, r1 \n" "pkhbt r0, r0, r1, lsl #16 \n" "stmia %[mix_buffer]!, {r0} \n" "2: \n" "ldmia %[accum_buffer]!, {r0-r1} \n" "ssat r0, #16, r0 \n" "ssat r1, #16, r1 \n" "pkhbt r0, r0, r1, lsl #16 \n" "stmia %[mix_buffer]!, {r0} \n" "1: \n" "ldmia %[accum_buffer]!, {r0-r1} \n" "ssat r0, #16, r0 \n" "ssat r1, #16, r1 \n" "pkhbt r0, r0, r1, lsl #16 \n" "stmia %[mix_buffer]!, {r0} \n" "0: \n" "cmp %[mix_buffer], r9 \n" "bcs 9f \n" "4: \n" "ldmia %[accum_buffer]!, {r0-r6,r8} \n" "ssat r0, #16, r0 \n" "ssat r1, #16, r1 \n" "ssat r2, #16, r2 \n" "ssat r3, #16, r3 \n" "ssat r4, #16, r4 \n" "ssat r5, #16, r5 \n" "ssat r6, #16, r6 \n" "ssat r8, #16, r8 \n" "pkhbt r0, r0, r1, lsl #16 \n" "pkhbt r1, r2, r3, lsl #16 \n" "pkhbt r2, r4, r5, lsl #16 \n" "pkhbt r3, r6, r8, lsl #16 \n" "stmia %[mix_buffer]!, {r0-r3} \n" "cmp %[mix_buffer], r9 \n" "bcc 0b \n" "9: \n" : "=r" (mix_dummy), "=r" (accum_dummy) /* Normally we also need "=m" (buffer) to indicate that the * contents of buffer are modified. However, (1) buffer is a * "void *" here, so the extra specification doesn't actually * give the compiler any useful information, and (2) including * the specification causes the version of Clang included with * Xcode 4.5.2 to choke on register allocation. So we drop * the 13-character specification and include this 8-line * explanation instead. */ : [mix_buffer] "0" (buffer), [accum_buffer] "1" (accum_buffer), [count] "r" (samples), "m" (accum_buffer[0]) : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r9", "cc" ); #elif defined(SIL_ARCH_MIPS_32) && defined(__GNUC__) __asm__( ".set push; .set noreorder\n" "# Register usage is as follows: \n" "# t0-t3, t8-t9: Temporaries \n" "# t4: (Mono) Left sample multiplier = volume * (1-pan) \n" "# (Stereo) Channel volume * left channel pan \n" "# t5: (Mono) Right sample multiplier = volume * pan \n" "# (Stereo) Channel volume * right channel pan \n" "# t6: (Mono) Current sample value \n" "# (Stereo) Current left sample value \n" "# t7: (Mono) Unused \n" "# (Stereo) Current right sample value \n" "# s0: Sample accumulator buffer pointer \n" "# s1: Sample accumulator buffer limit pointer \n" "# s2: Channel PCM data pointer \n" "# s4: Base volume multiplier (5.27 fixed point) \n" "# s5: Number of channels \n" "# s6: Pointer to current channel's MixerChannelInfo structure\n" "# s7: Index of current channel \n" " \n" "# Set up registers. \n" "sll $t8, %[count], 3 \n" "addu $s1, %[buf], $t8 \n" "li $s7, -1 # $s6/$s7 are advanced at the top of the loop. \n" "addiu $s6, %[copy_channels], -%[sizeof_ch] \n" "move $s5, %[num_channels] \n" "round.w.s $f0, %[base_volume_scaled] \n" "mfc1 $s4, $f0 \n" " \n" "# Clear the output buffer to zero. \n" "addiu $s0, %[buf], 32 \n" "0: sw $zero, -32($s0) \n" "sw $zero, -28($s0) \n" "sw $zero, -24($s0) \n" "sw $zero, -20($s0) \n" "sw $zero, -16($s0) \n" "sw $zero, -12($s0) \n" "sw $zero, -8($s0) \n" "addiu $s0, $s0, 32 \n" "sltu $t9, $s1, $s0 \n" "beqz $t9, 0b \n" "sw $zero, -(4+32)($s0) \n" " \n" "######## Process each channel in sequence. ######## \n" " \n" "chanloop: \n" "# Increment the channel pointer and index, and check whether \n" "# we've reached the end of the array. Also load some values \n" "# we'll use later; note that the overrun on the last channel \n" "# is deliberate and safe. \n" "lbu $t0, %[sizeof_ch]+%[playing]($s6) \n" "lw $t1, %[sizeof_ch]+%[volume]($s6) \n" "addiu $s7, $s7, 1 \n" "sltu $t8, $s7, $s5 \n" "multu $t1, $s4 \n" "beqz $t8, chanloop_end \n" "addiu $s6, $s6, %[sizeof_ch] \n" "# Skip out early if this channel isn't playing or is silent. \n" "beqz $t0, chanloop \n" "lbu $t2, %[stereo]($s6) \n" "beqz $t1, chanloop \n" "# We deliberately pull out the scaled volume after the test to \n" "# leave an extra cycle for the ALU in case it needs it. This \n" "# should normally make no difference since there's little sense\n" "# in setting the base volume to zero. \n" "mfhi $t1 \n" "# Scale the volume down into $t9 (so volume*pan fits in 32 \n" "# bits), and go to the appropriate loop (mono or stereo). The \n" "# -5 in the shift count is because we already cut off 5 bits \n" "# when scaling by the base volume. \n" "bnez $t2, stereo_setup \n" "srl $t9, $t1, %[pan_bits] - 5 \n" " \n" "######## Setup for monaural channels. ######## \n" " \n" "# Load this channel's data. \n" "lw $t5, %[pan]($s6) \n" "li $t4, 1<<%[pan_bits] \n" "multu $t9, $t5 \n" "subu $t4, $t4, $t5 \n" "mflo $t5 \n" "move $s0, %[buf] \n" "lw $s2, %[pcm_buffer]($s6) \n" "multu $t9, $t4 \n" "mflo $t4 \n" "# Load the first sample. \n" "lh $t6, 0($s2) \n" " \n" "######## Playback loop for monaural channels. ######## \n" " \n" "# Scale the sample up to 16.16 fixed point. \n" "sll $t6, $t6, 16 \n" "monoloop: \n" "# Add the sample to the left and right accumulated values. \n" "mult $t6, $t4 # Left output sample (to $t2) \n" "lw $t0, 0($s0) # $t0: Sum of left samples \n" "lw $t1, 4($s0) # $t1: Sum of right samples \n" "addiu $s0, $s0, 8 # Advance the output pointer. \n" "mfhi $t2 \n" "addiu $t2, $t2, 1 << (%[mono_shift] - %[pan_bits] - 17) \n" "sra $t2, $t2, %[mono_shift] - %[pan_bits] - 16 \n" "mult $t6, $t5 # Right output sample (to $t3) \n" "addu $t2, $t2, $t0 \n" "sw $t2, -8($s0) \n" "mfhi $t3 \n" "addiu $t3, $t3, 1 << (%[mono_shift] - %[pan_bits] - 17) \n" "sra $t3, $t3, %[mono_shift] - %[pan_bits] - 16 \n" "addu $t3, $t3, $t1 \n" "sw $t3, -4($s0) \n" " \n" "# Load the next input sample. This is a copy of the first \n" "# instruction in the loop; we do the load here to avoid load \n" "# delay stalls. Note that this overruns the input buffer on \n" "# the final sample; we allocate an extra stereo sample's worth \n" "# of space at init time to ensure this is safe. \n" "lh $t6, 2($s2) \n" "# Advance the input pointer to the next sample. \n" "addiu $s2, $s2, 2 \n" "# Process the next sample, if any are left. \n" "bne $s0, $s1, monoloop \n" "sll $t6, $t6, 16 # First insn copied to the delay slot. \n" "b chanloop \n" "nop \n" " \n" "######## Setup for stereo channels. ######## \n" " \n" "stereo_setup: \n" "# Load this channel's data. \n" "lh $t6, %[stereo_pan_l]($s6) \n" "lh $t7, %[stereo_pan_r]($s6) \n" "mult $t6, $t9 \n" "mflo $t4 \n" "lw $s2, %[pcm_buffer]($s6) \n" "move $s0, %[buf] \n" "mult $t7, $t9 \n" "mflo $t5 \n" "# Load the first sample for each audio channel. \n" "lh $t6, 0($s2) \n" "lh $t7, 2($s2) \n" " \n" "######## Playback loop for stereo channels. ######## \n" " \n" "sll $t6, $t6, 16 \n" "stereoloop: \n" "sll $t7, $t7, 16 \n" "mult $t6, $t4 # Left output sample (to $t2) \n" "lw $t0, 0($s0) # $t0: Sum of left samples \n" "lw $t1, 4($s0) # $t1: Sum of right samples \n" "addiu $s0, $s0, 8 # Advance the output pointer. \n" "mfhi $t2 \n" "addiu $t2, $t2, 1 << (%[stereo_shift] - %[pan_bits] - 17) \n" "sra $t2, $t2, %[stereo_shift] - %[pan_bits] - 16 \n" "mult $t7, $t5 # Right output sample (to $t3) \n" "addu $t2, $t2, $t0 \n" "sw $t2, -8($s0) \n" "mfhi $t3 \n" "addiu $t3, $t3, 1 << (%[stereo_shift] - %[pan_bits] - 17) \n" "sra $t3, $t3, %[stereo_shift] - %[pan_bits] - 16 \n" "addu $t3, $t3, $t1 \n" "sw $t3, -4($s0) \n" " \n" "lh $t6, 4($s2) \n" "lh $t7, 6($s2) \n" "addiu $s2, $s2, 4 \n" "bne $s0, $s1, stereoloop \n" "sll $t6, $t6, 16 # First insn copied to the delay slot. \n" "b chanloop \n" "nop \n" " \n" "######## End of loops. ######## \n" " \n" "chanloop_end: \n" ".set pop" : "=m" (accum_buffer) : [buf] "r" (accum_buffer), [count] "r" (samples), [copy_channels] "r" (copy_channels), [num_channels] "r" (num_channels), [base_volume_scaled] "f" (base_volume * (1 << 27)), [pan_bits] "i" (PAN_BITS), [mono_shift] "i" (MONO_SHIFT), [stereo_shift] "i" (STEREO_SHIFT), [stereo] "I" (offsetof(MixerChannelInfo,stereo)), [volume] "I" (offsetof(MixerChannelInfo,volume)), [pan] "I" (offsetof(MixerChannelInfo,pan)), [stereo_pan_l] "I" (offsetof(MixerChannelInfo,stereo_pan_l)), [stereo_pan_r] "I" (offsetof(MixerChannelInfo,stereo_pan_r)), [playing] "I" (offsetof(MixerChannelInfo,playing)), [pcm_buffer] "I" (offsetof(MixerChannelInfo,pcm_buffer)), [sizeof_ch] "I" (sizeof(copy_channels[0])) : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1", "s2", "s4", "s5", "s6", "s7", "$f0" ); __asm__( ".set push; .set noreorder\n" "move $a0, %[accum_buffer] \n" "move $a1, %[mix_buffer] \n" "sll $t9, %[count], 3 \n" "addu $t9, $a0, $t9 \n" "lw $t0, 0($a0) \n" "andi $t3, %[count], 1 \n" "lw $t1, 4($a0) \n" "beqzl $t3, 1f \n" "lw $t2, 8($a0) \n" #ifdef SIL_PLATFORM_PSP /* PSP prefers having RAW dependencies adjacent (EX->EX forwarding). */ "min $t0, $t0, %[x7FFF] \n" "max $t0, $t0, %[x8000] \n" "min $t1, $t1, %[x7FFF] \n" "max $t1, $t1, %[x8000] \n" #else "slt $t4, $t0, %[x7FFF] \n" "slt $t5, $t1, %[x7FFF] \n" "movz $t0, %[x7FFF], $t4 \n" "movz $t1, %[x7FFF], $t5 \n" "slt $t4, $t0, %[x8000] \n" "slt $t5, $t1, %[x8000] \n" "movn $t0, %[x8000], $t4 \n" "movn $t1, %[x8000], $t5 \n" #endif "add $a0, $a0, 8 \n" #if defined(SIL_ARCH_MIPS_MIPS32R2) || defined(SIL_PLATFORM_PSP) "ins $t0, $t1, 16, 16 \n" #else "sll $t1, $t1, 16 \n" "andi $t0, $t0, 0xFFFF \n" "or $t0, $t0, $t1 \n" #endif "add $a1, $a1, 4 \n" "beq $a0, $t9, 9f \n" "sw $t0, -4($a1) \n" "lw $t0, 0($a0) \n" "0: lw $t1, 4($a0) \n" "lw $t2, 8($a0) \n" "1: lw $t3, 12($a0) \n" "addiu $a0, $a0, 16 \n" #ifdef SIL_PLATFORM_PSP "min $t0, $t0, %[x7FFF] \n" "max $t0, $t0, %[x8000] \n" "min $t1, $t1, %[x7FFF] \n" "max $t1, $t1, %[x8000] \n" "min $t2, $t2, %[x7FFF] \n" "max $t2, $t2, %[x8000] \n" "min $t3, $t3, %[x7FFF] \n" "max $t3, $t3, %[x8000] \n" #else "slt $t4, $t0, %[x7FFF] \n" "slt $t5, $t1, %[x7FFF] \n" "movz $t0, %[x7FFF], $t4 \n" "movz $t1, %[x7FFF], $t5 \n" "slt $t4, $t2, %[x7FFF] \n" "slt $t5, $t3, %[x7FFF] \n" "movz $t2, %[x7FFF], $t4 \n" "movz $t3, %[x7FFF], $t5 \n" "slt $t4, $t0, %[x8000] \n" "slt $t5, $t1, %[x8000] \n" "movn $t0, %[x8000], $t4 \n" "movn $t1, %[x8000], $t5 \n" "slt $t4, $t2, %[x8000] \n" "slt $t5, $t3, %[x8000] \n" "movn $t2, %[x8000], $t4 \n" "movn $t3, %[x8000], $t5 \n" #endif #if defined(SIL_ARCH_MIPS_MIPS32R2) || defined(SIL_PLATFORM_PSP) "ins $t0, $t1, 16, 16 \n" "ins $t2, $t3, 16, 16 \n" #else "sll $t1, $t1, 16 \n" "sll $t3, $t3, 16 \n" "andi $t0, $t0, 0xFFFF \n" "andi $t2, $t2, 0xFFFF \n" "or $t0, $t0, $t1 \n" "or $t2, $t2, $t3 \n" #endif "sw $t0, 0($a1) \n" "addiu $a1, $a1, 8 \n" "sw $t2, -4($a1) \n" "bnel $a0, $t9, 0b \n" "lw $t0, 0($a0) \n" "9: .set pop" : "=m" (((int16_t *)buffer)[0]) : [mix_buffer] "r" (buffer), [accum_buffer] "r" (accum_buffer), [count] "r" (samples), [x7FFF] "r" (0x7FFF), [x8000] "r" (-0x8000), "m" (accum_buffer[0]) : "a0", "a1", "t0", "t1", "t2", "t3", "t9" #ifndef SIL_PLATFORM_PSP , "t4", "t5" #endif ); #else // No assembly version available. for (int ch = 0; ch < num_channels; ch++) { if (!copy_channels[ch].playing || copy_channels[ch].volume == 0) { continue; } const int64_t volume = iroundf(copy_channels[ch].volume * base_volume); int32_t pan_l, pan_r; if (copy_channels[ch].stereo) { pan_l = copy_channels[ch].stereo_pan_l; pan_r = copy_channels[ch].stereo_pan_r; } else { pan_l = PAN_MULT - copy_channels[ch].pan; pan_r = copy_channels[ch].pan; } for (int i = 0; i < samples; i++) { int32_t left = accum_buffer[i*2+0]; int32_t right = accum_buffer[i*2+1]; if (copy_channels[ch].stereo) { left += ((copy_channels[ch].pcm_buffer[i*2+0] * pan_l) * volume + ((int64_t)1 << (STEREO_SHIFT-1))) >> STEREO_SHIFT; right += ((copy_channels[ch].pcm_buffer[i*2+1] * pan_r) * volume + ((int64_t)1 << (STEREO_SHIFT-1))) >> STEREO_SHIFT; } else { const int32_t sample = copy_channels[ch].pcm_buffer[i]; left += ((sample * pan_l) * volume + ((int64_t)1 << (MONO_SHIFT-1))) >> MONO_SHIFT; right += ((sample * pan_r) * volume + ((int64_t)1 << (MONO_SHIFT-1))) >> MONO_SHIFT; } accum_buffer[i*2+0] = left; accum_buffer[i*2+1] = right; } } // for (int ch = 0 ... num_channels-1) int16_t *buffer16 = (int16_t *)buffer; for (int i = 0; i < samples*2; i++) { const int32_t sample = accum_buffer[i]; if (UNLIKELY(sample > 0x7FFF)) { buffer16[i] = 0x7FFF; } else if (UNLIKELY(sample < -0x8000)) { buffer16[i] = -0x8000; } else { buffer16[i] = sample; } } #endif // SIL_ARCH_* #ifdef MIX_TIMING const double time_end = time_now(); timing_lock += time_decode - time_lock; timing_lock_max = max(timing_lock_max, time_decode - time_lock); timing_decode += time_render - time_decode; timing_render += time_end - time_render; timing_samples += samples; for (int ch = 0; ch < num_channels; ch++) { if (copy_channels[ch].playing && copy_channels[ch].volume != 0) { timing_channels++; } } timing_count++; if (timing_samples >= mix_rate) { const double total_time = (double)timing_samples / (double)mix_rate; DLOG("Mixer stats (%d calls):\n" " Avg. channels: %.2f\n" " Lock wait: %5.2fms/call (%.2f%% CPU), max %5.2fms\n" " Audio decode: %5.2fms/call (%.2f%% CPU)\n" " Audio render: %5.2fms/call (%.2f%% CPU), %.3fus/sample", timing_count, (float)timing_channels / (float)timing_count, (timing_lock / timing_count) * 1000, (timing_lock / total_time) * 100, timing_lock_max * 1000, (timing_decode / timing_count) * 1000, (timing_decode / total_time) * 100, (timing_render / timing_count) * 1000, (timing_render / total_time) * 100, (timing_render / timing_samples) * 1000000); timing_lock = 0; timing_lock_max = 0; timing_decode = 0; timing_render = 0; timing_samples = 0; timing_channels = 0; timing_count = 0; } #endif } /*************************************************************************/ /*************************************************************************/