1/* 2 * x86-optimized AC-3 DSP functions 3 * Copyright (c) 2011 Justin Ruggles 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/mem.h" 24#include "libavutil/x86/asm.h" 25#include "libavutil/x86/cpu.h" 26#include "libavcodec/ac3.h" 27#include "libavcodec/ac3dsp.h" 28 29void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); 30void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); 31void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); 32 33int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); 34int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); 35int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); 36int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); 37 38void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); 39void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); 40 41void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); 42void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); 43 44void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); 45void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); 46void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); 47 48int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); 49 50void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); 51void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); 52void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); 53 54void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, 55 const int16_t *window, unsigned int len); 56void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, 57 const int16_t *window, unsigned int len); 58void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, 59 const int16_t *window, unsigned int len); 60void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, 61 const int16_t *window, unsigned int len); 62void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, 63 const int16_t *window, unsigned int len); 64void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, 65 const int16_t *window, unsigned int len); 66 67#if ARCH_X86_32 && defined(__INTEL_COMPILER) 68# undef HAVE_7REGS 69# define HAVE_7REGS 0 70#endif 71 72#if HAVE_SSE_INLINE && HAVE_7REGS 73 74#define IF1(x) x 75#define IF0(x) 76 77#define MIX5(mono, stereo) \ 78 __asm__ volatile ( \ 79 "movss 0(%1), %%xmm5 \n" \ 80 "movss 8(%1), %%xmm6 \n" \ 81 "movss 24(%1), %%xmm7 \n" \ 82 "shufps $0, %%xmm5, %%xmm5 \n" \ 83 "shufps $0, %%xmm6, %%xmm6 \n" \ 84 "shufps $0, %%xmm7, %%xmm7 \n" \ 85 "1: \n" \ 86 "movaps (%0, %2), %%xmm0 \n" \ 87 "movaps (%0, %3), %%xmm1 \n" \ 88 "movaps (%0, %4), %%xmm2 \n" \ 89 "movaps (%0, %5), %%xmm3 \n" \ 90 "movaps (%0, %6), %%xmm4 \n" \ 91 "mulps %%xmm5, %%xmm0 \n" \ 92 "mulps %%xmm6, %%xmm1 \n" \ 93 "mulps %%xmm5, %%xmm2 \n" \ 94 "mulps %%xmm7, %%xmm3 \n" \ 95 "mulps %%xmm7, %%xmm4 \n" \ 96 stereo("addps %%xmm1, %%xmm0 \n") \ 97 "addps %%xmm1, %%xmm2 \n" \ 98 "addps %%xmm3, %%xmm0 \n" \ 99 "addps %%xmm4, %%xmm2 \n" \ 100 mono("addps %%xmm2, %%xmm0 \n") \ 101 "movaps %%xmm0, (%0, %2) \n" \ 102 stereo("movaps %%xmm2, (%0, %3) \n") \ 103 "add $16, %0 \n" \ 104 "jl 1b \n" \ 105 : "+&r"(i) \ 106 : "r"(matrix), \ 107 "r"(samples[0] + len), \ 108 "r"(samples[1] + len), \ 109 "r"(samples[2] + len), \ 110 "r"(samples[3] + len), \ 111 "r"(samples[4] + len) \ 112 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ 113 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ 114 "memory" \ 115 ); 116 117#define MIX_MISC(stereo) \ 118 __asm__ volatile ( \ 119 "mov %5, %2 \n" \ 120 "1: \n" \ 121 "mov -%c7(%6, %2, %c8), %3 \n" \ 122 "movaps (%3, %0), %%xmm0 \n" \ 123 stereo("movaps %%xmm0, %%xmm1 \n") \ 124 "mulps %%xmm4, %%xmm0 \n" \ 125 stereo("mulps %%xmm5, %%xmm1 \n") \ 126 "2: \n" \ 127 "mov (%6, %2, %c8), %1 \n" \ 128 "movaps (%1, %0), %%xmm2 \n" \ 129 stereo("movaps %%xmm2, %%xmm3 \n") \ 130 "mulps (%4, %2, 8), %%xmm2 \n" \ 131 stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ 132 "addps %%xmm2, %%xmm0 \n" \ 133 stereo("addps %%xmm3, %%xmm1 \n") \ 134 "add $4, %2 \n" \ 135 "jl 2b \n" \ 136 "mov %5, %2 \n" \ 137 stereo("mov (%6, %2, %c8), %1 \n") \ 138 "movaps %%xmm0, (%3, %0) \n" \ 139 stereo("movaps %%xmm1, (%1, %0) \n") \ 140 "add $16, %0 \n" \ 141 "jl 1b \n" \ 142 : "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ 143 : "r"(matrix_simd + in_ch), \ 144 "g"((intptr_t) - 4 * (in_ch - 1)), \ 145 "r"(samp + in_ch), \ 146 "i"(sizeof(float *)), "i"(sizeof(float *)/4) \ 147 : "memory" \ 148 ); 149 150static void ac3_downmix_sse(float **samples, float (*matrix)[2], 151 int out_ch, int in_ch, int len) 152{ 153 int (*matrix_cmp)[2] = (int(*)[2])matrix; 154 intptr_t i, j, k, m; 155 156 i = -len * sizeof(float); 157 if (in_ch == 5 && out_ch == 2 && 158 !(matrix_cmp[0][1] | matrix_cmp[2][0] | 159 matrix_cmp[3][1] | matrix_cmp[4][0] | 160 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | 161 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { 162 MIX5(IF0, IF1); 163 } else if (in_ch == 5 && out_ch == 1 && 164 matrix_cmp[0][0] == matrix_cmp[2][0] && 165 matrix_cmp[3][0] == matrix_cmp[4][0]) { 166 MIX5(IF1, IF0); 167 } else { 168 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; 169 float *samp[AC3_MAX_CHANNELS]; 170 171 for (j = 0; j < in_ch; j++) 172 samp[j] = samples[j] + len; 173 174 j = 2 * in_ch * sizeof(float); 175 __asm__ volatile ( 176 "1: \n" 177 "sub $8, %0 \n" 178 "movss (%2, %0), %%xmm4 \n" 179 "movss 4(%2, %0), %%xmm5 \n" 180 "shufps $0, %%xmm4, %%xmm4 \n" 181 "shufps $0, %%xmm5, %%xmm5 \n" 182 "movaps %%xmm4, (%1, %0, 4) \n" 183 "movaps %%xmm5, 16(%1, %0, 4) \n" 184 "jg 1b \n" 185 : "+&r"(j) 186 : "r"(matrix_simd), "r"(matrix) 187 : "memory" 188 ); 189 if (out_ch == 2) { 190 MIX_MISC(IF1); 191 } else { 192 MIX_MISC(IF0); 193 } 194 } 195} 196 197#endif /* HAVE_SSE_INLINE && HAVE_7REGS */ 198 199av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) 200{ 201 int cpu_flags = av_get_cpu_flags(); 202 203 if (EXTERNAL_MMX(cpu_flags)) { 204 c->ac3_exponent_min = ff_ac3_exponent_min_mmx; 205 c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; 206 c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; 207 c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; 208 } 209 if (EXTERNAL_AMD3DNOW(cpu_flags)) { 210 if (!bit_exact) { 211 c->float_to_fixed24 = ff_float_to_fixed24_3dnow; 212 } 213 } 214 if (EXTERNAL_MMXEXT(cpu_flags)) { 215 c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; 216 c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; 217 if (bit_exact) { 218 c->apply_window_int16 = ff_apply_window_int16_mmxext; 219 } else { 220 c->apply_window_int16 = ff_apply_window_int16_round_mmxext; 221 } 222 } 223 if (EXTERNAL_SSE(cpu_flags)) { 224 c->float_to_fixed24 = ff_float_to_fixed24_sse; 225 } 226 if (EXTERNAL_SSE2(cpu_flags)) { 227 c->ac3_exponent_min = ff_ac3_exponent_min_sse2; 228 c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; 229 c->float_to_fixed24 = ff_float_to_fixed24_sse2; 230 c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; 231 c->extract_exponents = ff_ac3_extract_exponents_sse2; 232 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { 233 c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; 234 c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; 235 } 236 if (bit_exact) { 237 c->apply_window_int16 = ff_apply_window_int16_sse2; 238 } else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { 239 c->apply_window_int16 = ff_apply_window_int16_round_sse2; 240 } 241 } 242 if (EXTERNAL_SSSE3(cpu_flags)) { 243 c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; 244 if (cpu_flags & AV_CPU_FLAG_ATOM) { 245 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; 246 } else { 247 c->extract_exponents = ff_ac3_extract_exponents_ssse3; 248 c->apply_window_int16 = ff_apply_window_int16_ssse3; 249 } 250 } 251 252#if HAVE_SSE_INLINE && HAVE_7REGS 253 if (INLINE_SSE(cpu_flags)) { 254 c->downmix = ac3_downmix_sse; 255 } 256#endif 257} 258