1;****************************************************************************** 2;* optimized audio functions 3;* Copyright (c) 2008 Loren Merritt 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_TEXT 25 26%macro SCALARPRODUCT 0 27; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order) 28cglobal scalarproduct_int16, 3,3,3, v1, v2, order 29 shl orderq, 1 30 add v1q, orderq 31 add v2q, orderq 32 neg orderq 33 pxor m2, m2 34.loop: 35 movu m0, [v1q + orderq] 36 movu m1, [v1q + orderq + mmsize] 37 pmaddwd m0, [v2q + orderq] 38 pmaddwd m1, [v2q + orderq + mmsize] 39 paddd m2, m0 40 paddd m2, m1 41 add orderq, mmsize*2 42 jl .loop 43 HADDD m2, m0 44 movd eax, m2 45%if mmsize == 8 46 emms 47%endif 48 RET 49%endmacro 50 51INIT_MMX mmxext 52SCALARPRODUCT 53INIT_XMM sse2 54SCALARPRODUCT 55 56 57;----------------------------------------------------------------------------- 58; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, 59; int32_t max, unsigned int len) 60;----------------------------------------------------------------------------- 61 62; %1 = number of xmm registers used 63; %2 = number of inline load/process/store loops per asm loop 64; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop 65; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) 66; %5 = suffix 67%macro VECTOR_CLIP_INT32 4-5 68cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len 69%if %4 70 cvtsi2ss m4, minm 71 cvtsi2ss m5, maxm 72%else 73 movd m4, minm 74 movd m5, maxm 75%endif 76 SPLATD m4 77 SPLATD m5 78.loop: 79%assign %%i 0 80%rep %2 81 mova m0, [srcq+mmsize*(0+%%i)] 82 mova m1, [srcq+mmsize*(1+%%i)] 83 mova m2, [srcq+mmsize*(2+%%i)] 84 mova m3, [srcq+mmsize*(3+%%i)] 85%if %3 86 mova m7, [srcq+mmsize*(4+%%i)] 87 mova m8, [srcq+mmsize*(5+%%i)] 88 mova m9, [srcq+mmsize*(6+%%i)] 89 mova m10, [srcq+mmsize*(7+%%i)] 90%endif 91 CLIPD m0, m4, m5, m6 92 CLIPD m1, m4, m5, m6 93 CLIPD m2, m4, m5, m6 94 CLIPD m3, m4, m5, m6 95%if %3 96 CLIPD m7, m4, m5, m6 97 CLIPD m8, m4, m5, m6 98 CLIPD m9, m4, m5, m6 99 CLIPD m10, m4, m5, m6 100%endif 101 mova [dstq+mmsize*(0+%%i)], m0 102 mova [dstq+mmsize*(1+%%i)], m1 103 mova [dstq+mmsize*(2+%%i)], m2 104 mova [dstq+mmsize*(3+%%i)], m3 105%if %3 106 mova [dstq+mmsize*(4+%%i)], m7 107 mova [dstq+mmsize*(5+%%i)], m8 108 mova [dstq+mmsize*(6+%%i)], m9 109 mova [dstq+mmsize*(7+%%i)], m10 110%endif 111%assign %%i %%i+4*(%3+1) 112%endrep 113 add srcq, mmsize*4*(%2+%3) 114 add dstq, mmsize*4*(%2+%3) 115 sub lend, mmsize*(%2+%3) 116 jg .loop 117 REP_RET 118%endmacro 119 120INIT_MMX mmx 121%define CLIPD CLIPD_MMX 122VECTOR_CLIP_INT32 0, 1, 0, 0 123INIT_XMM sse2 124VECTOR_CLIP_INT32 6, 1, 0, 0, _int 125%define CLIPD CLIPD_SSE2 126VECTOR_CLIP_INT32 6, 2, 0, 1 127INIT_XMM sse4 128%define CLIPD CLIPD_SSE41 129%ifdef m8 130VECTOR_CLIP_INT32 11, 1, 1, 0 131%else 132VECTOR_CLIP_INT32 6, 1, 0, 0 133%endif 134 135;----------------------------------------------------- 136;void ff_vector_clipf(float *dst, const float *src, 137; float min, float max, int len) 138;----------------------------------------------------- 139INIT_XMM sse 140%if UNIX64 141cglobal vector_clipf, 3,3,6, dst, src, len 142%else 143cglobal vector_clipf, 5,5,6, dst, src, min, max, len 144%endif 145%if WIN64 146 SWAP 0, 2 147 SWAP 1, 3 148%elif ARCH_X86_32 149 movss m0, minm 150 movss m1, maxm 151%endif 152 SPLATD m0 153 SPLATD m1 154 shl lend, 2 155 add srcq, lenq 156 add dstq, lenq 157 neg lenq 158.loop: 159 mova m2, [srcq+lenq+mmsize*0] 160 mova m3, [srcq+lenq+mmsize*1] 161 mova m4, [srcq+lenq+mmsize*2] 162 mova m5, [srcq+lenq+mmsize*3] 163 maxps m2, m0 164 maxps m3, m0 165 maxps m4, m0 166 maxps m5, m0 167 minps m2, m1 168 minps m3, m1 169 minps m4, m1 170 minps m5, m1 171 mova [dstq+lenq+mmsize*0], m2 172 mova [dstq+lenq+mmsize*1], m3 173 mova [dstq+lenq+mmsize*2], m4 174 mova [dstq+lenq+mmsize*3], m5 175 add lenq, mmsize*4 176 jl .loop 177 REP_RET 178