1;***************************************************************************** 2;* SSE2-optimized weighted prediction code 3;***************************************************************************** 4;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 5;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION .text 27 28;----------------------------------------------------------------------------- 29; biweight pred: 30; 31; void ff_h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, 32; int height, int log2_denom, int weightd, 33; int weights, int offset); 34; and 35; void ff_h264_weight_16_sse2(uint8_t *dst, int stride, int height, 36; int log2_denom, int weight, int offset); 37;----------------------------------------------------------------------------- 38 39%macro WEIGHT_SETUP 0 40 add r5, r5 41 inc r5 42 movd m3, r4d 43 movd m5, r5d 44 movd m6, r3d 45 pslld m5, m6 46 psrld m5, 1 47%if mmsize == 16 48 pshuflw m3, m3, 0 49 pshuflw m5, m5, 0 50 punpcklqdq m3, m3 51 punpcklqdq m5, m5 52%else 53 pshufw m3, m3, 0 54 pshufw m5, m5, 0 55%endif 56 pxor m7, m7 57%endmacro 58 59%macro WEIGHT_OP 2 60 movh m0, [r0+%1] 61 movh m1, [r0+%2] 62 punpcklbw m0, m7 63 punpcklbw m1, m7 64 pmullw m0, m3 65 pmullw m1, m3 66 paddsw m0, m5 67 paddsw m1, m5 68 psraw m0, m6 69 psraw m1, m6 70 packuswb m0, m1 71%endmacro 72 73INIT_MMX mmxext 74cglobal h264_weight_16, 6, 6, 0 75 WEIGHT_SETUP 76.nextrow: 77 WEIGHT_OP 0, 4 78 mova [r0 ], m0 79 WEIGHT_OP 8, 12 80 mova [r0+8], m0 81 add r0, r1 82 dec r2d 83 jnz .nextrow 84 REP_RET 85 86%macro WEIGHT_FUNC_MM 2 87cglobal h264_weight_%1, 6, 6, %2 88 WEIGHT_SETUP 89.nextrow: 90 WEIGHT_OP 0, mmsize/2 91 mova [r0], m0 92 add r0, r1 93 dec r2d 94 jnz .nextrow 95 REP_RET 96%endmacro 97 98INIT_MMX mmxext 99WEIGHT_FUNC_MM 8, 0 100INIT_XMM sse2 101WEIGHT_FUNC_MM 16, 8 102 103%macro WEIGHT_FUNC_HALF_MM 2 104cglobal h264_weight_%1, 6, 6, %2 105 WEIGHT_SETUP 106 sar r2d, 1 107 lea r3, [r1*2] 108.nextrow: 109 WEIGHT_OP 0, r1 110 movh [r0], m0 111%if mmsize == 16 112 movhps [r0+r1], m0 113%else 114 psrlq m0, 32 115 movh [r0+r1], m0 116%endif 117 add r0, r3 118 dec r2d 119 jnz .nextrow 120 REP_RET 121%endmacro 122 123INIT_MMX mmxext 124WEIGHT_FUNC_HALF_MM 4, 0 125INIT_XMM sse2 126WEIGHT_FUNC_HALF_MM 8, 8 127 128%macro BIWEIGHT_SETUP 0 129%if ARCH_X86_64 130%define off_regd r7d 131%else 132%define off_regd r3d 133%endif 134 mov off_regd, r7m 135 add off_regd, 1 136 or off_regd, 1 137 add r4, 1 138 cmp r5, 128 139 jne .normal 140 sar r5, 1 141 sar r6, 1 142 sar off_regd, 1 143 sub r4, 1 144.normal 145%if cpuflag(ssse3) 146 movd m4, r5d 147 movd m0, r6d 148%else 149 movd m3, r5d 150 movd m4, r6d 151%endif 152 movd m5, off_regd 153 movd m6, r4d 154 pslld m5, m6 155 psrld m5, 1 156%if cpuflag(ssse3) 157 punpcklbw m4, m0 158 pshuflw m4, m4, 0 159 pshuflw m5, m5, 0 160 punpcklqdq m4, m4 161 punpcklqdq m5, m5 162 163%else 164%if mmsize == 16 165 pshuflw m3, m3, 0 166 pshuflw m4, m4, 0 167 pshuflw m5, m5, 0 168 punpcklqdq m3, m3 169 punpcklqdq m4, m4 170 punpcklqdq m5, m5 171%else 172 pshufw m3, m3, 0 173 pshufw m4, m4, 0 174 pshufw m5, m5, 0 175%endif 176 pxor m7, m7 177%endif 178%endmacro 179 180%macro BIWEIGHT_STEPA 3 181 movh m%1, [r0+%3] 182 movh m%2, [r1+%3] 183 punpcklbw m%1, m7 184 punpcklbw m%2, m7 185 pmullw m%1, m3 186 pmullw m%2, m4 187 paddsw m%1, m%2 188%endmacro 189 190%macro BIWEIGHT_STEPB 0 191 paddsw m0, m5 192 paddsw m1, m5 193 psraw m0, m6 194 psraw m1, m6 195 packuswb m0, m1 196%endmacro 197 198INIT_MMX mmxext 199cglobal h264_biweight_16, 7, 8, 0 200 BIWEIGHT_SETUP 201 movifnidn r3d, r3m 202.nextrow: 203 BIWEIGHT_STEPA 0, 1, 0 204 BIWEIGHT_STEPA 1, 2, 4 205 BIWEIGHT_STEPB 206 mova [r0], m0 207 BIWEIGHT_STEPA 0, 1, 8 208 BIWEIGHT_STEPA 1, 2, 12 209 BIWEIGHT_STEPB 210 mova [r0+8], m0 211 add r0, r2 212 add r1, r2 213 dec r3d 214 jnz .nextrow 215 REP_RET 216 217%macro BIWEIGHT_FUNC_MM 2 218cglobal h264_biweight_%1, 7, 8, %2 219 BIWEIGHT_SETUP 220 movifnidn r3d, r3m 221.nextrow: 222 BIWEIGHT_STEPA 0, 1, 0 223 BIWEIGHT_STEPA 1, 2, mmsize/2 224 BIWEIGHT_STEPB 225 mova [r0], m0 226 add r0, r2 227 add r1, r2 228 dec r3d 229 jnz .nextrow 230 REP_RET 231%endmacro 232 233INIT_MMX mmxext 234BIWEIGHT_FUNC_MM 8, 0 235INIT_XMM sse2 236BIWEIGHT_FUNC_MM 16, 8 237 238%macro BIWEIGHT_FUNC_HALF_MM 2 239cglobal h264_biweight_%1, 7, 8, %2 240 BIWEIGHT_SETUP 241 movifnidn r3d, r3m 242 sar r3, 1 243 lea r4, [r2*2] 244.nextrow: 245 BIWEIGHT_STEPA 0, 1, 0 246 BIWEIGHT_STEPA 1, 2, r2 247 BIWEIGHT_STEPB 248 movh [r0], m0 249%if mmsize == 16 250 movhps [r0+r2], m0 251%else 252 psrlq m0, 32 253 movh [r0+r2], m0 254%endif 255 add r0, r4 256 add r1, r4 257 dec r3d 258 jnz .nextrow 259 REP_RET 260%endmacro 261 262INIT_MMX mmxext 263BIWEIGHT_FUNC_HALF_MM 4, 0 264INIT_XMM sse2 265BIWEIGHT_FUNC_HALF_MM 8, 8 266 267%macro BIWEIGHT_SSSE3_OP 0 268 pmaddubsw m0, m4 269 pmaddubsw m2, m4 270 paddsw m0, m5 271 paddsw m2, m5 272 psraw m0, m6 273 psraw m2, m6 274 packuswb m0, m2 275%endmacro 276 277INIT_XMM ssse3 278cglobal h264_biweight_16, 7, 8, 8 279 BIWEIGHT_SETUP 280 movifnidn r3d, r3m 281 282.nextrow: 283 movh m0, [r0] 284 movh m2, [r0+8] 285 movh m3, [r1+8] 286 punpcklbw m0, [r1] 287 punpcklbw m2, m3 288 BIWEIGHT_SSSE3_OP 289 mova [r0], m0 290 add r0, r2 291 add r1, r2 292 dec r3d 293 jnz .nextrow 294 REP_RET 295 296INIT_XMM ssse3 297cglobal h264_biweight_8, 7, 8, 8 298 BIWEIGHT_SETUP 299 movifnidn r3d, r3m 300 sar r3, 1 301 lea r4, [r2*2] 302 303.nextrow: 304 movh m0, [r0] 305 movh m1, [r1] 306 movh m2, [r0+r2] 307 movh m3, [r1+r2] 308 punpcklbw m0, m1 309 punpcklbw m2, m3 310 BIWEIGHT_SSSE3_OP 311 movh [r0], m0 312 movhps [r0+r2], m0 313 add r0, r4 314 add r1, r4 315 dec r3d 316 jnz .nextrow 317 REP_RET 318