1;****************************************************************************** 2;* Copyright (c) 2010 David Conrad 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION_RODATA 24pw_3: times 8 dw 3 25pw_7: times 8 dw 7 26pw_16: times 8 dw 16 27pw_32: times 8 dw 32 28pb_128: times 16 db 128 29 30section .text 31 32%macro UNPACK_ADD 6 33 mov%5 %1, %3 34 mov%6 m5, %4 35 mova m4, %1 36 mova %2, m5 37 punpcklbw %1, m7 38 punpcklbw m5, m7 39 punpckhbw m4, m7 40 punpckhbw %2, m7 41 paddw %1, m5 42 paddw %2, m4 43%endmacro 44 45%macro HPEL_FILTER 1 46; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); 47cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 48 mov src0q, srcq 49 lea stridex3q, [3*strideq] 50 sub src0q, stridex3q 51 pxor m7, m7 52.loop: 53 ; 7*(src[0] + src[1]) 54 UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a 55 pmullw m0, [pw_7] 56 pmullw m1, [pw_7] 57 58 ; 3*( ... + src[-2] + src[3]) 59 UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a 60 paddw m0, m2 61 paddw m1, m3 62 pmullw m0, [pw_3] 63 pmullw m1, [pw_3] 64 65 ; ... - 7*(src[-1] + src[2]) 66 UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a 67 pmullw m2, [pw_7] 68 pmullw m3, [pw_7] 69 psubw m0, m2 70 psubw m1, m3 71 72 ; ... - (src[-3] + src[4]) 73 UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a 74 psubw m0, m2 75 psubw m1, m3 76 77 paddw m0, [pw_16] 78 paddw m1, [pw_16] 79 psraw m0, 5 80 psraw m1, 5 81 packuswb m0, m1 82 mova [dstq], m0 83 add dstq, mmsize 84 add srcq, mmsize 85 add src0q, mmsize 86 sub widthd, mmsize 87 jg .loop 88 RET 89 90; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); 91cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width 92 dec widthd 93 pxor m7, m7 94 and widthd, ~(mmsize-1) 95.loop: 96 ; 7*(src[0] + src[1]) 97 UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u 98 pmullw m0, [pw_7] 99 pmullw m1, [pw_7] 100 101 ; 3*( ... + src[-2] + src[3]) 102 UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u 103 paddw m0, m2 104 paddw m1, m3 105 pmullw m0, [pw_3] 106 pmullw m1, [pw_3] 107 108 ; ... - 7*(src[-1] + src[2]) 109 UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u 110 pmullw m2, [pw_7] 111 pmullw m3, [pw_7] 112 psubw m0, m2 113 psubw m1, m3 114 115 ; ... - (src[-3] + src[4]) 116 UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u 117 psubw m0, m2 118 psubw m1, m3 119 120 paddw m0, [pw_16] 121 paddw m1, [pw_16] 122 psraw m0, 5 123 psraw m1, 5 124 packuswb m0, m1 125 mova [dstq + widthq], m0 126 sub widthd, mmsize 127 jge .loop 128 RET 129%endmacro 130 131%macro PUT_RECT 1 132; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) 133cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 134 mova m0, [pb_128] 135 add wd, (mmsize-1) 136 and wd, ~(mmsize-1) 137 138%if ARCH_X86_64 139 movsxd dst_strideq, dst_strided 140 movsxd src_strideq, src_strided 141 mov r7d, r5m 142 mov r8d, wd 143 %define wspill r8d 144 %define hd r7d 145%else 146 mov r4m, wd 147 %define wspill r4m 148 %define hd r5mp 149%endif 150 151.loopy 152 lea src2q, [srcq+src_strideq*2] 153 lea dst2q, [dstq+dst_strideq] 154.loopx: 155 sub wd, mmsize 156 mova m1, [srcq +2*wq] 157 mova m2, [src2q+2*wq] 158 packsswb m1, [srcq +2*wq+mmsize] 159 packsswb m2, [src2q+2*wq+mmsize] 160 paddb m1, m0 161 paddb m2, m0 162 mova [dstq +wq], m1 163 mova [dst2q+wq], m2 164 jg .loopx 165 166 lea srcq, [srcq+src_strideq*4] 167 lea dstq, [dstq+dst_strideq*2] 168 sub hd, 2 169 mov wd, wspill 170 jg .loopy 171 RET 172%endm 173 174%macro ADD_RECT 1 175; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) 176cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h 177 mova m0, [pw_32] 178 add wd, (mmsize-1) 179 and wd, ~(mmsize-1) 180 181%if ARCH_X86_64 182 movsxd strideq, strided 183 movsxd idwt_strideq, idwt_strided 184 mov r8d, wd 185 %define wspill r8d 186%else 187 mov r5m, wd 188 %define wspill r5m 189%endif 190 191.loop: 192 sub wd, mmsize 193 movu m1, [srcq +2*wq] ; FIXME: ensure alignment 194 paddw m1, m0 195 psraw m1, 6 196 movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment 197 paddw m2, m0 198 psraw m2, 6 199 paddw m1, [idwtq+2*wq] 200 paddw m2, [idwtq+2*wq+mmsize] 201 packuswb m1, m2 202 mova [dstq +wq], m1 203 jg .loop 204 205 lea srcq, [srcq + 2*strideq] 206 add dstq, strideq 207 lea idwtq, [idwtq+ 2*idwt_strideq] 208 sub hd, 1 209 mov wd, wspill 210 jg .loop 211 RET 212%endm 213 214%macro ADD_OBMC 2 215; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) 216cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen 217 pxor m4, m4 218.loop: 219%assign i 0 220%rep %1 / mmsize 221 mova m0, [srcq+i] 222 mova m1, m0 223 punpcklbw m0, m4 224 punpckhbw m1, m4 225 mova m2, [obmcq+i] 226 mova m3, m2 227 punpcklbw m2, m4 228 punpckhbw m3, m4 229 pmullw m0, m2 230 pmullw m1, m3 231 movu m2, [dstq+2*i] 232 movu m3, [dstq+2*i+mmsize] 233 paddw m0, m2 234 paddw m1, m3 235 movu [dstq+2*i], m0 236 movu [dstq+2*i+mmsize], m1 237%assign i i+mmsize 238%endrep 239 lea srcq, [srcq+strideq] 240 lea dstq, [dstq+2*strideq] 241 add obmcq, 32 242 sub yblend, 1 243 jg .loop 244 RET 245%endm 246 247INIT_MMX 248%if ARCH_X86_64 == 0 249PUT_RECT mmx 250ADD_RECT mmx 251 252HPEL_FILTER mmx 253ADD_OBMC 32, mmx 254ADD_OBMC 16, mmx 255%endif 256ADD_OBMC 8, mmx 257 258INIT_XMM 259PUT_RECT sse2 260ADD_RECT sse2 261 262HPEL_FILTER sse2 263ADD_OBMC 32, sse2 264ADD_OBMC 16, sse2 265