1;***************************************************************************** 2;* MMX optimized DSP utils 3;***************************************************************************** 4;* Copyright (c) 2000, 2001 Fabrice Bellard 5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6;* 7;* This file is part of Libav. 8;* 9;* Libav is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* Libav is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with Libav; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;***************************************************************************** 23 24%include "x86inc.asm" 25%include "x86util.asm" 26 27SECTION .text 28 29%macro DIFF_PIXELS_1 4 30 movh %1, %3 31 movh %2, %4 32 punpcklbw %2, %1 33 punpcklbw %1, %1 34 psubw %1, %2 35%endmacro 36 37; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 38; %6=temporary storage location 39; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) 40%macro DIFF_PIXELS_8 6 41 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] 42 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] 43 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] 44 add %1, %5 45 add %2, %5 46 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] 47 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] 48 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] 49 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] 50%ifdef m8 51 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] 52%else 53 mova [%6], m0 54 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] 55 mova m0, [%6] 56%endif 57 sub %1, %5 58 sub %2, %5 59%endmacro 60 61%macro HADAMARD8 0 62 SUMSUB_BADC w, 0, 1, 2, 3 63 SUMSUB_BADC w, 4, 5, 6, 7 64 SUMSUB_BADC w, 0, 2, 1, 3 65 SUMSUB_BADC w, 4, 6, 5, 7 66 SUMSUB_BADC w, 0, 4, 1, 5 67 SUMSUB_BADC w, 2, 6, 3, 7 68%endmacro 69 70%macro ABS1_SUM 3 71 ABS1 %1, %2 72 paddusw %3, %1 73%endmacro 74 75%macro ABS2_SUM 6 76 ABS2 %1, %2, %3, %4 77 paddusw %5, %1 78 paddusw %6, %2 79%endmacro 80 81%macro ABS_SUM_8x8_64 1 82 ABS2 m0, m1, m8, m9 83 ABS2_SUM m2, m3, m8, m9, m0, m1 84 ABS2_SUM m4, m5, m8, m9, m0, m1 85 ABS2_SUM m6, m7, m8, m9, m0, m1 86 paddusw m0, m1 87%endmacro 88 89%macro ABS_SUM_8x8_32 1 90 mova [%1], m7 91 ABS1 m0, m7 92 ABS1 m1, m7 93 ABS1_SUM m2, m7, m0 94 ABS1_SUM m3, m7, m1 95 ABS1_SUM m4, m7, m0 96 ABS1_SUM m5, m7, m1 97 ABS1_SUM m6, m7, m0 98 mova m2, [%1] 99 ABS1_SUM m2, m7, m1 100 paddusw m0, m1 101%endmacro 102 103; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 104; about 100k on extreme inputs. But that's very unlikely to occur in natural video, 105; and it's even more unlikely to not have any alternative mvs/modes with lower cost. 106%macro HSUM_MMX 3 107 mova %2, %1 108 psrlq %1, 32 109 paddusw %1, %2 110 mova %2, %1 111 psrlq %1, 16 112 paddusw %1, %2 113 movd %3, %1 114%endmacro 115 116%macro HSUM_MMX2 3 117 pshufw %2, %1, 0xE 118 paddusw %1, %2 119 pshufw %2, %1, 0x1 120 paddusw %1, %2 121 movd %3, %1 122%endmacro 123 124%macro HSUM_SSE2 3 125 movhlps %2, %1 126 paddusw %1, %2 127 pshuflw %2, %1, 0xE 128 paddusw %1, %2 129 pshuflw %2, %1, 0x1 130 paddusw %1, %2 131 movd %3, %1 132%endmacro 133 134%macro STORE4 5 135 mova [%1+mmsize*0], %2 136 mova [%1+mmsize*1], %3 137 mova [%1+mmsize*2], %4 138 mova [%1+mmsize*3], %5 139%endmacro 140 141%macro LOAD4 5 142 mova %2, [%1+mmsize*0] 143 mova %3, [%1+mmsize*1] 144 mova %4, [%1+mmsize*2] 145 mova %5, [%1+mmsize*3] 146%endmacro 147 148%macro hadamard8_16_wrapper 3 149cglobal hadamard8_diff_%1, 4, 4, %2 150%ifndef m8 151 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) 152 SUB rsp, pad 153%endif 154 call hadamard8x8_diff_%1 155%ifndef m8 156 ADD rsp, pad 157%endif 158 RET 159 160cglobal hadamard8_diff16_%1, 5, 6, %2 161%ifndef m8 162 %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) 163 SUB rsp, pad 164%endif 165 166 call hadamard8x8_diff_%1 167 mov r5d, eax 168 169 add r1, 8 170 add r2, 8 171 call hadamard8x8_diff_%1 172 add r5d, eax 173 174 cmp r4d, 16 175 jne .done 176 177 lea r1, [r1+r3*8-8] 178 lea r2, [r2+r3*8-8] 179 call hadamard8x8_diff_%1 180 add r5d, eax 181 182 add r1, 8 183 add r2, 8 184 call hadamard8x8_diff_%1 185 add r5d, eax 186 187.done 188 mov eax, r5d 189%ifndef m8 190 ADD rsp, pad 191%endif 192 RET 193%endmacro 194 195%macro HADAMARD8_DIFF_MMX 1 196ALIGN 16 197; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, 198; int stride, int h) 199; r0 = void *s = unused, int h = unused (always 8) 200; note how r1, r2 and r3 are not clobbered in this function, so 16x16 201; can simply call this 2x2x (and that's why we access rsp+gprsize 202; everywhere, which is rsp of calling func 203hadamard8x8_diff_%1: 204 lea r0, [r3*3] 205 206 ; first 4x8 pixels 207 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 208 HADAMARD8 209 mova [rsp+gprsize+0x60], m7 210 TRANSPOSE4x4W 0, 1, 2, 3, 7 211 STORE4 rsp+gprsize, m0, m1, m2, m3 212 mova m7, [rsp+gprsize+0x60] 213 TRANSPOSE4x4W 4, 5, 6, 7, 0 214 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 215 216 ; second 4x8 pixels 217 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 218 HADAMARD8 219 mova [rsp+gprsize+0x60], m7 220 TRANSPOSE4x4W 0, 1, 2, 3, 7 221 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 222 mova m7, [rsp+gprsize+0x60] 223 TRANSPOSE4x4W 4, 5, 6, 7, 0 224 225 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 226 HADAMARD8 227 ABS_SUM_8x8_32 rsp+gprsize+0x60 228 mova [rsp+gprsize+0x60], m0 229 230 LOAD4 rsp+gprsize , m0, m1, m2, m3 231 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 232 HADAMARD8 233 ABS_SUM_8x8_32 rsp+gprsize 234 paddusw m0, [rsp+gprsize+0x60] 235 236 HSUM m0, m1, eax 237 and rax, 0xFFFF 238 ret 239 240hadamard8_16_wrapper %1, 0, 14 241%endmacro 242 243%macro HADAMARD8_DIFF_SSE2 2 244hadamard8x8_diff_%1: 245 lea r0, [r3*3] 246 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize 247 HADAMARD8 248%ifdef ARCH_X86_64 249 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 250%else 251 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] 252%endif 253 HADAMARD8 254 ABS_SUM_8x8 rsp+gprsize 255 HSUM_SSE2 m0, m1, eax 256 and eax, 0xFFFF 257 ret 258 259hadamard8_16_wrapper %1, %2, 3 260%endmacro 261 262INIT_MMX 263%define ABS1 ABS1_MMX 264%define HSUM HSUM_MMX 265HADAMARD8_DIFF_MMX mmx 266 267%define ABS1 ABS1_MMX2 268%define HSUM HSUM_MMX2 269HADAMARD8_DIFF_MMX mmx2 270 271INIT_XMM 272%define ABS2 ABS2_MMX2 273%ifdef ARCH_X86_64 274%define ABS_SUM_8x8 ABS_SUM_8x8_64 275%else 276%define ABS_SUM_8x8 ABS_SUM_8x8_32 277%endif 278HADAMARD8_DIFF_SSE2 sse2, 10 279 280%define ABS2 ABS2_SSSE3 281%define ABS_SUM_8x8 ABS_SUM_8x8_64 282HADAMARD8_DIFF_SSE2 ssse3, 9 283 284INIT_XMM 285; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) 286cglobal sse16_sse2, 5, 5, 8 287 shr r4d, 1 288 pxor m0, m0 ; mm0 = 0 289 pxor m7, m7 ; mm7 holds the sum 290 291.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned 292 movu m1, [r1 ] ; mm1 = pix1[0][0-15] 293 movu m2, [r2 ] ; mm2 = pix2[0][0-15] 294 movu m3, [r1+r3] ; mm3 = pix1[1][0-15] 295 movu m4, [r2+r3] ; mm4 = pix2[1][0-15] 296 297 ; todo: mm1-mm2, mm3-mm4 298 ; algo: subtract mm1 from mm2 with saturation and vice versa 299 ; OR the result to get the absolute difference 300 mova m5, m1 301 mova m6, m3 302 psubusb m1, m2 303 psubusb m3, m4 304 psubusb m2, m5 305 psubusb m4, m6 306 307 por m2, m1 308 por m4, m3 309 310 ; now convert to 16-bit vectors so we can square them 311 mova m1, m2 312 mova m3, m4 313 314 punpckhbw m2, m0 315 punpckhbw m4, m0 316 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) 317 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) 318 319 pmaddwd m2, m2 320 pmaddwd m4, m4 321 pmaddwd m1, m1 322 pmaddwd m3, m3 323 324 lea r1, [r1+r3*2] ; pix1 += 2*line_size 325 lea r2, [r2+r3*2] ; pix2 += 2*line_size 326 327 paddd m1, m2 328 paddd m3, m4 329 paddd m7, m1 330 paddd m7, m3 331 332 dec r4 333 jnz .next2lines 334 335 mova m1, m7 336 psrldq m7, 8 ; shift hi qword to lo 337 paddd m7, m1 338 mova m1, m7 339 psrldq m7, 4 ; shift hi dword to lo 340 paddd m7, m1 341 movd eax, m7 ; return value 342 RET 343