1;****************************************************************************** 2;* MMX optimized discrete wavelet trasnform 3;* Copyright (c) 2010 David Conrad 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25pw_1: times 8 dw 1 26pw_2: times 8 dw 2 27pw_8: times 8 dw 8 28pw_16: times 8 dw 16 29pw_1991: times 4 dw 9,-1 30 31section .text 32 33; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 34%macro COMPOSE_53iL0 4 35 paddw %2, %3 36 paddw %2, %4 37 psraw %2, 2 38 psubw %1, %2 39%endm 40 41; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 42; if %4 is supplied, %1 is loaded unaligned from there 43; m2: clobbered m3: pw_8 m4: pw_1991 44%macro COMPOSE_DD97iH0 3-4 45 paddw m0, %3 46 paddw m1, %2 47 psubw m0, m3 48 mova m2, m1 49 punpcklwd m1, m0 50 punpckhwd m2, m0 51 pmaddwd m1, m4 52 pmaddwd m2, m4 53%if %0 > 3 54 movu %1, %4 55%endif 56 psrad m1, 4 57 psrad m2, 4 58 packssdw m1, m2 59 paddw m1, %1 60%endm 61 62%macro COMPOSE_VERTICAL 1 63; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 64; int width) 65cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width 66 mova m2, [pw_2] 67%if ARCH_X86_64 68 mov widthd, widthd 69%endif 70.loop: 71 sub widthq, mmsize/2 72 mova m1, [b0q+2*widthq] 73 mova m0, [b1q+2*widthq] 74 COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 75 mova [b1q+2*widthq], m0 76 jg .loop 77 REP_RET 78 79; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 80; int width) 81cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width 82 mova m1, [pw_1] 83%if ARCH_X86_64 84 mov widthd, widthd 85%endif 86.loop: 87 sub widthq, mmsize/2 88 mova m0, [b0q+2*widthq] 89 paddw m0, [b2q+2*widthq] 90 paddw m0, m1 91 psraw m0, 1 92 paddw m0, [b1q+2*widthq] 93 mova [b1q+2*widthq], m0 94 jg .loop 95 REP_RET 96 97; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 98; IDWTELEM *b3, IDWTELEM *b4, int width) 99cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width 100 mova m3, [pw_8] 101 mova m4, [pw_1991] 102%if ARCH_X86_64 103 mov widthd, widthd 104%endif 105.loop: 106 sub widthq, mmsize/2 107 mova m0, [b0q+2*widthq] 108 mova m1, [b1q+2*widthq] 109 COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] 110 mova [b2q+2*widthq], m1 111 jg .loop 112 REP_RET 113 114; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, 115; IDWTELEM *b3, IDWTELEM *b4, int width) 116cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width 117 mova m3, [pw_16] 118 mova m4, [pw_1991] 119%if ARCH_X86_64 120 mov widthd, widthd 121%endif 122.loop: 123 sub widthq, mmsize/2 124 mova m0, [b0q+2*widthq] 125 mova m1, [b1q+2*widthq] 126 mova m5, [b2q+2*widthq] 127 paddw m0, [b4q+2*widthq] 128 paddw m1, [b3q+2*widthq] 129 psubw m0, m3 130 mova m2, m1 131 punpcklwd m1, m0 132 punpckhwd m2, m0 133 pmaddwd m1, m4 134 pmaddwd m2, m4 135 psrad m1, 5 136 psrad m2, 5 137 packssdw m1, m2 138 psubw m5, m1 139 mova [b2q+2*widthq], m5 140 jg .loop 141 REP_RET 142 143; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) 144cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width 145 mova m3, [pw_1] 146%if ARCH_X86_64 147 mov widthd, widthd 148%endif 149.loop: 150 sub widthq, mmsize/2 151 mova m1, [b1q+2*widthq] 152 mova m0, [b0q+2*widthq] 153 mova m2, m1 154 paddw m1, m3 155 psraw m1, 1 156 psubw m0, m1 157 mova [b0q+2*widthq], m0 158 paddw m2, m0 159 mova [b1q+2*widthq], m2 160 jg .loop 161 REP_RET 162%endmacro 163 164; extend the left and right edges of the tmp array by %1 and %2 respectively 165%macro EDGE_EXTENSION 3 166 mov %3, [tmpq] 167%assign %%i 1 168%rep %1 169 mov [tmpq-2*%%i], %3 170 %assign %%i %%i+1 171%endrep 172 mov %3, [tmpq+2*w2q-2] 173%assign %%i 0 174%rep %2 175 mov [tmpq+2*w2q+2*%%i], %3 176 %assign %%i %%i+1 177%endrep 178%endmacro 179 180 181%macro HAAR_HORIZONTAL 2 182; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) 183cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 184 mov w2d, wd 185 xor xq, xq 186 shr w2d, 1 187 lea b_w2q, [bq+wq] 188 mova m3, [pw_1] 189.lowpass_loop: 190 movu m1, [b_w2q + 2*xq] 191 mova m0, [bq + 2*xq] 192 paddw m1, m3 193 psraw m1, 1 194 psubw m0, m1 195 mova [tmpq + 2*xq], m0 196 add xq, mmsize/2 197 cmp xq, w2q 198 jl .lowpass_loop 199 200 xor xq, xq 201 and w2q, ~(mmsize/2 - 1) 202 cmp w2q, mmsize/2 203 jl .end 204 205.highpass_loop: 206 movu m1, [b_w2q + 2*xq] 207 mova m0, [tmpq + 2*xq] 208 paddw m1, m0 209 210 ; shift and interleave 211%if %2 == 1 212 paddw m0, m3 213 paddw m1, m3 214 psraw m0, 1 215 psraw m1, 1 216%endif 217 mova m2, m0 218 punpcklwd m0, m1 219 punpckhwd m2, m1 220 mova [bq+4*xq], m0 221 mova [bq+4*xq+mmsize], m2 222 223 add xq, mmsize/2 224 cmp xq, w2q 225 jl .highpass_loop 226.end: 227 REP_RET 228%endmacro 229 230 231INIT_XMM 232; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) 233cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 234 mov w2d, wd 235 xor xd, xd 236 shr w2d, 1 237 lea b_w2q, [bq+wq] 238 movu m4, [bq+wq] 239 mova m7, [pw_2] 240 pslldq m4, 14 241.lowpass_loop: 242 movu m1, [b_w2q + 2*xq] 243 mova m0, [bq + 2*xq] 244 mova m2, m1 245 palignr m1, m4, 14 246 mova m4, m2 247 COMPOSE_53iL0 m0, m1, m2, m7 248 mova [tmpq + 2*xq], m0 249 add xd, mmsize/2 250 cmp xd, w2d 251 jl .lowpass_loop 252 253 EDGE_EXTENSION 1, 2, xw 254 ; leave the last up to 7 (sse) or 3 (mmx) values for C 255 xor xd, xd 256 and w2d, ~(mmsize/2 - 1) 257 cmp w2d, mmsize/2 258 jl .end 259 260 mova m7, [tmpq-mmsize] 261 mova m0, [tmpq] 262 mova m5, [pw_1] 263 mova m3, [pw_8] 264 mova m4, [pw_1991] 265.highpass_loop: 266 mova m6, m0 267 palignr m0, m7, 14 268 mova m7, [tmpq + 2*xq + 16] 269 mova m1, m7 270 mova m2, m7 271 palignr m1, m6, 2 272 palignr m2, m6, 4 273 COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] 274 mova m0, m7 275 mova m7, m6 276 277 ; shift and interleave 278 paddw m6, m5 279 paddw m1, m5 280 psraw m6, 1 281 psraw m1, 1 282 mova m2, m6 283 punpcklwd m6, m1 284 punpckhwd m2, m1 285 mova [bq+4*xq], m6 286 mova [bq+4*xq+mmsize], m2 287 288 add xd, mmsize/2 289 cmp xd, w2d 290 jl .highpass_loop 291.end: 292 REP_RET 293 294 295%if ARCH_X86_64 == 0 296INIT_MMX 297COMPOSE_VERTICAL mmx 298HAAR_HORIZONTAL mmx, 0 299HAAR_HORIZONTAL mmx, 1 300%endif 301 302;;INIT_XMM 303INIT_XMM 304COMPOSE_VERTICAL sse2 305HAAR_HORIZONTAL sse2, 0 306HAAR_HORIZONTAL sse2, 1 307