1;*****************************************************************************
2;* MMX optimized DSP utils
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of Libav.
8;*
9;* Libav is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* Libav is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with Libav; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "x86inc.asm"
25%include "x86util.asm"
26
27SECTION .text
28
29%macro DIFF_PIXELS_1 4
30    movh            %1, %3
31    movh            %2, %4
32    punpcklbw       %2, %1
33    punpcklbw       %1, %1
34    psubw           %1, %2
35%endmacro
36
37; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
38; %6=temporary storage location
39; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
40%macro DIFF_PIXELS_8 6
41    DIFF_PIXELS_1   m0, m7, [%1     +%3], [%2     +%3]
42    DIFF_PIXELS_1   m1, m7, [%1+%4  +%3], [%2+%4  +%3]
43    DIFF_PIXELS_1   m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
44    add             %1, %5
45    add             %2, %5
46    DIFF_PIXELS_1   m3, m7, [%1     +%3], [%2     +%3]
47    DIFF_PIXELS_1   m4, m7, [%1+%4  +%3], [%2+%4  +%3]
48    DIFF_PIXELS_1   m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
49    DIFF_PIXELS_1   m6, m7, [%1+%5  +%3], [%2+%5  +%3]
50%ifdef m8
51    DIFF_PIXELS_1   m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
52%else
53    mova          [%6], m0
54    DIFF_PIXELS_1   m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
55    mova            m0, [%6]
56%endif
57    sub             %1, %5
58    sub             %2, %5
59%endmacro
60
61%macro HADAMARD8 0
62    SUMSUB_BADC       w, 0, 1, 2, 3
63    SUMSUB_BADC       w, 4, 5, 6, 7
64    SUMSUB_BADC       w, 0, 2, 1, 3
65    SUMSUB_BADC       w, 4, 6, 5, 7
66    SUMSUB_BADC       w, 0, 4, 1, 5
67    SUMSUB_BADC       w, 2, 6, 3, 7
68%endmacro
69
70%macro ABS1_SUM 3
71    ABS1            %1, %2
72    paddusw         %3, %1
73%endmacro
74
75%macro ABS2_SUM 6
76    ABS2            %1, %2, %3, %4
77    paddusw         %5, %1
78    paddusw         %6, %2
79%endmacro
80
81%macro ABS_SUM_8x8_64 1
82    ABS2            m0, m1, m8, m9
83    ABS2_SUM        m2, m3, m8, m9, m0, m1
84    ABS2_SUM        m4, m5, m8, m9, m0, m1
85    ABS2_SUM        m6, m7, m8, m9, m0, m1
86    paddusw         m0, m1
87%endmacro
88
89%macro ABS_SUM_8x8_32 1
90    mova          [%1], m7
91    ABS1            m0, m7
92    ABS1            m1, m7
93    ABS1_SUM        m2, m7, m0
94    ABS1_SUM        m3, m7, m1
95    ABS1_SUM        m4, m7, m0
96    ABS1_SUM        m5, m7, m1
97    ABS1_SUM        m6, m7, m0
98    mova            m2, [%1]
99    ABS1_SUM        m2, m7, m1
100    paddusw         m0, m1
101%endmacro
102
103; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
104; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
105; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
106%macro HSUM_MMX 3
107    mova            %2, %1
108    psrlq           %1, 32
109    paddusw         %1, %2
110    mova            %2, %1
111    psrlq           %1, 16
112    paddusw         %1, %2
113    movd            %3, %1
114%endmacro
115
116%macro HSUM_MMX2 3
117    pshufw          %2, %1, 0xE
118    paddusw         %1, %2
119    pshufw          %2, %1, 0x1
120    paddusw         %1, %2
121    movd            %3, %1
122%endmacro
123
124%macro HSUM_SSE2 3
125    movhlps         %2, %1
126    paddusw         %1, %2
127    pshuflw         %2, %1, 0xE
128    paddusw         %1, %2
129    pshuflw         %2, %1, 0x1
130    paddusw         %1, %2
131    movd            %3, %1
132%endmacro
133
134%macro STORE4 5
135    mova [%1+mmsize*0], %2
136    mova [%1+mmsize*1], %3
137    mova [%1+mmsize*2], %4
138    mova [%1+mmsize*3], %5
139%endmacro
140
141%macro LOAD4 5
142    mova            %2, [%1+mmsize*0]
143    mova            %3, [%1+mmsize*1]
144    mova            %4, [%1+mmsize*2]
145    mova            %5, [%1+mmsize*3]
146%endmacro
147
148%macro hadamard8_16_wrapper 3
149cglobal hadamard8_diff_%1, 4, 4, %2
150%ifndef m8
151    %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
152    SUB            rsp, pad
153%endif
154    call hadamard8x8_diff_%1
155%ifndef m8
156    ADD            rsp, pad
157%endif
158    RET
159
160cglobal hadamard8_diff16_%1, 5, 6, %2
161%ifndef m8
162    %assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
163    SUB            rsp, pad
164%endif
165
166    call hadamard8x8_diff_%1
167    mov            r5d, eax
168
169    add             r1, 8
170    add             r2, 8
171    call hadamard8x8_diff_%1
172    add            r5d, eax
173
174    cmp            r4d, 16
175    jne .done
176
177    lea             r1, [r1+r3*8-8]
178    lea             r2, [r2+r3*8-8]
179    call hadamard8x8_diff_%1
180    add            r5d, eax
181
182    add             r1, 8
183    add             r2, 8
184    call hadamard8x8_diff_%1
185    add            r5d, eax
186
187.done
188    mov            eax, r5d
189%ifndef m8
190    ADD            rsp, pad
191%endif
192    RET
193%endmacro
194
195%macro HADAMARD8_DIFF_MMX 1
196ALIGN 16
197; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
198;                          int stride, int h)
199; r0 = void *s = unused, int h = unused (always 8)
200; note how r1, r2 and r3 are not clobbered in this function, so 16x16
201; can simply call this 2x2x (and that's why we access rsp+gprsize
202; everywhere, which is rsp of calling func
203hadamard8x8_diff_%1:
204    lea                          r0, [r3*3]
205
206    ; first 4x8 pixels
207    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
208    HADAMARD8
209    mova         [rsp+gprsize+0x60], m7
210    TRANSPOSE4x4W                 0,  1,  2,  3,  7
211    STORE4              rsp+gprsize, m0, m1, m2, m3
212    mova                         m7, [rsp+gprsize+0x60]
213    TRANSPOSE4x4W                 4,  5,  6,  7,  0
214    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
215
216    ; second 4x8 pixels
217    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
218    HADAMARD8
219    mova         [rsp+gprsize+0x60], m7
220    TRANSPOSE4x4W                 0,  1,  2,  3,  7
221    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
222    mova                         m7, [rsp+gprsize+0x60]
223    TRANSPOSE4x4W                 4,  5,  6,  7,  0
224
225    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
226    HADAMARD8
227    ABS_SUM_8x8_32 rsp+gprsize+0x60
228    mova         [rsp+gprsize+0x60], m0
229
230    LOAD4          rsp+gprsize     , m0, m1, m2, m3
231    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
232    HADAMARD8
233    ABS_SUM_8x8_32 rsp+gprsize
234    paddusw                      m0, [rsp+gprsize+0x60]
235
236    HSUM                         m0, m1, eax
237    and                         rax, 0xFFFF
238    ret
239
240hadamard8_16_wrapper %1, 0, 14
241%endmacro
242
243%macro HADAMARD8_DIFF_SSE2 2
244hadamard8x8_diff_%1:
245    lea                          r0, [r3*3]
246    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
247    HADAMARD8
248%ifdef ARCH_X86_64
249    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7,  8
250%else
251    TRANSPOSE8x8W                 0,  1,  2,  3,  4,  5,  6,  7, [rsp+gprsize], [rsp+mmsize+gprsize]
252%endif
253    HADAMARD8
254    ABS_SUM_8x8         rsp+gprsize
255    HSUM_SSE2                    m0, m1, eax
256    and                         eax, 0xFFFF
257    ret
258
259hadamard8_16_wrapper %1, %2, 3
260%endmacro
261
262INIT_MMX
263%define ABS1 ABS1_MMX
264%define HSUM HSUM_MMX
265HADAMARD8_DIFF_MMX mmx
266
267%define ABS1 ABS1_MMX2
268%define HSUM HSUM_MMX2
269HADAMARD8_DIFF_MMX mmx2
270
271INIT_XMM
272%define ABS2 ABS2_MMX2
273%ifdef ARCH_X86_64
274%define ABS_SUM_8x8 ABS_SUM_8x8_64
275%else
276%define ABS_SUM_8x8 ABS_SUM_8x8_32
277%endif
278HADAMARD8_DIFF_SSE2 sse2, 10
279
280%define ABS2        ABS2_SSSE3
281%define ABS_SUM_8x8 ABS_SUM_8x8_64
282HADAMARD8_DIFF_SSE2 ssse3, 9
283
284INIT_XMM
285; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
286cglobal sse16_sse2, 5, 5, 8
287    shr      r4d, 1
288    pxor      m0, m0         ; mm0 = 0
289    pxor      m7, m7         ; mm7 holds the sum
290
291.next2lines ; FIXME why are these unaligned movs? pix1[] is aligned
292    movu      m1, [r1   ]    ; mm1 = pix1[0][0-15]
293    movu      m2, [r2   ]    ; mm2 = pix2[0][0-15]
294    movu      m3, [r1+r3]    ; mm3 = pix1[1][0-15]
295    movu      m4, [r2+r3]    ; mm4 = pix2[1][0-15]
296
297    ; todo: mm1-mm2, mm3-mm4
298    ; algo: subtract mm1 from mm2 with saturation and vice versa
299    ;       OR the result to get the absolute difference
300    mova      m5, m1
301    mova      m6, m3
302    psubusb   m1, m2
303    psubusb   m3, m4
304    psubusb   m2, m5
305    psubusb   m4, m6
306
307    por       m2, m1
308    por       m4, m3
309
310    ; now convert to 16-bit vectors so we can square them
311    mova      m1, m2
312    mova      m3, m4
313
314    punpckhbw m2, m0
315    punpckhbw m4, m0
316    punpcklbw m1, m0         ; mm1 not spread over (mm1,mm2)
317    punpcklbw m3, m0         ; mm4 not spread over (mm3,mm4)
318
319    pmaddwd   m2, m2
320    pmaddwd   m4, m4
321    pmaddwd   m1, m1
322    pmaddwd   m3, m3
323
324    lea       r1, [r1+r3*2]  ; pix1 += 2*line_size
325    lea       r2, [r2+r3*2]  ; pix2 += 2*line_size
326
327    paddd     m1, m2
328    paddd     m3, m4
329    paddd     m7, m1
330    paddd     m7, m3
331
332    dec       r4
333    jnz .next2lines
334
335    mova      m1, m7
336    psrldq    m7, 8          ; shift hi qword to lo
337    paddd     m7, m1
338    mova      m1, m7
339    psrldq    m7, 4          ; shift hi dword to lo
340    paddd     m7, m1
341    movd     eax, m7         ; return value
342    RET
343