1/*
2 * MMX optimized motion estimation
3 * Copyright (c) 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer
5 *
6 * mostly by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/x86_cpu.h"
26#include "libavcodec/dsputil.h"
27
28DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={
290x0000000000000000ULL,
300x0001000100010001ULL,
310x0002000200020002ULL,
32};
33
34DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
35
36static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
37{
38    x86_reg len= -(stride*h);
39    __asm__ volatile(
40        ASMALIGN(4)
41        "1:                             \n\t"
42        "movq (%1, %%"REG_a"), %%mm0    \n\t"
43        "movq (%2, %%"REG_a"), %%mm2    \n\t"
44        "movq (%2, %%"REG_a"), %%mm4    \n\t"
45        "add %3, %%"REG_a"              \n\t"
46        "psubusb %%mm0, %%mm2           \n\t"
47        "psubusb %%mm4, %%mm0           \n\t"
48        "movq (%1, %%"REG_a"), %%mm1    \n\t"
49        "movq (%2, %%"REG_a"), %%mm3    \n\t"
50        "movq (%2, %%"REG_a"), %%mm5    \n\t"
51        "psubusb %%mm1, %%mm3           \n\t"
52        "psubusb %%mm5, %%mm1           \n\t"
53        "por %%mm2, %%mm0               \n\t"
54        "por %%mm1, %%mm3               \n\t"
55        "movq %%mm0, %%mm1              \n\t"
56        "movq %%mm3, %%mm2              \n\t"
57        "punpcklbw %%mm7, %%mm0         \n\t"
58        "punpckhbw %%mm7, %%mm1         \n\t"
59        "punpcklbw %%mm7, %%mm3         \n\t"
60        "punpckhbw %%mm7, %%mm2         \n\t"
61        "paddw %%mm1, %%mm0             \n\t"
62        "paddw %%mm3, %%mm2             \n\t"
63        "paddw %%mm2, %%mm0             \n\t"
64        "paddw %%mm0, %%mm6             \n\t"
65        "add %3, %%"REG_a"              \n\t"
66        " js 1b                         \n\t"
67        : "+a" (len)
68        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
69    );
70}
71
72static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
73{
74    __asm__ volatile(
75        ASMALIGN(4)
76        "1:                             \n\t"
77        "movq (%1), %%mm0               \n\t"
78        "movq (%1, %3), %%mm1           \n\t"
79        "psadbw (%2), %%mm0             \n\t"
80        "psadbw (%2, %3), %%mm1         \n\t"
81        "paddw %%mm0, %%mm6             \n\t"
82        "paddw %%mm1, %%mm6             \n\t"
83        "lea (%1,%3,2), %1              \n\t"
84        "lea (%2,%3,2), %2              \n\t"
85        "sub $2, %0                     \n\t"
86        " jg 1b                         \n\t"
87        : "+r" (h), "+r" (blk1), "+r" (blk2)
88        : "r" ((x86_reg)stride)
89    );
90}
91
92static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
93{
94    int ret;
95    __asm__ volatile(
96        "pxor %%xmm6, %%xmm6            \n\t"
97        ASMALIGN(4)
98        "1:                             \n\t"
99        "movdqu (%1), %%xmm0            \n\t"
100        "movdqu (%1, %3), %%xmm1        \n\t"
101        "psadbw (%2), %%xmm0            \n\t"
102        "psadbw (%2, %3), %%xmm1        \n\t"
103        "paddw %%xmm0, %%xmm6           \n\t"
104        "paddw %%xmm1, %%xmm6           \n\t"
105        "lea (%1,%3,2), %1              \n\t"
106        "lea (%2,%3,2), %2              \n\t"
107        "sub $2, %0                     \n\t"
108        " jg 1b                         \n\t"
109        : "+r" (h), "+r" (blk1), "+r" (blk2)
110        : "r" ((x86_reg)stride)
111    );
112    __asm__ volatile(
113        "movhlps %%xmm6, %%xmm0         \n\t"
114        "paddw   %%xmm0, %%xmm6         \n\t"
115        "movd    %%xmm6, %0             \n\t"
116        : "=r"(ret)
117    );
118    return ret;
119}
120
121static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
122{
123    __asm__ volatile(
124        ASMALIGN(4)
125        "1:                             \n\t"
126        "movq (%1), %%mm0               \n\t"
127        "movq (%1, %3), %%mm1           \n\t"
128        "pavgb 1(%1), %%mm0             \n\t"
129        "pavgb 1(%1, %3), %%mm1         \n\t"
130        "psadbw (%2), %%mm0             \n\t"
131        "psadbw (%2, %3), %%mm1         \n\t"
132        "paddw %%mm0, %%mm6             \n\t"
133        "paddw %%mm1, %%mm6             \n\t"
134        "lea (%1,%3,2), %1              \n\t"
135        "lea (%2,%3,2), %2              \n\t"
136        "sub $2, %0                     \n\t"
137        " jg 1b                         \n\t"
138        : "+r" (h), "+r" (blk1), "+r" (blk2)
139        : "r" ((x86_reg)stride)
140    );
141}
142
143static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
144{
145    __asm__ volatile(
146        "movq (%1), %%mm0               \n\t"
147        "add %3, %1                     \n\t"
148        ASMALIGN(4)
149        "1:                             \n\t"
150        "movq (%1), %%mm1               \n\t"
151        "movq (%1, %3), %%mm2           \n\t"
152        "pavgb %%mm1, %%mm0             \n\t"
153        "pavgb %%mm2, %%mm1             \n\t"
154        "psadbw (%2), %%mm0             \n\t"
155        "psadbw (%2, %3), %%mm1         \n\t"
156        "paddw %%mm0, %%mm6             \n\t"
157        "paddw %%mm1, %%mm6             \n\t"
158        "movq %%mm2, %%mm0              \n\t"
159        "lea (%1,%3,2), %1              \n\t"
160        "lea (%2,%3,2), %2              \n\t"
161        "sub $2, %0                     \n\t"
162        " jg 1b                         \n\t"
163        : "+r" (h), "+r" (blk1), "+r" (blk2)
164        : "r" ((x86_reg)stride)
165    );
166}
167
168static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
169{
170    __asm__ volatile(
171        "movq "MANGLE(bone)", %%mm5     \n\t"
172        "movq (%1), %%mm0               \n\t"
173        "pavgb 1(%1), %%mm0             \n\t"
174        "add %3, %1                     \n\t"
175        ASMALIGN(4)
176        "1:                             \n\t"
177        "movq (%1), %%mm1               \n\t"
178        "movq (%1,%3), %%mm2            \n\t"
179        "pavgb 1(%1), %%mm1             \n\t"
180        "pavgb 1(%1,%3), %%mm2          \n\t"
181        "psubusb %%mm5, %%mm1           \n\t"
182        "pavgb %%mm1, %%mm0             \n\t"
183        "pavgb %%mm2, %%mm1             \n\t"
184        "psadbw (%2), %%mm0             \n\t"
185        "psadbw (%2,%3), %%mm1          \n\t"
186        "paddw %%mm0, %%mm6             \n\t"
187        "paddw %%mm1, %%mm6             \n\t"
188        "movq %%mm2, %%mm0              \n\t"
189        "lea (%1,%3,2), %1              \n\t"
190        "lea (%2,%3,2), %2              \n\t"
191        "sub $2, %0                     \n\t"
192        " jg 1b                         \n\t"
193        : "+r" (h), "+r" (blk1), "+r" (blk2)
194        : "r" ((x86_reg)stride)
195    );
196}
197
198static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
199{
200    x86_reg len= -(stride*h);
201    __asm__ volatile(
202        ASMALIGN(4)
203        "1:                             \n\t"
204        "movq (%1, %%"REG_a"), %%mm0    \n\t"
205        "movq (%2, %%"REG_a"), %%mm1    \n\t"
206        "movq (%1, %%"REG_a"), %%mm2    \n\t"
207        "movq (%2, %%"REG_a"), %%mm3    \n\t"
208        "punpcklbw %%mm7, %%mm0         \n\t"
209        "punpcklbw %%mm7, %%mm1         \n\t"
210        "punpckhbw %%mm7, %%mm2         \n\t"
211        "punpckhbw %%mm7, %%mm3         \n\t"
212        "paddw %%mm0, %%mm1             \n\t"
213        "paddw %%mm2, %%mm3             \n\t"
214        "movq (%3, %%"REG_a"), %%mm4    \n\t"
215        "movq (%3, %%"REG_a"), %%mm2    \n\t"
216        "paddw %%mm5, %%mm1             \n\t"
217        "paddw %%mm5, %%mm3             \n\t"
218        "psrlw $1, %%mm1                \n\t"
219        "psrlw $1, %%mm3                \n\t"
220        "packuswb %%mm3, %%mm1          \n\t"
221        "psubusb %%mm1, %%mm4           \n\t"
222        "psubusb %%mm2, %%mm1           \n\t"
223        "por %%mm4, %%mm1               \n\t"
224        "movq %%mm1, %%mm0              \n\t"
225        "punpcklbw %%mm7, %%mm0         \n\t"
226        "punpckhbw %%mm7, %%mm1         \n\t"
227        "paddw %%mm1, %%mm0             \n\t"
228        "paddw %%mm0, %%mm6             \n\t"
229        "add %4, %%"REG_a"              \n\t"
230        " js 1b                         \n\t"
231        : "+a" (len)
232        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
233    );
234}
235
236static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
237{
238    x86_reg len= -(stride*h);
239    __asm__ volatile(
240        "movq (%1, %%"REG_a"), %%mm0    \n\t"
241        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
242        "movq %%mm0, %%mm1              \n\t"
243        "movq %%mm2, %%mm3              \n\t"
244        "punpcklbw %%mm7, %%mm0         \n\t"
245        "punpckhbw %%mm7, %%mm1         \n\t"
246        "punpcklbw %%mm7, %%mm2         \n\t"
247        "punpckhbw %%mm7, %%mm3         \n\t"
248        "paddw %%mm2, %%mm0             \n\t"
249        "paddw %%mm3, %%mm1             \n\t"
250        ASMALIGN(4)
251        "1:                             \n\t"
252        "movq (%2, %%"REG_a"), %%mm2    \n\t"
253        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
254        "movq %%mm2, %%mm3              \n\t"
255        "movq %%mm4, %%mm5              \n\t"
256        "punpcklbw %%mm7, %%mm2         \n\t"
257        "punpckhbw %%mm7, %%mm3         \n\t"
258        "punpcklbw %%mm7, %%mm4         \n\t"
259        "punpckhbw %%mm7, %%mm5         \n\t"
260        "paddw %%mm4, %%mm2             \n\t"
261        "paddw %%mm5, %%mm3             \n\t"
262        "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
263        "paddw %%mm2, %%mm0             \n\t"
264        "paddw %%mm3, %%mm1             \n\t"
265        "paddw %%mm5, %%mm0             \n\t"
266        "paddw %%mm5, %%mm1             \n\t"
267        "movq (%3, %%"REG_a"), %%mm4    \n\t"
268        "movq (%3, %%"REG_a"), %%mm5    \n\t"
269        "psrlw $2, %%mm0                \n\t"
270        "psrlw $2, %%mm1                \n\t"
271        "packuswb %%mm1, %%mm0          \n\t"
272        "psubusb %%mm0, %%mm4           \n\t"
273        "psubusb %%mm5, %%mm0           \n\t"
274        "por %%mm4, %%mm0               \n\t"
275        "movq %%mm0, %%mm4              \n\t"
276        "punpcklbw %%mm7, %%mm0         \n\t"
277        "punpckhbw %%mm7, %%mm4         \n\t"
278        "paddw %%mm0, %%mm6             \n\t"
279        "paddw %%mm4, %%mm6             \n\t"
280        "movq  %%mm2, %%mm0             \n\t"
281        "movq  %%mm3, %%mm1             \n\t"
282        "add %4, %%"REG_a"              \n\t"
283        " js 1b                         \n\t"
284        : "+a" (len)
285        : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
286    );
287}
288
289static inline int sum_mmx(void)
290{
291    int ret;
292    __asm__ volatile(
293        "movq %%mm6, %%mm0              \n\t"
294        "psrlq $32, %%mm6               \n\t"
295        "paddw %%mm0, %%mm6             \n\t"
296        "movq %%mm6, %%mm0              \n\t"
297        "psrlq $16, %%mm6               \n\t"
298        "paddw %%mm0, %%mm6             \n\t"
299        "movd %%mm6, %0                 \n\t"
300        : "=r" (ret)
301    );
302    return ret&0xFFFF;
303}
304
305static inline int sum_mmx2(void)
306{
307    int ret;
308    __asm__ volatile(
309        "movd %%mm6, %0                 \n\t"
310        : "=r" (ret)
311    );
312    return ret;
313}
314
315static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
316{
317    sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
318}
319static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
320{
321    sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
322}
323
324
325#define PIX_SAD(suf)\
326static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
327{\
328    assert(h==8);\
329    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
330                 "pxor %%mm6, %%mm6     \n\t":);\
331\
332    sad8_1_ ## suf(blk1, blk2, stride, 8);\
333\
334    return sum_ ## suf();\
335}\
336static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
337{\
338    assert(h==8);\
339    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
340                 "pxor %%mm6, %%mm6     \n\t"\
341                 "movq %0, %%mm5        \n\t"\
342                 :: "m"(round_tab[1]) \
343                 );\
344\
345    sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
346\
347    return sum_ ## suf();\
348}\
349\
350static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
351{\
352    assert(h==8);\
353    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
354                 "pxor %%mm6, %%mm6     \n\t"\
355                 "movq %0, %%mm5        \n\t"\
356                 :: "m"(round_tab[1]) \
357                 );\
358\
359    sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
360\
361    return sum_ ## suf();\
362}\
363\
364static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
365{\
366    assert(h==8);\
367    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
368                 "pxor %%mm6, %%mm6     \n\t"\
369                 ::);\
370\
371    sad8_4_ ## suf(blk1, blk2, stride, 8);\
372\
373    return sum_ ## suf();\
374}\
375\
376static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
377{\
378    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
379                 "pxor %%mm6, %%mm6     \n\t":);\
380\
381    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
382    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
383\
384    return sum_ ## suf();\
385}\
386static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
387{\
388    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
389                 "pxor %%mm6, %%mm6     \n\t"\
390                 "movq %0, %%mm5        \n\t"\
391                 :: "m"(round_tab[1]) \
392                 );\
393\
394    sad8_x2a_ ## suf(blk1  , blk2  , stride, h);\
395    sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
396\
397    return sum_ ## suf();\
398}\
399static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
400{\
401    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
402                 "pxor %%mm6, %%mm6     \n\t"\
403                 "movq %0, %%mm5        \n\t"\
404                 :: "m"(round_tab[1]) \
405                 );\
406\
407    sad8_y2a_ ## suf(blk1  , blk2  , stride, h);\
408    sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
409\
410    return sum_ ## suf();\
411}\
412static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
413{\
414    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
415                 "pxor %%mm6, %%mm6     \n\t"\
416                 ::);\
417\
418    sad8_4_ ## suf(blk1  , blk2  , stride, h);\
419    sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
420\
421    return sum_ ## suf();\
422}\
423
424PIX_SAD(mmx)
425PIX_SAD(mmx2)
426
427void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
428{
429    if (mm_flags & FF_MM_MMX) {
430        c->pix_abs[0][0] = sad16_mmx;
431        c->pix_abs[0][1] = sad16_x2_mmx;
432        c->pix_abs[0][2] = sad16_y2_mmx;
433        c->pix_abs[0][3] = sad16_xy2_mmx;
434        c->pix_abs[1][0] = sad8_mmx;
435        c->pix_abs[1][1] = sad8_x2_mmx;
436        c->pix_abs[1][2] = sad8_y2_mmx;
437        c->pix_abs[1][3] = sad8_xy2_mmx;
438
439        c->sad[0]= sad16_mmx;
440        c->sad[1]= sad8_mmx;
441    }
442    if (mm_flags & FF_MM_MMXEXT) {
443        c->pix_abs[0][0] = sad16_mmx2;
444        c->pix_abs[1][0] = sad8_mmx2;
445
446        c->sad[0]= sad16_mmx2;
447        c->sad[1]= sad8_mmx2;
448
449        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
450            c->pix_abs[0][1] = sad16_x2_mmx2;
451            c->pix_abs[0][2] = sad16_y2_mmx2;
452            c->pix_abs[0][3] = sad16_xy2_mmx2;
453            c->pix_abs[1][1] = sad8_x2_mmx2;
454            c->pix_abs[1][2] = sad8_y2_mmx2;
455            c->pix_abs[1][3] = sad8_xy2_mmx2;
456        }
457    }
458    if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)) {
459        c->sad[0]= sad16_sse2;
460    }
461}
462