1/*
2 * MMX optimized motion estimation
3 * Copyright (c) 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer
5 *
6 * mostly by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/attributes.h"
26#include "libavutil/avassert.h"
27#include "libavutil/mem.h"
28#include "libavutil/x86/asm.h"
29#include "libavutil/x86/cpu.h"
30#include "libavcodec/mpegvideo.h"
31#include "dsputil_x86.h"
32
33#if HAVE_INLINE_ASM
34
35DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
36    0x0000000000000000ULL,
37    0x0001000100010001ULL,
38    0x0002000200020002ULL,
39};
40
41DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
42
43static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
44{
45    x86_reg len = -(x86_reg)stride * h;
46    __asm__ volatile (
47        ".p2align 4                     \n\t"
48        "1:                             \n\t"
49        "movq (%1, %%"REG_a"), %%mm0    \n\t"
50        "movq (%2, %%"REG_a"), %%mm2    \n\t"
51        "movq (%2, %%"REG_a"), %%mm4    \n\t"
52        "add %3, %%"REG_a"              \n\t"
53        "psubusb %%mm0, %%mm2           \n\t"
54        "psubusb %%mm4, %%mm0           \n\t"
55        "movq (%1, %%"REG_a"), %%mm1    \n\t"
56        "movq (%2, %%"REG_a"), %%mm3    \n\t"
57        "movq (%2, %%"REG_a"), %%mm5    \n\t"
58        "psubusb %%mm1, %%mm3           \n\t"
59        "psubusb %%mm5, %%mm1           \n\t"
60        "por %%mm2, %%mm0               \n\t"
61        "por %%mm1, %%mm3               \n\t"
62        "movq %%mm0, %%mm1              \n\t"
63        "movq %%mm3, %%mm2              \n\t"
64        "punpcklbw %%mm7, %%mm0         \n\t"
65        "punpckhbw %%mm7, %%mm1         \n\t"
66        "punpcklbw %%mm7, %%mm3         \n\t"
67        "punpckhbw %%mm7, %%mm2         \n\t"
68        "paddw %%mm1, %%mm0             \n\t"
69        "paddw %%mm3, %%mm2             \n\t"
70        "paddw %%mm2, %%mm0             \n\t"
71        "paddw %%mm0, %%mm6             \n\t"
72        "add %3, %%"REG_a"              \n\t"
73        " js 1b                         \n\t"
74        : "+a" (len)
75        : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
76}
77
78static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
79                                 int stride, int h)
80{
81    __asm__ volatile (
82        ".p2align 4                     \n\t"
83        "1:                             \n\t"
84        "movq (%1), %%mm0               \n\t"
85        "movq (%1, %3), %%mm1           \n\t"
86        "psadbw (%2), %%mm0             \n\t"
87        "psadbw (%2, %3), %%mm1         \n\t"
88        "paddw %%mm0, %%mm6             \n\t"
89        "paddw %%mm1, %%mm6             \n\t"
90        "lea (%1,%3,2), %1              \n\t"
91        "lea (%2,%3,2), %2              \n\t"
92        "sub $2, %0                     \n\t"
93        " jg 1b                         \n\t"
94        : "+r" (h), "+r" (blk1), "+r" (blk2)
95        : "r" ((x86_reg) stride));
96}
97
98static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
99                      int stride, int h)
100{
101    int ret;
102    __asm__ volatile (
103        "pxor %%xmm2, %%xmm2            \n\t"
104        ".p2align 4                     \n\t"
105        "1:                             \n\t"
106        "movdqu (%1), %%xmm0            \n\t"
107        "movdqu (%1, %4), %%xmm1        \n\t"
108        "psadbw (%2), %%xmm0            \n\t"
109        "psadbw (%2, %4), %%xmm1        \n\t"
110        "paddw %%xmm0, %%xmm2           \n\t"
111        "paddw %%xmm1, %%xmm2           \n\t"
112        "lea (%1,%4,2), %1              \n\t"
113        "lea (%2,%4,2), %2              \n\t"
114        "sub $2, %0                     \n\t"
115        " jg 1b                         \n\t"
116        "movhlps %%xmm2, %%xmm0         \n\t"
117        "paddw   %%xmm0, %%xmm2         \n\t"
118        "movd    %%xmm2, %3             \n\t"
119        : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
120        : "r" ((x86_reg) stride));
121    return ret;
122}
123
124static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
125                                   int stride, int h)
126{
127    __asm__ volatile (
128        ".p2align 4                     \n\t"
129        "1:                             \n\t"
130        "movq (%1), %%mm0               \n\t"
131        "movq (%1, %3), %%mm1           \n\t"
132        "pavgb 1(%1), %%mm0             \n\t"
133        "pavgb 1(%1, %3), %%mm1         \n\t"
134        "psadbw (%2), %%mm0             \n\t"
135        "psadbw (%2, %3), %%mm1         \n\t"
136        "paddw %%mm0, %%mm6             \n\t"
137        "paddw %%mm1, %%mm6             \n\t"
138        "lea (%1,%3,2), %1              \n\t"
139        "lea (%2,%3,2), %2              \n\t"
140        "sub $2, %0                     \n\t"
141        " jg 1b                         \n\t"
142        : "+r" (h), "+r" (blk1), "+r" (blk2)
143        : "r" ((x86_reg) stride));
144}
145
146static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
147                                   int stride, int h)
148{
149    __asm__ volatile (
150        "movq (%1), %%mm0               \n\t"
151        "add %3, %1                     \n\t"
152        ".p2align 4                     \n\t"
153        "1:                             \n\t"
154        "movq (%1), %%mm1               \n\t"
155        "movq (%1, %3), %%mm2           \n\t"
156        "pavgb %%mm1, %%mm0             \n\t"
157        "pavgb %%mm2, %%mm1             \n\t"
158        "psadbw (%2), %%mm0             \n\t"
159        "psadbw (%2, %3), %%mm1         \n\t"
160        "paddw %%mm0, %%mm6             \n\t"
161        "paddw %%mm1, %%mm6             \n\t"
162        "movq %%mm2, %%mm0              \n\t"
163        "lea (%1,%3,2), %1              \n\t"
164        "lea (%2,%3,2), %2              \n\t"
165        "sub $2, %0                     \n\t"
166        " jg 1b                         \n\t"
167        : "+r" (h), "+r" (blk1), "+r" (blk2)
168        : "r" ((x86_reg) stride));
169}
170
171static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
172                                 int stride, int h)
173{
174    __asm__ volatile (
175        "movq "MANGLE(bone)", %%mm5     \n\t"
176        "movq (%1), %%mm0               \n\t"
177        "pavgb 1(%1), %%mm0             \n\t"
178        "add %3, %1                     \n\t"
179        ".p2align 4                     \n\t"
180        "1:                             \n\t"
181        "movq (%1), %%mm1               \n\t"
182        "movq (%1,%3), %%mm2            \n\t"
183        "pavgb 1(%1), %%mm1             \n\t"
184        "pavgb 1(%1,%3), %%mm2          \n\t"
185        "psubusb %%mm5, %%mm1           \n\t"
186        "pavgb %%mm1, %%mm0             \n\t"
187        "pavgb %%mm2, %%mm1             \n\t"
188        "psadbw (%2), %%mm0             \n\t"
189        "psadbw (%2,%3), %%mm1          \n\t"
190        "paddw %%mm0, %%mm6             \n\t"
191        "paddw %%mm1, %%mm6             \n\t"
192        "movq %%mm2, %%mm0              \n\t"
193        "lea (%1,%3,2), %1              \n\t"
194        "lea (%2,%3,2), %2              \n\t"
195        "sub $2, %0                     \n\t"
196        " jg 1b                         \n\t"
197        : "+r" (h), "+r" (blk1), "+r" (blk2)
198        : "r" ((x86_reg) stride)
199          NAMED_CONSTRAINTS_ADD(bone));
200}
201
202static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
203                              int stride, int h)
204{
205    x86_reg len = -(x86_reg)stride * h;
206    __asm__ volatile (
207        ".p2align 4                     \n\t"
208        "1:                             \n\t"
209        "movq (%1, %%"REG_a"), %%mm0    \n\t"
210        "movq (%2, %%"REG_a"), %%mm1    \n\t"
211        "movq (%1, %%"REG_a"), %%mm2    \n\t"
212        "movq (%2, %%"REG_a"), %%mm3    \n\t"
213        "punpcklbw %%mm7, %%mm0         \n\t"
214        "punpcklbw %%mm7, %%mm1         \n\t"
215        "punpckhbw %%mm7, %%mm2         \n\t"
216        "punpckhbw %%mm7, %%mm3         \n\t"
217        "paddw %%mm0, %%mm1             \n\t"
218        "paddw %%mm2, %%mm3             \n\t"
219        "movq (%3, %%"REG_a"), %%mm4    \n\t"
220        "movq (%3, %%"REG_a"), %%mm2    \n\t"
221        "paddw %%mm5, %%mm1             \n\t"
222        "paddw %%mm5, %%mm3             \n\t"
223        "psrlw $1, %%mm1                \n\t"
224        "psrlw $1, %%mm3                \n\t"
225        "packuswb %%mm3, %%mm1          \n\t"
226        "psubusb %%mm1, %%mm4           \n\t"
227        "psubusb %%mm2, %%mm1           \n\t"
228        "por %%mm4, %%mm1               \n\t"
229        "movq %%mm1, %%mm0              \n\t"
230        "punpcklbw %%mm7, %%mm0         \n\t"
231        "punpckhbw %%mm7, %%mm1         \n\t"
232        "paddw %%mm1, %%mm0             \n\t"
233        "paddw %%mm0, %%mm6             \n\t"
234        "add %4, %%"REG_a"              \n\t"
235        " js 1b                         \n\t"
236        : "+a" (len)
237        : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
238          "r" ((x86_reg) stride));
239}
240
241static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
242{
243    x86_reg len = -(x86_reg)stride * h;
244    __asm__ volatile (
245        "movq  (%1, %%"REG_a"), %%mm0   \n\t"
246        "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
247        "movq %%mm0, %%mm1              \n\t"
248        "movq %%mm2, %%mm3              \n\t"
249        "punpcklbw %%mm7, %%mm0         \n\t"
250        "punpckhbw %%mm7, %%mm1         \n\t"
251        "punpcklbw %%mm7, %%mm2         \n\t"
252        "punpckhbw %%mm7, %%mm3         \n\t"
253        "paddw %%mm2, %%mm0             \n\t"
254        "paddw %%mm3, %%mm1             \n\t"
255        ".p2align 4                     \n\t"
256        "1:                             \n\t"
257        "movq  (%2, %%"REG_a"), %%mm2   \n\t"
258        "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
259        "movq %%mm2, %%mm3              \n\t"
260        "movq %%mm4, %%mm5              \n\t"
261        "punpcklbw %%mm7, %%mm2         \n\t"
262        "punpckhbw %%mm7, %%mm3         \n\t"
263        "punpcklbw %%mm7, %%mm4         \n\t"
264        "punpckhbw %%mm7, %%mm5         \n\t"
265        "paddw %%mm4, %%mm2             \n\t"
266        "paddw %%mm5, %%mm3             \n\t"
267        "movq %5, %%mm5                 \n\t"
268        "paddw %%mm2, %%mm0             \n\t"
269        "paddw %%mm3, %%mm1             \n\t"
270        "paddw %%mm5, %%mm0             \n\t"
271        "paddw %%mm5, %%mm1             \n\t"
272        "movq (%3, %%"REG_a"), %%mm4    \n\t"
273        "movq (%3, %%"REG_a"), %%mm5    \n\t"
274        "psrlw $2, %%mm0                \n\t"
275        "psrlw $2, %%mm1                \n\t"
276        "packuswb %%mm1, %%mm0          \n\t"
277        "psubusb %%mm0, %%mm4           \n\t"
278        "psubusb %%mm5, %%mm0           \n\t"
279        "por %%mm4, %%mm0               \n\t"
280        "movq %%mm0, %%mm4              \n\t"
281        "punpcklbw %%mm7, %%mm0         \n\t"
282        "punpckhbw %%mm7, %%mm4         \n\t"
283        "paddw %%mm0, %%mm6             \n\t"
284        "paddw %%mm4, %%mm6             \n\t"
285        "movq  %%mm2, %%mm0             \n\t"
286        "movq  %%mm3, %%mm1             \n\t"
287        "add %4, %%"REG_a"              \n\t"
288        " js 1b                         \n\t"
289        : "+a" (len)
290        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
291          "r" ((x86_reg) stride), "m" (round_tab[2]));
292}
293
294static inline int sum_mmx(void)
295{
296    int ret;
297    __asm__ volatile (
298        "movq %%mm6, %%mm0              \n\t"
299        "psrlq $32, %%mm6               \n\t"
300        "paddw %%mm0, %%mm6             \n\t"
301        "movq %%mm6, %%mm0              \n\t"
302        "psrlq $16, %%mm6               \n\t"
303        "paddw %%mm0, %%mm6             \n\t"
304        "movd %%mm6, %0                 \n\t"
305        : "=r" (ret));
306    return ret & 0xFFFF;
307}
308
309static inline int sum_mmxext(void)
310{
311    int ret;
312    __asm__ volatile (
313        "movd %%mm6, %0                 \n\t"
314        : "=r" (ret));
315    return ret;
316}
317
318static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
319{
320    sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
321}
322
323static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
324{
325    sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
326}
327
328#define PIX_SAD(suf)                                                    \
329static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
330                        uint8_t *blk1, int stride, int h)               \
331{                                                                       \
332    av_assert2(h == 8);                                                     \
333    __asm__ volatile (                                                  \
334        "pxor %%mm7, %%mm7     \n\t"                                    \
335        "pxor %%mm6, %%mm6     \n\t"                                    \
336        :);                                                             \
337                                                                        \
338    sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
339                                                                        \
340    return sum_ ## suf();                                               \
341}                                                                       \
342                                                                        \
343static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
344                           uint8_t *blk1, int stride, int h)            \
345{                                                                       \
346    av_assert2(h == 8);                                                     \
347    __asm__ volatile (                                                  \
348        "pxor %%mm7, %%mm7     \n\t"                                    \
349        "pxor %%mm6, %%mm6     \n\t"                                    \
350        "movq %0, %%mm5        \n\t"                                    \
351        :: "m" (round_tab[1]));                                         \
352                                                                        \
353    sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
354                                                                        \
355    return sum_ ## suf();                                               \
356}                                                                       \
357                                                                        \
358static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
359                           uint8_t *blk1, int stride, int h)            \
360{                                                                       \
361    av_assert2(h == 8);                                                     \
362    __asm__ volatile (                                                  \
363        "pxor %%mm7, %%mm7     \n\t"                                    \
364        "pxor %%mm6, %%mm6     \n\t"                                    \
365        "movq %0, %%mm5        \n\t"                                    \
366        :: "m" (round_tab[1]));                                         \
367                                                                        \
368    sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
369                                                                        \
370    return sum_ ## suf();                                               \
371}                                                                       \
372                                                                        \
373static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
374                            uint8_t *blk1, int stride, int h)           \
375{                                                                       \
376    av_assert2(h == 8);                                                     \
377    __asm__ volatile (                                                  \
378        "pxor %%mm7, %%mm7     \n\t"                                    \
379        "pxor %%mm6, %%mm6     \n\t"                                    \
380        ::);                                                            \
381                                                                        \
382    sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
383                                                                        \
384    return sum_ ## suf();                                               \
385}                                                                       \
386                                                                        \
387static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
388                         uint8_t *blk1, int stride, int h)              \
389{                                                                       \
390    __asm__ volatile (                                                  \
391        "pxor %%mm7, %%mm7     \n\t"                                    \
392        "pxor %%mm6, %%mm6     \n\t"                                    \
393        :);                                                             \
394                                                                        \
395    sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
396    sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
397                                                                        \
398    return sum_ ## suf();                                               \
399}                                                                       \
400                                                                        \
401static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
402                            uint8_t *blk1, int stride, int h)           \
403{                                                                       \
404    __asm__ volatile (                                                  \
405        "pxor %%mm7, %%mm7     \n\t"                                    \
406        "pxor %%mm6, %%mm6     \n\t"                                    \
407        "movq %0, %%mm5        \n\t"                                    \
408        :: "m" (round_tab[1]));                                         \
409                                                                        \
410    sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
411    sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
412                                                                        \
413    return sum_ ## suf();                                               \
414}                                                                       \
415                                                                        \
416static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
417                            uint8_t *blk1, int stride, int h)           \
418{                                                                       \
419    __asm__ volatile (                                                  \
420        "pxor %%mm7, %%mm7     \n\t"                                    \
421        "pxor %%mm6, %%mm6     \n\t"                                    \
422        "movq %0, %%mm5        \n\t"                                    \
423        :: "m" (round_tab[1]));                                         \
424                                                                        \
425    sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
426    sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
427                                                                        \
428    return sum_ ## suf();                                               \
429}                                                                       \
430                                                                        \
431static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
432                             uint8_t *blk1, int stride, int h)          \
433{                                                                       \
434    __asm__ volatile (                                                  \
435        "pxor %%mm7, %%mm7     \n\t"                                    \
436        "pxor %%mm6, %%mm6     \n\t"                                    \
437        ::);                                                            \
438                                                                        \
439    sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
440    sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
441                                                                        \
442    return sum_ ## suf();                                               \
443}                                                                       \
444
445PIX_SAD(mmx)
446PIX_SAD(mmxext)
447
448#endif /* HAVE_INLINE_ASM */
449
450av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
451{
452#if HAVE_INLINE_ASM
453    int cpu_flags = av_get_cpu_flags();
454
455    if (INLINE_MMX(cpu_flags)) {
456        c->pix_abs[0][0] = sad16_mmx;
457        c->pix_abs[0][1] = sad16_x2_mmx;
458        c->pix_abs[0][2] = sad16_y2_mmx;
459        c->pix_abs[0][3] = sad16_xy2_mmx;
460        c->pix_abs[1][0] = sad8_mmx;
461        c->pix_abs[1][1] = sad8_x2_mmx;
462        c->pix_abs[1][2] = sad8_y2_mmx;
463        c->pix_abs[1][3] = sad8_xy2_mmx;
464
465        c->sad[0] = sad16_mmx;
466        c->sad[1] = sad8_mmx;
467    }
468    if (INLINE_MMXEXT(cpu_flags)) {
469        c->pix_abs[0][0] = sad16_mmxext;
470        c->pix_abs[1][0] = sad8_mmxext;
471
472        c->sad[0] = sad16_mmxext;
473        c->sad[1] = sad8_mmxext;
474
475        c->pix_abs[0][1] = sad16_x2_mmxext;
476        c->pix_abs[0][2] = sad16_y2_mmxext;
477        c->pix_abs[1][1] = sad8_x2_mmxext;
478        c->pix_abs[1][2] = sad8_y2_mmxext;
479        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
480            c->pix_abs[0][3] = sad16_xy2_mmxext;
481            c->pix_abs[1][3] = sad8_xy2_mmxext;
482        }
483    }
484    if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
485        c->sad[0] = sad16_sse2;
486    }
487#endif /* HAVE_INLINE_ASM */
488}
489