1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/attributes.h"
26#include "libavutil/cpu.h"
27#include "libavutil/x86/asm.h"
28#include "libavutil/x86/cpu.h"
29#include "libavcodec/dsputil.h"
30#include "libavcodec/mpegvideo.h"
31#include "dsputil_x86.h"
32
33int ff_sum_abs_dctelem_mmx(int16_t *block);
34int ff_sum_abs_dctelem_mmxext(int16_t *block);
35int ff_sum_abs_dctelem_sse2(int16_t *block);
36int ff_sum_abs_dctelem_ssse3(int16_t *block);
37int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38                int line_size, int h);
39int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40                 int line_size, int h);
41int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42                  int line_size, int h);
43int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
44int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
45
46#define hadamard_func(cpu)                                              \
47    int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,     \
48                                  uint8_t *src2, int stride, int h);    \
49    int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,   \
50                                    uint8_t *src2, int stride, int h);
51
52hadamard_func(mmx)
53hadamard_func(mmxext)
54hadamard_func(sse2)
55hadamard_func(ssse3)
56
57#if HAVE_YASM
58static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
59                      int line_size, int h)
60{
61    int score1, score2;
62
63    if (c)
64        score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
65    else
66        score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
67    score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
68           - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
69
70    if (c)
71        return score1 + FFABS(score2) * c->avctx->nsse_weight;
72    else
73        return score1 + FFABS(score2) * 8;
74}
75
76static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
77                     int line_size, int h)
78{
79    int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
80    int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
81                 ff_hf_noise8_mmx(pix2, line_size, h);
82
83    if (c)
84        return score1 + FFABS(score2) * c->avctx->nsse_weight;
85    else
86        return score1 + FFABS(score2) * 8;
87}
88
89#endif /* HAVE_YASM */
90
91#if HAVE_INLINE_ASM
92
93static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
94                            int line_size, int h)
95{
96    int tmp;
97
98    av_assert2((((int) pix) & 7) == 0);
99    av_assert2((line_size & 7) == 0);
100
101#define SUM(in0, in1, out0, out1)               \
102    "movq (%0), %%mm2\n"                        \
103    "movq 8(%0), %%mm3\n"                       \
104    "add %2,%0\n"                               \
105    "movq %%mm2, " #out0 "\n"                   \
106    "movq %%mm3, " #out1 "\n"                   \
107    "psubusb " #in0 ", %%mm2\n"                 \
108    "psubusb " #in1 ", %%mm3\n"                 \
109    "psubusb " #out0 ", " #in0 "\n"             \
110    "psubusb " #out1 ", " #in1 "\n"             \
111    "por %%mm2, " #in0 "\n"                     \
112    "por %%mm3, " #in1 "\n"                     \
113    "movq " #in0 ", %%mm2\n"                    \
114    "movq " #in1 ", %%mm3\n"                    \
115    "punpcklbw %%mm7, " #in0 "\n"               \
116    "punpcklbw %%mm7, " #in1 "\n"               \
117    "punpckhbw %%mm7, %%mm2\n"                  \
118    "punpckhbw %%mm7, %%mm3\n"                  \
119    "paddw " #in1 ", " #in0 "\n"                \
120    "paddw %%mm3, %%mm2\n"                      \
121    "paddw %%mm2, " #in0 "\n"                   \
122    "paddw " #in0 ", %%mm6\n"
123
124
125    __asm__ volatile (
126        "movl    %3, %%ecx\n"
127        "pxor %%mm6, %%mm6\n"
128        "pxor %%mm7, %%mm7\n"
129        "movq  (%0), %%mm0\n"
130        "movq 8(%0), %%mm1\n"
131        "add %2, %0\n"
132        "jmp 2f\n"
133        "1:\n"
134
135        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
136        "2:\n"
137        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
138
139        "subl $2, %%ecx\n"
140        "jnz 1b\n"
141
142        "movq  %%mm6, %%mm0\n"
143        "psrlq $32,   %%mm6\n"
144        "paddw %%mm6, %%mm0\n"
145        "movq  %%mm0, %%mm6\n"
146        "psrlq $16,   %%mm0\n"
147        "paddw %%mm6, %%mm0\n"
148        "movd  %%mm0, %1\n"
149        : "+r" (pix), "=r" (tmp)
150        : "r" ((x86_reg) line_size), "m" (h)
151        : "%ecx");
152
153    return tmp & 0xFFFF;
154}
155#undef SUM
156
157static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
158                               int line_size, int h)
159{
160    int tmp;
161
162    av_assert2((((int) pix) & 7) == 0);
163    av_assert2((line_size & 7) == 0);
164
165#define SUM(in0, in1, out0, out1)               \
166    "movq (%0), " #out0 "\n"                    \
167    "movq 8(%0), " #out1 "\n"                   \
168    "add %2, %0\n"                              \
169    "psadbw " #out0 ", " #in0 "\n"              \
170    "psadbw " #out1 ", " #in1 "\n"              \
171    "paddw " #in1 ", " #in0 "\n"                \
172    "paddw " #in0 ", %%mm6\n"
173
174    __asm__ volatile (
175        "movl %3, %%ecx\n"
176        "pxor %%mm6, %%mm6\n"
177        "pxor %%mm7, %%mm7\n"
178        "movq (%0), %%mm0\n"
179        "movq 8(%0), %%mm1\n"
180        "add %2, %0\n"
181        "jmp 2f\n"
182        "1:\n"
183
184        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
185        "2:\n"
186        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
187
188        "subl $2, %%ecx\n"
189        "jnz 1b\n"
190
191        "movd %%mm6, %1\n"
192        : "+r" (pix), "=r" (tmp)
193        : "r" ((x86_reg) line_size), "m" (h)
194        : "%ecx");
195
196    return tmp;
197}
198#undef SUM
199
200static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
201                      int line_size, int h)
202{
203    int tmp;
204
205    av_assert2((((int) pix1) & 7) == 0);
206    av_assert2((((int) pix2) & 7) == 0);
207    av_assert2((line_size & 7) == 0);
208
209#define SUM(in0, in1, out0, out1)       \
210    "movq (%0), %%mm2\n"                \
211    "movq (%1), " #out0 "\n"            \
212    "movq 8(%0), %%mm3\n"               \
213    "movq 8(%1), " #out1 "\n"           \
214    "add %3, %0\n"                      \
215    "add %3, %1\n"                      \
216    "psubb " #out0 ", %%mm2\n"          \
217    "psubb " #out1 ", %%mm3\n"          \
218    "pxor %%mm7, %%mm2\n"               \
219    "pxor %%mm7, %%mm3\n"               \
220    "movq %%mm2, " #out0 "\n"           \
221    "movq %%mm3, " #out1 "\n"           \
222    "psubusb " #in0 ", %%mm2\n"         \
223    "psubusb " #in1 ", %%mm3\n"         \
224    "psubusb " #out0 ", " #in0 "\n"     \
225    "psubusb " #out1 ", " #in1 "\n"     \
226    "por %%mm2, " #in0 "\n"             \
227    "por %%mm3, " #in1 "\n"             \
228    "movq " #in0 ", %%mm2\n"            \
229    "movq " #in1 ", %%mm3\n"            \
230    "punpcklbw %%mm7, " #in0 "\n"       \
231    "punpcklbw %%mm7, " #in1 "\n"       \
232    "punpckhbw %%mm7, %%mm2\n"          \
233    "punpckhbw %%mm7, %%mm3\n"          \
234    "paddw " #in1 ", " #in0 "\n"        \
235    "paddw %%mm3, %%mm2\n"              \
236    "paddw %%mm2, " #in0 "\n"           \
237    "paddw " #in0 ", %%mm6\n"
238
239
240    __asm__ volatile (
241        "movl %4, %%ecx\n"
242        "pxor %%mm6, %%mm6\n"
243        "pcmpeqw %%mm7, %%mm7\n"
244        "psllw $15, %%mm7\n"
245        "packsswb %%mm7, %%mm7\n"
246        "movq (%0), %%mm0\n"
247        "movq (%1), %%mm2\n"
248        "movq 8(%0), %%mm1\n"
249        "movq 8(%1), %%mm3\n"
250        "add %3, %0\n"
251        "add %3, %1\n"
252        "psubb %%mm2, %%mm0\n"
253        "psubb %%mm3, %%mm1\n"
254        "pxor %%mm7, %%mm0\n"
255        "pxor %%mm7, %%mm1\n"
256        "jmp 2f\n"
257        "1:\n"
258
259        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
260        "2:\n"
261        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
262
263        "subl $2, %%ecx\n"
264        "jnz 1b\n"
265
266        "movq %%mm6, %%mm0\n"
267        "psrlq $32, %%mm6\n"
268        "paddw %%mm6, %%mm0\n"
269        "movq %%mm0, %%mm6\n"
270        "psrlq $16, %%mm0\n"
271        "paddw %%mm6, %%mm0\n"
272        "movd %%mm0, %2\n"
273        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
274        : "r" ((x86_reg) line_size), "m" (h)
275        : "%ecx");
276
277    return tmp & 0x7FFF;
278}
279#undef SUM
280
281static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
282                         int line_size, int h)
283{
284    int tmp;
285
286    av_assert2((((int) pix1) & 7) == 0);
287    av_assert2((((int) pix2) & 7) == 0);
288    av_assert2((line_size & 7) == 0);
289
290#define SUM(in0, in1, out0, out1)               \
291    "movq (%0), " #out0 "\n"                    \
292    "movq (%1), %%mm2\n"                        \
293    "movq 8(%0), " #out1 "\n"                   \
294    "movq 8(%1), %%mm3\n"                       \
295    "add %3, %0\n"                              \
296    "add %3, %1\n"                              \
297    "psubb %%mm2, " #out0 "\n"                  \
298    "psubb %%mm3, " #out1 "\n"                  \
299    "pxor %%mm7, " #out0 "\n"                   \
300    "pxor %%mm7, " #out1 "\n"                   \
301    "psadbw " #out0 ", " #in0 "\n"              \
302    "psadbw " #out1 ", " #in1 "\n"              \
303    "paddw " #in1 ", " #in0 "\n"                \
304    "paddw " #in0 ", %%mm6\n    "
305
306    __asm__ volatile (
307        "movl %4, %%ecx\n"
308        "pxor %%mm6, %%mm6\n"
309        "pcmpeqw %%mm7, %%mm7\n"
310        "psllw $15, %%mm7\n"
311        "packsswb %%mm7, %%mm7\n"
312        "movq (%0), %%mm0\n"
313        "movq (%1), %%mm2\n"
314        "movq 8(%0), %%mm1\n"
315        "movq 8(%1), %%mm3\n"
316        "add %3, %0\n"
317        "add %3, %1\n"
318        "psubb %%mm2, %%mm0\n"
319        "psubb %%mm3, %%mm1\n"
320        "pxor %%mm7, %%mm0\n"
321        "pxor %%mm7, %%mm1\n"
322        "jmp 2f\n"
323        "1:\n"
324
325        SUM(%%mm4, %%mm5, %%mm0, %%mm1)
326        "2:\n"
327        SUM(%%mm0, %%mm1, %%mm4, %%mm5)
328
329        "subl $2, %%ecx\n"
330        "jnz 1b\n"
331
332        "movd %%mm6, %2\n"
333        : "+r" (pix1), "+r" (pix2), "=r" (tmp)
334        : "r" ((x86_reg) line_size), "m" (h)
335        : "%ecx");
336
337    return tmp;
338}
339#undef SUM
340
341
342#endif /* HAVE_INLINE_ASM */
343
344av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
345{
346    int cpu_flags = av_get_cpu_flags();
347
348#if HAVE_INLINE_ASM
349    if (INLINE_MMX(cpu_flags)) {
350        c->vsad[4] = vsad_intra16_mmx;
351
352        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
353            c->vsad[0]      = vsad16_mmx;
354        }
355    }
356
357    if (INLINE_MMXEXT(cpu_flags)) {
358        c->vsad[4]         = vsad_intra16_mmxext;
359
360        if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
361            c->vsad[0] = vsad16_mmxext;
362        }
363    }
364#endif /* HAVE_INLINE_ASM */
365
366    if (EXTERNAL_MMX(cpu_flags)) {
367        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
368        c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
369        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
370        c->sse[0]            = ff_sse16_mmx;
371        c->sse[1]            = ff_sse8_mmx;
372#if HAVE_YASM
373        c->nsse[0]           = nsse16_mmx;
374        c->nsse[1]           = nsse8_mmx;
375#endif
376    }
377
378    if (EXTERNAL_MMXEXT(cpu_flags)) {
379        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
380        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
381        c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
382    }
383
384    if (EXTERNAL_SSE2(cpu_flags)) {
385        c->sse[0] = ff_sse16_sse2;
386        c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
387
388#if HAVE_ALIGNED_STACK
389        c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
390        c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
391#endif
392    }
393
394    if (EXTERNAL_SSSE3(cpu_flags)) {
395        c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
396#if HAVE_ALIGNED_STACK
397        c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
398        c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
399#endif
400    }
401
402    ff_dsputil_init_pix_mmx(c, avctx);
403}
404