1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of Libav.
7 *
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25#include "libavutil/cpu.h"
26#include "libavutil/x86_cpu.h"
27#include "libavcodec/dsputil.h"
28#include "libavcodec/mpegvideo.h"
29#include "libavcodec/mathops.h"
30#include "dsputil_mmx.h"
31
32
33static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
34{
35    __asm__ volatile(
36        "mov $-128, %%"REG_a"           \n\t"
37        "pxor %%mm7, %%mm7              \n\t"
38        ".p2align 4                     \n\t"
39        "1:                             \n\t"
40        "movq (%0), %%mm0               \n\t"
41        "movq (%0, %2), %%mm2           \n\t"
42        "movq %%mm0, %%mm1              \n\t"
43        "movq %%mm2, %%mm3              \n\t"
44        "punpcklbw %%mm7, %%mm0         \n\t"
45        "punpckhbw %%mm7, %%mm1         \n\t"
46        "punpcklbw %%mm7, %%mm2         \n\t"
47        "punpckhbw %%mm7, %%mm3         \n\t"
48        "movq %%mm0, (%1, %%"REG_a")    \n\t"
49        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
50        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
51        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
52        "add %3, %0                     \n\t"
53        "add $32, %%"REG_a"             \n\t"
54        "js 1b                          \n\t"
55        : "+r" (pixels)
56        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
57        : "%"REG_a
58    );
59}
60
61static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
62{
63    __asm__ volatile(
64        "pxor %%xmm4,      %%xmm4         \n\t"
65        "movq (%0),        %%xmm0         \n\t"
66        "movq (%0, %2),    %%xmm1         \n\t"
67        "movq (%0, %2,2),  %%xmm2         \n\t"
68        "movq (%0, %3),    %%xmm3         \n\t"
69        "lea (%0,%2,4), %0                \n\t"
70        "punpcklbw %%xmm4, %%xmm0         \n\t"
71        "punpcklbw %%xmm4, %%xmm1         \n\t"
72        "punpcklbw %%xmm4, %%xmm2         \n\t"
73        "punpcklbw %%xmm4, %%xmm3         \n\t"
74        "movdqa %%xmm0,      (%1)         \n\t"
75        "movdqa %%xmm1,    16(%1)         \n\t"
76        "movdqa %%xmm2,    32(%1)         \n\t"
77        "movdqa %%xmm3,    48(%1)         \n\t"
78        "movq (%0),        %%xmm0         \n\t"
79        "movq (%0, %2),    %%xmm1         \n\t"
80        "movq (%0, %2,2),  %%xmm2         \n\t"
81        "movq (%0, %3),    %%xmm3         \n\t"
82        "punpcklbw %%xmm4, %%xmm0         \n\t"
83        "punpcklbw %%xmm4, %%xmm1         \n\t"
84        "punpcklbw %%xmm4, %%xmm2         \n\t"
85        "punpcklbw %%xmm4, %%xmm3         \n\t"
86        "movdqa %%xmm0,    64(%1)         \n\t"
87        "movdqa %%xmm1,    80(%1)         \n\t"
88        "movdqa %%xmm2,    96(%1)         \n\t"
89        "movdqa %%xmm3,   112(%1)         \n\t"
90        : "+r" (pixels)
91        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
92    );
93}
94
95static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
96{
97    __asm__ volatile(
98        "pxor %%mm7, %%mm7              \n\t"
99        "mov $-128, %%"REG_a"           \n\t"
100        ".p2align 4                     \n\t"
101        "1:                             \n\t"
102        "movq (%0), %%mm0               \n\t"
103        "movq (%1), %%mm2               \n\t"
104        "movq %%mm0, %%mm1              \n\t"
105        "movq %%mm2, %%mm3              \n\t"
106        "punpcklbw %%mm7, %%mm0         \n\t"
107        "punpckhbw %%mm7, %%mm1         \n\t"
108        "punpcklbw %%mm7, %%mm2         \n\t"
109        "punpckhbw %%mm7, %%mm3         \n\t"
110        "psubw %%mm2, %%mm0             \n\t"
111        "psubw %%mm3, %%mm1             \n\t"
112        "movq %%mm0, (%2, %%"REG_a")    \n\t"
113        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
114        "add %3, %0                     \n\t"
115        "add %3, %1                     \n\t"
116        "add $16, %%"REG_a"             \n\t"
117        "jnz 1b                         \n\t"
118        : "+r" (s1), "+r" (s2)
119        : "r" (block+64), "r" ((x86_reg)stride)
120        : "%"REG_a
121    );
122}
123
124static int pix_sum16_mmx(uint8_t * pix, int line_size){
125    const int h=16;
126    int sum;
127    x86_reg index= -line_size*h;
128
129    __asm__ volatile(
130                "pxor %%mm7, %%mm7              \n\t"
131                "pxor %%mm6, %%mm6              \n\t"
132                "1:                             \n\t"
133                "movq (%2, %1), %%mm0           \n\t"
134                "movq (%2, %1), %%mm1           \n\t"
135                "movq 8(%2, %1), %%mm2          \n\t"
136                "movq 8(%2, %1), %%mm3          \n\t"
137                "punpcklbw %%mm7, %%mm0         \n\t"
138                "punpckhbw %%mm7, %%mm1         \n\t"
139                "punpcklbw %%mm7, %%mm2         \n\t"
140                "punpckhbw %%mm7, %%mm3         \n\t"
141                "paddw %%mm0, %%mm1             \n\t"
142                "paddw %%mm2, %%mm3             \n\t"
143                "paddw %%mm1, %%mm3             \n\t"
144                "paddw %%mm3, %%mm6             \n\t"
145                "add %3, %1                     \n\t"
146                " js 1b                         \n\t"
147                "movq %%mm6, %%mm5              \n\t"
148                "psrlq $32, %%mm6               \n\t"
149                "paddw %%mm5, %%mm6             \n\t"
150                "movq %%mm6, %%mm5              \n\t"
151                "psrlq $16, %%mm6               \n\t"
152                "paddw %%mm5, %%mm6             \n\t"
153                "movd %%mm6, %0                 \n\t"
154                "andl $0xFFFF, %0               \n\t"
155                : "=&r" (sum), "+r" (index)
156                : "r" (pix - index), "r" ((x86_reg)line_size)
157        );
158
159        return sum;
160}
161
162static int pix_norm1_mmx(uint8_t *pix, int line_size) {
163    int tmp;
164  __asm__ volatile (
165      "movl $16,%%ecx\n"
166      "pxor %%mm0,%%mm0\n"
167      "pxor %%mm7,%%mm7\n"
168      "1:\n"
169      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
170      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
171
172      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
173
174      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
175      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
176
177      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
178      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
179      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
180
181      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
182      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
183
184      "pmaddwd %%mm3,%%mm3\n"
185      "pmaddwd %%mm4,%%mm4\n"
186
187      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
188                                          pix2^2+pix3^2+pix6^2+pix7^2) */
189      "paddd %%mm3,%%mm4\n"
190      "paddd %%mm2,%%mm7\n"
191
192      "add %2, %0\n"
193      "paddd %%mm4,%%mm7\n"
194      "dec %%ecx\n"
195      "jnz 1b\n"
196
197      "movq %%mm7,%%mm1\n"
198      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
199      "paddd %%mm7,%%mm1\n"
200      "movd %%mm1,%1\n"
201      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
202    return tmp;
203}
204
205static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
206    int tmp;
207  __asm__ volatile (
208      "movl %4,%%ecx\n"
209      "shr $1,%%ecx\n"
210      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
211      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
212      "1:\n"
213      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
214      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
215      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
216      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
217
218      /* todo: mm1-mm2, mm3-mm4 */
219      /* algo: subtract mm1 from mm2 with saturation and vice versa */
220      /*       OR the results to get absolute difference */
221      "movq %%mm1,%%mm5\n"
222      "movq %%mm3,%%mm6\n"
223      "psubusb %%mm2,%%mm1\n"
224      "psubusb %%mm4,%%mm3\n"
225      "psubusb %%mm5,%%mm2\n"
226      "psubusb %%mm6,%%mm4\n"
227
228      "por %%mm1,%%mm2\n"
229      "por %%mm3,%%mm4\n"
230
231      /* now convert to 16-bit vectors so we can square them */
232      "movq %%mm2,%%mm1\n"
233      "movq %%mm4,%%mm3\n"
234
235      "punpckhbw %%mm0,%%mm2\n"
236      "punpckhbw %%mm0,%%mm4\n"
237      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
238      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
239
240      "pmaddwd %%mm2,%%mm2\n"
241      "pmaddwd %%mm4,%%mm4\n"
242      "pmaddwd %%mm1,%%mm1\n"
243      "pmaddwd %%mm3,%%mm3\n"
244
245      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
246      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
247
248      "paddd %%mm2,%%mm1\n"
249      "paddd %%mm4,%%mm3\n"
250      "paddd %%mm1,%%mm7\n"
251      "paddd %%mm3,%%mm7\n"
252
253      "decl %%ecx\n"
254      "jnz 1b\n"
255
256      "movq %%mm7,%%mm1\n"
257      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
258      "paddd %%mm7,%%mm1\n"
259      "movd %%mm1,%2\n"
260      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
261      : "r" ((x86_reg)line_size) , "m" (h)
262      : "%ecx");
263    return tmp;
264}
265
266static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
267    int tmp;
268  __asm__ volatile (
269      "movl %4,%%ecx\n"
270      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
271      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
272      "1:\n"
273      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
274      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
275      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
276      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
277
278      /* todo: mm1-mm2, mm3-mm4 */
279      /* algo: subtract mm1 from mm2 with saturation and vice versa */
280      /*       OR the results to get absolute difference */
281      "movq %%mm1,%%mm5\n"
282      "movq %%mm3,%%mm6\n"
283      "psubusb %%mm2,%%mm1\n"
284      "psubusb %%mm4,%%mm3\n"
285      "psubusb %%mm5,%%mm2\n"
286      "psubusb %%mm6,%%mm4\n"
287
288      "por %%mm1,%%mm2\n"
289      "por %%mm3,%%mm4\n"
290
291      /* now convert to 16-bit vectors so we can square them */
292      "movq %%mm2,%%mm1\n"
293      "movq %%mm4,%%mm3\n"
294
295      "punpckhbw %%mm0,%%mm2\n"
296      "punpckhbw %%mm0,%%mm4\n"
297      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
298      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
299
300      "pmaddwd %%mm2,%%mm2\n"
301      "pmaddwd %%mm4,%%mm4\n"
302      "pmaddwd %%mm1,%%mm1\n"
303      "pmaddwd %%mm3,%%mm3\n"
304
305      "add %3,%0\n"
306      "add %3,%1\n"
307
308      "paddd %%mm2,%%mm1\n"
309      "paddd %%mm4,%%mm3\n"
310      "paddd %%mm1,%%mm7\n"
311      "paddd %%mm3,%%mm7\n"
312
313      "decl %%ecx\n"
314      "jnz 1b\n"
315
316      "movq %%mm7,%%mm1\n"
317      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
318      "paddd %%mm7,%%mm1\n"
319      "movd %%mm1,%2\n"
320      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
321      : "r" ((x86_reg)line_size) , "m" (h)
322      : "%ecx");
323    return tmp;
324}
325
326int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
327
328static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
329    int tmp;
330  __asm__ volatile (
331      "movl %3,%%ecx\n"
332      "pxor %%mm7,%%mm7\n"
333      "pxor %%mm6,%%mm6\n"
334
335      "movq (%0),%%mm0\n"
336      "movq %%mm0, %%mm1\n"
337      "psllq $8, %%mm0\n"
338      "psrlq $8, %%mm1\n"
339      "psrlq $8, %%mm0\n"
340      "movq %%mm0, %%mm2\n"
341      "movq %%mm1, %%mm3\n"
342      "punpcklbw %%mm7,%%mm0\n"
343      "punpcklbw %%mm7,%%mm1\n"
344      "punpckhbw %%mm7,%%mm2\n"
345      "punpckhbw %%mm7,%%mm3\n"
346      "psubw %%mm1, %%mm0\n"
347      "psubw %%mm3, %%mm2\n"
348
349      "add %2,%0\n"
350
351      "movq (%0),%%mm4\n"
352      "movq %%mm4, %%mm1\n"
353      "psllq $8, %%mm4\n"
354      "psrlq $8, %%mm1\n"
355      "psrlq $8, %%mm4\n"
356      "movq %%mm4, %%mm5\n"
357      "movq %%mm1, %%mm3\n"
358      "punpcklbw %%mm7,%%mm4\n"
359      "punpcklbw %%mm7,%%mm1\n"
360      "punpckhbw %%mm7,%%mm5\n"
361      "punpckhbw %%mm7,%%mm3\n"
362      "psubw %%mm1, %%mm4\n"
363      "psubw %%mm3, %%mm5\n"
364      "psubw %%mm4, %%mm0\n"
365      "psubw %%mm5, %%mm2\n"
366      "pxor %%mm3, %%mm3\n"
367      "pxor %%mm1, %%mm1\n"
368      "pcmpgtw %%mm0, %%mm3\n\t"
369      "pcmpgtw %%mm2, %%mm1\n\t"
370      "pxor %%mm3, %%mm0\n"
371      "pxor %%mm1, %%mm2\n"
372      "psubw %%mm3, %%mm0\n"
373      "psubw %%mm1, %%mm2\n"
374      "paddw %%mm0, %%mm2\n"
375      "paddw %%mm2, %%mm6\n"
376
377      "add %2,%0\n"
378      "1:\n"
379
380      "movq (%0),%%mm0\n"
381      "movq %%mm0, %%mm1\n"
382      "psllq $8, %%mm0\n"
383      "psrlq $8, %%mm1\n"
384      "psrlq $8, %%mm0\n"
385      "movq %%mm0, %%mm2\n"
386      "movq %%mm1, %%mm3\n"
387      "punpcklbw %%mm7,%%mm0\n"
388      "punpcklbw %%mm7,%%mm1\n"
389      "punpckhbw %%mm7,%%mm2\n"
390      "punpckhbw %%mm7,%%mm3\n"
391      "psubw %%mm1, %%mm0\n"
392      "psubw %%mm3, %%mm2\n"
393      "psubw %%mm0, %%mm4\n"
394      "psubw %%mm2, %%mm5\n"
395      "pxor %%mm3, %%mm3\n"
396      "pxor %%mm1, %%mm1\n"
397      "pcmpgtw %%mm4, %%mm3\n\t"
398      "pcmpgtw %%mm5, %%mm1\n\t"
399      "pxor %%mm3, %%mm4\n"
400      "pxor %%mm1, %%mm5\n"
401      "psubw %%mm3, %%mm4\n"
402      "psubw %%mm1, %%mm5\n"
403      "paddw %%mm4, %%mm5\n"
404      "paddw %%mm5, %%mm6\n"
405
406      "add %2,%0\n"
407
408      "movq (%0),%%mm4\n"
409      "movq %%mm4, %%mm1\n"
410      "psllq $8, %%mm4\n"
411      "psrlq $8, %%mm1\n"
412      "psrlq $8, %%mm4\n"
413      "movq %%mm4, %%mm5\n"
414      "movq %%mm1, %%mm3\n"
415      "punpcklbw %%mm7,%%mm4\n"
416      "punpcklbw %%mm7,%%mm1\n"
417      "punpckhbw %%mm7,%%mm5\n"
418      "punpckhbw %%mm7,%%mm3\n"
419      "psubw %%mm1, %%mm4\n"
420      "psubw %%mm3, %%mm5\n"
421      "psubw %%mm4, %%mm0\n"
422      "psubw %%mm5, %%mm2\n"
423      "pxor %%mm3, %%mm3\n"
424      "pxor %%mm1, %%mm1\n"
425      "pcmpgtw %%mm0, %%mm3\n\t"
426      "pcmpgtw %%mm2, %%mm1\n\t"
427      "pxor %%mm3, %%mm0\n"
428      "pxor %%mm1, %%mm2\n"
429      "psubw %%mm3, %%mm0\n"
430      "psubw %%mm1, %%mm2\n"
431      "paddw %%mm0, %%mm2\n"
432      "paddw %%mm2, %%mm6\n"
433
434      "add %2,%0\n"
435      "subl $2, %%ecx\n"
436      " jnz 1b\n"
437
438      "movq %%mm6, %%mm0\n"
439      "punpcklwd %%mm7,%%mm0\n"
440      "punpckhwd %%mm7,%%mm6\n"
441      "paddd %%mm0, %%mm6\n"
442
443      "movq %%mm6,%%mm0\n"
444      "psrlq $32, %%mm6\n"
445      "paddd %%mm6,%%mm0\n"
446      "movd %%mm0,%1\n"
447      : "+r" (pix1), "=r"(tmp)
448      : "r" ((x86_reg)line_size) , "g" (h-2)
449      : "%ecx");
450      return tmp;
451}
452
453static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
454    int tmp;
455    uint8_t * pix= pix1;
456  __asm__ volatile (
457      "movl %3,%%ecx\n"
458      "pxor %%mm7,%%mm7\n"
459      "pxor %%mm6,%%mm6\n"
460
461      "movq (%0),%%mm0\n"
462      "movq 1(%0),%%mm1\n"
463      "movq %%mm0, %%mm2\n"
464      "movq %%mm1, %%mm3\n"
465      "punpcklbw %%mm7,%%mm0\n"
466      "punpcklbw %%mm7,%%mm1\n"
467      "punpckhbw %%mm7,%%mm2\n"
468      "punpckhbw %%mm7,%%mm3\n"
469      "psubw %%mm1, %%mm0\n"
470      "psubw %%mm3, %%mm2\n"
471
472      "add %2,%0\n"
473
474      "movq (%0),%%mm4\n"
475      "movq 1(%0),%%mm1\n"
476      "movq %%mm4, %%mm5\n"
477      "movq %%mm1, %%mm3\n"
478      "punpcklbw %%mm7,%%mm4\n"
479      "punpcklbw %%mm7,%%mm1\n"
480      "punpckhbw %%mm7,%%mm5\n"
481      "punpckhbw %%mm7,%%mm3\n"
482      "psubw %%mm1, %%mm4\n"
483      "psubw %%mm3, %%mm5\n"
484      "psubw %%mm4, %%mm0\n"
485      "psubw %%mm5, %%mm2\n"
486      "pxor %%mm3, %%mm3\n"
487      "pxor %%mm1, %%mm1\n"
488      "pcmpgtw %%mm0, %%mm3\n\t"
489      "pcmpgtw %%mm2, %%mm1\n\t"
490      "pxor %%mm3, %%mm0\n"
491      "pxor %%mm1, %%mm2\n"
492      "psubw %%mm3, %%mm0\n"
493      "psubw %%mm1, %%mm2\n"
494      "paddw %%mm0, %%mm2\n"
495      "paddw %%mm2, %%mm6\n"
496
497      "add %2,%0\n"
498      "1:\n"
499
500      "movq (%0),%%mm0\n"
501      "movq 1(%0),%%mm1\n"
502      "movq %%mm0, %%mm2\n"
503      "movq %%mm1, %%mm3\n"
504      "punpcklbw %%mm7,%%mm0\n"
505      "punpcklbw %%mm7,%%mm1\n"
506      "punpckhbw %%mm7,%%mm2\n"
507      "punpckhbw %%mm7,%%mm3\n"
508      "psubw %%mm1, %%mm0\n"
509      "psubw %%mm3, %%mm2\n"
510      "psubw %%mm0, %%mm4\n"
511      "psubw %%mm2, %%mm5\n"
512      "pxor %%mm3, %%mm3\n"
513      "pxor %%mm1, %%mm1\n"
514      "pcmpgtw %%mm4, %%mm3\n\t"
515      "pcmpgtw %%mm5, %%mm1\n\t"
516      "pxor %%mm3, %%mm4\n"
517      "pxor %%mm1, %%mm5\n"
518      "psubw %%mm3, %%mm4\n"
519      "psubw %%mm1, %%mm5\n"
520      "paddw %%mm4, %%mm5\n"
521      "paddw %%mm5, %%mm6\n"
522
523      "add %2,%0\n"
524
525      "movq (%0),%%mm4\n"
526      "movq 1(%0),%%mm1\n"
527      "movq %%mm4, %%mm5\n"
528      "movq %%mm1, %%mm3\n"
529      "punpcklbw %%mm7,%%mm4\n"
530      "punpcklbw %%mm7,%%mm1\n"
531      "punpckhbw %%mm7,%%mm5\n"
532      "punpckhbw %%mm7,%%mm3\n"
533      "psubw %%mm1, %%mm4\n"
534      "psubw %%mm3, %%mm5\n"
535      "psubw %%mm4, %%mm0\n"
536      "psubw %%mm5, %%mm2\n"
537      "pxor %%mm3, %%mm3\n"
538      "pxor %%mm1, %%mm1\n"
539      "pcmpgtw %%mm0, %%mm3\n\t"
540      "pcmpgtw %%mm2, %%mm1\n\t"
541      "pxor %%mm3, %%mm0\n"
542      "pxor %%mm1, %%mm2\n"
543      "psubw %%mm3, %%mm0\n"
544      "psubw %%mm1, %%mm2\n"
545      "paddw %%mm0, %%mm2\n"
546      "paddw %%mm2, %%mm6\n"
547
548      "add %2,%0\n"
549      "subl $2, %%ecx\n"
550      " jnz 1b\n"
551
552      "movq %%mm6, %%mm0\n"
553      "punpcklwd %%mm7,%%mm0\n"
554      "punpckhwd %%mm7,%%mm6\n"
555      "paddd %%mm0, %%mm6\n"
556
557      "movq %%mm6,%%mm0\n"
558      "psrlq $32, %%mm6\n"
559      "paddd %%mm6,%%mm0\n"
560      "movd %%mm0,%1\n"
561      : "+r" (pix1), "=r"(tmp)
562      : "r" ((x86_reg)line_size) , "g" (h-2)
563      : "%ecx");
564      return tmp + hf_noise8_mmx(pix+8, line_size, h);
565}
566
567static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
568    MpegEncContext *c = p;
569    int score1, score2;
570
571    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
572    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
573    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
574
575    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
576    else  return score1 + FFABS(score2)*8;
577}
578
579static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
580    MpegEncContext *c = p;
581    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
582    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
583
584    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
585    else  return score1 + FFABS(score2)*8;
586}
587
588static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
589    int tmp;
590
591    assert( (((int)pix) & 7) == 0);
592    assert((line_size &7) ==0);
593
594#define SUM(in0, in1, out0, out1) \
595      "movq (%0), %%mm2\n"\
596      "movq 8(%0), %%mm3\n"\
597      "add %2,%0\n"\
598      "movq %%mm2, " #out0 "\n"\
599      "movq %%mm3, " #out1 "\n"\
600      "psubusb " #in0 ", %%mm2\n"\
601      "psubusb " #in1 ", %%mm3\n"\
602      "psubusb " #out0 ", " #in0 "\n"\
603      "psubusb " #out1 ", " #in1 "\n"\
604      "por %%mm2, " #in0 "\n"\
605      "por %%mm3, " #in1 "\n"\
606      "movq " #in0 ", %%mm2\n"\
607      "movq " #in1 ", %%mm3\n"\
608      "punpcklbw %%mm7, " #in0 "\n"\
609      "punpcklbw %%mm7, " #in1 "\n"\
610      "punpckhbw %%mm7, %%mm2\n"\
611      "punpckhbw %%mm7, %%mm3\n"\
612      "paddw " #in1 ", " #in0 "\n"\
613      "paddw %%mm3, %%mm2\n"\
614      "paddw %%mm2, " #in0 "\n"\
615      "paddw " #in0 ", %%mm6\n"
616
617
618  __asm__ volatile (
619      "movl %3,%%ecx\n"
620      "pxor %%mm6,%%mm6\n"
621      "pxor %%mm7,%%mm7\n"
622      "movq (%0),%%mm0\n"
623      "movq 8(%0),%%mm1\n"
624      "add %2,%0\n"
625      "jmp 2f\n"
626      "1:\n"
627
628      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
629      "2:\n"
630      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
631
632      "subl $2, %%ecx\n"
633      "jnz 1b\n"
634
635      "movq %%mm6,%%mm0\n"
636      "psrlq $32, %%mm6\n"
637      "paddw %%mm6,%%mm0\n"
638      "movq %%mm0,%%mm6\n"
639      "psrlq $16, %%mm0\n"
640      "paddw %%mm6,%%mm0\n"
641      "movd %%mm0,%1\n"
642      : "+r" (pix), "=r"(tmp)
643      : "r" ((x86_reg)line_size) , "m" (h)
644      : "%ecx");
645    return tmp & 0xFFFF;
646}
647#undef SUM
648
649static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
650    int tmp;
651
652    assert( (((int)pix) & 7) == 0);
653    assert((line_size &7) ==0);
654
655#define SUM(in0, in1, out0, out1) \
656      "movq (%0), " #out0 "\n"\
657      "movq 8(%0), " #out1 "\n"\
658      "add %2,%0\n"\
659      "psadbw " #out0 ", " #in0 "\n"\
660      "psadbw " #out1 ", " #in1 "\n"\
661      "paddw " #in1 ", " #in0 "\n"\
662      "paddw " #in0 ", %%mm6\n"
663
664  __asm__ volatile (
665      "movl %3,%%ecx\n"
666      "pxor %%mm6,%%mm6\n"
667      "pxor %%mm7,%%mm7\n"
668      "movq (%0),%%mm0\n"
669      "movq 8(%0),%%mm1\n"
670      "add %2,%0\n"
671      "jmp 2f\n"
672      "1:\n"
673
674      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
675      "2:\n"
676      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
677
678      "subl $2, %%ecx\n"
679      "jnz 1b\n"
680
681      "movd %%mm6,%1\n"
682      : "+r" (pix), "=r"(tmp)
683      : "r" ((x86_reg)line_size) , "m" (h)
684      : "%ecx");
685    return tmp;
686}
687#undef SUM
688
689static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
690    int tmp;
691
692    assert( (((int)pix1) & 7) == 0);
693    assert( (((int)pix2) & 7) == 0);
694    assert((line_size &7) ==0);
695
696#define SUM(in0, in1, out0, out1) \
697      "movq (%0),%%mm2\n"\
698      "movq (%1)," #out0 "\n"\
699      "movq 8(%0),%%mm3\n"\
700      "movq 8(%1)," #out1 "\n"\
701      "add %3,%0\n"\
702      "add %3,%1\n"\
703      "psubb " #out0 ", %%mm2\n"\
704      "psubb " #out1 ", %%mm3\n"\
705      "pxor %%mm7, %%mm2\n"\
706      "pxor %%mm7, %%mm3\n"\
707      "movq %%mm2, " #out0 "\n"\
708      "movq %%mm3, " #out1 "\n"\
709      "psubusb " #in0 ", %%mm2\n"\
710      "psubusb " #in1 ", %%mm3\n"\
711      "psubusb " #out0 ", " #in0 "\n"\
712      "psubusb " #out1 ", " #in1 "\n"\
713      "por %%mm2, " #in0 "\n"\
714      "por %%mm3, " #in1 "\n"\
715      "movq " #in0 ", %%mm2\n"\
716      "movq " #in1 ", %%mm3\n"\
717      "punpcklbw %%mm7, " #in0 "\n"\
718      "punpcklbw %%mm7, " #in1 "\n"\
719      "punpckhbw %%mm7, %%mm2\n"\
720      "punpckhbw %%mm7, %%mm3\n"\
721      "paddw " #in1 ", " #in0 "\n"\
722      "paddw %%mm3, %%mm2\n"\
723      "paddw %%mm2, " #in0 "\n"\
724      "paddw " #in0 ", %%mm6\n"
725
726
727  __asm__ volatile (
728      "movl %4,%%ecx\n"
729      "pxor %%mm6,%%mm6\n"
730      "pcmpeqw %%mm7,%%mm7\n"
731      "psllw $15, %%mm7\n"
732      "packsswb %%mm7, %%mm7\n"
733      "movq (%0),%%mm0\n"
734      "movq (%1),%%mm2\n"
735      "movq 8(%0),%%mm1\n"
736      "movq 8(%1),%%mm3\n"
737      "add %3,%0\n"
738      "add %3,%1\n"
739      "psubb %%mm2, %%mm0\n"
740      "psubb %%mm3, %%mm1\n"
741      "pxor %%mm7, %%mm0\n"
742      "pxor %%mm7, %%mm1\n"
743      "jmp 2f\n"
744      "1:\n"
745
746      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
747      "2:\n"
748      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
749
750      "subl $2, %%ecx\n"
751      "jnz 1b\n"
752
753      "movq %%mm6,%%mm0\n"
754      "psrlq $32, %%mm6\n"
755      "paddw %%mm6,%%mm0\n"
756      "movq %%mm0,%%mm6\n"
757      "psrlq $16, %%mm0\n"
758      "paddw %%mm6,%%mm0\n"
759      "movd %%mm0,%2\n"
760      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
761      : "r" ((x86_reg)line_size) , "m" (h)
762      : "%ecx");
763    return tmp & 0x7FFF;
764}
765#undef SUM
766
767static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
768    int tmp;
769
770    assert( (((int)pix1) & 7) == 0);
771    assert( (((int)pix2) & 7) == 0);
772    assert((line_size &7) ==0);
773
774#define SUM(in0, in1, out0, out1) \
775      "movq (%0)," #out0 "\n"\
776      "movq (%1),%%mm2\n"\
777      "movq 8(%0)," #out1 "\n"\
778      "movq 8(%1),%%mm3\n"\
779      "add %3,%0\n"\
780      "add %3,%1\n"\
781      "psubb %%mm2, " #out0 "\n"\
782      "psubb %%mm3, " #out1 "\n"\
783      "pxor %%mm7, " #out0 "\n"\
784      "pxor %%mm7, " #out1 "\n"\
785      "psadbw " #out0 ", " #in0 "\n"\
786      "psadbw " #out1 ", " #in1 "\n"\
787      "paddw " #in1 ", " #in0 "\n"\
788      "paddw " #in0 ", %%mm6\n"
789
790  __asm__ volatile (
791      "movl %4,%%ecx\n"
792      "pxor %%mm6,%%mm6\n"
793      "pcmpeqw %%mm7,%%mm7\n"
794      "psllw $15, %%mm7\n"
795      "packsswb %%mm7, %%mm7\n"
796      "movq (%0),%%mm0\n"
797      "movq (%1),%%mm2\n"
798      "movq 8(%0),%%mm1\n"
799      "movq 8(%1),%%mm3\n"
800      "add %3,%0\n"
801      "add %3,%1\n"
802      "psubb %%mm2, %%mm0\n"
803      "psubb %%mm3, %%mm1\n"
804      "pxor %%mm7, %%mm0\n"
805      "pxor %%mm7, %%mm1\n"
806      "jmp 2f\n"
807      "1:\n"
808
809      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
810      "2:\n"
811      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
812
813      "subl $2, %%ecx\n"
814      "jnz 1b\n"
815
816      "movd %%mm6,%2\n"
817      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
818      : "r" ((x86_reg)line_size) , "m" (h)
819      : "%ecx");
820    return tmp;
821}
822#undef SUM
823
824static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
825    x86_reg i=0;
826    __asm__ volatile(
827        "1:                             \n\t"
828        "movq  (%2, %0), %%mm0          \n\t"
829        "movq  (%1, %0), %%mm1          \n\t"
830        "psubb %%mm0, %%mm1             \n\t"
831        "movq %%mm1, (%3, %0)           \n\t"
832        "movq 8(%2, %0), %%mm0          \n\t"
833        "movq 8(%1, %0), %%mm1          \n\t"
834        "psubb %%mm0, %%mm1             \n\t"
835        "movq %%mm1, 8(%3, %0)          \n\t"
836        "add $16, %0                    \n\t"
837        "cmp %4, %0                     \n\t"
838        " jb 1b                         \n\t"
839        : "+r" (i)
840        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
841    );
842    for(; i<w; i++)
843        dst[i+0] = src1[i+0]-src2[i+0];
844}
845
846static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
847    x86_reg i=0;
848    uint8_t l, lt;
849
850    __asm__ volatile(
851        "1:                             \n\t"
852        "movq  -1(%1, %0), %%mm0        \n\t" // LT
853        "movq  (%1, %0), %%mm1          \n\t" // T
854        "movq  -1(%2, %0), %%mm2        \n\t" // L
855        "movq  (%2, %0), %%mm3          \n\t" // X
856        "movq %%mm2, %%mm4              \n\t" // L
857        "psubb %%mm0, %%mm2             \n\t"
858        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
859        "movq %%mm4, %%mm5              \n\t" // L
860        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
861        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
862        "pminub %%mm2, %%mm4            \n\t"
863        "pmaxub %%mm1, %%mm4            \n\t"
864        "psubb %%mm4, %%mm3             \n\t" // dst - pred
865        "movq %%mm3, (%3, %0)           \n\t"
866        "add $8, %0                     \n\t"
867        "cmp %4, %0                     \n\t"
868        " jb 1b                         \n\t"
869        : "+r" (i)
870        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
871    );
872
873    l= *left;
874    lt= *left_top;
875
876    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
877
878    *left_top= src1[w-1];
879    *left    = src2[w-1];
880}
881
882#define MMABS_MMX(a,z)\
883    "pxor " #z ", " #z "              \n\t"\
884    "pcmpgtw " #a ", " #z "           \n\t"\
885    "pxor " #z ", " #a "              \n\t"\
886    "psubw " #z ", " #a "             \n\t"
887
888#define MMABS_MMX2(a,z)\
889    "pxor " #z ", " #z "              \n\t"\
890    "psubw " #a ", " #z "             \n\t"\
891    "pmaxsw " #z ", " #a "            \n\t"
892
893#define MMABS_SSSE3(a,z)\
894    "pabsw " #a ", " #a "             \n\t"
895
896#define MMABS_SUM(a,z, sum)\
897    MMABS(a,z)\
898    "paddusw " #a ", " #sum "         \n\t"
899
900/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
901 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
902 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
903#define HSUM_MMX(a, t, dst)\
904    "movq "#a", "#t"                  \n\t"\
905    "psrlq $32, "#a"                  \n\t"\
906    "paddusw "#t", "#a"               \n\t"\
907    "movq "#a", "#t"                  \n\t"\
908    "psrlq $16, "#a"                  \n\t"\
909    "paddusw "#t", "#a"               \n\t"\
910    "movd "#a", "#dst"                \n\t"\
911
912#define HSUM_MMX2(a, t, dst)\
913    "pshufw $0x0E, "#a", "#t"         \n\t"\
914    "paddusw "#t", "#a"               \n\t"\
915    "pshufw $0x01, "#a", "#t"         \n\t"\
916    "paddusw "#t", "#a"               \n\t"\
917    "movd "#a", "#dst"                \n\t"\
918
919#define HSUM_SSE2(a, t, dst)\
920    "movhlps "#a", "#t"               \n\t"\
921    "paddusw "#t", "#a"               \n\t"\
922    "pshuflw $0x0E, "#a", "#t"        \n\t"\
923    "paddusw "#t", "#a"               \n\t"\
924    "pshuflw $0x01, "#a", "#t"        \n\t"\
925    "paddusw "#t", "#a"               \n\t"\
926    "movd "#a", "#dst"                \n\t"\
927
928#define hadamard_func(cpu) \
929int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
930                              int stride, int h); \
931int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
932                              int stride, int h);
933
934hadamard_func(mmx)
935hadamard_func(mmx2)
936hadamard_func(sse2)
937hadamard_func(ssse3)
938
939#define DCT_SAD4(m,mm,o)\
940    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
941    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
942    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
943    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
944    MMABS_SUM(mm##2, mm##6, mm##0)\
945    MMABS_SUM(mm##3, mm##7, mm##1)\
946    MMABS_SUM(mm##4, mm##6, mm##0)\
947    MMABS_SUM(mm##5, mm##7, mm##1)\
948
949#define DCT_SAD_MMX\
950    "pxor %%mm0, %%mm0                \n\t"\
951    "pxor %%mm1, %%mm1                \n\t"\
952    DCT_SAD4(q, %%mm, 0)\
953    DCT_SAD4(q, %%mm, 8)\
954    DCT_SAD4(q, %%mm, 64)\
955    DCT_SAD4(q, %%mm, 72)\
956    "paddusw %%mm1, %%mm0             \n\t"\
957    HSUM(%%mm0, %%mm1, %0)
958
959#define DCT_SAD_SSE2\
960    "pxor %%xmm0, %%xmm0              \n\t"\
961    "pxor %%xmm1, %%xmm1              \n\t"\
962    DCT_SAD4(dqa, %%xmm, 0)\
963    DCT_SAD4(dqa, %%xmm, 64)\
964    "paddusw %%xmm1, %%xmm0           \n\t"\
965    HSUM(%%xmm0, %%xmm1, %0)
966
967#define DCT_SAD_FUNC(cpu) \
968static int sum_abs_dctelem_##cpu(DCTELEM *block){\
969    int sum;\
970    __asm__ volatile(\
971        DCT_SAD\
972        :"=r"(sum)\
973        :"r"(block)\
974    );\
975    return sum&0xFFFF;\
976}
977
978#define DCT_SAD       DCT_SAD_MMX
979#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
980#define MMABS(a,z)    MMABS_MMX(a,z)
981DCT_SAD_FUNC(mmx)
982#undef MMABS
983#undef HSUM
984
985#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
986#define MMABS(a,z)    MMABS_MMX2(a,z)
987DCT_SAD_FUNC(mmx2)
988#undef HSUM
989#undef DCT_SAD
990
991#define DCT_SAD       DCT_SAD_SSE2
992#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
993DCT_SAD_FUNC(sse2)
994#undef MMABS
995
996#if HAVE_SSSE3
997#define MMABS(a,z)    MMABS_SSSE3(a,z)
998DCT_SAD_FUNC(ssse3)
999#undef MMABS
1000#endif
1001#undef HSUM
1002#undef DCT_SAD
1003
1004static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1005    int sum;
1006    x86_reg i=size;
1007    __asm__ volatile(
1008        "pxor %%mm4, %%mm4 \n"
1009        "1: \n"
1010        "sub $8, %0 \n"
1011        "movq (%2,%0), %%mm2 \n"
1012        "movq (%3,%0,2), %%mm0 \n"
1013        "movq 8(%3,%0,2), %%mm1 \n"
1014        "punpckhbw %%mm2, %%mm3 \n"
1015        "punpcklbw %%mm2, %%mm2 \n"
1016        "psraw $8, %%mm3 \n"
1017        "psraw $8, %%mm2 \n"
1018        "psubw %%mm3, %%mm1 \n"
1019        "psubw %%mm2, %%mm0 \n"
1020        "pmaddwd %%mm1, %%mm1 \n"
1021        "pmaddwd %%mm0, %%mm0 \n"
1022        "paddd %%mm1, %%mm4 \n"
1023        "paddd %%mm0, %%mm4 \n"
1024        "jg 1b \n"
1025        "movq %%mm4, %%mm3 \n"
1026        "psrlq $32, %%mm3 \n"
1027        "paddd %%mm3, %%mm4 \n"
1028        "movd %%mm4, %1 \n"
1029        :"+r"(i), "=r"(sum)
1030        :"r"(pix1), "r"(pix2)
1031    );
1032    return sum;
1033}
1034
1035#define PHADDD(a, t)\
1036    "movq "#a", "#t"                  \n\t"\
1037    "psrlq $32, "#a"                  \n\t"\
1038    "paddd "#t", "#a"                 \n\t"
1039/*
1040   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1041   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1042   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1043 */
1044#define PMULHRW(x, y, s, o)\
1045    "pmulhw " #s ", "#x "            \n\t"\
1046    "pmulhw " #s ", "#y "            \n\t"\
1047    "paddw " #o ", "#x "             \n\t"\
1048    "paddw " #o ", "#y "             \n\t"\
1049    "psraw $1, "#x "                 \n\t"\
1050    "psraw $1, "#y "                 \n\t"
1051#define DEF(x) x ## _mmx
1052#define SET_RND MOVQ_WONE
1053#define SCALE_OFFSET 1
1054
1055#include "dsputil_mmx_qns_template.c"
1056
1057#undef DEF
1058#undef SET_RND
1059#undef SCALE_OFFSET
1060#undef PMULHRW
1061
1062#define DEF(x) x ## _3dnow
1063#define SET_RND(x)
1064#define SCALE_OFFSET 0
1065#define PMULHRW(x, y, s, o)\
1066    "pmulhrw " #s ", "#x "           \n\t"\
1067    "pmulhrw " #s ", "#y "           \n\t"
1068
1069#include "dsputil_mmx_qns_template.c"
1070
1071#undef DEF
1072#undef SET_RND
1073#undef SCALE_OFFSET
1074#undef PMULHRW
1075
1076#if HAVE_SSSE3
1077#undef PHADDD
1078#define DEF(x) x ## _ssse3
1079#define SET_RND(x)
1080#define SCALE_OFFSET -1
1081#define PHADDD(a, t)\
1082    "pshufw $0x0E, "#a", "#t"         \n\t"\
1083    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1084#define PMULHRW(x, y, s, o)\
1085    "pmulhrsw " #s ", "#x "          \n\t"\
1086    "pmulhrsw " #s ", "#y "          \n\t"
1087
1088#include "dsputil_mmx_qns_template.c"
1089
1090#undef DEF
1091#undef SET_RND
1092#undef SCALE_OFFSET
1093#undef PMULHRW
1094#undef PHADDD
1095#endif //HAVE_SSSE3
1096
1097
1098void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1099{
1100    int mm_flags = av_get_cpu_flags();
1101    int bit_depth = avctx->bits_per_raw_sample;
1102
1103    if (mm_flags & AV_CPU_FLAG_MMX) {
1104        const int dct_algo = avctx->dct_algo;
1105        if (avctx->bits_per_raw_sample <= 8 &&
1106            (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
1107            if(mm_flags & AV_CPU_FLAG_SSE2){
1108                c->fdct = ff_fdct_sse2;
1109            }else if(mm_flags & AV_CPU_FLAG_MMX2){
1110                c->fdct = ff_fdct_mmx2;
1111            }else{
1112                c->fdct = ff_fdct_mmx;
1113            }
1114        }
1115
1116        if (bit_depth <= 8)
1117            c->get_pixels = get_pixels_mmx;
1118        c->diff_pixels = diff_pixels_mmx;
1119        c->pix_sum = pix_sum16_mmx;
1120
1121        c->diff_bytes= diff_bytes_mmx;
1122        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1123
1124#if HAVE_YASM
1125        c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
1126        c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
1127#endif
1128
1129        c->pix_norm1 = pix_norm1_mmx;
1130        c->sse[0] = (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
1131          c->sse[1] = sse8_mmx;
1132        c->vsad[4]= vsad_intra16_mmx;
1133
1134        c->nsse[0] = nsse16_mmx;
1135        c->nsse[1] = nsse8_mmx;
1136        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1137            c->vsad[0] = vsad16_mmx;
1138        }
1139
1140        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1141            c->try_8x8basis= try_8x8basis_mmx;
1142        }
1143        c->add_8x8basis= add_8x8basis_mmx;
1144
1145        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1146
1147
1148        if (mm_flags & AV_CPU_FLAG_MMX2) {
1149            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1150#if HAVE_YASM
1151            c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
1152            c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
1153#endif
1154            c->vsad[4]= vsad_intra16_mmx2;
1155
1156            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1157                c->vsad[0] = vsad16_mmx2;
1158            }
1159
1160            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1161        }
1162
1163        if(mm_flags & AV_CPU_FLAG_SSE2){
1164            if (bit_depth <= 8)
1165                c->get_pixels = get_pixels_sse2;
1166            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1167#if HAVE_YASM && HAVE_ALIGNED_STACK
1168            c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
1169            c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
1170#endif
1171        }
1172
1173#if HAVE_SSSE3
1174        if(mm_flags & AV_CPU_FLAG_SSSE3){
1175            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1176                c->try_8x8basis= try_8x8basis_ssse3;
1177            }
1178            c->add_8x8basis= add_8x8basis_ssse3;
1179            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1180#if HAVE_YASM && HAVE_ALIGNED_STACK
1181            c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
1182            c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
1183#endif
1184        }
1185#endif
1186
1187        if(mm_flags & AV_CPU_FLAG_3DNOW){
1188            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1189                c->try_8x8basis= try_8x8basis_3dnow;
1190            }
1191            c->add_8x8basis= add_8x8basis_3dnow;
1192        }
1193    }
1194
1195    dsputil_init_pix_mmx(c, avctx);
1196}
1197