1/*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23 */
24
25#include "libavutil/x86_cpu.h"
26#include "libavcodec/dsputil.h"
27#include "libavcodec/mpegvideo.h"
28#include "libavcodec/mathops.h"
29#include "dsputil_mmx.h"
30
31
32static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
33{
34    __asm__ volatile(
35        "mov $-128, %%"REG_a"           \n\t"
36        "pxor %%mm7, %%mm7              \n\t"
37        ASMALIGN(4)
38        "1:                             \n\t"
39        "movq (%0), %%mm0               \n\t"
40        "movq (%0, %2), %%mm2           \n\t"
41        "movq %%mm0, %%mm1              \n\t"
42        "movq %%mm2, %%mm3              \n\t"
43        "punpcklbw %%mm7, %%mm0         \n\t"
44        "punpckhbw %%mm7, %%mm1         \n\t"
45        "punpcklbw %%mm7, %%mm2         \n\t"
46        "punpckhbw %%mm7, %%mm3         \n\t"
47        "movq %%mm0, (%1, %%"REG_a")    \n\t"
48        "movq %%mm1, 8(%1, %%"REG_a")   \n\t"
49        "movq %%mm2, 16(%1, %%"REG_a")  \n\t"
50        "movq %%mm3, 24(%1, %%"REG_a")  \n\t"
51        "add %3, %0                     \n\t"
52        "add $32, %%"REG_a"             \n\t"
53        "js 1b                          \n\t"
54        : "+r" (pixels)
55        : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
56        : "%"REG_a
57    );
58}
59
60static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
61{
62    __asm__ volatile(
63        "pxor %%xmm7,      %%xmm7         \n\t"
64        "movq (%0),        %%xmm0         \n\t"
65        "movq (%0, %2),    %%xmm1         \n\t"
66        "movq (%0, %2,2),  %%xmm2         \n\t"
67        "movq (%0, %3),    %%xmm3         \n\t"
68        "lea (%0,%2,4), %0                \n\t"
69        "punpcklbw %%xmm7, %%xmm0         \n\t"
70        "punpcklbw %%xmm7, %%xmm1         \n\t"
71        "punpcklbw %%xmm7, %%xmm2         \n\t"
72        "punpcklbw %%xmm7, %%xmm3         \n\t"
73        "movdqa %%xmm0,      (%1)         \n\t"
74        "movdqa %%xmm1,    16(%1)         \n\t"
75        "movdqa %%xmm2,    32(%1)         \n\t"
76        "movdqa %%xmm3,    48(%1)         \n\t"
77        "movq (%0),        %%xmm0         \n\t"
78        "movq (%0, %2),    %%xmm1         \n\t"
79        "movq (%0, %2,2),  %%xmm2         \n\t"
80        "movq (%0, %3),    %%xmm3         \n\t"
81        "punpcklbw %%xmm7, %%xmm0         \n\t"
82        "punpcklbw %%xmm7, %%xmm1         \n\t"
83        "punpcklbw %%xmm7, %%xmm2         \n\t"
84        "punpcklbw %%xmm7, %%xmm3         \n\t"
85        "movdqa %%xmm0,    64(%1)         \n\t"
86        "movdqa %%xmm1,    80(%1)         \n\t"
87        "movdqa %%xmm2,    96(%1)         \n\t"
88        "movdqa %%xmm3,   112(%1)         \n\t"
89        : "+r" (pixels)
90        : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
91    );
92}
93
94static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
95{
96    __asm__ volatile(
97        "pxor %%mm7, %%mm7              \n\t"
98        "mov $-128, %%"REG_a"           \n\t"
99        ASMALIGN(4)
100        "1:                             \n\t"
101        "movq (%0), %%mm0               \n\t"
102        "movq (%1), %%mm2               \n\t"
103        "movq %%mm0, %%mm1              \n\t"
104        "movq %%mm2, %%mm3              \n\t"
105        "punpcklbw %%mm7, %%mm0         \n\t"
106        "punpckhbw %%mm7, %%mm1         \n\t"
107        "punpcklbw %%mm7, %%mm2         \n\t"
108        "punpckhbw %%mm7, %%mm3         \n\t"
109        "psubw %%mm2, %%mm0             \n\t"
110        "psubw %%mm3, %%mm1             \n\t"
111        "movq %%mm0, (%2, %%"REG_a")    \n\t"
112        "movq %%mm1, 8(%2, %%"REG_a")   \n\t"
113        "add %3, %0                     \n\t"
114        "add %3, %1                     \n\t"
115        "add $16, %%"REG_a"             \n\t"
116        "jnz 1b                         \n\t"
117        : "+r" (s1), "+r" (s2)
118        : "r" (block+64), "r" ((x86_reg)stride)
119        : "%"REG_a
120    );
121}
122
123static int pix_sum16_mmx(uint8_t * pix, int line_size){
124    const int h=16;
125    int sum;
126    x86_reg index= -line_size*h;
127
128    __asm__ volatile(
129                "pxor %%mm7, %%mm7              \n\t"
130                "pxor %%mm6, %%mm6              \n\t"
131                "1:                             \n\t"
132                "movq (%2, %1), %%mm0           \n\t"
133                "movq (%2, %1), %%mm1           \n\t"
134                "movq 8(%2, %1), %%mm2          \n\t"
135                "movq 8(%2, %1), %%mm3          \n\t"
136                "punpcklbw %%mm7, %%mm0         \n\t"
137                "punpckhbw %%mm7, %%mm1         \n\t"
138                "punpcklbw %%mm7, %%mm2         \n\t"
139                "punpckhbw %%mm7, %%mm3         \n\t"
140                "paddw %%mm0, %%mm1             \n\t"
141                "paddw %%mm2, %%mm3             \n\t"
142                "paddw %%mm1, %%mm3             \n\t"
143                "paddw %%mm3, %%mm6             \n\t"
144                "add %3, %1                     \n\t"
145                " js 1b                         \n\t"
146                "movq %%mm6, %%mm5              \n\t"
147                "psrlq $32, %%mm6               \n\t"
148                "paddw %%mm5, %%mm6             \n\t"
149                "movq %%mm6, %%mm5              \n\t"
150                "psrlq $16, %%mm6               \n\t"
151                "paddw %%mm5, %%mm6             \n\t"
152                "movd %%mm6, %0                 \n\t"
153                "andl $0xFFFF, %0               \n\t"
154                : "=&r" (sum), "+r" (index)
155                : "r" (pix - index), "r" ((x86_reg)line_size)
156        );
157
158        return sum;
159}
160
161static int pix_norm1_mmx(uint8_t *pix, int line_size) {
162    int tmp;
163  __asm__ volatile (
164      "movl $16,%%ecx\n"
165      "pxor %%mm0,%%mm0\n"
166      "pxor %%mm7,%%mm7\n"
167      "1:\n"
168      "movq (%0),%%mm2\n"       /* mm2 = pix[0-7] */
169      "movq 8(%0),%%mm3\n"      /* mm3 = pix[8-15] */
170
171      "movq %%mm2,%%mm1\n"      /* mm1 = mm2 = pix[0-7] */
172
173      "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
174      "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
175
176      "movq %%mm3,%%mm4\n"      /* mm4 = mm3 = pix[8-15] */
177      "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
178      "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
179
180      "pmaddwd %%mm1,%%mm1\n"   /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
181      "pmaddwd %%mm2,%%mm2\n"   /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
182
183      "pmaddwd %%mm3,%%mm3\n"
184      "pmaddwd %%mm4,%%mm4\n"
185
186      "paddd %%mm1,%%mm2\n"     /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
187                                          pix2^2+pix3^2+pix6^2+pix7^2) */
188      "paddd %%mm3,%%mm4\n"
189      "paddd %%mm2,%%mm7\n"
190
191      "add %2, %0\n"
192      "paddd %%mm4,%%mm7\n"
193      "dec %%ecx\n"
194      "jnz 1b\n"
195
196      "movq %%mm7,%%mm1\n"
197      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
198      "paddd %%mm7,%%mm1\n"
199      "movd %%mm1,%1\n"
200      : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
201    return tmp;
202}
203
204static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
205    int tmp;
206  __asm__ volatile (
207      "movl %4,%%ecx\n"
208      "shr $1,%%ecx\n"
209      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
210      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
211      "1:\n"
212      "movq (%0),%%mm1\n"       /* mm1 = pix1[0][0-7] */
213      "movq (%1),%%mm2\n"       /* mm2 = pix2[0][0-7] */
214      "movq (%0,%3),%%mm3\n"    /* mm3 = pix1[1][0-7] */
215      "movq (%1,%3),%%mm4\n"    /* mm4 = pix2[1][0-7] */
216
217      /* todo: mm1-mm2, mm3-mm4 */
218      /* algo: subtract mm1 from mm2 with saturation and vice versa */
219      /*       OR the results to get absolute difference */
220      "movq %%mm1,%%mm5\n"
221      "movq %%mm3,%%mm6\n"
222      "psubusb %%mm2,%%mm1\n"
223      "psubusb %%mm4,%%mm3\n"
224      "psubusb %%mm5,%%mm2\n"
225      "psubusb %%mm6,%%mm4\n"
226
227      "por %%mm1,%%mm2\n"
228      "por %%mm3,%%mm4\n"
229
230      /* now convert to 16-bit vectors so we can square them */
231      "movq %%mm2,%%mm1\n"
232      "movq %%mm4,%%mm3\n"
233
234      "punpckhbw %%mm0,%%mm2\n"
235      "punpckhbw %%mm0,%%mm4\n"
236      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
237      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
238
239      "pmaddwd %%mm2,%%mm2\n"
240      "pmaddwd %%mm4,%%mm4\n"
241      "pmaddwd %%mm1,%%mm1\n"
242      "pmaddwd %%mm3,%%mm3\n"
243
244      "lea (%0,%3,2), %0\n"     /* pix1 += 2*line_size */
245      "lea (%1,%3,2), %1\n"     /* pix2 += 2*line_size */
246
247      "paddd %%mm2,%%mm1\n"
248      "paddd %%mm4,%%mm3\n"
249      "paddd %%mm1,%%mm7\n"
250      "paddd %%mm3,%%mm7\n"
251
252      "decl %%ecx\n"
253      "jnz 1b\n"
254
255      "movq %%mm7,%%mm1\n"
256      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
257      "paddd %%mm7,%%mm1\n"
258      "movd %%mm1,%2\n"
259      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
260      : "r" ((x86_reg)line_size) , "m" (h)
261      : "%ecx");
262    return tmp;
263}
264
265static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
266    int tmp;
267  __asm__ volatile (
268      "movl %4,%%ecx\n"
269      "pxor %%mm0,%%mm0\n"      /* mm0 = 0 */
270      "pxor %%mm7,%%mm7\n"      /* mm7 holds the sum */
271      "1:\n"
272      "movq (%0),%%mm1\n"       /* mm1 = pix1[0-7] */
273      "movq (%1),%%mm2\n"       /* mm2 = pix2[0-7] */
274      "movq 8(%0),%%mm3\n"      /* mm3 = pix1[8-15] */
275      "movq 8(%1),%%mm4\n"      /* mm4 = pix2[8-15] */
276
277      /* todo: mm1-mm2, mm3-mm4 */
278      /* algo: subtract mm1 from mm2 with saturation and vice versa */
279      /*       OR the results to get absolute difference */
280      "movq %%mm1,%%mm5\n"
281      "movq %%mm3,%%mm6\n"
282      "psubusb %%mm2,%%mm1\n"
283      "psubusb %%mm4,%%mm3\n"
284      "psubusb %%mm5,%%mm2\n"
285      "psubusb %%mm6,%%mm4\n"
286
287      "por %%mm1,%%mm2\n"
288      "por %%mm3,%%mm4\n"
289
290      /* now convert to 16-bit vectors so we can square them */
291      "movq %%mm2,%%mm1\n"
292      "movq %%mm4,%%mm3\n"
293
294      "punpckhbw %%mm0,%%mm2\n"
295      "punpckhbw %%mm0,%%mm4\n"
296      "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
297      "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
298
299      "pmaddwd %%mm2,%%mm2\n"
300      "pmaddwd %%mm4,%%mm4\n"
301      "pmaddwd %%mm1,%%mm1\n"
302      "pmaddwd %%mm3,%%mm3\n"
303
304      "add %3,%0\n"
305      "add %3,%1\n"
306
307      "paddd %%mm2,%%mm1\n"
308      "paddd %%mm4,%%mm3\n"
309      "paddd %%mm1,%%mm7\n"
310      "paddd %%mm3,%%mm7\n"
311
312      "decl %%ecx\n"
313      "jnz 1b\n"
314
315      "movq %%mm7,%%mm1\n"
316      "psrlq $32, %%mm7\n"      /* shift hi dword to lo */
317      "paddd %%mm7,%%mm1\n"
318      "movd %%mm1,%2\n"
319      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
320      : "r" ((x86_reg)line_size) , "m" (h)
321      : "%ecx");
322    return tmp;
323}
324
325static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
326    int tmp;
327  __asm__ volatile (
328      "shr $1,%2\n"
329      "pxor %%xmm0,%%xmm0\n"    /* mm0 = 0 */
330      "pxor %%xmm7,%%xmm7\n"    /* mm7 holds the sum */
331      "1:\n"
332      "movdqu (%0),%%xmm1\n"    /* mm1 = pix1[0][0-15] */
333      "movdqu (%1),%%xmm2\n"    /* mm2 = pix2[0][0-15] */
334      "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
335      "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
336
337      /* todo: mm1-mm2, mm3-mm4 */
338      /* algo: subtract mm1 from mm2 with saturation and vice versa */
339      /*       OR the results to get absolute difference */
340      "movdqa %%xmm1,%%xmm5\n"
341      "movdqa %%xmm3,%%xmm6\n"
342      "psubusb %%xmm2,%%xmm1\n"
343      "psubusb %%xmm4,%%xmm3\n"
344      "psubusb %%xmm5,%%xmm2\n"
345      "psubusb %%xmm6,%%xmm4\n"
346
347      "por %%xmm1,%%xmm2\n"
348      "por %%xmm3,%%xmm4\n"
349
350      /* now convert to 16-bit vectors so we can square them */
351      "movdqa %%xmm2,%%xmm1\n"
352      "movdqa %%xmm4,%%xmm3\n"
353
354      "punpckhbw %%xmm0,%%xmm2\n"
355      "punpckhbw %%xmm0,%%xmm4\n"
356      "punpcklbw %%xmm0,%%xmm1\n"  /* mm1 now spread over (mm1,mm2) */
357      "punpcklbw %%xmm0,%%xmm3\n"  /* mm4 now spread over (mm3,mm4) */
358
359      "pmaddwd %%xmm2,%%xmm2\n"
360      "pmaddwd %%xmm4,%%xmm4\n"
361      "pmaddwd %%xmm1,%%xmm1\n"
362      "pmaddwd %%xmm3,%%xmm3\n"
363
364      "lea (%0,%4,2), %0\n"        /* pix1 += 2*line_size */
365      "lea (%1,%4,2), %1\n"        /* pix2 += 2*line_size */
366
367      "paddd %%xmm2,%%xmm1\n"
368      "paddd %%xmm4,%%xmm3\n"
369      "paddd %%xmm1,%%xmm7\n"
370      "paddd %%xmm3,%%xmm7\n"
371
372      "decl %2\n"
373      "jnz 1b\n"
374
375      "movdqa %%xmm7,%%xmm1\n"
376      "psrldq $8, %%xmm7\n"        /* shift hi qword to lo */
377      "paddd %%xmm1,%%xmm7\n"
378      "movdqa %%xmm7,%%xmm1\n"
379      "psrldq $4, %%xmm7\n"        /* shift hi dword to lo */
380      "paddd %%xmm1,%%xmm7\n"
381      "movd %%xmm7,%3\n"
382      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
383      : "r" ((x86_reg)line_size));
384    return tmp;
385}
386
387static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
388    int tmp;
389  __asm__ volatile (
390      "movl %3,%%ecx\n"
391      "pxor %%mm7,%%mm7\n"
392      "pxor %%mm6,%%mm6\n"
393
394      "movq (%0),%%mm0\n"
395      "movq %%mm0, %%mm1\n"
396      "psllq $8, %%mm0\n"
397      "psrlq $8, %%mm1\n"
398      "psrlq $8, %%mm0\n"
399      "movq %%mm0, %%mm2\n"
400      "movq %%mm1, %%mm3\n"
401      "punpcklbw %%mm7,%%mm0\n"
402      "punpcklbw %%mm7,%%mm1\n"
403      "punpckhbw %%mm7,%%mm2\n"
404      "punpckhbw %%mm7,%%mm3\n"
405      "psubw %%mm1, %%mm0\n"
406      "psubw %%mm3, %%mm2\n"
407
408      "add %2,%0\n"
409
410      "movq (%0),%%mm4\n"
411      "movq %%mm4, %%mm1\n"
412      "psllq $8, %%mm4\n"
413      "psrlq $8, %%mm1\n"
414      "psrlq $8, %%mm4\n"
415      "movq %%mm4, %%mm5\n"
416      "movq %%mm1, %%mm3\n"
417      "punpcklbw %%mm7,%%mm4\n"
418      "punpcklbw %%mm7,%%mm1\n"
419      "punpckhbw %%mm7,%%mm5\n"
420      "punpckhbw %%mm7,%%mm3\n"
421      "psubw %%mm1, %%mm4\n"
422      "psubw %%mm3, %%mm5\n"
423      "psubw %%mm4, %%mm0\n"
424      "psubw %%mm5, %%mm2\n"
425      "pxor %%mm3, %%mm3\n"
426      "pxor %%mm1, %%mm1\n"
427      "pcmpgtw %%mm0, %%mm3\n\t"
428      "pcmpgtw %%mm2, %%mm1\n\t"
429      "pxor %%mm3, %%mm0\n"
430      "pxor %%mm1, %%mm2\n"
431      "psubw %%mm3, %%mm0\n"
432      "psubw %%mm1, %%mm2\n"
433      "paddw %%mm0, %%mm2\n"
434      "paddw %%mm2, %%mm6\n"
435
436      "add %2,%0\n"
437      "1:\n"
438
439      "movq (%0),%%mm0\n"
440      "movq %%mm0, %%mm1\n"
441      "psllq $8, %%mm0\n"
442      "psrlq $8, %%mm1\n"
443      "psrlq $8, %%mm0\n"
444      "movq %%mm0, %%mm2\n"
445      "movq %%mm1, %%mm3\n"
446      "punpcklbw %%mm7,%%mm0\n"
447      "punpcklbw %%mm7,%%mm1\n"
448      "punpckhbw %%mm7,%%mm2\n"
449      "punpckhbw %%mm7,%%mm3\n"
450      "psubw %%mm1, %%mm0\n"
451      "psubw %%mm3, %%mm2\n"
452      "psubw %%mm0, %%mm4\n"
453      "psubw %%mm2, %%mm5\n"
454      "pxor %%mm3, %%mm3\n"
455      "pxor %%mm1, %%mm1\n"
456      "pcmpgtw %%mm4, %%mm3\n\t"
457      "pcmpgtw %%mm5, %%mm1\n\t"
458      "pxor %%mm3, %%mm4\n"
459      "pxor %%mm1, %%mm5\n"
460      "psubw %%mm3, %%mm4\n"
461      "psubw %%mm1, %%mm5\n"
462      "paddw %%mm4, %%mm5\n"
463      "paddw %%mm5, %%mm6\n"
464
465      "add %2,%0\n"
466
467      "movq (%0),%%mm4\n"
468      "movq %%mm4, %%mm1\n"
469      "psllq $8, %%mm4\n"
470      "psrlq $8, %%mm1\n"
471      "psrlq $8, %%mm4\n"
472      "movq %%mm4, %%mm5\n"
473      "movq %%mm1, %%mm3\n"
474      "punpcklbw %%mm7,%%mm4\n"
475      "punpcklbw %%mm7,%%mm1\n"
476      "punpckhbw %%mm7,%%mm5\n"
477      "punpckhbw %%mm7,%%mm3\n"
478      "psubw %%mm1, %%mm4\n"
479      "psubw %%mm3, %%mm5\n"
480      "psubw %%mm4, %%mm0\n"
481      "psubw %%mm5, %%mm2\n"
482      "pxor %%mm3, %%mm3\n"
483      "pxor %%mm1, %%mm1\n"
484      "pcmpgtw %%mm0, %%mm3\n\t"
485      "pcmpgtw %%mm2, %%mm1\n\t"
486      "pxor %%mm3, %%mm0\n"
487      "pxor %%mm1, %%mm2\n"
488      "psubw %%mm3, %%mm0\n"
489      "psubw %%mm1, %%mm2\n"
490      "paddw %%mm0, %%mm2\n"
491      "paddw %%mm2, %%mm6\n"
492
493      "add %2,%0\n"
494      "subl $2, %%ecx\n"
495      " jnz 1b\n"
496
497      "movq %%mm6, %%mm0\n"
498      "punpcklwd %%mm7,%%mm0\n"
499      "punpckhwd %%mm7,%%mm6\n"
500      "paddd %%mm0, %%mm6\n"
501
502      "movq %%mm6,%%mm0\n"
503      "psrlq $32, %%mm6\n"
504      "paddd %%mm6,%%mm0\n"
505      "movd %%mm0,%1\n"
506      : "+r" (pix1), "=r"(tmp)
507      : "r" ((x86_reg)line_size) , "g" (h-2)
508      : "%ecx");
509      return tmp;
510}
511
512static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
513    int tmp;
514    uint8_t * pix= pix1;
515  __asm__ volatile (
516      "movl %3,%%ecx\n"
517      "pxor %%mm7,%%mm7\n"
518      "pxor %%mm6,%%mm6\n"
519
520      "movq (%0),%%mm0\n"
521      "movq 1(%0),%%mm1\n"
522      "movq %%mm0, %%mm2\n"
523      "movq %%mm1, %%mm3\n"
524      "punpcklbw %%mm7,%%mm0\n"
525      "punpcklbw %%mm7,%%mm1\n"
526      "punpckhbw %%mm7,%%mm2\n"
527      "punpckhbw %%mm7,%%mm3\n"
528      "psubw %%mm1, %%mm0\n"
529      "psubw %%mm3, %%mm2\n"
530
531      "add %2,%0\n"
532
533      "movq (%0),%%mm4\n"
534      "movq 1(%0),%%mm1\n"
535      "movq %%mm4, %%mm5\n"
536      "movq %%mm1, %%mm3\n"
537      "punpcklbw %%mm7,%%mm4\n"
538      "punpcklbw %%mm7,%%mm1\n"
539      "punpckhbw %%mm7,%%mm5\n"
540      "punpckhbw %%mm7,%%mm3\n"
541      "psubw %%mm1, %%mm4\n"
542      "psubw %%mm3, %%mm5\n"
543      "psubw %%mm4, %%mm0\n"
544      "psubw %%mm5, %%mm2\n"
545      "pxor %%mm3, %%mm3\n"
546      "pxor %%mm1, %%mm1\n"
547      "pcmpgtw %%mm0, %%mm3\n\t"
548      "pcmpgtw %%mm2, %%mm1\n\t"
549      "pxor %%mm3, %%mm0\n"
550      "pxor %%mm1, %%mm2\n"
551      "psubw %%mm3, %%mm0\n"
552      "psubw %%mm1, %%mm2\n"
553      "paddw %%mm0, %%mm2\n"
554      "paddw %%mm2, %%mm6\n"
555
556      "add %2,%0\n"
557      "1:\n"
558
559      "movq (%0),%%mm0\n"
560      "movq 1(%0),%%mm1\n"
561      "movq %%mm0, %%mm2\n"
562      "movq %%mm1, %%mm3\n"
563      "punpcklbw %%mm7,%%mm0\n"
564      "punpcklbw %%mm7,%%mm1\n"
565      "punpckhbw %%mm7,%%mm2\n"
566      "punpckhbw %%mm7,%%mm3\n"
567      "psubw %%mm1, %%mm0\n"
568      "psubw %%mm3, %%mm2\n"
569      "psubw %%mm0, %%mm4\n"
570      "psubw %%mm2, %%mm5\n"
571      "pxor %%mm3, %%mm3\n"
572      "pxor %%mm1, %%mm1\n"
573      "pcmpgtw %%mm4, %%mm3\n\t"
574      "pcmpgtw %%mm5, %%mm1\n\t"
575      "pxor %%mm3, %%mm4\n"
576      "pxor %%mm1, %%mm5\n"
577      "psubw %%mm3, %%mm4\n"
578      "psubw %%mm1, %%mm5\n"
579      "paddw %%mm4, %%mm5\n"
580      "paddw %%mm5, %%mm6\n"
581
582      "add %2,%0\n"
583
584      "movq (%0),%%mm4\n"
585      "movq 1(%0),%%mm1\n"
586      "movq %%mm4, %%mm5\n"
587      "movq %%mm1, %%mm3\n"
588      "punpcklbw %%mm7,%%mm4\n"
589      "punpcklbw %%mm7,%%mm1\n"
590      "punpckhbw %%mm7,%%mm5\n"
591      "punpckhbw %%mm7,%%mm3\n"
592      "psubw %%mm1, %%mm4\n"
593      "psubw %%mm3, %%mm5\n"
594      "psubw %%mm4, %%mm0\n"
595      "psubw %%mm5, %%mm2\n"
596      "pxor %%mm3, %%mm3\n"
597      "pxor %%mm1, %%mm1\n"
598      "pcmpgtw %%mm0, %%mm3\n\t"
599      "pcmpgtw %%mm2, %%mm1\n\t"
600      "pxor %%mm3, %%mm0\n"
601      "pxor %%mm1, %%mm2\n"
602      "psubw %%mm3, %%mm0\n"
603      "psubw %%mm1, %%mm2\n"
604      "paddw %%mm0, %%mm2\n"
605      "paddw %%mm2, %%mm6\n"
606
607      "add %2,%0\n"
608      "subl $2, %%ecx\n"
609      " jnz 1b\n"
610
611      "movq %%mm6, %%mm0\n"
612      "punpcklwd %%mm7,%%mm0\n"
613      "punpckhwd %%mm7,%%mm6\n"
614      "paddd %%mm0, %%mm6\n"
615
616      "movq %%mm6,%%mm0\n"
617      "psrlq $32, %%mm6\n"
618      "paddd %%mm6,%%mm0\n"
619      "movd %%mm0,%1\n"
620      : "+r" (pix1), "=r"(tmp)
621      : "r" ((x86_reg)line_size) , "g" (h-2)
622      : "%ecx");
623      return tmp + hf_noise8_mmx(pix+8, line_size, h);
624}
625
626static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
627    MpegEncContext *c = p;
628    int score1, score2;
629
630    if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
631    else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
632    score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
633
634    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
635    else  return score1 + FFABS(score2)*8;
636}
637
638static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
639    MpegEncContext *c = p;
640    int score1= sse8_mmx(c, pix1, pix2, line_size, h);
641    int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
642
643    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
644    else  return score1 + FFABS(score2)*8;
645}
646
647static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
648    int tmp;
649
650    assert( (((int)pix) & 7) == 0);
651    assert((line_size &7) ==0);
652
653#define SUM(in0, in1, out0, out1) \
654      "movq (%0), %%mm2\n"\
655      "movq 8(%0), %%mm3\n"\
656      "add %2,%0\n"\
657      "movq %%mm2, " #out0 "\n"\
658      "movq %%mm3, " #out1 "\n"\
659      "psubusb " #in0 ", %%mm2\n"\
660      "psubusb " #in1 ", %%mm3\n"\
661      "psubusb " #out0 ", " #in0 "\n"\
662      "psubusb " #out1 ", " #in1 "\n"\
663      "por %%mm2, " #in0 "\n"\
664      "por %%mm3, " #in1 "\n"\
665      "movq " #in0 ", %%mm2\n"\
666      "movq " #in1 ", %%mm3\n"\
667      "punpcklbw %%mm7, " #in0 "\n"\
668      "punpcklbw %%mm7, " #in1 "\n"\
669      "punpckhbw %%mm7, %%mm2\n"\
670      "punpckhbw %%mm7, %%mm3\n"\
671      "paddw " #in1 ", " #in0 "\n"\
672      "paddw %%mm3, %%mm2\n"\
673      "paddw %%mm2, " #in0 "\n"\
674      "paddw " #in0 ", %%mm6\n"
675
676
677  __asm__ volatile (
678      "movl %3,%%ecx\n"
679      "pxor %%mm6,%%mm6\n"
680      "pxor %%mm7,%%mm7\n"
681      "movq (%0),%%mm0\n"
682      "movq 8(%0),%%mm1\n"
683      "add %2,%0\n"
684      "jmp 2f\n"
685      "1:\n"
686
687      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
688      "2:\n"
689      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
690
691      "subl $2, %%ecx\n"
692      "jnz 1b\n"
693
694      "movq %%mm6,%%mm0\n"
695      "psrlq $32, %%mm6\n"
696      "paddw %%mm6,%%mm0\n"
697      "movq %%mm0,%%mm6\n"
698      "psrlq $16, %%mm0\n"
699      "paddw %%mm6,%%mm0\n"
700      "movd %%mm0,%1\n"
701      : "+r" (pix), "=r"(tmp)
702      : "r" ((x86_reg)line_size) , "m" (h)
703      : "%ecx");
704    return tmp & 0xFFFF;
705}
706#undef SUM
707
708static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
709    int tmp;
710
711    assert( (((int)pix) & 7) == 0);
712    assert((line_size &7) ==0);
713
714#define SUM(in0, in1, out0, out1) \
715      "movq (%0), " #out0 "\n"\
716      "movq 8(%0), " #out1 "\n"\
717      "add %2,%0\n"\
718      "psadbw " #out0 ", " #in0 "\n"\
719      "psadbw " #out1 ", " #in1 "\n"\
720      "paddw " #in1 ", " #in0 "\n"\
721      "paddw " #in0 ", %%mm6\n"
722
723  __asm__ volatile (
724      "movl %3,%%ecx\n"
725      "pxor %%mm6,%%mm6\n"
726      "pxor %%mm7,%%mm7\n"
727      "movq (%0),%%mm0\n"
728      "movq 8(%0),%%mm1\n"
729      "add %2,%0\n"
730      "jmp 2f\n"
731      "1:\n"
732
733      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
734      "2:\n"
735      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
736
737      "subl $2, %%ecx\n"
738      "jnz 1b\n"
739
740      "movd %%mm6,%1\n"
741      : "+r" (pix), "=r"(tmp)
742      : "r" ((x86_reg)line_size) , "m" (h)
743      : "%ecx");
744    return tmp;
745}
746#undef SUM
747
748static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
749    int tmp;
750
751    assert( (((int)pix1) & 7) == 0);
752    assert( (((int)pix2) & 7) == 0);
753    assert((line_size &7) ==0);
754
755#define SUM(in0, in1, out0, out1) \
756      "movq (%0),%%mm2\n"\
757      "movq (%1)," #out0 "\n"\
758      "movq 8(%0),%%mm3\n"\
759      "movq 8(%1)," #out1 "\n"\
760      "add %3,%0\n"\
761      "add %3,%1\n"\
762      "psubb " #out0 ", %%mm2\n"\
763      "psubb " #out1 ", %%mm3\n"\
764      "pxor %%mm7, %%mm2\n"\
765      "pxor %%mm7, %%mm3\n"\
766      "movq %%mm2, " #out0 "\n"\
767      "movq %%mm3, " #out1 "\n"\
768      "psubusb " #in0 ", %%mm2\n"\
769      "psubusb " #in1 ", %%mm3\n"\
770      "psubusb " #out0 ", " #in0 "\n"\
771      "psubusb " #out1 ", " #in1 "\n"\
772      "por %%mm2, " #in0 "\n"\
773      "por %%mm3, " #in1 "\n"\
774      "movq " #in0 ", %%mm2\n"\
775      "movq " #in1 ", %%mm3\n"\
776      "punpcklbw %%mm7, " #in0 "\n"\
777      "punpcklbw %%mm7, " #in1 "\n"\
778      "punpckhbw %%mm7, %%mm2\n"\
779      "punpckhbw %%mm7, %%mm3\n"\
780      "paddw " #in1 ", " #in0 "\n"\
781      "paddw %%mm3, %%mm2\n"\
782      "paddw %%mm2, " #in0 "\n"\
783      "paddw " #in0 ", %%mm6\n"
784
785
786  __asm__ volatile (
787      "movl %4,%%ecx\n"
788      "pxor %%mm6,%%mm6\n"
789      "pcmpeqw %%mm7,%%mm7\n"
790      "psllw $15, %%mm7\n"
791      "packsswb %%mm7, %%mm7\n"
792      "movq (%0),%%mm0\n"
793      "movq (%1),%%mm2\n"
794      "movq 8(%0),%%mm1\n"
795      "movq 8(%1),%%mm3\n"
796      "add %3,%0\n"
797      "add %3,%1\n"
798      "psubb %%mm2, %%mm0\n"
799      "psubb %%mm3, %%mm1\n"
800      "pxor %%mm7, %%mm0\n"
801      "pxor %%mm7, %%mm1\n"
802      "jmp 2f\n"
803      "1:\n"
804
805      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
806      "2:\n"
807      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
808
809      "subl $2, %%ecx\n"
810      "jnz 1b\n"
811
812      "movq %%mm6,%%mm0\n"
813      "psrlq $32, %%mm6\n"
814      "paddw %%mm6,%%mm0\n"
815      "movq %%mm0,%%mm6\n"
816      "psrlq $16, %%mm0\n"
817      "paddw %%mm6,%%mm0\n"
818      "movd %%mm0,%2\n"
819      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
820      : "r" ((x86_reg)line_size) , "m" (h)
821      : "%ecx");
822    return tmp & 0x7FFF;
823}
824#undef SUM
825
826static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
827    int tmp;
828
829    assert( (((int)pix1) & 7) == 0);
830    assert( (((int)pix2) & 7) == 0);
831    assert((line_size &7) ==0);
832
833#define SUM(in0, in1, out0, out1) \
834      "movq (%0)," #out0 "\n"\
835      "movq (%1),%%mm2\n"\
836      "movq 8(%0)," #out1 "\n"\
837      "movq 8(%1),%%mm3\n"\
838      "add %3,%0\n"\
839      "add %3,%1\n"\
840      "psubb %%mm2, " #out0 "\n"\
841      "psubb %%mm3, " #out1 "\n"\
842      "pxor %%mm7, " #out0 "\n"\
843      "pxor %%mm7, " #out1 "\n"\
844      "psadbw " #out0 ", " #in0 "\n"\
845      "psadbw " #out1 ", " #in1 "\n"\
846      "paddw " #in1 ", " #in0 "\n"\
847      "paddw " #in0 ", %%mm6\n"
848
849  __asm__ volatile (
850      "movl %4,%%ecx\n"
851      "pxor %%mm6,%%mm6\n"
852      "pcmpeqw %%mm7,%%mm7\n"
853      "psllw $15, %%mm7\n"
854      "packsswb %%mm7, %%mm7\n"
855      "movq (%0),%%mm0\n"
856      "movq (%1),%%mm2\n"
857      "movq 8(%0),%%mm1\n"
858      "movq 8(%1),%%mm3\n"
859      "add %3,%0\n"
860      "add %3,%1\n"
861      "psubb %%mm2, %%mm0\n"
862      "psubb %%mm3, %%mm1\n"
863      "pxor %%mm7, %%mm0\n"
864      "pxor %%mm7, %%mm1\n"
865      "jmp 2f\n"
866      "1:\n"
867
868      SUM(%%mm4, %%mm5, %%mm0, %%mm1)
869      "2:\n"
870      SUM(%%mm0, %%mm1, %%mm4, %%mm5)
871
872      "subl $2, %%ecx\n"
873      "jnz 1b\n"
874
875      "movd %%mm6,%2\n"
876      : "+r" (pix1), "+r" (pix2), "=r"(tmp)
877      : "r" ((x86_reg)line_size) , "m" (h)
878      : "%ecx");
879    return tmp;
880}
881#undef SUM
882
883static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
884    x86_reg i=0;
885    __asm__ volatile(
886        "1:                             \n\t"
887        "movq  (%2, %0), %%mm0          \n\t"
888        "movq  (%1, %0), %%mm1          \n\t"
889        "psubb %%mm0, %%mm1             \n\t"
890        "movq %%mm1, (%3, %0)           \n\t"
891        "movq 8(%2, %0), %%mm0          \n\t"
892        "movq 8(%1, %0), %%mm1          \n\t"
893        "psubb %%mm0, %%mm1             \n\t"
894        "movq %%mm1, 8(%3, %0)          \n\t"
895        "add $16, %0                    \n\t"
896        "cmp %4, %0                     \n\t"
897        " jb 1b                         \n\t"
898        : "+r" (i)
899        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
900    );
901    for(; i<w; i++)
902        dst[i+0] = src1[i+0]-src2[i+0];
903}
904
905static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
906    x86_reg i=0;
907    uint8_t l, lt;
908
909    __asm__ volatile(
910        "1:                             \n\t"
911        "movq  -1(%1, %0), %%mm0        \n\t" // LT
912        "movq  (%1, %0), %%mm1          \n\t" // T
913        "movq  -1(%2, %0), %%mm2        \n\t" // L
914        "movq  (%2, %0), %%mm3          \n\t" // X
915        "movq %%mm2, %%mm4              \n\t" // L
916        "psubb %%mm0, %%mm2             \n\t"
917        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
918        "movq %%mm4, %%mm5              \n\t" // L
919        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
920        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
921        "pminub %%mm2, %%mm4            \n\t"
922        "pmaxub %%mm1, %%mm4            \n\t"
923        "psubb %%mm4, %%mm3             \n\t" // dst - pred
924        "movq %%mm3, (%3, %0)           \n\t"
925        "add $8, %0                     \n\t"
926        "cmp %4, %0                     \n\t"
927        " jb 1b                         \n\t"
928        : "+r" (i)
929        : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
930    );
931
932    l= *left;
933    lt= *left_top;
934
935    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
936
937    *left_top= src1[w-1];
938    *left    = src2[w-1];
939}
940
941#define DIFF_PIXELS_1(m,a,t,p1,p2)\
942    "mov"#m" "#p1", "#a"              \n\t"\
943    "mov"#m" "#p2", "#t"              \n\t"\
944    "punpcklbw "#a", "#t"             \n\t"\
945    "punpcklbw "#a", "#a"             \n\t"\
946    "psubw     "#t", "#a"             \n\t"\
947
948#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
949    uint8_t *p1b=p1, *p2b=p2;\
950    __asm__ volatile(\
951        DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
952        DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
953        DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
954        "add %4, %1                   \n\t"\
955        "add %4, %2                   \n\t"\
956        DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
957        DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
958        DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
959        DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
960        "mov"#m1" "#mm"0, %0          \n\t"\
961        DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
962        "mov"#m1" %0, "#mm"0          \n\t"\
963        : "+m"(temp), "+r"(p1b), "+r"(p2b)\
964        : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
965    );\
966}
967    //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
968
969#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q,   %%mm,  p1, p2, stride, temp)
970#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
971
972#define LBUTTERFLY2(a1,b1,a2,b2)\
973    "paddw " #b1 ", " #a1 "           \n\t"\
974    "paddw " #b2 ", " #a2 "           \n\t"\
975    "paddw " #b1 ", " #b1 "           \n\t"\
976    "paddw " #b2 ", " #b2 "           \n\t"\
977    "psubw " #a1 ", " #b1 "           \n\t"\
978    "psubw " #a2 ", " #b2 "           \n\t"
979
980#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
981        LBUTTERFLY2(m0, m1, m2, m3)\
982        LBUTTERFLY2(m4, m5, m6, m7)\
983        LBUTTERFLY2(m0, m2, m1, m3)\
984        LBUTTERFLY2(m4, m6, m5, m7)\
985        LBUTTERFLY2(m0, m4, m1, m5)\
986        LBUTTERFLY2(m2, m6, m3, m7)\
987
988#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
989
990#define MMABS_MMX(a,z)\
991    "pxor " #z ", " #z "              \n\t"\
992    "pcmpgtw " #a ", " #z "           \n\t"\
993    "pxor " #z ", " #a "              \n\t"\
994    "psubw " #z ", " #a "             \n\t"
995
996#define MMABS_MMX2(a,z)\
997    "pxor " #z ", " #z "              \n\t"\
998    "psubw " #a ", " #z "             \n\t"\
999    "pmaxsw " #z ", " #a "            \n\t"
1000
1001#define MMABS_SSSE3(a,z)\
1002    "pabsw " #a ", " #a "             \n\t"
1003
1004#define MMABS_SUM(a,z, sum)\
1005    MMABS(a,z)\
1006    "paddusw " #a ", " #sum "         \n\t"
1007
1008#define MMABS_SUM_8x8_NOSPILL\
1009    MMABS(%%xmm0, %%xmm8)\
1010    MMABS(%%xmm1, %%xmm9)\
1011    MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
1012    MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
1013    MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
1014    MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
1015    MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
1016    MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
1017    "paddusw %%xmm1, %%xmm0           \n\t"
1018
1019#if ARCH_X86_64
1020#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
1021#else
1022#define MMABS_SUM_8x8_SSE2\
1023    "movdqa %%xmm7, (%1)              \n\t"\
1024    MMABS(%%xmm0, %%xmm7)\
1025    MMABS(%%xmm1, %%xmm7)\
1026    MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
1027    MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
1028    MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
1029    MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
1030    MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
1031    "movdqa (%1), %%xmm2              \n\t"\
1032    MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
1033    "paddusw %%xmm1, %%xmm0           \n\t"
1034#endif
1035
1036/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1037 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1038 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1039#define HSUM_MMX(a, t, dst)\
1040    "movq "#a", "#t"                  \n\t"\
1041    "psrlq $32, "#a"                  \n\t"\
1042    "paddusw "#t", "#a"               \n\t"\
1043    "movq "#a", "#t"                  \n\t"\
1044    "psrlq $16, "#a"                  \n\t"\
1045    "paddusw "#t", "#a"               \n\t"\
1046    "movd "#a", "#dst"                \n\t"\
1047
1048#define HSUM_MMX2(a, t, dst)\
1049    "pshufw $0x0E, "#a", "#t"         \n\t"\
1050    "paddusw "#t", "#a"               \n\t"\
1051    "pshufw $0x01, "#a", "#t"         \n\t"\
1052    "paddusw "#t", "#a"               \n\t"\
1053    "movd "#a", "#dst"                \n\t"\
1054
1055#define HSUM_SSE2(a, t, dst)\
1056    "movhlps "#a", "#t"               \n\t"\
1057    "paddusw "#t", "#a"               \n\t"\
1058    "pshuflw $0x0E, "#a", "#t"        \n\t"\
1059    "paddusw "#t", "#a"               \n\t"\
1060    "pshuflw $0x01, "#a", "#t"        \n\t"\
1061    "paddusw "#t", "#a"               \n\t"\
1062    "movd "#a", "#dst"                \n\t"\
1063
1064#define HADAMARD8_DIFF_MMX(cpu) \
1065static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1066    DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1067    int sum;\
1068\
1069    assert(h==8);\
1070\
1071    DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1072\
1073    __asm__ volatile(\
1074        HADAMARD48\
1075\
1076        "movq %%mm7, 96(%1)             \n\t"\
1077\
1078        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1079        STORE4(8,  0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1080\
1081        "movq 96(%1), %%mm7             \n\t"\
1082        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1083        STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
1084\
1085        : "=r" (sum)\
1086        : "r"(temp)\
1087    );\
1088\
1089    DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1090\
1091    __asm__ volatile(\
1092        HADAMARD48\
1093\
1094        "movq %%mm7, 96(%1)             \n\t"\
1095\
1096        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1097        STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
1098\
1099        "movq 96(%1), %%mm7             \n\t"\
1100        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1101        "movq %%mm7, %%mm5              \n\t"/*FIXME remove*/\
1102        "movq %%mm6, %%mm7              \n\t"\
1103        "movq %%mm0, %%mm6              \n\t"\
1104\
1105        LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1106\
1107        HADAMARD48\
1108        "movq %%mm7, 64(%1)             \n\t"\
1109        MMABS(%%mm0, %%mm7)\
1110        MMABS(%%mm1, %%mm7)\
1111        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1112        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1113        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1114        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1115        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1116        "movq 64(%1), %%mm2             \n\t"\
1117        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1118        "paddusw %%mm1, %%mm0           \n\t"\
1119        "movq %%mm0, 64(%1)             \n\t"\
1120\
1121        LOAD4(8,  0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
1122        LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
1123\
1124        HADAMARD48\
1125        "movq %%mm7, (%1)               \n\t"\
1126        MMABS(%%mm0, %%mm7)\
1127        MMABS(%%mm1, %%mm7)\
1128        MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1129        MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1130        MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1131        MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1132        MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1133        "movq (%1), %%mm2               \n\t"\
1134        MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1135        "paddusw 64(%1), %%mm0          \n\t"\
1136        "paddusw %%mm1, %%mm0           \n\t"\
1137\
1138        HSUM(%%mm0, %%mm1, %0)\
1139\
1140        : "=r" (sum)\
1141        : "r"(temp)\
1142    );\
1143    return sum&0xFFFF;\
1144}\
1145WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1146
1147#define HADAMARD8_DIFF_SSE2(cpu) \
1148static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1149    DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1150    int sum;\
1151\
1152    assert(h==8);\
1153\
1154    DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1155\
1156    __asm__ volatile(\
1157        HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1158        TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1159        HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1160        MMABS_SUM_8x8\
1161        HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1162        : "=r" (sum)\
1163        : "r"(temp)\
1164    );\
1165    return sum&0xFFFF;\
1166}\
1167WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1168
1169#define MMABS(a,z)         MMABS_MMX(a,z)
1170#define HSUM(a,t,dst)      HSUM_MMX(a,t,dst)
1171HADAMARD8_DIFF_MMX(mmx)
1172#undef MMABS
1173#undef HSUM
1174
1175#define MMABS(a,z)         MMABS_MMX2(a,z)
1176#define MMABS_SUM_8x8      MMABS_SUM_8x8_SSE2
1177#define HSUM(a,t,dst)      HSUM_MMX2(a,t,dst)
1178HADAMARD8_DIFF_MMX(mmx2)
1179HADAMARD8_DIFF_SSE2(sse2)
1180#undef MMABS
1181#undef MMABS_SUM_8x8
1182#undef HSUM
1183
1184#if HAVE_SSSE3
1185#define MMABS(a,z)         MMABS_SSSE3(a,z)
1186#define MMABS_SUM_8x8      MMABS_SUM_8x8_NOSPILL
1187HADAMARD8_DIFF_SSE2(ssse3)
1188#undef MMABS
1189#undef MMABS_SUM_8x8
1190#endif
1191
1192#define DCT_SAD4(m,mm,o)\
1193    "mov"#m" "#o"+ 0(%1), "#mm"2      \n\t"\
1194    "mov"#m" "#o"+16(%1), "#mm"3      \n\t"\
1195    "mov"#m" "#o"+32(%1), "#mm"4      \n\t"\
1196    "mov"#m" "#o"+48(%1), "#mm"5      \n\t"\
1197    MMABS_SUM(mm##2, mm##6, mm##0)\
1198    MMABS_SUM(mm##3, mm##7, mm##1)\
1199    MMABS_SUM(mm##4, mm##6, mm##0)\
1200    MMABS_SUM(mm##5, mm##7, mm##1)\
1201
1202#define DCT_SAD_MMX\
1203    "pxor %%mm0, %%mm0                \n\t"\
1204    "pxor %%mm1, %%mm1                \n\t"\
1205    DCT_SAD4(q, %%mm, 0)\
1206    DCT_SAD4(q, %%mm, 8)\
1207    DCT_SAD4(q, %%mm, 64)\
1208    DCT_SAD4(q, %%mm, 72)\
1209    "paddusw %%mm1, %%mm0             \n\t"\
1210    HSUM(%%mm0, %%mm1, %0)
1211
1212#define DCT_SAD_SSE2\
1213    "pxor %%xmm0, %%xmm0              \n\t"\
1214    "pxor %%xmm1, %%xmm1              \n\t"\
1215    DCT_SAD4(dqa, %%xmm, 0)\
1216    DCT_SAD4(dqa, %%xmm, 64)\
1217    "paddusw %%xmm1, %%xmm0           \n\t"\
1218    HSUM(%%xmm0, %%xmm1, %0)
1219
1220#define DCT_SAD_FUNC(cpu) \
1221static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1222    int sum;\
1223    __asm__ volatile(\
1224        DCT_SAD\
1225        :"=r"(sum)\
1226        :"r"(block)\
1227    );\
1228    return sum&0xFFFF;\
1229}
1230
1231#define DCT_SAD       DCT_SAD_MMX
1232#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1233#define MMABS(a,z)    MMABS_MMX(a,z)
1234DCT_SAD_FUNC(mmx)
1235#undef MMABS
1236#undef HSUM
1237
1238#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1239#define MMABS(a,z)    MMABS_MMX2(a,z)
1240DCT_SAD_FUNC(mmx2)
1241#undef HSUM
1242#undef DCT_SAD
1243
1244#define DCT_SAD       DCT_SAD_SSE2
1245#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1246DCT_SAD_FUNC(sse2)
1247#undef MMABS
1248
1249#if HAVE_SSSE3
1250#define MMABS(a,z)    MMABS_SSSE3(a,z)
1251DCT_SAD_FUNC(ssse3)
1252#undef MMABS
1253#endif
1254#undef HSUM
1255#undef DCT_SAD
1256
1257static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1258    int sum;
1259    x86_reg i=size;
1260    __asm__ volatile(
1261        "pxor %%mm4, %%mm4 \n"
1262        "1: \n"
1263        "sub $8, %0 \n"
1264        "movq (%2,%0), %%mm2 \n"
1265        "movq (%3,%0,2), %%mm0 \n"
1266        "movq 8(%3,%0,2), %%mm1 \n"
1267        "punpckhbw %%mm2, %%mm3 \n"
1268        "punpcklbw %%mm2, %%mm2 \n"
1269        "psraw $8, %%mm3 \n"
1270        "psraw $8, %%mm2 \n"
1271        "psubw %%mm3, %%mm1 \n"
1272        "psubw %%mm2, %%mm0 \n"
1273        "pmaddwd %%mm1, %%mm1 \n"
1274        "pmaddwd %%mm0, %%mm0 \n"
1275        "paddd %%mm1, %%mm4 \n"
1276        "paddd %%mm0, %%mm4 \n"
1277        "jg 1b \n"
1278        "movq %%mm4, %%mm3 \n"
1279        "psrlq $32, %%mm3 \n"
1280        "paddd %%mm3, %%mm4 \n"
1281        "movd %%mm4, %1 \n"
1282        :"+r"(i), "=r"(sum)
1283        :"r"(pix1), "r"(pix2)
1284    );
1285    return sum;
1286}
1287
1288#define PHADDD(a, t)\
1289    "movq "#a", "#t"                  \n\t"\
1290    "psrlq $32, "#a"                  \n\t"\
1291    "paddd "#t", "#a"                 \n\t"
1292/*
1293   pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1294   pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1295   pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1296 */
1297#define PMULHRW(x, y, s, o)\
1298    "pmulhw " #s ", "#x "            \n\t"\
1299    "pmulhw " #s ", "#y "            \n\t"\
1300    "paddw " #o ", "#x "             \n\t"\
1301    "paddw " #o ", "#y "             \n\t"\
1302    "psraw $1, "#x "                 \n\t"\
1303    "psraw $1, "#y "                 \n\t"
1304#define DEF(x) x ## _mmx
1305#define SET_RND MOVQ_WONE
1306#define SCALE_OFFSET 1
1307
1308#include "dsputil_mmx_qns_template.c"
1309
1310#undef DEF
1311#undef SET_RND
1312#undef SCALE_OFFSET
1313#undef PMULHRW
1314
1315#define DEF(x) x ## _3dnow
1316#define SET_RND(x)
1317#define SCALE_OFFSET 0
1318#define PMULHRW(x, y, s, o)\
1319    "pmulhrw " #s ", "#x "           \n\t"\
1320    "pmulhrw " #s ", "#y "           \n\t"
1321
1322#include "dsputil_mmx_qns_template.c"
1323
1324#undef DEF
1325#undef SET_RND
1326#undef SCALE_OFFSET
1327#undef PMULHRW
1328
1329#if HAVE_SSSE3
1330#undef PHADDD
1331#define DEF(x) x ## _ssse3
1332#define SET_RND(x)
1333#define SCALE_OFFSET -1
1334#define PHADDD(a, t)\
1335    "pshufw $0x0E, "#a", "#t"         \n\t"\
1336    "paddd "#t", "#a"                 \n\t" /* faster than phaddd on core2 */
1337#define PMULHRW(x, y, s, o)\
1338    "pmulhrsw " #s ", "#x "          \n\t"\
1339    "pmulhrsw " #s ", "#y "          \n\t"
1340
1341#include "dsputil_mmx_qns_template.c"
1342
1343#undef DEF
1344#undef SET_RND
1345#undef SCALE_OFFSET
1346#undef PMULHRW
1347#undef PHADDD
1348#endif //HAVE_SSSE3
1349
1350
1351/* FLAC specific */
1352void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1353                                   double *autoc);
1354
1355
1356void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1357{
1358    if (mm_flags & FF_MM_MMX) {
1359        const int dct_algo = avctx->dct_algo;
1360        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1361            if(mm_flags & FF_MM_SSE2){
1362                c->fdct = ff_fdct_sse2;
1363            }else if(mm_flags & FF_MM_MMXEXT){
1364                c->fdct = ff_fdct_mmx2;
1365            }else{
1366                c->fdct = ff_fdct_mmx;
1367            }
1368        }
1369
1370        c->get_pixels = get_pixels_mmx;
1371        c->diff_pixels = diff_pixels_mmx;
1372        c->pix_sum = pix_sum16_mmx;
1373
1374        c->diff_bytes= diff_bytes_mmx;
1375        c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1376
1377        c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1378        c->hadamard8_diff[1]= hadamard8_diff_mmx;
1379
1380        c->pix_norm1 = pix_norm1_mmx;
1381        c->sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;
1382          c->sse[1] = sse8_mmx;
1383        c->vsad[4]= vsad_intra16_mmx;
1384
1385        c->nsse[0] = nsse16_mmx;
1386        c->nsse[1] = nsse8_mmx;
1387        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1388            c->vsad[0] = vsad16_mmx;
1389        }
1390
1391        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1392            c->try_8x8basis= try_8x8basis_mmx;
1393        }
1394        c->add_8x8basis= add_8x8basis_mmx;
1395
1396        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1397
1398
1399        if (mm_flags & FF_MM_MMXEXT) {
1400            c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1401            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1402            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1403            c->vsad[4]= vsad_intra16_mmx2;
1404
1405            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1406                c->vsad[0] = vsad16_mmx2;
1407            }
1408
1409            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1410        }
1411
1412        if(mm_flags & FF_MM_SSE2){
1413            c->get_pixels = get_pixels_sse2;
1414            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1415            c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1416            c->hadamard8_diff[1]= hadamard8_diff_sse2;
1417            if (CONFIG_FLAC_ENCODER)
1418                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1419        }
1420
1421#if HAVE_SSSE3
1422        if(mm_flags & FF_MM_SSSE3){
1423            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1424                c->try_8x8basis= try_8x8basis_ssse3;
1425            }
1426            c->add_8x8basis= add_8x8basis_ssse3;
1427            c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1428            c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1429            c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1430        }
1431#endif
1432
1433        if(mm_flags & FF_MM_3DNOW){
1434            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1435                c->try_8x8basis= try_8x8basis_3dnow;
1436            }
1437            c->add_8x8basis= add_8x8basis_3dnow;
1438        }
1439    }
1440
1441    dsputil_init_pix_mmx(c, avctx);
1442}
1443