1/*
2 * DSP utils mmx functions are compiled twice for rnd/no_rnd
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27// put_pixels
28static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
29{
30    MOVQ_BFE(mm6);
31    __asm__ volatile(
32        "lea    (%3, %3), %%"REG_a"     \n\t"
33        ASMALIGN(3)
34        "1:                             \n\t"
35        "movq   (%1), %%mm0             \n\t"
36        "movq   1(%1), %%mm1            \n\t"
37        "movq   (%1, %3), %%mm2         \n\t"
38        "movq   1(%1, %3), %%mm3        \n\t"
39        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
40        "movq   %%mm4, (%2)             \n\t"
41        "movq   %%mm5, (%2, %3)         \n\t"
42        "add    %%"REG_a", %1           \n\t"
43        "add    %%"REG_a", %2           \n\t"
44        "movq   (%1), %%mm0             \n\t"
45        "movq   1(%1), %%mm1            \n\t"
46        "movq   (%1, %3), %%mm2         \n\t"
47        "movq   1(%1, %3), %%mm3        \n\t"
48        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
49        "movq   %%mm4, (%2)             \n\t"
50        "movq   %%mm5, (%2, %3)         \n\t"
51        "add    %%"REG_a", %1           \n\t"
52        "add    %%"REG_a", %2           \n\t"
53        "subl   $4, %0                  \n\t"
54        "jnz    1b                      \n\t"
55        :"+g"(h), "+S"(pixels), "+D"(block)
56        :"r"((x86_reg)line_size)
57        :REG_a, "memory");
58}
59
60static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
61{
62    MOVQ_BFE(mm6);
63    __asm__ volatile(
64        "testl $1, %0                   \n\t"
65        " jz 1f                         \n\t"
66        "movq   (%1), %%mm0             \n\t"
67        "movq   (%2), %%mm1             \n\t"
68        "add    %4, %1                  \n\t"
69        "add    $8, %2                  \n\t"
70        PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
71        "movq   %%mm4, (%3)             \n\t"
72        "add    %5, %3                  \n\t"
73        "decl   %0                      \n\t"
74        ASMALIGN(3)
75        "1:                             \n\t"
76        "movq   (%1), %%mm0             \n\t"
77        "movq   (%2), %%mm1             \n\t"
78        "add    %4, %1                  \n\t"
79        "movq   (%1), %%mm2             \n\t"
80        "movq   8(%2), %%mm3            \n\t"
81        "add    %4, %1                  \n\t"
82        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
83        "movq   %%mm4, (%3)             \n\t"
84        "add    %5, %3                  \n\t"
85        "movq   %%mm5, (%3)             \n\t"
86        "add    %5, %3                  \n\t"
87        "movq   (%1), %%mm0             \n\t"
88        "movq   16(%2), %%mm1           \n\t"
89        "add    %4, %1                  \n\t"
90        "movq   (%1), %%mm2             \n\t"
91        "movq   24(%2), %%mm3           \n\t"
92        "add    %4, %1                  \n\t"
93        "add    $32, %2                 \n\t"
94        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
95        "movq   %%mm4, (%3)             \n\t"
96        "add    %5, %3                  \n\t"
97        "movq   %%mm5, (%3)             \n\t"
98        "add    %5, %3                  \n\t"
99        "subl   $4, %0                  \n\t"
100        "jnz    1b                      \n\t"
101#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
102        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
103#else
104        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
105#endif
106        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
107        :"memory");
108}
109
110static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
111{
112    MOVQ_BFE(mm6);
113    __asm__ volatile(
114        "lea        (%3, %3), %%"REG_a" \n\t"
115        ASMALIGN(3)
116        "1:                             \n\t"
117        "movq   (%1), %%mm0             \n\t"
118        "movq   1(%1), %%mm1            \n\t"
119        "movq   (%1, %3), %%mm2         \n\t"
120        "movq   1(%1, %3), %%mm3        \n\t"
121        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
122        "movq   %%mm4, (%2)             \n\t"
123        "movq   %%mm5, (%2, %3)         \n\t"
124        "movq   8(%1), %%mm0            \n\t"
125        "movq   9(%1), %%mm1            \n\t"
126        "movq   8(%1, %3), %%mm2        \n\t"
127        "movq   9(%1, %3), %%mm3        \n\t"
128        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
129        "movq   %%mm4, 8(%2)            \n\t"
130        "movq   %%mm5, 8(%2, %3)        \n\t"
131        "add    %%"REG_a", %1           \n\t"
132        "add    %%"REG_a", %2           \n\t"
133        "movq   (%1), %%mm0             \n\t"
134        "movq   1(%1), %%mm1            \n\t"
135        "movq   (%1, %3), %%mm2         \n\t"
136        "movq   1(%1, %3), %%mm3        \n\t"
137        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
138        "movq   %%mm4, (%2)             \n\t"
139        "movq   %%mm5, (%2, %3)         \n\t"
140        "movq   8(%1), %%mm0            \n\t"
141        "movq   9(%1), %%mm1            \n\t"
142        "movq   8(%1, %3), %%mm2        \n\t"
143        "movq   9(%1, %3), %%mm3        \n\t"
144        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
145        "movq   %%mm4, 8(%2)            \n\t"
146        "movq   %%mm5, 8(%2, %3)        \n\t"
147        "add    %%"REG_a", %1           \n\t"
148        "add    %%"REG_a", %2           \n\t"
149        "subl   $4, %0                  \n\t"
150        "jnz    1b                      \n\t"
151        :"+g"(h), "+S"(pixels), "+D"(block)
152        :"r"((x86_reg)line_size)
153        :REG_a, "memory");
154}
155
156static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
157{
158    MOVQ_BFE(mm6);
159    __asm__ volatile(
160        "testl $1, %0                   \n\t"
161        " jz 1f                         \n\t"
162        "movq   (%1), %%mm0             \n\t"
163        "movq   (%2), %%mm1             \n\t"
164        "movq   8(%1), %%mm2            \n\t"
165        "movq   8(%2), %%mm3            \n\t"
166        "add    %4, %1                  \n\t"
167        "add    $16, %2                 \n\t"
168        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
169        "movq   %%mm4, (%3)             \n\t"
170        "movq   %%mm5, 8(%3)            \n\t"
171        "add    %5, %3                  \n\t"
172        "decl   %0                      \n\t"
173        ASMALIGN(3)
174        "1:                             \n\t"
175        "movq   (%1), %%mm0             \n\t"
176        "movq   (%2), %%mm1             \n\t"
177        "movq   8(%1), %%mm2            \n\t"
178        "movq   8(%2), %%mm3            \n\t"
179        "add    %4, %1                  \n\t"
180        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
181        "movq   %%mm4, (%3)             \n\t"
182        "movq   %%mm5, 8(%3)            \n\t"
183        "add    %5, %3                  \n\t"
184        "movq   (%1), %%mm0             \n\t"
185        "movq   16(%2), %%mm1           \n\t"
186        "movq   8(%1), %%mm2            \n\t"
187        "movq   24(%2), %%mm3           \n\t"
188        "add    %4, %1                  \n\t"
189        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
190        "movq   %%mm4, (%3)             \n\t"
191        "movq   %%mm5, 8(%3)            \n\t"
192        "add    %5, %3                  \n\t"
193        "add    $32, %2                 \n\t"
194        "subl   $2, %0                  \n\t"
195        "jnz    1b                      \n\t"
196#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
197        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
198#else
199        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
200#endif
201        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
202        :"memory");
203}
204
205static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
206{
207    MOVQ_BFE(mm6);
208    __asm__ volatile(
209        "lea (%3, %3), %%"REG_a"        \n\t"
210        "movq (%1), %%mm0               \n\t"
211        ASMALIGN(3)
212        "1:                             \n\t"
213        "movq   (%1, %3), %%mm1         \n\t"
214        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
215        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
216        "movq   %%mm4, (%2)             \n\t"
217        "movq   %%mm5, (%2, %3)         \n\t"
218        "add    %%"REG_a", %1           \n\t"
219        "add    %%"REG_a", %2           \n\t"
220        "movq   (%1, %3), %%mm1         \n\t"
221        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
222        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
223        "movq   %%mm4, (%2)             \n\t"
224        "movq   %%mm5, (%2, %3)         \n\t"
225        "add    %%"REG_a", %1           \n\t"
226        "add    %%"REG_a", %2           \n\t"
227        "subl   $4, %0                  \n\t"
228        "jnz    1b                      \n\t"
229        :"+g"(h), "+S"(pixels), "+D"(block)
230        :"r"((x86_reg)line_size)
231        :REG_a, "memory");
232}
233
234static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
235{
236    MOVQ_ZERO(mm7);
237    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
238    __asm__ volatile(
239        "movq   (%1), %%mm0             \n\t"
240        "movq   1(%1), %%mm4            \n\t"
241        "movq   %%mm0, %%mm1            \n\t"
242        "movq   %%mm4, %%mm5            \n\t"
243        "punpcklbw %%mm7, %%mm0         \n\t"
244        "punpcklbw %%mm7, %%mm4         \n\t"
245        "punpckhbw %%mm7, %%mm1         \n\t"
246        "punpckhbw %%mm7, %%mm5         \n\t"
247        "paddusw %%mm0, %%mm4           \n\t"
248        "paddusw %%mm1, %%mm5           \n\t"
249        "xor    %%"REG_a", %%"REG_a"    \n\t"
250        "add    %3, %1                  \n\t"
251        ASMALIGN(3)
252        "1:                             \n\t"
253        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
254        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
255        "movq   %%mm0, %%mm1            \n\t"
256        "movq   %%mm2, %%mm3            \n\t"
257        "punpcklbw %%mm7, %%mm0         \n\t"
258        "punpcklbw %%mm7, %%mm2         \n\t"
259        "punpckhbw %%mm7, %%mm1         \n\t"
260        "punpckhbw %%mm7, %%mm3         \n\t"
261        "paddusw %%mm2, %%mm0           \n\t"
262        "paddusw %%mm3, %%mm1           \n\t"
263        "paddusw %%mm6, %%mm4           \n\t"
264        "paddusw %%mm6, %%mm5           \n\t"
265        "paddusw %%mm0, %%mm4           \n\t"
266        "paddusw %%mm1, %%mm5           \n\t"
267        "psrlw  $2, %%mm4               \n\t"
268        "psrlw  $2, %%mm5               \n\t"
269        "packuswb  %%mm5, %%mm4         \n\t"
270        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
271        "add    %3, %%"REG_a"           \n\t"
272
273        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
274        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
275        "movq   %%mm2, %%mm3            \n\t"
276        "movq   %%mm4, %%mm5            \n\t"
277        "punpcklbw %%mm7, %%mm2         \n\t"
278        "punpcklbw %%mm7, %%mm4         \n\t"
279        "punpckhbw %%mm7, %%mm3         \n\t"
280        "punpckhbw %%mm7, %%mm5         \n\t"
281        "paddusw %%mm2, %%mm4           \n\t"
282        "paddusw %%mm3, %%mm5           \n\t"
283        "paddusw %%mm6, %%mm0           \n\t"
284        "paddusw %%mm6, %%mm1           \n\t"
285        "paddusw %%mm4, %%mm0           \n\t"
286        "paddusw %%mm5, %%mm1           \n\t"
287        "psrlw  $2, %%mm0               \n\t"
288        "psrlw  $2, %%mm1               \n\t"
289        "packuswb  %%mm1, %%mm0         \n\t"
290        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
291        "add    %3, %%"REG_a"           \n\t"
292
293        "subl   $2, %0                  \n\t"
294        "jnz    1b                      \n\t"
295        :"+g"(h), "+S"(pixels)
296        :"D"(block), "r"((x86_reg)line_size)
297        :REG_a, "memory");
298}
299
300// avg_pixels
301static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
302{
303    MOVQ_BFE(mm6);
304    JUMPALIGN();
305    do {
306        __asm__ volatile(
307             "movd  %0, %%mm0           \n\t"
308             "movd  %1, %%mm1           \n\t"
309             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
310             "movd  %%mm2, %0           \n\t"
311             :"+m"(*block)
312             :"m"(*pixels)
313             :"memory");
314        pixels += line_size;
315        block += line_size;
316    }
317    while (--h);
318}
319
320// in case more speed is needed - unroling would certainly help
321static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
322{
323    MOVQ_BFE(mm6);
324    JUMPALIGN();
325    do {
326        __asm__ volatile(
327             "movq  %0, %%mm0           \n\t"
328             "movq  %1, %%mm1           \n\t"
329             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
330             "movq  %%mm2, %0           \n\t"
331             :"+m"(*block)
332             :"m"(*pixels)
333             :"memory");
334        pixels += line_size;
335        block += line_size;
336    }
337    while (--h);
338}
339
340static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
341{
342    MOVQ_BFE(mm6);
343    JUMPALIGN();
344    do {
345        __asm__ volatile(
346             "movq  %0, %%mm0           \n\t"
347             "movq  %1, %%mm1           \n\t"
348             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
349             "movq  %%mm2, %0           \n\t"
350             "movq  8%0, %%mm0          \n\t"
351             "movq  8%1, %%mm1          \n\t"
352             PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
353             "movq  %%mm2, 8%0          \n\t"
354             :"+m"(*block)
355             :"m"(*pixels)
356             :"memory");
357        pixels += line_size;
358        block += line_size;
359    }
360    while (--h);
361}
362
363static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
364{
365    MOVQ_BFE(mm6);
366    JUMPALIGN();
367    do {
368        __asm__ volatile(
369            "movq  %1, %%mm0            \n\t"
370            "movq  1%1, %%mm1           \n\t"
371            "movq  %0, %%mm3            \n\t"
372            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
373            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
374            "movq  %%mm0, %0            \n\t"
375            :"+m"(*block)
376            :"m"(*pixels)
377            :"memory");
378        pixels += line_size;
379        block += line_size;
380    } while (--h);
381}
382
383static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
384{
385    MOVQ_BFE(mm6);
386    JUMPALIGN();
387    do {
388        __asm__ volatile(
389            "movq  %1, %%mm0            \n\t"
390            "movq  %2, %%mm1            \n\t"
391            "movq  %0, %%mm3            \n\t"
392            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
393            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
394            "movq  %%mm0, %0            \n\t"
395            :"+m"(*dst)
396            :"m"(*src1), "m"(*src2)
397            :"memory");
398        dst += dstStride;
399        src1 += src1Stride;
400        src2 += 8;
401    } while (--h);
402}
403
404static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
405{
406    MOVQ_BFE(mm6);
407    JUMPALIGN();
408    do {
409        __asm__ volatile(
410            "movq  %1, %%mm0            \n\t"
411            "movq  1%1, %%mm1           \n\t"
412            "movq  %0, %%mm3            \n\t"
413            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
414            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
415            "movq  %%mm0, %0            \n\t"
416            "movq  8%1, %%mm0           \n\t"
417            "movq  9%1, %%mm1           \n\t"
418            "movq  8%0, %%mm3           \n\t"
419            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
420            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
421            "movq  %%mm0, 8%0           \n\t"
422            :"+m"(*block)
423            :"m"(*pixels)
424            :"memory");
425        pixels += line_size;
426        block += line_size;
427    } while (--h);
428}
429
430static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
431{
432    MOVQ_BFE(mm6);
433    JUMPALIGN();
434    do {
435        __asm__ volatile(
436            "movq  %1, %%mm0            \n\t"
437            "movq  %2, %%mm1            \n\t"
438            "movq  %0, %%mm3            \n\t"
439            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
440            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
441            "movq  %%mm0, %0            \n\t"
442            "movq  8%1, %%mm0           \n\t"
443            "movq  8%2, %%mm1           \n\t"
444            "movq  8%0, %%mm3           \n\t"
445            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
446            PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
447            "movq  %%mm0, 8%0           \n\t"
448            :"+m"(*dst)
449            :"m"(*src1), "m"(*src2)
450            :"memory");
451        dst += dstStride;
452        src1 += src1Stride;
453        src2 += 16;
454    } while (--h);
455}
456
457static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
458{
459    MOVQ_BFE(mm6);
460    __asm__ volatile(
461        "lea    (%3, %3), %%"REG_a"     \n\t"
462        "movq   (%1), %%mm0             \n\t"
463        ASMALIGN(3)
464        "1:                             \n\t"
465        "movq   (%1, %3), %%mm1         \n\t"
466        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
467        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
468        "movq   (%2), %%mm3             \n\t"
469        PAVGB(%%mm3, %%mm4, %%mm0, %%mm6)
470        "movq   (%2, %3), %%mm3         \n\t"
471        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
472        "movq   %%mm0, (%2)             \n\t"
473        "movq   %%mm1, (%2, %3)         \n\t"
474        "add    %%"REG_a", %1           \n\t"
475        "add    %%"REG_a", %2           \n\t"
476
477        "movq   (%1, %3), %%mm1         \n\t"
478        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
479        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
480        "movq   (%2), %%mm3             \n\t"
481        PAVGB(%%mm3, %%mm4, %%mm2, %%mm6)
482        "movq   (%2, %3), %%mm3         \n\t"
483        PAVGB(%%mm3, %%mm5, %%mm1, %%mm6)
484        "movq   %%mm2, (%2)             \n\t"
485        "movq   %%mm1, (%2, %3)         \n\t"
486        "add    %%"REG_a", %1           \n\t"
487        "add    %%"REG_a", %2           \n\t"
488
489        "subl   $4, %0                  \n\t"
490        "jnz    1b                      \n\t"
491        :"+g"(h), "+S"(pixels), "+D"(block)
492        :"r"((x86_reg)line_size)
493        :REG_a, "memory");
494}
495
496// this routine is 'slightly' suboptimal but mostly unused
497static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
498{
499    MOVQ_ZERO(mm7);
500    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
501    __asm__ volatile(
502        "movq   (%1), %%mm0             \n\t"
503        "movq   1(%1), %%mm4            \n\t"
504        "movq   %%mm0, %%mm1            \n\t"
505        "movq   %%mm4, %%mm5            \n\t"
506        "punpcklbw %%mm7, %%mm0         \n\t"
507        "punpcklbw %%mm7, %%mm4         \n\t"
508        "punpckhbw %%mm7, %%mm1         \n\t"
509        "punpckhbw %%mm7, %%mm5         \n\t"
510        "paddusw %%mm0, %%mm4           \n\t"
511        "paddusw %%mm1, %%mm5           \n\t"
512        "xor    %%"REG_a", %%"REG_a"    \n\t"
513        "add    %3, %1                  \n\t"
514        ASMALIGN(3)
515        "1:                             \n\t"
516        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
517        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
518        "movq   %%mm0, %%mm1            \n\t"
519        "movq   %%mm2, %%mm3            \n\t"
520        "punpcklbw %%mm7, %%mm0         \n\t"
521        "punpcklbw %%mm7, %%mm2         \n\t"
522        "punpckhbw %%mm7, %%mm1         \n\t"
523        "punpckhbw %%mm7, %%mm3         \n\t"
524        "paddusw %%mm2, %%mm0           \n\t"
525        "paddusw %%mm3, %%mm1           \n\t"
526        "paddusw %%mm6, %%mm4           \n\t"
527        "paddusw %%mm6, %%mm5           \n\t"
528        "paddusw %%mm0, %%mm4           \n\t"
529        "paddusw %%mm1, %%mm5           \n\t"
530        "psrlw  $2, %%mm4               \n\t"
531        "psrlw  $2, %%mm5               \n\t"
532                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
533        "packuswb  %%mm5, %%mm4         \n\t"
534                "pcmpeqd %%mm2, %%mm2   \n\t"
535                "paddb %%mm2, %%mm2     \n\t"
536                PAVGB(%%mm3, %%mm4, %%mm5, %%mm2)
537                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
538        "add    %3, %%"REG_a"                \n\t"
539
540        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
541        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
542        "movq   %%mm2, %%mm3            \n\t"
543        "movq   %%mm4, %%mm5            \n\t"
544        "punpcklbw %%mm7, %%mm2         \n\t"
545        "punpcklbw %%mm7, %%mm4         \n\t"
546        "punpckhbw %%mm7, %%mm3         \n\t"
547        "punpckhbw %%mm7, %%mm5         \n\t"
548        "paddusw %%mm2, %%mm4           \n\t"
549        "paddusw %%mm3, %%mm5           \n\t"
550        "paddusw %%mm6, %%mm0           \n\t"
551        "paddusw %%mm6, %%mm1           \n\t"
552        "paddusw %%mm4, %%mm0           \n\t"
553        "paddusw %%mm5, %%mm1           \n\t"
554        "psrlw  $2, %%mm0               \n\t"
555        "psrlw  $2, %%mm1               \n\t"
556                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
557        "packuswb  %%mm1, %%mm0         \n\t"
558                "pcmpeqd %%mm2, %%mm2   \n\t"
559                "paddb %%mm2, %%mm2     \n\t"
560                PAVGB(%%mm3, %%mm0, %%mm1, %%mm2)
561                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
562        "add    %3, %%"REG_a"           \n\t"
563
564        "subl   $2, %0                  \n\t"
565        "jnz    1b                      \n\t"
566        :"+g"(h), "+S"(pixels)
567        :"D"(block), "r"((x86_reg)line_size)
568        :REG_a, "memory");
569}
570
571//FIXME optimize
572static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
573    DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
574    DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
575}
576
577static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
578    DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
579    DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
580}
581
582static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
583    DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
584    DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
585}
586
587static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
588    DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
589    DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
590}
591