1/*
2 * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <stddef.h>
28#include <stdint.h>
29
30// put_pixels
31static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
32{
33    MOVQ_BFE(mm6);
34    __asm__ volatile(
35        "lea    (%3, %3), %%"REG_a"     \n\t"
36        ".p2align 3                     \n\t"
37        "1:                             \n\t"
38        "movq   (%1), %%mm0             \n\t"
39        "movq   1(%1), %%mm1            \n\t"
40        "movq   (%1, %3), %%mm2         \n\t"
41        "movq   1(%1, %3), %%mm3        \n\t"
42        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
43        "movq   %%mm4, (%2)             \n\t"
44        "movq   %%mm5, (%2, %3)         \n\t"
45        "add    %%"REG_a", %1           \n\t"
46        "add    %%"REG_a", %2           \n\t"
47        "movq   (%1), %%mm0             \n\t"
48        "movq   1(%1), %%mm1            \n\t"
49        "movq   (%1, %3), %%mm2         \n\t"
50        "movq   1(%1, %3), %%mm3        \n\t"
51        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
52        "movq   %%mm4, (%2)             \n\t"
53        "movq   %%mm5, (%2, %3)         \n\t"
54        "add    %%"REG_a", %1           \n\t"
55        "add    %%"REG_a", %2           \n\t"
56        "subl   $4, %0                  \n\t"
57        "jnz    1b                      \n\t"
58        :"+g"(h), "+S"(pixels), "+D"(block)
59        :"r"((x86_reg)line_size)
60        :REG_a, "memory");
61}
62
63static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
64{
65    MOVQ_BFE(mm6);
66    __asm__ volatile(
67        "lea        (%3, %3), %%"REG_a" \n\t"
68        ".p2align 3                     \n\t"
69        "1:                             \n\t"
70        "movq   (%1), %%mm0             \n\t"
71        "movq   1(%1), %%mm1            \n\t"
72        "movq   (%1, %3), %%mm2         \n\t"
73        "movq   1(%1, %3), %%mm3        \n\t"
74        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
75        "movq   %%mm4, (%2)             \n\t"
76        "movq   %%mm5, (%2, %3)         \n\t"
77        "movq   8(%1), %%mm0            \n\t"
78        "movq   9(%1), %%mm1            \n\t"
79        "movq   8(%1, %3), %%mm2        \n\t"
80        "movq   9(%1, %3), %%mm3        \n\t"
81        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
82        "movq   %%mm4, 8(%2)            \n\t"
83        "movq   %%mm5, 8(%2, %3)        \n\t"
84        "add    %%"REG_a", %1           \n\t"
85        "add    %%"REG_a", %2           \n\t"
86        "movq   (%1), %%mm0             \n\t"
87        "movq   1(%1), %%mm1            \n\t"
88        "movq   (%1, %3), %%mm2         \n\t"
89        "movq   1(%1, %3), %%mm3        \n\t"
90        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
91        "movq   %%mm4, (%2)             \n\t"
92        "movq   %%mm5, (%2, %3)         \n\t"
93        "movq   8(%1), %%mm0            \n\t"
94        "movq   9(%1), %%mm1            \n\t"
95        "movq   8(%1, %3), %%mm2        \n\t"
96        "movq   9(%1, %3), %%mm3        \n\t"
97        PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
98        "movq   %%mm4, 8(%2)            \n\t"
99        "movq   %%mm5, 8(%2, %3)        \n\t"
100        "add    %%"REG_a", %1           \n\t"
101        "add    %%"REG_a", %2           \n\t"
102        "subl   $4, %0                  \n\t"
103        "jnz    1b                      \n\t"
104        :"+g"(h), "+S"(pixels), "+D"(block)
105        :"r"((x86_reg)line_size)
106        :REG_a, "memory");
107}
108
109static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
110{
111    MOVQ_BFE(mm6);
112    __asm__ volatile(
113        "lea (%3, %3), %%"REG_a"        \n\t"
114        "movq (%1), %%mm0               \n\t"
115        ".p2align 3                     \n\t"
116        "1:                             \n\t"
117        "movq   (%1, %3), %%mm1         \n\t"
118        "movq   (%1, %%"REG_a"),%%mm2   \n\t"
119        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
120        "movq   %%mm4, (%2)             \n\t"
121        "movq   %%mm5, (%2, %3)         \n\t"
122        "add    %%"REG_a", %1           \n\t"
123        "add    %%"REG_a", %2           \n\t"
124        "movq   (%1, %3), %%mm1         \n\t"
125        "movq   (%1, %%"REG_a"),%%mm0   \n\t"
126        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
127        "movq   %%mm4, (%2)             \n\t"
128        "movq   %%mm5, (%2, %3)         \n\t"
129        "add    %%"REG_a", %1           \n\t"
130        "add    %%"REG_a", %2           \n\t"
131        "subl   $4, %0                  \n\t"
132        "jnz    1b                      \n\t"
133        :"+g"(h), "+S"(pixels), "+D"(block)
134        :"r"((x86_reg)line_size)
135        :REG_a, "memory");
136}
137
138static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
139{
140    MOVQ_BFE(mm6);
141    JUMPALIGN();
142    do {
143        __asm__ volatile(
144            "movq  %1, %%mm0            \n\t"
145            "movq  1%1, %%mm1           \n\t"
146            "movq  %0, %%mm3            \n\t"
147            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
148            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
149            "movq  %%mm0, %0            \n\t"
150            "movq  8%1, %%mm0           \n\t"
151            "movq  9%1, %%mm1           \n\t"
152            "movq  8%0, %%mm3           \n\t"
153            PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
154            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
155            "movq  %%mm0, 8%0           \n\t"
156            :"+m"(*block)
157            :"m"(*pixels)
158            :"memory");
159        pixels += line_size;
160        block += line_size;
161    } while (--h);
162}
163
164static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
165{
166    MOVQ_BFE(mm6);
167    __asm__ volatile(
168        "lea    (%3, %3), %%"REG_a"     \n\t"
169        "movq   (%1), %%mm0             \n\t"
170        ".p2align 3                     \n\t"
171        "1:                             \n\t"
172        "movq   (%1, %3), %%mm1         \n\t"
173        "movq   (%1, %%"REG_a"), %%mm2  \n\t"
174        PAVGBP(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
175        "movq   (%2), %%mm3             \n\t"
176        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
177        "movq   (%2, %3), %%mm3         \n\t"
178        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
179        "movq   %%mm0, (%2)             \n\t"
180        "movq   %%mm1, (%2, %3)         \n\t"
181        "add    %%"REG_a", %1           \n\t"
182        "add    %%"REG_a", %2           \n\t"
183
184        "movq   (%1, %3), %%mm1         \n\t"
185        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
186        PAVGBP(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
187        "movq   (%2), %%mm3             \n\t"
188        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
189        "movq   (%2, %3), %%mm3         \n\t"
190        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
191        "movq   %%mm2, (%2)             \n\t"
192        "movq   %%mm1, (%2, %3)         \n\t"
193        "add    %%"REG_a", %1           \n\t"
194        "add    %%"REG_a", %2           \n\t"
195
196        "subl   $4, %0                  \n\t"
197        "jnz    1b                      \n\t"
198        :"+g"(h), "+S"(pixels), "+D"(block)
199        :"r"((x86_reg)line_size)
200        :REG_a, "memory");
201}
202