1/*
2 * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8 * and improved by Zdenek Kabelac <kabi@users.sf.net>
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include <stddef.h>
28#include <stdint.h>
29
30#include "inline_asm.h"
31
32// put_pixels
33STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
34                                  ptrdiff_t line_size, int h)
35{
36    MOVQ_ZERO(mm7);
37    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
38    __asm__ volatile(
39        "movq   (%1), %%mm0             \n\t"
40        "movq   1(%1), %%mm4            \n\t"
41        "movq   %%mm0, %%mm1            \n\t"
42        "movq   %%mm4, %%mm5            \n\t"
43        "punpcklbw %%mm7, %%mm0         \n\t"
44        "punpcklbw %%mm7, %%mm4         \n\t"
45        "punpckhbw %%mm7, %%mm1         \n\t"
46        "punpckhbw %%mm7, %%mm5         \n\t"
47        "paddusw %%mm0, %%mm4           \n\t"
48        "paddusw %%mm1, %%mm5           \n\t"
49        "xor    %%"REG_a", %%"REG_a"    \n\t"
50        "add    %3, %1                  \n\t"
51        ".p2align 3                     \n\t"
52        "1:                             \n\t"
53        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
54        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
55        "movq   %%mm0, %%mm1            \n\t"
56        "movq   %%mm2, %%mm3            \n\t"
57        "punpcklbw %%mm7, %%mm0         \n\t"
58        "punpcklbw %%mm7, %%mm2         \n\t"
59        "punpckhbw %%mm7, %%mm1         \n\t"
60        "punpckhbw %%mm7, %%mm3         \n\t"
61        "paddusw %%mm2, %%mm0           \n\t"
62        "paddusw %%mm3, %%mm1           \n\t"
63        "paddusw %%mm6, %%mm4           \n\t"
64        "paddusw %%mm6, %%mm5           \n\t"
65        "paddusw %%mm0, %%mm4           \n\t"
66        "paddusw %%mm1, %%mm5           \n\t"
67        "psrlw  $2, %%mm4               \n\t"
68        "psrlw  $2, %%mm5               \n\t"
69        "packuswb  %%mm5, %%mm4         \n\t"
70        "movq   %%mm4, (%2, %%"REG_a")  \n\t"
71        "add    %3, %%"REG_a"           \n\t"
72
73        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
74        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
75        "movq   %%mm2, %%mm3            \n\t"
76        "movq   %%mm4, %%mm5            \n\t"
77        "punpcklbw %%mm7, %%mm2         \n\t"
78        "punpcklbw %%mm7, %%mm4         \n\t"
79        "punpckhbw %%mm7, %%mm3         \n\t"
80        "punpckhbw %%mm7, %%mm5         \n\t"
81        "paddusw %%mm2, %%mm4           \n\t"
82        "paddusw %%mm3, %%mm5           \n\t"
83        "paddusw %%mm6, %%mm0           \n\t"
84        "paddusw %%mm6, %%mm1           \n\t"
85        "paddusw %%mm4, %%mm0           \n\t"
86        "paddusw %%mm5, %%mm1           \n\t"
87        "psrlw  $2, %%mm0               \n\t"
88        "psrlw  $2, %%mm1               \n\t"
89        "packuswb  %%mm1, %%mm0         \n\t"
90        "movq   %%mm0, (%2, %%"REG_a")  \n\t"
91        "add    %3, %%"REG_a"           \n\t"
92
93        "subl   $2, %0                  \n\t"
94        "jnz    1b                      \n\t"
95        :"+g"(h), "+S"(pixels)
96        :"D"(block), "r"((x86_reg)line_size)
97        :REG_a, "memory");
98}
99
100// avg_pixels
101// this routine is 'slightly' suboptimal but mostly unused
102STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
103                                  ptrdiff_t line_size, int h)
104{
105    MOVQ_ZERO(mm7);
106    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
107    __asm__ volatile(
108        "movq   (%1), %%mm0             \n\t"
109        "movq   1(%1), %%mm4            \n\t"
110        "movq   %%mm0, %%mm1            \n\t"
111        "movq   %%mm4, %%mm5            \n\t"
112        "punpcklbw %%mm7, %%mm0         \n\t"
113        "punpcklbw %%mm7, %%mm4         \n\t"
114        "punpckhbw %%mm7, %%mm1         \n\t"
115        "punpckhbw %%mm7, %%mm5         \n\t"
116        "paddusw %%mm0, %%mm4           \n\t"
117        "paddusw %%mm1, %%mm5           \n\t"
118        "xor    %%"REG_a", %%"REG_a"    \n\t"
119        "add    %3, %1                  \n\t"
120        ".p2align 3                     \n\t"
121        "1:                             \n\t"
122        "movq   (%1, %%"REG_a"), %%mm0  \n\t"
123        "movq   1(%1, %%"REG_a"), %%mm2 \n\t"
124        "movq   %%mm0, %%mm1            \n\t"
125        "movq   %%mm2, %%mm3            \n\t"
126        "punpcklbw %%mm7, %%mm0         \n\t"
127        "punpcklbw %%mm7, %%mm2         \n\t"
128        "punpckhbw %%mm7, %%mm1         \n\t"
129        "punpckhbw %%mm7, %%mm3         \n\t"
130        "paddusw %%mm2, %%mm0           \n\t"
131        "paddusw %%mm3, %%mm1           \n\t"
132        "paddusw %%mm6, %%mm4           \n\t"
133        "paddusw %%mm6, %%mm5           \n\t"
134        "paddusw %%mm0, %%mm4           \n\t"
135        "paddusw %%mm1, %%mm5           \n\t"
136        "psrlw  $2, %%mm4               \n\t"
137        "psrlw  $2, %%mm5               \n\t"
138                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
139        "packuswb  %%mm5, %%mm4         \n\t"
140                "pcmpeqd %%mm2, %%mm2   \n\t"
141                "paddb %%mm2, %%mm2     \n\t"
142                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
143                "movq   %%mm5, (%2, %%"REG_a")  \n\t"
144        "add    %3, %%"REG_a"                \n\t"
145
146        "movq   (%1, %%"REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
147        "movq   1(%1, %%"REG_a"), %%mm4 \n\t"
148        "movq   %%mm2, %%mm3            \n\t"
149        "movq   %%mm4, %%mm5            \n\t"
150        "punpcklbw %%mm7, %%mm2         \n\t"
151        "punpcklbw %%mm7, %%mm4         \n\t"
152        "punpckhbw %%mm7, %%mm3         \n\t"
153        "punpckhbw %%mm7, %%mm5         \n\t"
154        "paddusw %%mm2, %%mm4           \n\t"
155        "paddusw %%mm3, %%mm5           \n\t"
156        "paddusw %%mm6, %%mm0           \n\t"
157        "paddusw %%mm6, %%mm1           \n\t"
158        "paddusw %%mm4, %%mm0           \n\t"
159        "paddusw %%mm5, %%mm1           \n\t"
160        "psrlw  $2, %%mm0               \n\t"
161        "psrlw  $2, %%mm1               \n\t"
162                "movq   (%2, %%"REG_a"), %%mm3  \n\t"
163        "packuswb  %%mm1, %%mm0         \n\t"
164                "pcmpeqd %%mm2, %%mm2   \n\t"
165                "paddb %%mm2, %%mm2     \n\t"
166                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
167                "movq   %%mm1, (%2, %%"REG_a")  \n\t"
168        "add    %3, %%"REG_a"           \n\t"
169
170        "subl   $2, %0                  \n\t"
171        "jnz    1b                      \n\t"
172        :"+g"(h), "+S"(pixels)
173        :"D"(block), "r"((x86_reg)line_size)
174        :REG_a, "memory");
175}
176