1/*
2 * Alpha optimized DSP utils
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavcodec/dsputil.h"
23#include "asm.h"
24
25void get_pixels_mvi(DCTELEM *restrict block,
26                    const uint8_t *restrict pixels, int line_size)
27{
28    int h = 8;
29
30    do {
31        uint64_t p;
32
33        p = ldq(pixels);
34        stq(unpkbw(p),       block);
35        stq(unpkbw(p >> 32), block + 4);
36
37        pixels += line_size;
38        block += 8;
39    } while (--h);
40}
41
42void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
43                     int stride) {
44    int h = 8;
45    uint64_t mask = 0x4040;
46
47    mask |= mask << 16;
48    mask |= mask << 32;
49    do {
50        uint64_t x, y, c, d, a;
51        uint64_t signs;
52
53        x = ldq(s1);
54        y = ldq(s2);
55        c = cmpbge(x, y);
56        d = x - y;
57        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
58        d += 4 * a;             /* ...so we can use s4addq here.      */
59        signs = zap(-1, c);
60
61        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
62        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
63
64        s1 += stride;
65        s2 += stride;
66        block += 8;
67    } while (--h);
68}
69
70static inline uint64_t avg2(uint64_t a, uint64_t b)
71{
72    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
73}
74
75static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
76{
77    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
78                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
79                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
80                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
81    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
82                    + (l2 & BYTE_VEC(0x03))
83                    + (l3 & BYTE_VEC(0x03))
84                    + (l4 & BYTE_VEC(0x03))
85                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
86    return r1 + r2;
87}
88
89int pix_abs8x8_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
90{
91    int result = 0;
92
93    if ((size_t) pix2 & 0x7) {
94        /* works only when pix2 is actually unaligned */
95        do {                    /* do 8 pixel a time */
96            uint64_t p1, p2;
97
98            p1  = ldq(pix1);
99            p2  = uldq(pix2);
100            result += perr(p1, p2);
101
102            pix1 += line_size;
103            pix2 += line_size;
104        } while (--h);
105    } else {
106        do {
107            uint64_t p1, p2;
108
109            p1 = ldq(pix1);
110            p2 = ldq(pix2);
111            result += perr(p1, p2);
112
113            pix1 += line_size;
114            pix2 += line_size;
115        } while (--h);
116    }
117
118    return result;
119}
120
121#if 0                           /* now done in assembly */
122int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
123{
124    int result = 0;
125    int h = 16;
126
127    if ((size_t) pix2 & 0x7) {
128        /* works only when pix2 is actually unaligned */
129        do {                    /* do 16 pixel a time */
130            uint64_t p1_l, p1_r, p2_l, p2_r;
131            uint64_t t;
132
133            p1_l  = ldq(pix1);
134            p1_r  = ldq(pix1 + 8);
135            t     = ldq_u(pix2 + 8);
136            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
137            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
138            pix1 += line_size;
139            pix2 += line_size;
140
141            result += perr(p1_l, p2_l)
142                    + perr(p1_r, p2_r);
143        } while (--h);
144    } else {
145        do {
146            uint64_t p1_l, p1_r, p2_l, p2_r;
147
148            p1_l = ldq(pix1);
149            p1_r = ldq(pix1 + 8);
150            p2_l = ldq(pix2);
151            p2_r = ldq(pix2 + 8);
152            pix1 += line_size;
153            pix2 += line_size;
154
155            result += perr(p1_l, p2_l)
156                    + perr(p1_r, p2_r);
157        } while (--h);
158    }
159
160    return result;
161}
162#endif
163
164int pix_abs16x16_x2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
165{
166    int result = 0;
167    uint64_t disalign = (size_t) pix2 & 0x7;
168
169    switch (disalign) {
170    case 0:
171        do {
172            uint64_t p1_l, p1_r, p2_l, p2_r;
173            uint64_t l, r;
174
175            p1_l = ldq(pix1);
176            p1_r = ldq(pix1 + 8);
177            l    = ldq(pix2);
178            r    = ldq(pix2 + 8);
179            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
180            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
181            pix1 += line_size;
182            pix2 += line_size;
183
184            result += perr(p1_l, p2_l)
185                    + perr(p1_r, p2_r);
186        } while (--h);
187        break;
188    case 7:
189        /* |.......l|lllllllr|rrrrrrr*|
190           This case is special because disalign1 would be 8, which
191           gets treated as 0 by extqh.  At least it is a bit faster
192           that way :)  */
193        do {
194            uint64_t p1_l, p1_r, p2_l, p2_r;
195            uint64_t l, m, r;
196
197            p1_l = ldq(pix1);
198            p1_r = ldq(pix1 + 8);
199            l     = ldq_u(pix2);
200            m     = ldq_u(pix2 + 8);
201            r     = ldq_u(pix2 + 16);
202            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
203            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
204            pix1 += line_size;
205            pix2 += line_size;
206
207            result += perr(p1_l, p2_l)
208                    + perr(p1_r, p2_r);
209        } while (--h);
210        break;
211    default:
212        do {
213            uint64_t disalign1 = disalign + 1;
214            uint64_t p1_l, p1_r, p2_l, p2_r;
215            uint64_t l, m, r;
216
217            p1_l  = ldq(pix1);
218            p1_r  = ldq(pix1 + 8);
219            l     = ldq_u(pix2);
220            m     = ldq_u(pix2 + 8);
221            r     = ldq_u(pix2 + 16);
222            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
223                         extql(l, disalign1) | extqh(m, disalign1));
224            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
225                         extql(m, disalign1) | extqh(r, disalign1));
226            pix1 += line_size;
227            pix2 += line_size;
228
229            result += perr(p1_l, p2_l)
230                    + perr(p1_r, p2_r);
231        } while (--h);
232        break;
233    }
234    return result;
235}
236
237int pix_abs16x16_y2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
238{
239    int result = 0;
240
241    if ((size_t) pix2 & 0x7) {
242        uint64_t t, p2_l, p2_r;
243        t     = ldq_u(pix2 + 8);
244        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
245        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
246
247        do {
248            uint64_t p1_l, p1_r, np2_l, np2_r;
249            uint64_t t;
250
251            p1_l  = ldq(pix1);
252            p1_r  = ldq(pix1 + 8);
253            pix2 += line_size;
254            t     = ldq_u(pix2 + 8);
255            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
256            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
257
258            result += perr(p1_l, avg2(p2_l, np2_l))
259                    + perr(p1_r, avg2(p2_r, np2_r));
260
261            pix1 += line_size;
262            p2_l  = np2_l;
263            p2_r  = np2_r;
264
265        } while (--h);
266    } else {
267        uint64_t p2_l, p2_r;
268        p2_l = ldq(pix2);
269        p2_r = ldq(pix2 + 8);
270        do {
271            uint64_t p1_l, p1_r, np2_l, np2_r;
272
273            p1_l = ldq(pix1);
274            p1_r = ldq(pix1 + 8);
275            pix2 += line_size;
276            np2_l = ldq(pix2);
277            np2_r = ldq(pix2 + 8);
278
279            result += perr(p1_l, avg2(p2_l, np2_l))
280                    + perr(p1_r, avg2(p2_r, np2_r));
281
282            pix1 += line_size;
283            p2_l  = np2_l;
284            p2_r  = np2_r;
285        } while (--h);
286    }
287    return result;
288}
289
290int pix_abs16x16_xy2_mvi(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
291{
292    int result = 0;
293
294    uint64_t p1_l, p1_r;
295    uint64_t p2_l, p2_r, p2_x;
296
297    p1_l = ldq(pix1);
298    p1_r = ldq(pix1 + 8);
299
300    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
301        p2_l = uldq(pix2);
302        p2_r = uldq(pix2 + 8);
303        p2_x = (uint64_t) pix2[16] << 56;
304    } else {
305        p2_l = ldq(pix2);
306        p2_r = ldq(pix2 + 8);
307        p2_x = ldq(pix2 + 16) << 56;
308    }
309
310    do {
311        uint64_t np1_l, np1_r;
312        uint64_t np2_l, np2_r, np2_x;
313
314        pix1 += line_size;
315        pix2 += line_size;
316
317        np1_l = ldq(pix1);
318        np1_r = ldq(pix1 + 8);
319
320        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
321            np2_l = uldq(pix2);
322            np2_r = uldq(pix2 + 8);
323            np2_x = (uint64_t) pix2[16] << 56;
324        } else {
325            np2_l = ldq(pix2);
326            np2_r = ldq(pix2 + 8);
327            np2_x = ldq(pix2 + 16) << 56;
328        }
329
330        result += perr(p1_l,
331                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
332                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
333                + perr(p1_r,
334                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
335                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
336
337        p1_l = np1_l;
338        p1_r = np1_r;
339        p2_l = np2_l;
340        p2_r = np2_r;
341        p2_x = np2_x;
342    } while (--h);
343
344    return result;
345}
346