1/*
2 * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21/**
22 ** @file
23 ** integer misc ops.
24 **/
25
26#include "config.h"
27#if HAVE_ALTIVEC_H
28#include <altivec.h>
29#endif
30
31#include "libavcodec/dsputil.h"
32
33#include "dsputil_altivec.h"
34
35#include "types_altivec.h"
36
37static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
38                                     int size) {
39    int i, size16;
40    vector signed char vpix1;
41    vector signed short vpix2, vdiff, vpix1l,vpix1h;
42    union { vector signed int vscore;
43            int32_t score[4];
44          } u;
45    u.vscore = vec_splat_s32(0);
46//
47//XXX lazy way, fix it later
48
49#define vec_unaligned_load(b) \
50    vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
51
52    size16 = size >> 4;
53    while(size16) {
54//        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
55        //load pix1 and the first batch of pix2
56
57        vpix1 = vec_unaligned_load(pix1);
58        vpix2 = vec_unaligned_load(pix2);
59        pix2 += 8;
60        //unpack
61        vpix1h = vec_unpackh(vpix1);
62        vdiff  = vec_sub(vpix1h, vpix2);
63        vpix1l = vec_unpackl(vpix1);
64        // load another batch from pix2
65        vpix2 = vec_unaligned_load(pix2);
66        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
67        vdiff  = vec_sub(vpix1l, vpix2);
68        u.vscore = vec_msum(vdiff, vdiff, u.vscore);
69        pix1 += 16;
70        pix2 += 8;
71        size16--;
72    }
73    u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
74
75    size %= 16;
76    for (i = 0; i < size; i++) {
77        u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
78    }
79    return u.score[3];
80}
81
82static int32_t scalarproduct_int16_altivec(const int16_t * v1, const int16_t * v2, int order, const int shift)
83{
84    int i;
85    LOAD_ZERO;
86    register vec_s16 vec1, *pv;
87    register vec_s32 res = vec_splat_s32(0), t;
88    register vec_u32 shifts;
89    int32_t ires;
90
91    shifts = zero_u32v;
92    if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
93    if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
94    if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
95    if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
96    if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
97
98    for(i = 0; i < order; i += 8){
99        pv = (vec_s16*)v1;
100        vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
101        t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
102        t = vec_sr(t, shifts);
103        res = vec_sums(t, res);
104        v1 += 8;
105        v2 += 8;
106    }
107    res = vec_splat(res, 3);
108    vec_ste(res, 0, &ires);
109    return ires;
110}
111
112static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
113{
114    LOAD_ZERO;
115    vec_s16 *pv1 = (vec_s16*)v1;
116    vec_s16 *pv2 = (vec_s16*)v2;
117    vec_s16 *pv3 = (vec_s16*)v3;
118    register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
119    register vec_s16 t0, t1, i0, i1;
120    register vec_s16 i2 = pv2[0], i3 = pv3[0];
121    register vec_s32 res = zero_s32v;
122    register vec_u8 align = vec_lvsl(0, v2);
123    int32_t ires;
124    order >>= 4;
125    do {
126        t0 = vec_perm(i2, pv2[1], align);
127        i2 = pv2[2];
128        t1 = vec_perm(pv2[1], i2, align);
129        i0 = pv1[0];
130        i1 = pv1[1];
131        res = vec_msum(t0, i0, res);
132        res = vec_msum(t1, i1, res);
133        t0 = vec_perm(i3, pv3[1], align);
134        i3 = pv3[2];
135        t1 = vec_perm(pv3[1], i3, align);
136        pv1[0] = vec_mladd(t0, muls, i0);
137        pv1[1] = vec_mladd(t1, muls, i1);
138        pv1 += 2;
139        pv2 += 2;
140        pv3 += 2;
141    } while(--order);
142    res = vec_splat(vec_sums(res, zero_s32v), 3);
143    vec_ste(res, 0, &ires);
144    return ires;
145}
146
147void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
148{
149    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
150    c->scalarproduct_int16 = scalarproduct_int16_altivec;
151    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
152}
153