1/*
2 * Blackfin Pixel Operations
3 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21#include "config_bfin.h"
22
23DEFUN(put_pixels_clamped,mL1,
24        (DCTELEM *block, uint8_t *dest, int line_size)):
25    [--SP] = (R7:4);
26    R4 = 0;
27    R5.l = 0x00ff;
28    R5.h = 0x00ff;
29    I0 = R0;         // block
30    I1 = R1;         // dest
31    R2 += -4;        // line_size
32    M1 = R2;
33    P0 = 8;
34    R0 = [I0++];
35    R1 = [I0++];
36    R2 = MAX(R0, R4) (V);
37    LSETUP (ppc$0,ppc$1) LC0=P0;
38ppc$0: R2 = MIN(R2, R5) (V);
39       R3 = MAX(R1, R4) (V);
40       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
41       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
42       R2 = MAX(R0, R4) (V)      || [I1++] = R6;
43       R2 = MIN(R2, R5) (V);
44       R3 = MAX(R1, R4) (V);
45       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
46       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
47ppc$1: R2 = Max(R0, R4) (V)      || [I1++M1] = R6;
48
49    (R7:4) = [SP++];
50    RTS;
51DEFUN_END(put_pixels_clamped)
52
53DEFUN(add_pixels_clamped,mL1,
54        (DCTELEM *block, uint8_t *dest, int line_size)):
55    [-- SP] = (R7:4);
56    R4 = 0;
57    I0 = 0;
58    R2 += -4;        // line_size
59    M0 = R2;
60    I1 = R1;         // dest
61    I3 = R0;         // block
62    I2 = R1;         // dest
63    P0 = 8;
64    M3 = 2;
65    R0 = [I3++]  || R2 = [I1];
66    R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
67    R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
68    R6 = BYTEOP3P(R1:0, R3:2) (LO)    || R1.H = W[I3++]  || R2 = [I1];
69
70    LSETUP(apc$2,apc$3) LC1 = P0;
71apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R3 = [I1++M0];
72       R2 = R2 << 8                      || R0.H = W[I3--];
73       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
74       R6 = R6 + R7 (S)                  || R1.H = W[I3];
75       R6 = BYTEOP3P(R1:0, R3:2) (LO)    || I3+=M3          || [I2++]=R6;
76       R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R2 = [I1];
77       R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
78       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
79       R6 = R6 + R7 (S)                  || R1.H = W[I3++];
80apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO)    || [I2++M0] = R6   || R2 = [I1];
81
82    (R7:4) = [SP++];
83    RTS;
84DEFUN_END(add_pixels_clamped)
85
86
87/*
88  motion compensation
89  primitives
90
91     * Halfpel motion compensation with rounding (a+b+1)>>1.
92     * This is an array[4][4] of motion compensation funcions for 4
93     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
94     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
95     * @param block destination where the result is stored
96     * @param pixels source
97     * @param line_size number of bytes in a horizontal line of block
98     * @param h height
99
100*/
101
102DEFUN(put_pixels8uc,mL1,
103        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
104                 int dest_size, int line_size, int h)):
105        i3=r0;        // dest
106        i0=r1;        // src0
107        i1=r2;        // src1
108        r0=[sp+12];   // dest_size
109        r2=[sp+16];   // line_size
110        p0=[sp+20];   // h
111        [--sp] = (r7:6);
112        r0+=-4;
113        m3=r0;
114        r2+=-8;
115        m0=r2;
116        LSETUP(pp8$0,pp8$1) LC0=P0;
117        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
118
119pp8$0:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
120        R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0]|| R2  =[I1++M0];
121        R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]  || [I3++] = R6 ;
122pp8$1:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
123
124        (r7:6) = [sp++];
125        RTS;
126DEFUN_END(put_pixels8uc)
127
128DEFUN(put_pixels16uc,mL1,
129        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
130                 int dest_size, int line_size, int h)):
131        link 0;
132        [--sp] = (r7:6);
133        i3=r0;        // dest
134        i0=r1;        // src0
135        i1=r2;        // src1
136        r0=[fp+20];   // dest_size
137        r2=[fp+24];   // line_size
138        p0=[fp+28];   // h
139
140
141        r0+=-12;
142        m3=r0;        // line_size
143        r2+=-16;
144        m0=r2;
145
146        LSETUP(pp16$0,pp16$1) LC0=P0;
147         DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
148
149pp16$0:  DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
150         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++]   || R2  =[I1++];
151         R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++]   || R3  =[I1++];
152         [I3++] = R6;
153         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0] || R2  =[I1++M0];
154         R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]   || [I3++] = R7 ;
155         [I3++] = R6;
156pp16$1:  DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
157
158        (r7:6) = [sp++];
159        unlink;
160        RTS;
161DEFUN_END(put_pixels16uc)
162
163
164
165
166
167
168DEFUN(put_pixels8uc_nornd,mL1,
169        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
170                 int line_size, int h)):
171        i3=r0;        // dest
172        i0=r1;        // src0
173        i1=r2;        // src1
174        r2=[sp+12];   // line_size
175        p0=[sp+16];   // h
176        [--sp] = (r7:6);
177        r2+=-4;
178        m3=r2;
179        r2+=-4;
180        m0=r2;
181        LSETUP(pp8$2,pp8$3) LC0=P0;
182        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];
183
184pp8$2:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
185        R6 = BYTEOP1P(R1:0,R3:2)(T)  || R0 = [I0++M0]|| R2  =[I1++M0];
186        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]  || [I3++] = R6 ;
187pp8$3:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;
188
189        (r7:6) = [sp++];
190        RTS;
191DEFUN_END(put_pixels8uc_nornd)
192
193DEFUN(put_pixels16uc_nornd,mL1,
194        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
195                 int line_size, int h)):
196        i3=r0;        // dest
197        i0=r1;        // src0
198        i1=r2;        // src1
199        r2=[sp+12];   // line_size
200        p0=[sp+16];   // h
201
202        [--sp] = (r7:6);
203        r2+=-12;
204        m3=r2;        // line_size
205        r2+=-4;
206        m0=r2;
207
208        LSETUP(pp16$2,pp16$3) LC0=P0;
209        DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];
210
211pp16$2:
212        DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
213        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++]   || R2  =[I1++];
214        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++]   || R3  =[I1++];
215        [I3++] = R6;
216
217        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++M0] || R2  =[I1++M0];
218        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]   || [I3++] = R7 ;
219        [I3++] = R6;
220pp16$3: DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;
221
222        (r7:6) = [sp++];
223
224        RTS;
225DEFUN_END(put_pixels16uc_nornd)
226
227DEFUN(z_put_pixels16_xy2,mL1,
228        (uint8_t *block, const uint8_t *s0,
229                 int dest_size, int line_size, int h)):
230        link 0;
231        [--sp] = (r7:4);
232        i3=r0;        // dest
233        i0=r1;        // src0--> pixels
234        i1=r1;        // src1--> pixels + line_size
235        r2+=-12;
236        m2=r2;        // m2=dest_width-4
237        r2=[fp+20];
238        m3=r2;        // line_size
239        p0=[fp+24];   // h
240        r2+=-16;
241        i1+=m3;       /* src1 + line_size */
242        m0=r2;        /* line-size - 20 */
243
244        B0 = I0;
245        B1 = I1;
246        B3 = I3;
247
248        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
249
250        LSETUP(LS$16E,LE$16E) LC0=P0;
251LS$16E: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
252        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++] || R2  =[I1++];
253        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
254        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
255        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0]|| R2  = [I1++M0];
256        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
257LE$16E: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
258
259        M1 = 1;
260        I3 = B3;
261        I1 = B1;
262        I0 = B0;
263
264        I0 += M1;
265        I1 += M1;
266
267        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
268        LSETUP(LS$16O,LE$16O) LC0=P0;
269LS$16O: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
270        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++] || R2  =[I1++];
271        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6  =[I3++];
272        R4 = R4 +|+ R6                       || R7 = [I3--];
273        R5 = R5 +|+ R7                       || [I3++] = R4;
274        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
275        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0]|| R2  = [I1++M0];
276        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
277        R4 = R4 +|+ R6                       || R7 = [I3--];
278        R5 = R5 +|+ R7                       || [I3++] = R4;
279LE$16O: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
280
281        (r7:4) = [sp++];
282        unlink;
283        rts;
284DEFUN_END(z_put_pixels16_xy2)
285
286DEFUN(put_pixels16_xy2_nornd,mL1,
287        (uint8_t *block, const uint8_t *s0,
288                 int line_size, int h)):
289        link 0;
290        [--sp] = (r7:4);
291        i3=r0;        // dest
292        i0=r1;        // src0--> pixels
293        i1=r1;        // src1--> pixels + line_size
294        m3=r2;
295        r2+=-12;
296        m2=r2;
297        r2+=-4;
298        i1+=m3;       /* src1 + line_size */
299        m0=r2;        /* line-size - 20 */
300        p0=[fp+20];   // h
301
302        B0=I0;
303        B1=I1;
304        B3=I3;
305
306        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
307
308        LSETUP(LS$16ET,LE$16ET) LC0=P0;
309LS$16ET:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
310        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++] || R2  =[I1++];
311        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R1 = [I0++] || [I3++] = R4 ;
312        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
313        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0]|| R2  = [I1++M0];
314        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++] || [I3++] = R4 ;
315LE$16ET:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
316
317        M1 = 1;
318        I3=B3;
319        I1=B1;
320        I0=B0;
321
322        I0 += M1;
323        I1 += M1;
324
325        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
326        LSETUP(LS$16OT,LE$16OT) LC0=P0;
327LS$16OT:DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
328        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++] || R2  =[I1++];
329        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R1 = [I0++] || R6  =[I3++];
330        R4 = R4 +|+ R6                                    || R7 = [I3--];
331        R5 = R5 +|+ R7                                    || [I3++] = R4;
332        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
333        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0]|| R2  = [I1++M0];
334        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++] || R6 = [I3++];
335        R4 = R4 +|+ R6                                    || R7 = [I3--];
336        R5 = R5 +|+ R7                                    || [I3++] = R4;
337LE$16OT:DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;
338
339        (r7:4) = [sp++];
340        unlink;
341        rts;
342DEFUN_END(put_pixels16_xy2_nornd)
343
344DEFUN(z_put_pixels8_xy2,mL1,
345        (uint8_t *block, const uint8_t *s0,
346                 int dest_size, int line_size, int h)):
347        link 0;
348        [--sp] = (r7:4);
349        i3=r0;        // dest
350        i0=r1;        // src0--> pixels
351        i1=r1;        // src1--> pixels + line_size
352        r2+=-4;
353        m2=r2;        // m2=dest_width-4
354        r2=[fp+20];
355        m3=r2;        // line_size
356        p0=[fp+24];   // h
357        r2+=-8;
358        i1+=m3;       /* src1 + line_size */
359        m0=r2;        /* line-size - 20 */
360
361        b0 = I0;
362        b1 = I1;
363        b3 = I3;
364
365        LSETUP(LS$8E,LE$8E) LC0=P0;
366        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
367LS$8E:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
368        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0] || R2  =[I1++M0];
369        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++]   || [I3++] = R4 ;
370LE$8E:  DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
371
372        M1 = 1;
373        I3 = b3;
374        I1 = b1;
375        I0 = b0;
376
377        I0 += M1;
378        I1 += M1;
379
380        LSETUP(LS$8O,LE$8O) LC0=P0;
381        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
382LS$8O:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
383        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0] || R2  =[I1++M0];
384        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++]   || R6  =[I3++];
385        R4 = R4 +|+ R6                                      || R7 = [I3--];
386        R5 = R5 +|+ R7                                      || [I3++] = R4;
387LE$8O:  DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
388
389        (r7:4) = [sp++];
390        unlink;
391        rts;
392DEFUN_END(z_put_pixels8_xy2)
393
394DEFUN(put_pixels8_xy2_nornd,mL1,
395        (uint8_t *block, const uint8_t *s0, int line_size, int h)):
396        link 0;
397        [--sp] = (r7:4);
398        i3=r0;        // dest
399        i0=r1;        // src0--> pixels
400        i1=r1;        // src1--> pixels + line_size
401        m3=r2;
402        r2+=-4;
403        m2=r2;
404        r2+=-4;
405        i1+=m3;       /* src1 + line_size */
406        m0=r2;        /* line-size - 20 */
407        p0=[fp+20];   // h
408
409
410        b0 = I0;
411        b1 = I1;
412        b3 = I3;
413
414        LSETUP(LS$8ET,LE$8ET) LC0=P0;
415        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
416
417LS$8ET: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
418        R4 = BYTEOP2P (R3:2,R1:0) (TL)     || R0 = [I0++M0] || R2 = [I1++M0];
419        R5 = BYTEOP2P (R3:2,R1:0) (TL,R)   || R0 = [I0++]   || [I3++] = R4 ;
420LE$8ET: DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;
421
422        M1 = 1;
423        I3 = b3;
424        I1 = b1;
425        I0 = b0;
426
427        I0 += M1;
428        I1 += M1;
429
430        LSETUP(LS$8OT,LE$8OT) LC0=P0;
431        DISALGNEXCPT                       || R0 = [I0++]   || R2 = [I1++];
432
433LS$8OT: DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
434        R4 = BYTEOP2P (R3:2,R1:0) (TH)     || R0 = [I0++M0] || R2 = [I1++M0];
435        R5 = BYTEOP2P (R3:2,R1:0) (TH,R)   || R0 = [I0++]   || R6 = [I3++];
436        R4 = R4 +|+ R6                                      || R7 = [I3--];
437        R5 = R5 +|+ R7                                      || [I3++] = R4;
438LE$8OT: DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;
439
440        (r7:4) = [sp++];
441        unlink;
442        rts;
443
444DEFUN(diff_pixels,mL1,
445       (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)):
446        link 0;
447        [--sp] = (r7:4);
448        p0=8;
449        i3=r0;        // block
450        i0=r1;        // s1
451        i1=r2;        // s2
452        r2=[fp+20];   // stride
453        r2+=-8;
454        m0=r2;
455
456
457        LSETUP(.LS0,.LE0) LC0=P0;
458        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
459
460.LS0:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
461        (R5,R4) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
462        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || [I3++] = R4;
463        DISALGNEXCPT                       || R2 = [I1++]   || [I3++] = R5;
464        [i3++]=r6;
465.LE0:  [i3++]=r7;
466
467        (r7:4) = [sp++];
468        unlink;
469        rts;
470DEFUN_END(put_pixels8_xy2_nornd)
471
472/*
473    for (i = 0; i < 16; i++) {
474        for (j = 0; j < 16; j++) {
475          sum += pix[j];
476        }
477        pix += line_size;
478    }
479*/
480DEFUN(pix_sum,mL1,
481        (uint8_t *p, int stride)):
482        link 0;
483        [--sp] = (r7:4);
484        p0=8;
485        i0=r0;        // s1
486        i1=r0;
487        m1=r1;
488        r1=r1+r1;
489        r1+=-16;       // stride
490        m0=r1;
491        i1+=m1;
492
493        r6=0;
494
495        LSETUP(LS$PS,LE$PS) LC0=P0;
496        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
497
498LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
499        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
500        r6=r6+|+r5;
501        r6=r6+|+r4;
502        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
503        r6=r6+|+r5;
504        r6=r6+|+r4;
505        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
506        r6=r6+|+r5;
507        r6=r6+|+r4;
508        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
509        r6=r6+|+r5;
510LE$PS:  r6=r6+|+r4;
511        r0.l=r6.l+r6.h;
512        r0.h=0;
513
514        (r7:4) = [sp++];
515        unlink;
516        rts;
517DEFUN_END(pix_sum)
518
519
520DEFUN(get_pixels,mL1,
521        (DCTELEM *restrict block, const uint8_t *pixels, int line_size)):
522        [--sp] = (r7:4);
523        i3=r0;        // dest
524        i0=r1;        // src0
525        p0=8;
526        r2+=-8;
527        m0=r2;
528        LSETUP(gp8$0,gp8$1) LC0=P0;
529
530        DISALGNEXCPT                   || R0 = [I0++];
531        DISALGNEXCPT                   || R1 = [I0++];
532
533gp8$0:  (R7,R6) = byteunpack R1:0      || R0 = [I0++M0];
534        (R5,R4) = byteunpack R1:0 (R)  || R0 = [I0++]    || [I3++]=R6;
535        DISALGNEXCPT                   || R1 = [I0++]    || [I3++]=R7;
536        [I3++]=R4;
537gp8$1:  [I3++]=R5
538
539
540        (r7:4) = [sp++];
541        RTS;
542DEFUN_END(get_pixels)
543
544
545/* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
546/* 91 cycles */
547DEFUN(z_sad16x16,mL1,
548        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
549        link 0;
550        I0 = R0;
551        I1 = R1;
552
553        A1 = A0 = 0;
554        R0 = [sp+20]; // rwidth
555        P2 = [sp+24]; // height
556        R3 = 16;
557        R0 = R0 - R3;
558        R3 = R2 - R3;
559        M1 = R0;
560        M0 = R3;
561
562        DISALGNEXCPT         || R0 = [I0++]    || R2 = [I1++];
563        LSETUP (s$16, e$16) LC0=P2;
564s$16:   DISALGNEXCPT         || R1 = [I0++]    || R3 = [I1++];
565        SAA (R1:0,R3:2)      || R0 = [I0++]    || R2 = [I1++];
566        SAA (R1:0,R3:2) (R)  || R1 = [I0++]    || R3 = [I1++];
567        SAA (R1:0,R3:2)      || R0 = [I0++M0]  || R2 = [I1++M1];
568e$16:   SAA (R1:0,R3:2) (R)  || R0 = [I0++]    || R2 = [I1++];
569
570        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
571        R0 = R2 + R3 ;
572        unlink;
573        RTS;
574DEFUN_END(z_sad16x16)
575
576/* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
577/* 36 cycles */
578DEFUN(z_sad8x8,mL1,
579        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
580        I0 = R0;
581        I1 = R1;
582
583        A1 = A0 = 0;
584        r0 = [sp+12]; // rwidth
585        P2 = [sp+16]; //height
586        R3 = 8;
587        R0 = R0 - R3;
588        R3 = R2 - R3;
589        M0 = R3;
590        M1 = R0;
591
592        LSETUP (s$8, e$8) LC0=P2;
593        DISALGNEXCPT         || R0 = [I0++]   || R2 = [I1++];
594        DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
595s$8:    SAA (R1:0,R3:2)      || R0 = [I0++M0] || R2 = [I1++M1];
596        SAA (R1:0,R3:2) (R)  || R0 = [I0++]   || R2 = [I1++];
597e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
598
599        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
600        R0 = R2 + R3 ;
601        RTS;
602DEFUN_END(z_sad8x8)
603
604DEFUN(pix_norm1,mL1,
605        (uint8_t * pix, int line_size)):
606        [--SP]=(R7:4,P5:3);
607
608        // Fetch the input arguments.
609        P1 = R0;  // pix
610        P0 = R1;  // line_size
611        P5 = 16;  // loop ctr.
612        P0 -= P5;
613        M0 = P0;  // M0 = line_size-16;
614        // Now for the real work.
615        A1 = A0 = 0;
616        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
617        I0 = P1;
618        DISALGNEXCPT || r0 = [i0++];
619
620_pix_norm1_blkfn_loopStart:
621        // following unpacks pix1[0..15] pix1+line_size[0..15]
622        DISALGNEXCPT || r1 = [i0++];
623
624        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
625        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
626        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
627        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
628        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
629        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
630        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
631        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
632        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
633        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
634        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
635_pix_norm1_blkfn_loopEnd:
636        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
637
638
639// Clean up at the end:
640        R2 = A0, R3 = A1;
641        R0 = R2 + R3 (S);
642
643        (R7:4,P5:3)=[SP++];
644
645        RTS;
646DEFUN_END(pix_norm1)
647
648DEFUN(sse4,mL1,
649        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
650        link 0;
651        [--sp] = (r7:6);
652        p0=[fp+24];   // h
653        i0=r1;        // pix1
654        i1=r2;        // pix2
655        r2=[fp+20];   // line_size
656        r2+=-4;
657        m0=r2;
658
659        a0=a1=0;
660        LSETUP(.S40,.E40) LC0=P0;
661        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
662
663.S40:   DISALGNEXCPT                       || R1 = [I0++M0] || R3 = [I1++M0];
664        (R7,R6) = BYTEOP16M (R1:0,R3:2);
665        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
666.E40:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
667        a0 += a1;
668        r0 = a0;
669
670        (r7:6) = [sp++];
671        unlink;
672        rts;
673DEFUN_END(sse4)
674
675DEFUN(sse8,mL1,
676        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
677        link 0;
678        [--sp] = (r7:6);
679        p0=[fp+24];   // h
680        i0=r1;        // pix1
681        i1=r2;        // pix2
682        r2=[fp+20];   // line_size
683        r2+=-8;
684        m0=r2;
685
686        a0=a1=0;
687        LSETUP(.S80,.E80) LC0=P0;
688        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
689
690.S80:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
691        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
692        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
693        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
694        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
695        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
696.E80:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
697        a0 += a1;
698        r0 = a0;
699
700        (r7:6) = [sp++];
701        unlink;
702        rts;
703DEFUN_END(sse8)
704
705DEFUN(sse16,mL1,
706        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
707        link 0;
708        [--sp] = (r7:6);
709        p0=[fp+24];   // h
710        i0=r1;        // pix1
711        i1=r2;        // pix2
712        r2=[fp+20];   // line_size
713        r2+=-16;
714        m0=r2;
715
716        a0=a1=0;
717        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
718        LSETUP(.S160,.E160) LC0=P0;
719
720.S160:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
721        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++]   || R2 = [I1++];
722        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
723        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
724        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++]   || R3 = [I1++];
725        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
726        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
727        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
728        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
729        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
730        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
731        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
732.E160:  a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
733        a0 += a1;
734        r0 = a0;
735
736        (r7:6) = [sp++];
737        unlink;
738        rts;
739DEFUN_END(sse16)
740
741
742