1/*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 *                    April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26/*
27YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31The following calculation is used for the conversion:
32
33  r = clipz((y-oy)*cy  + crv*(v-128))
34  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35  b = clipz((y-oy)*cy  + cbu*(u-128))
36
37y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40New factorization to eliminate the truncation error which was
41occurring due to the byteop3p.
42
43
441) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
472) Scale operands up by a factor of 4 not 8 because Blackfin
48   multiplies include a shift.
49
503) Compute into the accumulators cy*yx0, cy*yx1.
51
524) Compute each of the linear equations:
53     r = clipz((y - oy) * cy  + crv * (v - 128))
54
55     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56
57     b = clipz((y - oy) * cy  + cbu * (u - 128))
58
59   Reuse of the accumulators requires that we actually multiply
60   twice once with addition and the second time with a subtraction.
61
62   Because of this we need to compute the equations in the order R B
63   then G saving the writes for B in the case of 24/32 bit color
64   formats.
65
66   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                      int dW, uint32_t *coeffs);
68
69       A          B
70       ---        ---
71       i2 = cb    i3 = cr
72       i1 = coeff i0 = y
73
74Where coeffs have the following layout in memory.
75
76uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78coeffs is a pointer to oy.
79
80The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81replication is used to simplify the internal algorithms for the dual Mac
82architecture of BlackFin.
83
84All routines are exported with _ff_bfin_ as a symbol prefix.
85
86Rough performance gain compared against -O3:
87
882779809/1484290 187.28%
89
90which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91c/pel for the optimized implementations. Not sure why there is such a
92huge variation on the reference codes on Blackfin I guess it must have
93to do with the memory system.
94*/
95
96#define mL3 .text
97#if defined(__FDPIC__) && CONFIG_SRAM
98#define mL1 .l1.text
99#else
100#define mL1 mL3
101#endif
102#define MEM mL1
103
104#define DEFUN(fname,where,interface) \
105        .section where;              \
106        .global _ff_bfin_ ## fname;  \
107        .type _ff_bfin_ ## fname, STT_FUNC; \
108        .align 8;                    \
109        _ff_bfin_ ## fname
110
111#define DEFUN_END(fname) \
112        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115.text
116
117#define COEFF_LEN        11*4
118#define COEFF_REL_CY_OFF 4*4
119
120#define ARG_OUT   20
121#define ARG_W     24
122#define ARG_COEFF 28
123
124DEFUN(yuv2rgb565_line,MEM,
125   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126        link 0;
127        [--sp] = (r7:4);
128        p1 = [fp+ARG_OUT];
129        r3 = [fp+ARG_W];
130
131        i0 = r0;
132        i2 = r1;
133        i3 = r2;
134
135        r0 = [fp+ARG_COEFF];
136        i1 = r0;
137        b1 = i1;
138        l1 = COEFF_LEN;
139        m0 = COEFF_REL_CY_OFF;
140        p0 = r3;
141
142        r0   = [i0++];         // 2Y
143        r1.l = w[i2++];        // 2u
144        r1.h = w[i3++];        // 2v
145        p0 = p0>>2;
146
147        lsetup (.L0565, .L1565) lc0 = p0;
148
149        /*
150           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151           r0 -- used to load 4ys
152           r1 -- used to load 2us,2vs
153           r4 -- y3,y2
154           r5 -- y1,y0
155           r6 -- u1,u0
156           r7 -- v1,v0
157        */
158                                                              r2=[i1++]; // oy
159.L0565:
160        /*
161        rrrrrrrr gggggggg bbbbbbbb
162         5432109876543210
163                    bbbbb >>3
164              gggggggg    <<3
165         rrrrrrrr         <<8
166         rrrrrggggggbbbbb
167        */
168        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169        (r7,r6) = byteop16m (r1:0, r3:2) (r);
170        r5 = r5 << 2 (v);                                                // y1,y0
171        r4 = r4 << 2 (v);                                                // y3,y2
172        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174        /* Y' = y*cy */
175        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176
177        /* R = Y+ crv*(Cr-128) */
178        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181        r2 = r2 >> 3 (v);
182        r3 = r2 & r5;
183
184        /* B = Y+ cbu*(Cb-128) */
185        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188        r2 = r2 << 8 (v);
189        r2 = r2 & r5;
190        r3 = r3 | r2;
191
192        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196        r2 = r2 << 3 (v);
197        r2 = r2 & r5;
198        r3 = r3 | r2;
199        [p1++]=r3                                          || r1=[i1++]; // cy
200
201        /* Y' = y*cy */
202
203        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204
205        /* R = Y+ crv*(Cr-128) */
206        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209        r2 = r2 >> 3 (v);
210        r3 = r2 & r5;
211
212        /* B = Y+ cbu*(Cb-128) */
213        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216        r2 = r2 << 8 (v);
217        r2 = r2 & r5;
218        r3 = r3 | r2;
219
220        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225        r2 = r2 & r5;
226        r3 = r3 | r2;
227        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228.L1565:                                                       r2=[i1++]; // oy
229
230        l1 = 0;
231
232        (r7:4) = [sp++];
233        unlink;
234        rts;
235DEFUN_END(yuv2rgb565_line)
236
237DEFUN(yuv2rgb555_line,MEM,
238   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239        link 0;
240        [--sp] = (r7:4);
241        p1 = [fp+ARG_OUT];
242        r3 = [fp+ARG_W];
243
244        i0 = r0;
245        i2 = r1;
246        i3 = r2;
247
248        r0 = [fp+ARG_COEFF];
249        i1 = r0;
250        b1 = i1;
251        l1 = COEFF_LEN;
252        m0 = COEFF_REL_CY_OFF;
253        p0 = r3;
254
255        r0   = [i0++];         // 2Y
256        r1.l = w[i2++];        // 2u
257        r1.h = w[i3++];        // 2v
258        p0 = p0>>2;
259
260        lsetup (.L0555, .L1555) lc0 = p0;
261
262        /*
263           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264           r0 -- used to load 4ys
265           r1 -- used to load 2us,2vs
266           r4 -- y3,y2
267           r5 -- y1,y0
268           r6 -- u1,u0
269           r7 -- v1,v0
270        */
271                                                              r2=[i1++]; // oy
272.L0555:
273        /*
274        rrrrrrrr gggggggg bbbbbbbb
275         5432109876543210
276                    bbbbb >>3
277               gggggggg   <<2
278          rrrrrrrr        <<7
279         xrrrrrgggggbbbbb
280        */
281
282        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283        (r7,r6) = byteop16m (r1:0, r3:2) (r);
284        r5 = r5 << 2 (v);                                                // y1,y0
285        r4 = r4 << 2 (v);                                                // y3,y2
286        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288        /* Y' = y*cy */
289        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290
291        /* R = Y+ crv*(Cr-128) */
292        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295        r2 = r2 >> 3 (v);
296        r3 = r2 & r5;
297
298        /* B = Y+ cbu*(Cb-128) */
299        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302        r2 = r2 << 7 (v);
303        r2 = r2 & r5;
304        r3 = r3 | r2;
305
306        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310        r2 = r2 << 2 (v);
311        r2 = r2 & r5;
312        r3 = r3 | r2;
313        [p1++]=r3                                          || r1=[i1++]; // cy
314
315        /* Y' = y*cy */
316
317        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318
319        /* R = Y+ crv*(Cr-128) */
320        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323        r2 = r2 >> 3 (v);
324        r3 = r2 & r5;
325
326        /* B = Y+ cbu*(Cb-128) */
327        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330        r2 = r2 << 7 (v);
331        r2 = r2 & r5;
332        r3 = r3 | r2;
333
334        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339        r2 = r2 & r5;
340        r3 = r3 | r2;
341        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342
343.L1555:                                                       r2=[i1++]; // oy
344
345        l1 = 0;
346
347        (r7:4) = [sp++];
348        unlink;
349        rts;
350DEFUN_END(yuv2rgb555_line)
351
352DEFUN(yuv2rgb24_line,MEM,
353   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354        link 0;
355        [--sp] = (r7:4);
356        p1 = [fp+ARG_OUT];
357        r3 = [fp+ARG_W];
358        p2 = p1;
359        p2 += 3;
360
361        i0 = r0;
362        i2 = r1;
363        i3 = r2;
364
365        r0 = [fp+ARG_COEFF]; // coeff buffer
366        i1 = r0;
367        b1 = i1;
368        l1 = COEFF_LEN;
369        m0 = COEFF_REL_CY_OFF;
370        p0 = r3;
371
372        r0   = [i0++];         // 2Y
373        r1.l = w[i2++];        // 2u
374        r1.h = w[i3++];        // 2v
375        p0 = p0>>2;
376
377        lsetup (.L0888, .L1888) lc0 = p0;
378
379        /*
380           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381           r0 -- used to load 4ys
382           r1 -- used to load 2us,2vs
383           r4 -- y3,y2
384           r5 -- y1,y0
385           r6 -- u1,u0
386           r7 -- v1,v0
387        */
388                                                              r2=[i1++]; // oy
389.L0888:
390        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391        (r7,r6) = byteop16m (r1:0, r3:2) (r);
392        r5 = r5 << 2 (v);               // y1,y0
393        r4 = r4 << 2 (v);               // y3,y2
394        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396
397        /* Y' = y*cy */
398        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399
400        /* R = Y+ crv*(Cr-128) */
401        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404        r2=r2>>16 || B[p1++]=r2;
405                     B[p2++]=r2;
406
407        /* B = Y+ cbu*(Cb-128) */
408        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411
412        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416
417        r2=r2>>16 || B[p1++]=r2;
418                     B[p2++]=r2;
419
420        r3=r3>>16 || B[p1++]=r3;
421                     B[p2++]=r3                            || r1=[i1++]; // cy
422
423        p1+=3;
424        p2+=3;
425        /* Y' = y*cy */
426        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427
428        /* R = Y+ crv*(Cr-128) */
429        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432        r2=r2>>16 || B[p1++]=r2;
433        B[p2++]=r2;
434
435        /* B = Y+ cbu*(Cb-128) */
436        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439
440        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
446        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447                     B[p2++]=r3 || r2=[i1++];      // oy
448
449        p1+=3;
450.L1888: p2+=3;
451
452        l1 = 0;
453
454        (r7:4) = [sp++];
455        unlink;
456        rts;
457DEFUN_END(yuv2rgb24_line)
458
459
460
461#define ARG_vdst        20
462#define ARG_width       24
463#define ARG_height      28
464#define ARG_lumStride   32
465#define ARG_chromStride 36
466#define ARG_srcStride   40
467
468DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469                         int width, int height,
470                         int lumStride, int chromStride, int srcStride)):
471        link 0;
472        [--sp] = (r7:4,p5:4);
473
474        p0 = r1;       // Y top even
475
476        i2 = r2; // *u
477        r2 = [fp + ARG_vdst];
478        i3 = r2; // *v
479
480        r1 = [fp + ARG_srcStride];
481        r2 = r0 + r1;
482        i0 = r0;  // uyvy_T even
483        i1 = r2;  // uyvy_B odd
484
485        p2 = [fp + ARG_lumStride];
486        p1 = p0 + p2;  // Y bot odd
487
488        p5 = [fp + ARG_width];
489        p4 = [fp + ARG_height];
490        r0 = p5;
491        p4 = p4 >> 1;
492        p5 = p5 >> 2;
493
494        r2 = r0 << 1;
495        r1 = r1 << 1;
496        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
497        r1 += -8;  // i0,i1 is pre read need to correct
498        m0 = r1;
499
500        r2 = [fp + ARG_chromStride];
501        r0 = r0 >> 1;
502        r2 = r2 - r0;
503        m1 = r2;
504
505        /*   I0,I1 - src input line pointers
506         *   p0,p1 - luma output line pointers
507         *   I2    - dstU
508         *   I3    - dstV
509         */
510
511        lsetup (0f, 1f) lc1 = p4;   // H/2
5120:        r0 = [i0++] || r2 = [i1++];
513          r1 = [i0++] || r3 = [i1++];
514          r4 = byteop1p(r1:0, r3:2);
515          r5 = byteop1p(r1:0, r3:2) (r);
516          lsetup (2f, 3f) lc0 = p5; // W/4
5172:          r0 = r0 >> 8(v);
518            r1 = r1 >> 8(v);
519            r2 = r2 >> 8(v);
520            r3 = r3 >> 8(v);
521            r0 = bytepack(r0, r1);
522            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
523            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
524            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
525            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
526            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
5273:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
528
529          i0 += m0;
530          i1 += m0;
531          i2 += m1;
532          i3 += m1;
533          p0 = p0 + p2;
5341:        p1 = p1 + p2;
535
536        (r7:4,p5:4) = [sp++];
537        unlink;
538        rts;
539DEFUN_END(uyvytoyv12)
540
541DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
542                         int width, int height,
543                         int lumStride, int chromStride, int srcStride)):
544        link 0;
545        [--sp] = (r7:4,p5:4);
546
547        p0 = r1;       // Y top even
548
549        i2 = r2; // *u
550        r2 = [fp + ARG_vdst];
551        i3 = r2; // *v
552
553        r1 = [fp + ARG_srcStride];
554        r2 = r0 + r1;
555
556        i0 = r0;  // uyvy_T even
557        i1 = r2;  // uyvy_B odd
558
559        p2 = [fp + ARG_lumStride];
560        p1 = p0 + p2;  // Y bot odd
561
562        p5 = [fp + ARG_width];
563        p4 = [fp + ARG_height];
564        r0 = p5;
565        p4 = p4 >> 1;
566        p5 = p5 >> 2;
567
568        r2 = r0 << 1;
569        r1 = r1 << 1;
570        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
571        r1 += -8;  // i0,i1 is pre read need to correct
572        m0 = r1;
573
574        r2 = [fp + ARG_chromStride];
575        r0 = r0 >> 1;
576        r2 = r2 - r0;
577        m1 = r2;
578
579        /*   I0,I1 - src input line pointers
580         *   p0,p1 - luma output line pointers
581         *   I2    - dstU
582         *   I3    - dstV
583         */
584
585        lsetup (0f, 1f) lc1 = p4;   // H/2
5860:        r0 = [i0++] || r2 = [i1++];
587          r1 = [i0++] || r3 = [i1++];
588          r4 = bytepack(r0, r1);
589          r5 = bytepack(r2, r3);
590          lsetup (2f, 3f) lc0 = p5; // W/4
5912:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
592            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
593            r2 = r2 >> 8(v);
594            r3 = r3 >> 8(v);
595            r4 = byteop1p(r1:0, r3:2);
596            r5 = byteop1p(r1:0, r3:2) (r);
597            r6 = pack(r5.l, r4.l);
598            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
599            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
600            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
6013:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
602
603          i0 += m0;
604          i1 += m0;
605          i2 += m1;
606          i3 += m1;
607          p0 = p0 + p2;
6081:        p1 = p1 + p2;
609
610        (r7:4,p5:4) = [sp++];
611        unlink;
612        rts;
613DEFUN_END(yuyvtoyv12)
614