1/*
2 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 *                    April 20, 2007
4 *
5 * Blackfin video color space converter operations
6 * convert I420 YV12 to RGB in various formats
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25
26/*
27YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31The following calculation is used for the conversion:
32
33  r = clipz((y-oy)*cy  + crv*(v-128))
34  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35  b = clipz((y-oy)*cy  + cbu*(u-128))
36
37y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40New factorization to eliminate the truncation error which was
41occurring due to the byteop3p.
42
43
441) Use the bytop16m to subtract quad bytes we use this in U8 this
45 then so the offsets need to be renormalized to 8bits.
46
472) Scale operands up by a factor of 4 not 8 because Blackfin
48   multiplies include a shift.
49
503) Compute into the accumulators cy*yx0, cy*yx1.
51
524) Compute each of the linear equations:
53     r = clipz((y - oy) * cy  + crv * (v - 128))
54
55     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56
57     b = clipz((y - oy) * cy  + cbu * (u - 128))
58
59   Reuse of the accumulators requires that we actually multiply
60   twice once with addition and the second time with a subtraction.
61
62   Because of this we need to compute the equations in the order R B
63   then G saving the writes for B in the case of 24/32 bit color
64   formats.
65
66   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                      int dW, uint32_t *coeffs);
68
69       A          B
70       ---        ---
71       i2 = cb    i3 = cr
72       i1 = coeff i0 = y
73
74Where coeffs have the following layout in memory.
75
76uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78coeffs is a pointer to oy.
79
80The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81replication is used to simplify the internal algorithms for the dual Mac
82architecture of BlackFin.
83
84All routines are exported with _ff_bfin_ as a symbol prefix.
85
86Rough performance gain compared against -O3:
87
882779809/1484290 187.28%
89
90which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91c/pel for the optimized implementations. Not sure why there is such a
92huge variation on the reference codes on Blackfin I guess it must have
93to do with the memory system.
94*/
95
96#define mL3 .text
97#ifdef __FDPIC__
98#define mL1 .l1.text
99#else
100#define mL1 mL3
101#endif
102#define MEM mL1
103
104#define DEFUN(fname,where,interface) \
105        .section where;              \
106        .global _ff_bfin_ ## fname;  \
107        .type _ff_bfin_ ## fname, STT_FUNC; \
108        .align 8;                    \
109        _ff_bfin_ ## fname
110
111#define DEFUN_END(fname) \
112        .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115.text
116
117#define COEFF_LEN        11*4
118#define COEFF_REL_CY_OFF 4*4
119
120#define ARG_OUT   20
121#define ARG_W     24
122#define ARG_COEFF 28
123
124DEFUN(yuv2rgb565_line,MEM,
125   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126        link 0;
127        [--sp] = (r7:4);
128        p1 = [fp+ARG_OUT];
129        r3 = [fp+ARG_W];
130
131        i0 = r0;
132        i2 = r1;
133        i3 = r2;
134
135        r0 = [fp+ARG_COEFF];
136        i1 = r0;
137        b1 = i1;
138        l1 = COEFF_LEN;
139        m0 = COEFF_REL_CY_OFF;
140        p0 = r3;
141
142        r0   = [i0++];         // 2Y
143        r1.l = w[i2++];        // 2u
144        r1.h = w[i3++];        // 2v
145        p0 = p0>>2;
146
147        lsetup (.L0565, .L1565) lc0 = p0;
148
149        /*
150           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151           r0 -- used to load 4ys
152           r1 -- used to load 2us,2vs
153           r4 -- y3,y2
154           r5 -- y1,y0
155           r6 -- u1,u0
156           r7 -- v1,v0
157        */
158                                                              r2=[i1++]; // oy
159.L0565:
160        /*
161        rrrrrrrr gggggggg bbbbbbbb
162         5432109876543210
163                    bbbbb >>3
164              gggggggg    <<3
165         rrrrrrrr         <<8
166         rrrrrggggggbbbbb
167        */
168        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169        (r7,r6) = byteop16m (r1:0, r3:2) (r);
170        r5 = r5 << 2 (v);                                                // y1,y0
171        r4 = r4 << 2 (v);                                                // y3,y2
172        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174        /* Y' = y*cy */
175        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176
177        /* R = Y+ crv*(Cr-128) */
178        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181        r2 = r2 >> 3 (v);
182        r3 = r2 & r5;
183
184        /* B = Y+ cbu*(Cb-128) */
185        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188        r2 = r2 << 8 (v);
189        r2 = r2 & r5;
190        r3 = r3 | r2;
191
192        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196        r2 = r2 << 3 (v);
197        r2 = r2 & r5;
198        r3 = r3 | r2;
199        [p1++]=r3                                          || r1=[i1++]; // cy
200
201        /* Y' = y*cy */
202
203        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204
205        /* R = Y+ crv*(Cr-128) */
206        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209        r2 = r2 >> 3 (v);
210        r3 = r2 & r5;
211
212        /* B = Y+ cbu*(Cb-128) */
213        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216        r2 = r2 << 8 (v);
217        r2 = r2 & r5;
218        r3 = r3 | r2;
219
220        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225        r2 = r2 & r5;
226        r3 = r3 | r2;
227        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228.L1565:                                                       r2=[i1++]; // oy
229
230        l1 = 0;
231
232        (r7:4) = [sp++];
233        unlink;
234        rts;
235DEFUN_END(yuv2rgb565_line)
236
237DEFUN(yuv2rgb555_line,MEM,
238   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239        link 0;
240        [--sp] = (r7:4);
241        p1 = [fp+ARG_OUT];
242        r3 = [fp+ARG_W];
243
244        i0 = r0;
245        i2 = r1;
246        i3 = r2;
247
248        r0 = [fp+ARG_COEFF];
249        i1 = r0;
250        b1 = i1;
251        l1 = COEFF_LEN;
252        m0 = COEFF_REL_CY_OFF;
253        p0 = r3;
254
255        r0   = [i0++];         // 2Y
256        r1.l = w[i2++];        // 2u
257        r1.h = w[i3++];        // 2v
258        p0 = p0>>2;
259
260        lsetup (.L0555, .L1555) lc0 = p0;
261
262        /*
263           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264           r0 -- used to load 4ys
265           r1 -- used to load 2us,2vs
266           r4 -- y3,y2
267           r5 -- y1,y0
268           r6 -- u1,u0
269           r7 -- v1,v0
270        */
271                                                              r2=[i1++]; // oy
272.L0555:
273        /*
274        rrrrrrrr gggggggg bbbbbbbb
275         5432109876543210
276                    bbbbb >>3
277               gggggggg   <<2
278          rrrrrrrr        <<7
279         xrrrrrgggggbbbbb
280        */
281
282        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283        (r7,r6) = byteop16m (r1:0, r3:2) (r);
284        r5 = r5 << 2 (v);                                                // y1,y0
285        r4 = r4 << 2 (v);                                                // y3,y2
286        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288        /* Y' = y*cy */
289        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290
291        /* R = Y+ crv*(Cr-128) */
292        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295        r2 = r2 >> 3 (v);
296        r3 = r2 & r5;
297
298        /* B = Y+ cbu*(Cb-128) */
299        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302        r2 = r2 << 7 (v);
303        r2 = r2 & r5;
304        r3 = r3 | r2;
305
306        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310        r2 = r2 << 2 (v);
311        r2 = r2 & r5;
312        r3 = r3 | r2;
313        [p1++]=r3                                          || r1=[i1++]; // cy
314
315        /* Y' = y*cy */
316
317        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318
319        /* R = Y+ crv*(Cr-128) */
320        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323        r2 = r2 >> 3 (v);
324        r3 = r2 & r5;
325
326        /* B = Y+ cbu*(Cb-128) */
327        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330        r2 = r2 << 7 (v);
331        r2 = r2 & r5;
332        r3 = r3 | r2;
333
334        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339        r2 = r2 & r5;
340        r3 = r3 | r2;
341        [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342
343.L1555:                                                       r2=[i1++]; // oy
344
345        l1 = 0;
346
347        (r7:4) = [sp++];
348        unlink;
349        rts;
350DEFUN_END(yuv2rgb555_line)
351
352DEFUN(yuv2rgb24_line,MEM,
353   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354        link 0;
355        [--sp] = (r7:4);
356        p1 = [fp+ARG_OUT];
357        r3 = [fp+ARG_W];
358        p2 = p1;
359        p2 += 3;
360
361        i0 = r0;
362        i2 = r1;
363        i3 = r2;
364
365        r0 = [fp+ARG_COEFF]; // coeff buffer
366        i1 = r0;
367        b1 = i1;
368        l1 = COEFF_LEN;
369        m0 = COEFF_REL_CY_OFF;
370        p0 = r3;
371
372        r0   = [i0++];         // 2Y
373        r1.l = w[i2++];        // 2u
374        r1.h = w[i3++];        // 2v
375        p0 = p0>>2;
376
377        lsetup (.L0888, .L1888) lc0 = p0;
378
379        /*
380           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381           r0 -- used to load 4ys
382           r1 -- used to load 2us,2vs
383           r4 -- y3,y2
384           r5 -- y1,y0
385           r6 -- u1,u0
386           r7 -- v1,v0
387        */
388                                                              r2=[i1++]; // oy
389.L0888:
390        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391        (r7,r6) = byteop16m (r1:0, r3:2) (r);
392        r5 = r5 << 2 (v);               // y1,y0
393        r4 = r4 << 2 (v);               // y3,y2
394        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396
397        /* Y' = y*cy */
398        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399
400        /* R = Y+ crv*(Cr-128) */
401        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404        r2=r2>>16 || B[p1++]=r2;
405                     B[p2++]=r2;
406
407        /* B = Y+ cbu*(Cb-128) */
408        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411
412        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416
417        r2=r2>>16 || B[p1++]=r2;
418                     B[p2++]=r2;
419
420        r3=r3>>16 || B[p1++]=r3;
421                     B[p2++]=r3                            || r1=[i1++]; // cy
422
423        p1+=3;
424        p2+=3;
425        /* Y' = y*cy */
426        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427
428        /* R = Y+ crv*(Cr-128) */
429        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432        r2=r2>>16 || B[p1++]=r2;
433        B[p2++]=r2;
434
435        /* B = Y+ cbu*(Cb-128) */
436        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439
440        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
446        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447                     B[p2++]=r3 || r2=[i1++];      // oy
448
449        p1+=3;
450.L1888: p2+=3;
451
452        l1 = 0;
453
454        (r7:4) = [sp++];
455        unlink;
456        rts;
457DEFUN_END(yuv2rgb24_line)
458
459
460
461#define ARG_vdst        20
462#define ARG_width       24
463#define ARG_height      28
464#define ARG_lumStride   32
465#define ARG_chromStride 36
466#define ARG_srcStride   40
467
468DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469                         long width, long height,
470                         long lumStride, long chromStride, long srcStride)):
471        link 0;
472        [--sp] = (r7:4,p5:4);
473
474        p0 = r1;       // Y top even
475
476        i2 = r2; // *u
477        r2 = [fp + ARG_vdst];
478        i3 = r2; // *v
479
480        r1 = [fp + ARG_srcStride];
481        r2 = r0 + r1;
482        r1 += -8;  // i0,i1 is pre read need to correct
483        m0 = r1;
484
485        i0 = r0;  // uyvy_T even
486        i1 = r2;  // uyvy_B odd
487
488        p2 = [fp + ARG_lumStride];
489        p1 = p0 + p2;  // Y bot odd
490
491        p5 = [fp + ARG_width];
492        p4 = [fp + ARG_height];
493        r0 = p5;
494        p4 = p4 >> 1;
495        p5 = p5 >> 2;
496
497        r2 = [fp + ARG_chromStride];
498        r0 = r0 >> 1;
499        r2 = r2 - r0;
500        m1 = r2;
501
502        /*   I0,I1 - src input line pointers
503         *   p0,p1 - luma output line pointers
504         *   I2    - dstU
505         *   I3    - dstV
506         */
507
508        lsetup (0f, 1f) lc1 = p4;   // H/2
5090:        r0 = [i0++] || r2 = [i1++];
510          r1 = [i0++] || r3 = [i1++];
511          r4 = byteop1p(r1:0, r3:2);
512          r5 = byteop1p(r1:0, r3:2) (r);
513          lsetup (2f, 3f) lc0 = p5; // W/4
5142:          r0 = r0 >> 8(v);
515            r1 = r1 >> 8(v);
516            r2 = r2 >> 8(v);
517            r3 = r3 >> 8(v);
518            r0 = bytepack(r0, r1);
519            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
520            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
521            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
522            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
523            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
5243:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
525
526          i0 += m0;
527          i1 += m0;
528          i2 += m1;
529          i3 += m1;
530          p0 = p0 + p2;
5311:        p1 = p1 + p2;
532
533        (r7:4,p5:4) = [sp++];
534        unlink;
535        rts;
536DEFUN_END(uyvytoyv12)
537
538DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
539                         long width, long height,
540                         long lumStride, long chromStride, long srcStride)):
541        link 0;
542        [--sp] = (r7:4,p5:4);
543
544        p0 = r1;       // Y top even
545
546        i2 = r2; // *u
547        r2 = [fp + ARG_vdst];
548        i3 = r2; // *v
549
550        r1 = [fp + ARG_srcStride];
551        r2 = r0 + r1;
552        r1 += -8;  // i0,i1 is pre read need to correct
553        m0 = r1;
554
555        i0 = r0;  // uyvy_T even
556        i1 = r2;  // uyvy_B odd
557
558        p2 = [fp + ARG_lumStride];
559        p1 = p0 + p2;  // Y bot odd
560
561        p5 = [fp + ARG_width];
562        p4 = [fp + ARG_height];
563        r0 = p5;
564        p4 = p4 >> 1;
565        p5 = p5 >> 2;
566
567        r2 = [fp + ARG_chromStride];
568        r0 = r0 >> 1;
569        r2 = r2 - r0;
570        m1 = r2;
571
572        /*   I0,I1 - src input line pointers
573         *   p0,p1 - luma output line pointers
574         *   I2    - dstU
575         *   I3    - dstV
576         */
577
578        lsetup (0f, 1f) lc1 = p4;   // H/2
5790:        r0 = [i0++] || r2 = [i1++];
580          r1 = [i0++] || r3 = [i1++];
581          r4 = bytepack(r0, r1);
582          r5 = bytepack(r2, r3);
583          lsetup (2f, 3f) lc0 = p5; // W/4
5842:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
585            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
586            r2 = r2 >> 8(v);
587            r3 = r3 >> 8(v);
588            r4 = byteop1p(r1:0, r3:2);
589            r5 = byteop1p(r1:0, r3:2) (r);
590            r6 = pack(r5.l, r4.l);
591            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
592            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
593            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
5943:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
595
596          i0 += m0;
597          i1 += m0;
598          i2 += m1;
599          i3 += m1;
600          p0 = p0 + p2;
6011:        p1 = p1 + p2;
602
603        (r7:4,p5:4) = [sp++];
604        unlink;
605        rts;
606DEFUN_END(yuyvtoyv12)
607