FLAT objects cannot have multiple sections, so using the L1 attributes breaks
[ffmpeg.git] / libswscale / internal_bfin.S
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin Video Color Space Converters Operations
6  *  convert I420 YV12 to RGB in various formats,
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25
26 /*
27     YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28     and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
29
30
31     The following calculation is used for the conversion:
32
33       r = clipz((y-oy)*cy  + crv*(v-128))
34       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35       b = clipz((y-oy)*cy  + cbu*(u-128))
36
37     y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
38
39
40     New factorization to eliminate the truncation error which was
41     occuring due to the byteop3p.
42
43
44   1) use the bytop16m to subtract quad bytes we use this in U8 this
45    then so the offsets need to be renormalized to 8bits.
46
47   2) scale operands up by a factor of 4 not 8 because Blackfin
48      multiplies include a shift.
49
50   3) compute into the accumulators cy*yx0, cy*yx1
51
52   4) compute each of the linear equations
53       r = clipz((y-oy)*cy  + crv*(v-128))
54
55       g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
56
57       b = clipz((y-oy)*cy  + cbu*(u-128))
58
59      reuse of the accumulators requires that we actually multiply
60      twice once with addition and the second time with a subtaction.
61
62      because of this we need to compute the equations in the order R B
63      then G saving the writes for B in the case of 24/32 bit color
64      formats.
65
66     api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                        int dW, uint32_t *coeffs);
68
69         A          B
70         ---        ---
71         i2 = cb    i3 = cr
72         i1 = coeff i0 = y
73
74   Where coeffs have the following layout in memory.
75
76   uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78   coeffs is a pointer to oy.
79
80   the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81   replication is used to simplify the internal algorithms for the dual mac architecture
82   of BlackFin.
83
84   All routines are exported with _ff_bfin_ as a symbol prefix
85
86   rough performance gain compared against -O3:
87
88   2779809/1484290 187.28%
89
90   which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91   c/pel for the optimized implementations. Not sure why there is such a
92   huge variation on the reference codes on Blackfin I guess it must have
93   to do with the memory system.
94
95 */
96
97 #define mL3 .text
98 #ifdef __FDPIC__
99 #define mL1 .l1.text
100 #else
101 #define mL1 mL3
102 #endif
103 #define MEM mL1
104
105 #define DEFUN(fname,where,interface) \
106         .section where;              \
107         .global _ff_bfin_ ## fname;  \
108         .type _ff_bfin_ ## fname, STT_FUNC; \
109         .align 8;                    \
110         _ff_bfin_ ## fname
111
112 #define DEFUN_END(fname) \
113         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
114
115
116 .text
117
118 #define COEFF_LEN        11*4
119 #define COEFF_REL_CY_OFF 4*4
120
121 #define ARG_OUT   20
122 #define ARG_W     24
123 #define ARG_COEFF 28
124
125 DEFUN(yuv2rgb565_line,MEM,
126    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
127         link 0;
128         [--sp] = (r7:4);
129         p1 = [fp+ARG_OUT];
130         r3 = [fp+ARG_W];
131
132         i0 = r0;
133         i2 = r1;
134         i3 = r2;
135
136         r0 = [fp+ARG_COEFF];
137         i1 = r0;
138         b1 = i1;
139         l1 = COEFF_LEN;
140         m0 = COEFF_REL_CY_OFF;
141         p0 = r3;
142
143         r0   = [i0++];         // 2Y
144         r1.l = w[i2++];        // 2u
145         r1.h = w[i3++];        // 2v
146         p0 = p0>>2;
147
148         lsetup (.L0565, .L1565) lc0 = p0;
149
150         /*
151            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
152            r0 -- used to load 4ys
153            r1 -- used to load 2us,2vs
154            r4 -- y3,y2
155            r5 -- y1,y0
156            r6 -- u1,u0
157            r7 -- v1,v0
158         */
159                                                               r2=[i1++]; // oy
160 .L0565:
161         /*
162         rrrrrrrr gggggggg bbbbbbbb
163          5432109876543210
164                     bbbbb >>3
165               gggggggg    <<3
166          rrrrrrrr         <<8
167          rrrrrggggggbbbbb
168         */
169         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
170         (r7,r6) = byteop16m (r1:0, r3:2) (r);
171         r5 = r5 << 2 (v);                                                // y1,y0
172         r4 = r4 << 2 (v);                                                // y3,y2
173         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
174         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
175         /* Y' = y*cy */
176         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
177
178         /* R = Y+ crv*(Cr-128) */
179         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
180                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
181         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
182         r2 = r2 >> 3 (v);
183         r3 = r2 & r5;
184
185         /* B = Y+ cbu*(Cb-128) */
186         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
187                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
188         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
189         r2 = r2 << 8 (v);
190         r2 = r2 & r5;
191         r3 = r3 | r2;
192
193         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
194                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
195         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
196         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
197         r2 = r2 << 3 (v);
198         r2 = r2 & r5;
199         r3 = r3 | r2;
200         [p1++]=r3                                          || r1=[i1++]; // cy
201
202         /* Y' = y*cy */
203
204         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
205
206         /* R = Y+ crv*(Cr-128) */
207         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
208                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
209         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
210         r2 = r2 >> 3 (v);
211         r3 = r2 & r5;
212
213         /* B = Y+ cbu*(Cb-128) */
214         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
215                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
216         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
217         r2 = r2 << 8 (v);
218         r2 = r2 & r5;
219         r3 = r3 | r2;
220
221         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
222                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
223         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
224         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
225         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
226         r2 = r2 & r5;
227         r3 = r3 | r2;
228         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
229 .L1565:                                                       r2=[i1++]; // oy
230
231         l1 = 0;
232
233         (r7:4) = [sp++];
234         unlink;
235         rts;
236 DEFUN_END(yuv2rgb565_line)
237
238 DEFUN(yuv2rgb555_line,MEM,
239    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
240         link 0;
241         [--sp] = (r7:4);
242         p1 = [fp+ARG_OUT];
243         r3 = [fp+ARG_W];
244
245         i0 = r0;
246         i2 = r1;
247         i3 = r2;
248
249         r0 = [fp+ARG_COEFF];
250         i1 = r0;
251         b1 = i1;
252         l1 = COEFF_LEN;
253         m0 = COEFF_REL_CY_OFF;
254         p0 = r3;
255
256         r0   = [i0++];         // 2Y
257         r1.l = w[i2++];        // 2u
258         r1.h = w[i3++];        // 2v
259         p0 = p0>>2;
260
261         lsetup (.L0555, .L1555) lc0 = p0;
262
263         /*
264            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
265            r0 -- used to load 4ys
266            r1 -- used to load 2us,2vs
267            r4 -- y3,y2
268            r5 -- y1,y0
269            r6 -- u1,u0
270            r7 -- v1,v0
271         */
272                                                               r2=[i1++]; // oy
273 .L0555:
274         /*
275         rrrrrrrr gggggggg bbbbbbbb
276          5432109876543210
277                     bbbbb >>3
278                gggggggg   <<2
279           rrrrrrrr        <<7
280          xrrrrrgggggbbbbb
281         */
282
283         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
284         (r7,r6) = byteop16m (r1:0, r3:2) (r);
285         r5 = r5 << 2 (v);                                                // y1,y0
286         r4 = r4 << 2 (v);                                                // y3,y2
287         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
288         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
289         /* Y' = y*cy */
290         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
291
292         /* R = Y+ crv*(Cr-128) */
293         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
294                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
295         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
296         r2 = r2 >> 3 (v);
297         r3 = r2 & r5;
298
299         /* B = Y+ cbu*(Cb-128) */
300         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
301                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
302         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
303         r2 = r2 << 7 (v);
304         r2 = r2 & r5;
305         r3 = r3 | r2;
306
307         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
308                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
309         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
310         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
311         r2 = r2 << 2 (v);
312         r2 = r2 & r5;
313         r3 = r3 | r2;
314         [p1++]=r3                                          || r1=[i1++]; // cy
315
316         /* Y' = y*cy */
317
318         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
319
320         /* R = Y+ crv*(Cr-128) */
321         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
322                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
323         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
324         r2 = r2 >> 3 (v);
325         r3 = r2 & r5;
326
327         /* B = Y+ cbu*(Cb-128) */
328         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
329                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
330         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
331         r2 = r2 << 7 (v);
332         r2 = r2 & r5;
333         r3 = r3 | r2;
334
335         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
336                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
337         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
338         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
339         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
340         r2 = r2 & r5;
341         r3 = r3 | r2;
342         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
343
344 .L1555:                                                       r2=[i1++]; // oy
345
346         l1 = 0;
347
348         (r7:4) = [sp++];
349         unlink;
350         rts;
351 DEFUN_END(yuv2rgb555_line)
352
353 DEFUN(yuv2rgb24_line,MEM,
354    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
355         link 0;
356         [--sp] = (r7:4);
357         p1 = [fp+ARG_OUT];
358         r3 = [fp+ARG_W];
359         p2 = p1;
360         p2 += 3;
361
362         i0 = r0;
363         i2 = r1;
364         i3 = r2;
365
366         r0 = [fp+ARG_COEFF]; // coeff buffer
367         i1 = r0;
368         b1 = i1;
369         l1 = COEFF_LEN;
370         m0 = COEFF_REL_CY_OFF;
371         p0 = r3;
372
373         r0   = [i0++];         // 2Y
374         r1.l = w[i2++];        // 2u
375         r1.h = w[i3++];        // 2v
376         p0 = p0>>2;
377
378         lsetup (.L0888, .L1888) lc0 = p0;
379
380         /*
381            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
382            r0 -- used to load 4ys
383            r1 -- used to load 2us,2vs
384            r4 -- y3,y2
385            r5 -- y1,y0
386            r6 -- u1,u0
387            r7 -- v1,v0
388         */
389                                                               r2=[i1++]; // oy
390 .L0888:
391         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
392         (r7,r6) = byteop16m (r1:0, r3:2) (r);
393         r5 = r5 << 2 (v);               // y1,y0
394         r4 = r4 << 2 (v);               // y3,y2
395         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
396         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
397
398         /* Y' = y*cy */
399         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
400
401         /* R = Y+ crv*(Cr-128) */
402         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
403                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
404         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
405         r2=r2>>16 || B[p1++]=r2;
406                      B[p2++]=r2;
407
408         /* B = Y+ cbu*(Cb-128) */
409         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
410                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
411         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
412
413         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
414                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
415         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
416         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
417
418         r2=r2>>16 || B[p1++]=r2;
419                      B[p2++]=r2;
420
421         r3=r3>>16 || B[p1++]=r3;
422                      B[p2++]=r3                            || r1=[i1++]; // cy
423
424         p1+=3;
425         p2+=3;
426         /* Y' = y*cy */
427         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
428
429         /* R = Y+ crv*(Cr-128) */
430         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
431                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
432         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
433         r2=r2>>16 || B[p1++]=r2;
434         B[p2++]=r2;
435
436         /* B = Y+ cbu*(Cb-128) */
437         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
438                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
439         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
440
441         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
442                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
443         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
444         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
445         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
446                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
447         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
448                      B[p2++]=r3 || r2=[i1++];      // oy
449
450         p1+=3;
451 .L1888: p2+=3;
452
453         l1 = 0;
454
455         (r7:4) = [sp++];
456         unlink;
457         rts;
458 DEFUN_END(yuv2rgb24_line)
459
460
461
462 #define ARG_vdst        20
463 #define ARG_width       24
464 #define ARG_height      28
465 #define ARG_lumStride   32
466 #define ARG_chromStride 36
467 #define ARG_srcStride   40
468
469 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
470                          long width, long height,
471                          long lumStride, long chromStride, long srcStride)):
472         link 0;
473         [--sp] = (r7:4,p5:4);
474
475         p0 = r1;       // Y top even
476
477         i2 = r2; // *u
478         r2 = [fp + ARG_vdst];
479         i3 = r2; // *v
480
481         r1 = [fp + ARG_srcStride];
482         r2 = r0 + r1;
483         r1 += -8;  // i0,i1 is pre read need to correct
484         m0 = r1;
485
486         i0 = r0;  // uyvy_T even
487         i1 = r2;  // uyvy_B odd
488
489         p2 = [fp + ARG_lumStride];
490         p1 = p0 + p2;  // Y bot odd
491
492         p5 = [fp + ARG_width];
493         p4 = [fp + ARG_height];
494         r0 = p5;
495         p4 = p4 >> 1;
496         p5 = p5 >> 2;
497
498         r2 = [fp + ARG_chromStride];
499         r0 = r0 >> 1;
500         r2 = r2 - r0;
501         m1 = r2;
502
503         /*   I0,I1 - src input line pointers
504          *   p0,p1 - luma output line pointers
505          *   I2    - dstU
506          *   I3    - dstV
507          */
508
509         lsetup (0f, 1f) lc1 = p4;   // H/2
510 0:        r0 = [i0++] || r2 = [i1++];
511           r1 = [i0++] || r3 = [i1++];
512           r4 = byteop1p(r1:0, r3:2);
513           r5 = byteop1p(r1:0, r3:2) (r);
514           lsetup (2f, 3f) lc0 = p5; // W/4
515 2:          r0 = r0 >> 8(v);
516             r1 = r1 >> 8(v);
517             r2 = r2 >> 8(v);
518             r3 = r3 >> 8(v);
519             r0 = bytepack(r0, r1);
520             r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
521             r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
522             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
523             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
524             r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
525 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
526
527           i0 += m0;
528           i1 += m0;
529           i2 += m1;
530           i3 += m1;
531           p0 = p0 + p2;
532 1:        p1 = p1 + p2;
533
534         (r7:4,p5:4) = [sp++];
535         unlink;
536         rts;
537 DEFUN_END(uyvytoyv12)
538
539 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
540                          long width, long height,
541                          long lumStride, long chromStride, long srcStride)):
542         link 0;
543         [--sp] = (r7:4,p5:4);
544
545         p0 = r1;       // Y top even
546
547         i2 = r2; // *u
548         r2 = [fp + ARG_vdst];
549         i3 = r2; // *v
550
551         r1 = [fp + ARG_srcStride];
552         r2 = r0 + r1;
553         r1 += -8;  // i0,i1 is pre read need to correct
554         m0 = r1;
555
556         i0 = r0;  // uyvy_T even
557         i1 = r2;  // uyvy_B odd
558
559         p2 = [fp + ARG_lumStride];
560         p1 = p0 + p2;  // Y bot odd
561
562         p5 = [fp + ARG_width];
563         p4 = [fp + ARG_height];
564         r0 = p5;
565         p4 = p4 >> 1;
566         p5 = p5 >> 2;
567
568         r2 = [fp + ARG_chromStride];
569         r0 = r0 >> 1;
570         r2 = r2 - r0;
571         m1 = r2;
572
573         /*   I0,I1 - src input line pointers
574          *   p0,p1 - luma output line pointers
575          *   I2    - dstU
576          *   I3    - dstV
577          */
578
579         lsetup (0f, 1f) lc1 = p4;   // H/2
580 0:        r0 = [i0++] || r2 = [i1++];
581           r1 = [i0++] || r3 = [i1++];
582           r4 = bytepack(r0, r1);
583           r5 = bytepack(r2, r3);
584           lsetup (2f, 3f) lc0 = p5; // W/4
585 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
586             r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
587             r2 = r2 >> 8(v);
588             r3 = r3 >> 8(v);
589             r4 = byteop1p(r1:0, r3:2);
590             r5 = byteop1p(r1:0, r3:2) (r);
591             r6 = pack(r5.l, r4.l);
592             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
593             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
594             r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
595 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
596
597           i0 += m0;
598           i1 += m0;
599           i2 += m1;
600           i3 += m1;
601           p0 = p0 + p2;
602 1:        p1 = p1 + p2;
603
604         (r7:4,p5:4) = [sp++];
605         unlink;
606         rts;
607 DEFUN_END(yuyvtoyv12)