swscale: remove if (c->dstFormat) branch from yuv2packed[12X]().
authorRonald S. Bultje <rsbultje@gmail.com>
Tue, 24 May 2011 20:02:47 +0000 (16:02 -0400)
committerRonald S. Bultje <rsbultje@gmail.com>
Thu, 26 May 2011 13:30:14 +0000 (09:30 -0400)
This allows cutting up the function in much smaller and easier-
to-maintain chunks.

libswscale/x86/swscale_template.c

index 355a1f5..2b6d7e4 100644 (file)
@@ -901,15 +901,13 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
 /**
  * vertical scale YV12 to RGB
  */
-static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+static inline void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
 
-            switch(c->dstFormat) {
-            case PIX_FMT_RGB32:
                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                     YSCALEYUV2PACKEDX_ACCURATE
                     YSCALEYUV2RGBX
@@ -932,8 +930,15 @@ static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilte
 
                     YSCALEYUV2PACKEDX_END
                 }
-                return;
-            case PIX_FMT_BGR24:
+}
+
+static inline void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                          const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                          const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX_ACCURATE
                 YSCALEYUV2RGBX
                 "pxor %%mm7, %%mm7 \n\t"
@@ -947,8 +952,16 @@ static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilte
                 "r" (dest), "m" (dstW_reg)
                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
                 );
-                return;
-            case PIX_FMT_RGB555:
+}
+
+
+static inline void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                          const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                          const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX_ACCURATE
                 YSCALEYUV2RGBX
                 "pxor %%mm7, %%mm7 \n\t"
@@ -961,8 +974,15 @@ static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilte
 
                 WRITERGB15(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            case PIX_FMT_RGB565:
+}
+
+static inline void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                          const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                          const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX_ACCURATE
                 YSCALEYUV2RGBX
                 "pxor %%mm7, %%mm7 \n\t"
@@ -975,8 +995,15 @@ static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilte
 
                 WRITERGB16(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            case PIX_FMT_YUYV422:
+}
+
+static inline void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                          const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                          const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX_ACCURATE
                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
@@ -986,23 +1013,15 @@ static inline void RENAME(yuv2packedX_ar)(SwsContext *c, const int16_t *lumFilte
                 "psraw $3, %%mm7    \n\t"
                 WRITEYUY2(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            }
-
-    yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
-                   chrFilter, chrSrc, chrFilterSize,
-                   alpSrc, dest, dstW, dstY);
 }
 
-static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+static inline void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
 
-        switch(c->dstFormat) {
-            case PIX_FMT_RGB32:
                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                     YSCALEYUV2PACKEDX
                     YSCALEYUV2RGBX
@@ -1019,8 +1038,15 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                     WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
                     YSCALEYUV2PACKEDX_END
                 }
-                return;
-            case PIX_FMT_BGR24:
+}
+
+static inline void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX
                 YSCALEYUV2RGBX
                 "pxor                    %%mm7, %%mm7       \n\t"
@@ -1033,8 +1059,15 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                 "r" (dest),  "m" (dstW_reg)
                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
                 );
-                return;
-            case PIX_FMT_RGB555:
+}
+
+static inline void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX
                 YSCALEYUV2RGBX
                 "pxor %%mm7, %%mm7 \n\t"
@@ -1047,8 +1080,15 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 
                 WRITERGB15(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            case PIX_FMT_RGB565:
+}
+
+static inline void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX
                 YSCALEYUV2RGBX
                 "pxor %%mm7, %%mm7 \n\t"
@@ -1061,8 +1101,15 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 
                 WRITERGB16(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            case PIX_FMT_YUYV422:
+}
+
+static inline void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
+                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
+                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
+{
+    x86_reg dummy=0;
+    x86_reg dstW_reg = dstW;
+
                 YSCALEYUV2PACKEDX
                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 
@@ -1072,23 +1119,14 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                 "psraw $3, %%mm7    \n\t"
                 WRITEYUY2(%4, %5, %%REGa)
                 YSCALEYUV2PACKEDX_END
-                return;
-            }
-
-    yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
-                   chrFilter, chrSrc, chrFilterSize,
-                   alpSrc, dest, dstW, dstY);
 }
 
 /**
  * vertical bilinear scale YV12 to RGB
  */
-static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+static inline void RENAME(yuv2rgb32_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
-        switch(c->dstFormat) {
-        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
-        case PIX_FMT_RGB32:
             if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
 #if ARCH_X86_64
                 __asm__ volatile(
@@ -1145,8 +1183,12 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                     "a" (&c->redDither)
                 );
             }
-            return;
-        case PIX_FMT_BGR24:
+}
+
+static inline void RENAME(yuv2bgr24_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
+{
+            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
             __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
@@ -1159,8 +1201,12 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                 "a" (&c->redDither)
             );
-            return;
-        case PIX_FMT_RGB555:
+}
+
+static inline void RENAME(yuv2rgb555_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
+{
+            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
             __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
@@ -1181,8 +1227,12 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                 "a" (&c->redDither)
             );
-            return;
-        case PIX_FMT_RGB565:
+}
+
+static inline void RENAME(yuv2rgb565_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
+{
+            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
             __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov        %4, %%"REG_b"               \n\t"
@@ -1202,8 +1252,12 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                 "a" (&c->redDither)
             );
-            return;
-        case PIX_FMT_YUYV422:
+}
+
+static inline void RENAME(yuv2yuyv422_2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
+{
+            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
             __asm__ volatile(
                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                 "mov %4, %%"REG_b"                        \n\t"
@@ -1215,24 +1269,17 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                 "a" (&c->redDither)
             );
-            return;
-        }
-
-    yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
-                  dest, dstW, yalpha, uvalpha, y);
 }
 
 /**
  * YV12 to RGB without scaling or interpolating
  */
-static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+static inline void RENAME(yuv2rgb32_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
 {
         const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
-            switch(dstFormat) {
-            case PIX_FMT_RGB32:
                 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
                     __asm__ volatile(
                         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
@@ -1262,8 +1309,45 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                         "a" (&c->redDither)
                     );
                 }
-                return;
-            case PIX_FMT_BGR24:
+        } else {
+                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
+                    __asm__ volatile(
+                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                        "mov        %4, %%"REG_b"               \n\t"
+                        "push %%"REG_BP"                        \n\t"
+                        YSCALEYUV2RGB1b(%%REGBP, %5)
+                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
+                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                        "pop %%"REG_BP"                         \n\t"
+                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+
+                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                        "a" (&c->redDither)
+                    );
+                } else {
+                    __asm__ volatile(
+                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
+                        "mov        %4, %%"REG_b"               \n\t"
+                        "push %%"REG_BP"                        \n\t"
+                        YSCALEYUV2RGB1b(%%REGBP, %5)
+                        "pcmpeqd %%mm7, %%mm7                   \n\t"
+                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
+                        "pop %%"REG_BP"                         \n\t"
+                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+
+                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+                        "a" (&c->redDither)
+                    );
+                }
+        }
+}
+
+static inline void RENAME(yuv2bgr24_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
+{
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
@@ -1277,29 +1361,29 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_RGB555:
+        } else {
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
                     "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB1(%%REGBP, %5)
+                    YSCALEYUV2RGB1b(%%REGBP, %5)
                     "pxor    %%mm7, %%mm7                   \n\t"
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
-                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
-                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
-                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
-#endif
-                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_RGB565:
+        }
+}
+
+static inline void RENAME(yuv2rgb555_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
+{
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
@@ -1312,79 +1396,63 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
-
-                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_YUYV422:
+        } else {
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
                     "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2PACKED1(%%REGBP, %5)
-                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                    YSCALEYUV2RGB1b(%%REGBP, %5)
+                    "pxor    %%mm7, %%mm7                   \n\t"
+                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            }
-        } else {
-            switch(dstFormat) {
-            case PIX_FMT_RGB32:
-                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
-                    __asm__ volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-                        "mov        %4, %%"REG_b"               \n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
-                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+        }
+}
 
-                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-                        "a" (&c->redDither)
-                    );
-                } else {
-                    __asm__ volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
-                        "mov        %4, %%"REG_b"               \n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        "pcmpeqd %%mm7, %%mm7                   \n\t"
-                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
+static inline void RENAME(yuv2rgb565_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
+{
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
-                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
-                        "a" (&c->redDither)
-                    );
-                }
-                return;
-            case PIX_FMT_BGR24:
+        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
                     "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB1b(%%REGBP, %5)
+                    YSCALEYUV2RGB1(%%REGBP, %5)
                     "pxor    %%mm7, %%mm7                   \n\t"
-                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+#ifdef DITHER1XBPP
+                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
+                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
+                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
+#endif
+
+                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_RGB555:
+        } else {
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
@@ -1397,37 +1465,36 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                     "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
                     "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
 #endif
-                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
+
+                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_RGB565:
+        }
+}
+
+static inline void RENAME(yuv2yuyv422_1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
+                                       const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
+{
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
+
+        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
                     "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB1b(%%REGBP, %5)
-                    "pxor    %%mm7, %%mm7                   \n\t"
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-#ifdef DITHER1XBPP
-                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
-                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
-                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
-#endif
-
-                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
+                    YSCALEYUV2PACKED1(%%REGBP, %5)
+                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
                     "pop %%"REG_BP"                         \n\t"
                     "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
 
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            case PIX_FMT_YUYV422:
+        } else {
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
@@ -1440,12 +1507,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                     :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
                     "a" (&c->redDither)
                 );
-                return;
-            }
         }
-
-    yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
-                  dstW, uvalpha, dstFormat, flags, y);
 }
 
 //FIXME yuy2* can read up to 7 samples too much
@@ -2148,14 +2210,50 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
         if (c->flags & SWS_ACCURATE_RND) {
             c->yuv2yuv1     = RENAME(yuv2yuv1_ar    );
             c->yuv2yuvX     = RENAME(yuv2yuvX_ar    );
-            c->yuv2packedX  = RENAME(yuv2packedX_ar );
+            switch (c->dstFormat) {
+            case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
+            case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
+            case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
+            case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
+            case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
+            default: break;
+            }
         } else {
             c->yuv2yuv1     = RENAME(yuv2yuv1    );
             c->yuv2yuvX     = RENAME(yuv2yuvX    );
-            c->yuv2packedX  = RENAME(yuv2packedX );
+            switch (c->dstFormat) {
+            case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
+            case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
+            case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
+            case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
+            case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
+            default: break;
+            }
+        }
+        switch (c->dstFormat) {
+        case PIX_FMT_RGB32:
+                c->yuv2packed1 = RENAME(yuv2rgb32_1);
+                c->yuv2packed2 = RENAME(yuv2rgb32_2);
+                break;
+        case PIX_FMT_BGR24:
+                c->yuv2packed1 = RENAME(yuv2bgr24_1);
+                c->yuv2packed2 = RENAME(yuv2bgr24_2);
+                break;
+        case PIX_FMT_RGB555:
+                c->yuv2packed1 = RENAME(yuv2rgb555_1);
+                c->yuv2packed2 = RENAME(yuv2rgb555_2);
+                break;
+        case PIX_FMT_RGB565:
+                c->yuv2packed1 = RENAME(yuv2rgb565_1);
+                c->yuv2packed2 = RENAME(yuv2rgb565_2);
+                break;
+        case PIX_FMT_YUYV422:
+                c->yuv2packed1 = RENAME(yuv2yuyv422_1);
+                c->yuv2packed2 = RENAME(yuv2yuyv422_2);
+                break;
+        default:
+                break;
         }
-        c->yuv2packed1  = RENAME(yuv2packed1 );
-        c->yuv2packed2  = RENAME(yuv2packed2 );
     }
 
     c->hScale       = RENAME(hScale      );