Merge remote-tracking branch 'qatar/master'
[ffmpeg.git] / libswscale / x86 / swscale_template.c
index 678060f..ffc01c5 100644 (file)
@@ -1,20 +1,20 @@
 /*
  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
@@ -858,7 +858,7 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
         long p= 4;
-        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
+        const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
         x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
 
@@ -1085,8 +1085,8 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                     : "%r8"
                 );
 #else
-                *(const uint16_t **)(&c->u_temp)=abuf0;
-                *(const uint16_t **)(&c->v_temp)=abuf1;
+                c->u_temp=(intptr_t)abuf0;
+                c->v_temp=(intptr_t)abuf1;
                 __asm__ volatile(
                     "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
                     "mov        %4, %%"REG_b"               \n\t"
@@ -1904,6 +1904,163 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
     }
 }
 
+static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
+                                    const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
+{
+    int i, j;
+
+    assert(filterSize % 4 == 0 && filterSize>0);
+    if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
+        x86_reg counter= -2*dstW;
+        filter-= counter*2;
+        filterPos-= counter/2;
+        dst-= counter/2;
+        __asm__ volatile(
+            "movd                   %5, %%mm7       \n\t"
+#if defined(PIC)
+            "push            %%"REG_b"              \n\t"
+#endif
+            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
+            "mov             %%"REG_a", %%"REG_BP"  \n\t"
+            ".p2align                4              \n\t"
+            "1:                                     \n\t"
+            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
+            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
+            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
+            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
+            "movq      (%3, %%"REG_a", 2), %%mm0    \n\t"
+            "movq      (%3, %%"REG_b", 2), %%mm2    \n\t"
+            "pmaddwd             %%mm1, %%mm0       \n\t"
+            "pmaddwd             %%mm2, %%mm3       \n\t"
+            "movq                %%mm0, %%mm4       \n\t"
+            "punpckldq           %%mm3, %%mm0       \n\t"
+            "punpckhdq           %%mm3, %%mm4       \n\t"
+            "paddd               %%mm4, %%mm0       \n\t"
+            "psrad               %%mm7, %%mm0       \n\t"
+            "packssdw            %%mm0, %%mm0       \n\t"
+            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
+            "add                    $4, %%"REG_BP"  \n\t"
+            " jnc                   1b              \n\t"
+
+            "pop            %%"REG_BP"              \n\t"
+#if defined(PIC)
+            "pop             %%"REG_b"              \n\t"
+#endif
+            : "+a" (counter)
+            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
+#if !defined(PIC)
+            : "%"REG_b
+#endif
+        );
+    } else if (filterSize==8 && shift<15) {
+        x86_reg counter= -2*dstW;
+        filter-= counter*4;
+        filterPos-= counter/2;
+        dst-= counter/2;
+        __asm__ volatile(
+            "movd                   %5, %%mm7       \n\t"
+#if defined(PIC)
+            "push            %%"REG_b"              \n\t"
+#endif
+            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
+            "mov              %%"REG_a", %%"REG_BP" \n\t"
+            ".p2align                 4             \n\t"
+            "1:                                     \n\t"
+            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
+            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
+            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
+            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
+            "movq       (%3, %%"REG_a", 2), %%mm0   \n\t"
+            "movq       (%3, %%"REG_b", 2), %%mm2   \n\t"
+            "pmaddwd              %%mm1, %%mm0      \n\t"
+            "pmaddwd              %%mm2, %%mm3      \n\t"
+
+            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
+            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
+            "movq      8(%3, %%"REG_a", 2), %%mm4   \n\t"
+            "movq      8(%3, %%"REG_b", 2), %%mm2   \n\t"
+            "pmaddwd              %%mm1, %%mm4      \n\t"
+            "pmaddwd              %%mm2, %%mm5      \n\t"
+            "paddd                %%mm4, %%mm0      \n\t"
+            "paddd                %%mm5, %%mm3      \n\t"
+            "movq                 %%mm0, %%mm4      \n\t"
+            "punpckldq            %%mm3, %%mm0      \n\t"
+            "punpckhdq            %%mm3, %%mm4      \n\t"
+            "paddd                %%mm4, %%mm0      \n\t"
+            "psrad                %%mm7, %%mm0      \n\t"
+            "packssdw             %%mm0, %%mm0      \n\t"
+            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
+            "add                     $4, %%"REG_BP" \n\t"
+            " jnc                    1b             \n\t"
+
+            "pop             %%"REG_BP"             \n\t"
+#if defined(PIC)
+            "pop             %%"REG_b"              \n\t"
+#endif
+            : "+a" (counter)
+            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
+#if !defined(PIC)
+            : "%"REG_b
+#endif
+        );
+    } else if (shift<15){
+        const uint16_t *offset = src+filterSize;
+        x86_reg counter= -2*dstW;
+        //filter-= counter*filterSize/2;
+        filterPos-= counter/2;
+        dst-= counter/2;
+        __asm__ volatile(
+            "movd                   %7, %%mm7       \n\t"
+            ".p2align                  4            \n\t"
+            "1:                                     \n\t"
+            "mov                      %2, %%"REG_c" \n\t"
+            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
+            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
+            "mov                      %5, %%"REG_c" \n\t"
+            "pxor                  %%mm4, %%mm4     \n\t"
+            "pxor                  %%mm5, %%mm5     \n\t"
+            "2:                                     \n\t"
+            "movq                   (%1), %%mm1     \n\t"
+            "movq               (%1, %6), %%mm3     \n\t"
+            "movq (%%"REG_c", %%"REG_a", 2), %%mm0     \n\t"
+            "movq (%%"REG_c", %%"REG_d", 2), %%mm2     \n\t"
+            "pmaddwd               %%mm1, %%mm0     \n\t"
+            "pmaddwd               %%mm2, %%mm3     \n\t"
+            "paddd                 %%mm3, %%mm5     \n\t"
+            "paddd                 %%mm0, %%mm4     \n\t"
+            "add                      $8, %1        \n\t"
+            "add                      $8, %%"REG_c" \n\t"
+            "cmp                      %4, %%"REG_c" \n\t"
+            " jb                      2b            \n\t"
+            "add                      %6, %1        \n\t"
+            "movq                  %%mm4, %%mm0     \n\t"
+            "punpckldq             %%mm5, %%mm4     \n\t"
+            "punpckhdq             %%mm5, %%mm0     \n\t"
+            "paddd                 %%mm0, %%mm4     \n\t"
+            "psrad                 %%mm7, %%mm4     \n\t"
+            "packssdw              %%mm4, %%mm4     \n\t"
+            "mov                      %3, %%"REG_a" \n\t"
+            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
+            "add                      $4, %0        \n\t"
+            " jnc                     1b            \n\t"
+
+            : "+r" (counter), "+r" (filter)
+            : "m" (filterPos), "m" (dst), "m"(offset),
+            "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
+            : "%"REG_a, "%"REG_c, "%"REG_d
+        );
+    } else
+    for (i=0; i<dstW; i++) {
+        int srcPos= filterPos[i];
+        int val=0;
+        for (j=0; j<filterSize; j++) {
+            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
+        }
+        dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
+    }
+}
+
+
 #if COMPILE_TEMPLATE_MMX2
 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                         long dstWidth, const uint8_t *src, int srcW,
@@ -2154,19 +2311,19 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
     }
 #endif /* COMPILE_TEMPLATE_MMX2 */
 
-    switch(srcFormat) {
+     switch(srcFormat) {
         case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
         case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
         case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
         case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
-        case PIX_FMT_YUV420P16BE:
-        case PIX_FMT_YUV422P16BE:
-        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
+        case PIX_FMT_GRAY16LE :
+        case PIX_FMT_YUV420P9LE:
+        case PIX_FMT_YUV422P10LE:
+        case PIX_FMT_YUV420P10LE:
         case PIX_FMT_YUV420P16LE:
         case PIX_FMT_YUV422P16LE:
-        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
-        default: break;
-    }
+        case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
+    }   
     if (!c->chrSrcHSubSample) {
         switch(srcFormat) {
         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
@@ -2177,16 +2334,10 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
 
     switch (srcFormat) {
     case PIX_FMT_YUYV422  :
-    case PIX_FMT_YUV420P16BE:
-    case PIX_FMT_YUV422P16BE:
-    case PIX_FMT_YUV444P16BE:
     case PIX_FMT_Y400A    :
-    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
+                            c->lumToYV12 = RENAME(yuy2ToY); break;
     case PIX_FMT_UYVY422  :
-    case PIX_FMT_YUV420P16LE:
-    case PIX_FMT_YUV422P16LE:
-    case PIX_FMT_YUV444P16LE:
-    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
+                            c->lumToYV12 = RENAME(uyvyToY); break;
     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
     default: break;