optimize
authorMichael Niedermayer <michaelni@gmx.at>
Tue, 18 Feb 2003 19:22:34 +0000 (19:22 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Tue, 18 Feb 2003 19:22:34 +0000 (19:22 +0000)
Originally committed as revision 9455 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/rgb2rgb.c
postproc/rgb2rgb_template.c

index be21af0..c07301c 100644 (file)
@@ -28,6 +28,11 @@ static const uint64_t mask32b  __attribute__((aligned(8))) = 0x000000FF000000FFU
 static const uint64_t mask32g  __attribute__((aligned(8))) = 0x0000FF000000FF00ULL;
 static const uint64_t mask32r  __attribute__((aligned(8))) = 0x00FF000000FF0000ULL;
 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
+static const uint64_t mask3216br __attribute__((aligned(8)))=0x00F800F800F800F8ULL;
+static const uint64_t mask3216g  __attribute__((aligned(8)))=0x0000FC000000FC00ULL;
+static const uint64_t mask3215g  __attribute__((aligned(8)))=0x0000F8000000F800ULL;
+static const uint64_t mul3216  __attribute__((aligned(8))) = 0x2000000420000004ULL;
+static const uint64_t mul3215  __attribute__((aligned(8))) = 0x2000000820000008ULL;
 static const uint64_t mask24b  __attribute__((aligned(8))) = 0x00FF0000FF0000FFULL;
 static const uint64_t mask24g  __attribute__((aligned(8))) = 0xFF0000FF0000FF00ULL;
 static const uint64_t mask24r  __attribute__((aligned(8))) = 0x0000FF0000FF0000ULL;
index e299b0c..01ba6ed 100644 (file)
@@ -318,12 +318,46 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $5, %%mm0                \n\t"
+               "pslld $11, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_16mask),"m"(green_16mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -359,6 +393,7 @@ static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif
@@ -441,12 +476,46 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
        uint16_t *d = (uint16_t *)dst;
        end = s + src_size;
 #ifdef HAVE_MMX
+       mm_end = end - 15;
+#if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
+       asm volatile(
+               "movq %3, %%mm5                 \n\t"
+               "movq %4, %%mm6                 \n\t"
+               "movq %5, %%mm7                 \n\t"
+               ".balign 16                     \n\t"
+               "1:                             \n\t"
+               PREFETCH" 32(%1)                \n\t"
+               "movd   (%1), %%mm0             \n\t"
+               "movd   4(%1), %%mm3            \n\t"
+               "punpckldq 8(%1), %%mm0         \n\t"
+               "punpckldq 12(%1), %%mm3        \n\t"
+               "movq %%mm0, %%mm1              \n\t"
+               "movq %%mm3, %%mm4              \n\t"
+               "pand %%mm6, %%mm0              \n\t"
+               "pand %%mm6, %%mm3              \n\t"
+               "pmaddwd %%mm7, %%mm0           \n\t"
+               "pmaddwd %%mm7, %%mm3           \n\t"
+               "pand %%mm5, %%mm1              \n\t"
+               "pand %%mm5, %%mm4              \n\t"
+               "por %%mm1, %%mm0               \n\t"   
+               "por %%mm4, %%mm3               \n\t"
+               "psrld $6, %%mm0                \n\t"
+               "pslld $10, %%mm3               \n\t"
+               "por %%mm3, %%mm0               \n\t"
+               MOVNTQ" %%mm0, (%0)             \n\t"
+               "addl $16, %1                   \n\t"
+               "addl $8, %0                    \n\t"
+               "cmpl %2, %1                    \n\t"
+               " jb 1b                         \n\t"
+               : "+r" (d), "+r"(s)
+               : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
+       );
+#else
        __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
        __asm __volatile(
            "movq       %0, %%mm7\n\t"
            "movq       %1, %%mm6\n\t"
            ::"m"(red_15mask),"m"(green_15mask));
-       mm_end = end - 15;
        while(s < mm_end)
        {
            __asm __volatile(
@@ -482,6 +551,7 @@ static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned
                d += 4;
                s += 16;
        }
+#endif
        __asm __volatile(SFENCE:::"memory");
        __asm __volatile(EMMS:::"memory");
 #endif