avcodec/mips/h264dsp_mmi: Version 2 of the optimizations for loongson mmi
authorZhouXiaoyong <zhouxiaoyong@loongson.cn>
Fri, 13 May 2016 10:03:27 +0000 (18:03 +0800)
committerMichael Niedermayer <michael@niedermayer.cc>
Tue, 17 May 2016 02:18:42 +0000 (04:18 +0200)
1. no longer use the register names directly and optimized code format
2. to be compatible with O32, specify type of address variable with mips_reg and handle the address variable with PTR_ operator
3. optimize some unaligned loads and stores
4. use uld and mtc1 to workaround cpu 3A2000 gslwlc1 bug (gslwlc1 instruction extension bug in O32 ABI)

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
libavcodec/mips/h264dsp_mmi.c

index 14c4a43..a62bbab 100644 (file)
 
 #include "libavcodec/bit_depth_template.c"
 #include "h264dsp_mips.h"
+#include "libavutil/mips/asmdefs.h"
 
 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
 {
+    double ftmp[9];
+    uint64_t low32;
+
     __asm__ volatile (
-        "xor $f0, $f0, $f0              \r\n"
-        "ldc1 $f2, 0(%[src])            \r\n"
-        "ldc1 $f4, 8(%[src])            \r\n"
-        "ldc1 $f6, 16(%[src])           \r\n"
-        "ldc1 $f8, 24(%[src])           \r\n"
-        "lwc1 $f10, 0(%[dst0])          \r\n"
-        "lwc1 $f12, 0(%[dst1])          \r\n"
-        "lwc1 $f14, 0(%[dst2])          \r\n"
-        "lwc1 $f16, 0(%[dst3])          \r\n"
-        "punpcklbh $f10, $f10, $f0      \r\n"
-        "punpcklbh $f12, $f12, $f0      \r\n"
-        "punpcklbh $f14, $f14, $f0      \r\n"
-        "punpcklbh $f16, $f16, $f0      \r\n"
-        "paddh $f2, $f2, $f10           \r\n"
-        "paddh $f4, $f4, $f12           \r\n"
-        "paddh $f6, $f6, $f14           \r\n"
-        "paddh $f8, $f8, $f16           \r\n"
-        "packushb $f2, $f2, $f0         \r\n"
-        "packushb $f4, $f4, $f0         \r\n"
-        "packushb $f6, $f6, $f0         \r\n"
-        "packushb $f8, $f8, $f0         \r\n"
-        "swc1 $f2, 0(%[dst0])           \r\n"
-        "swc1 $f4, 0(%[dst1])           \r\n"
-        "swc1 $f6, 0(%[dst2])           \r\n"
-        "swc1 $f8, 0(%[dst3])           \r\n"
-        ::[dst0]"r"(dst),[dst1]"r"(dst+stride),[dst2]"r"(dst+2*stride),
-          [dst3]"r"(dst+3*stride),[src]"r"(src)
-        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "ldc1       %[ftmp1],   0x00(%[src])                            \n\t"
+        "ldc1       %[ftmp2],   0x08(%[src])                            \n\t"
+        "ldc1       %[ftmp3],   0x10(%[src])                            \n\t"
+        "ldc1       %[ftmp4],   0x18(%[src])                            \n\t"
+        "uld        %[low32],   0x00(%[dst0])                           \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "uld        %[low32],   0x00(%[dst1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "uld        %[low32],   0x00(%[dst2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "uld        %[low32],   0x00(%[dst3])                           \n\t"
+        "mtc1       %[low32],   %[ftmp8]                                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
+        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [low32]"=&r"(low32)
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [src]"r"(src)
+        : "memory"
     );
 
     memset(src, 0, 32);
@@ -64,79 +83,94 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
 
 void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
+    double ftmp[12];
+    uint64_t tmp[1];
+    uint64_t low32;
+
     __asm__ volatile (
-        "dli $8, 1                              \r\n"
-        "ldc1 $f0, 0(%[block])                  \r\n"
-        "dmtc1 $8, $f16                         \r\n"
-        "ldc1 $f2, 8(%[block])                  \r\n"
-        "dli $8, 6                              \r\n"
-        "ldc1 $f4, 16(%[block])                 \r\n"
-        "dmtc1 $8, $f18                         \r\n"
-        "psrah $f8, $f2, $f16                   \r\n"
-        "ldc1 $f6, 24(%[block])                 \r\n"
-        "psrah $f10, $f6, $f16                  \r\n"
-        "psubh $f8, $f8, $f6                    \r\n"
-        "paddh $f10, $f10, $f2                  \r\n"
-        "paddh $f20, $f4, $f0                   \r\n"
-        "psubh $f0, $f0, $f4                    \r\n"
-        "paddh $f22, $f10, $f20                 \r\n"
-        "psubh $f4, $f20, $f10                  \r\n"
-        "paddh $f20, $f8, $f0                   \r\n"
-        "psubh $f0, $f0, $f8                    \r\n"
-        "punpckhhw $f2, $f22, $f20              \r\n"
-        "punpcklhw $f10, $f22, $f20             \r\n"
-        "punpckhhw $f8, $f0, $f4                \r\n"
-        "punpcklhw $f0, $f0, $f4                \r\n"
-        "punpckhwd $f4, $f10, $f0               \r\n"
-        "punpcklwd $f10, $f10, $f0              \r\n"
-        "punpcklwd $f20, $f2, $f8               \r\n"
-        "punpckhwd $f0, $f2, $f8                \r\n"
-        "paddh $f10, $f10, %[ff_pw_32]          \r\n"
-        "psrah $f8, $f4, $f16                   \r\n"
-        "psrah $f6, $f0, $f16                   \r\n"
-        "psubh $f8, $f8, $f0                    \r\n"
-        "paddh $f6, $f6, $f4                    \r\n"
-        "paddh $f2, $f20, $f10                  \r\n"
-        "psubh $f10, $f10, $f20                 \r\n"
-        "paddh $f20, $f6, $f2                   \r\n"
-        "psubh $f2, $f2, $f6                    \r\n"
-        "paddh $f22, $f8, $f10                  \r\n"
-        "xor $f14, $f14, $f14                   \r\n"
-        "psubh $f10, $f10, $f8                  \r\n"
-        "sdc1 $f14, 0(%[block])                 \r\n"
-        "sdc1 $f14, 8(%[block])                 \r\n"
-        "sdc1 $f14, 16(%[block])                \r\n"
-        "sdc1 $f14, 24(%[block])                \r\n"
-        "lwc1 $f4, 0(%[dst])                    \r\n"
-        "psrah $f6, $f20, $f18                  \r\n"
-        "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
-        "psrah $f8, $f22, $f18                  \r\n"
-        "punpcklbh $f4, $f4, $f14               \r\n"
-        "punpcklbh $f0, $f0, $f14               \r\n"
-        "paddh $f4, $f4, $f6                    \r\n"
-        "paddh $f0, $f0, $f8                    \r\n"
-        "packushb $f4, $f4, $f14                \r\n"
-        "packushb $f0, $f0, $f14                \r\n"
-        "swc1 $f4, 0(%[dst])                    \r\n"
-        "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
-        "daddu %[dst], %[dst], %[stride]        \r\n"
-        "daddu %[dst], %[dst], %[stride]        \r\n"
-        "lwc1 $f4, 0(%[dst])                    \r\n"
-        "psrah $f10, $f10, $f18                 \r\n"
-        "gslwxc1 $f0, 0(%[dst], %[stride])      \r\n"
-        "psrah $f2, $f2, $f18                   \r\n"
-        "punpcklbh $f4, $f4, $f14               \r\n"
-        "punpcklbh $f0, $f0, $f14               \r\n"
-        "paddh $f4, $f4, $f10                   \r\n"
-        "paddh $f0, $f0, $f2                    \r\n"
-        "packushb $f4, $f4, $f14                \r\n"
-        "swc1 $f4, 0(%[dst])                    \r\n"
-        "packushb $f0, $f0, $f14                \r\n"
-        "gsswxc1 $f0, 0(%[dst], %[stride])      \r\n"
-        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride),
-          [ff_pw_32]"f"(ff_pw_32)
-        : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
-          "$f18","$f20","$f22"
+        "dli        %[tmp0],    0x01                                    \n\t"
+        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        "dli        %[tmp0],    0x06                                    \n\t"
+        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
+        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp5],       %[ftmp10]               \n\t"
+        "psubh      %[ftmp2],   %[ftmp10],      %[ftmp5]                \n\t"
+        "paddh      %[ftmp10],  %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp11],      %[ftmp10]               \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp10],  %[ftmp1],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ff_pw_32]             \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp10],      %[ftmp5]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "paddh      %[ftmp10],  %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "sdc1       %[ftmp7],   0x00(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x08(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x10(%[block])                          \n\t"
+        "sdc1       %[ftmp7],   0x18(%[block])                          \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
+        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[dst])                            \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [tmp0]"=&r"(tmp[0]),
+          [low32]"=&r"(low32)
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
+        : "memory"
     );
 
     memset(block, 0, 32);
@@ -144,448 +178,482 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 
 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
+    double ftmp[16];
+    uint64_t tmp[8];
+    mips_reg addr[1];
+    uint64_t low32;
+
     __asm__ volatile (
-        "lhu $10, 0x0(%[block])                     \r\n"
-        "daddiu $29, $29, -0x20                     \r\n"
-        "daddiu $10, $10, 0x20                      \r\n"
-        "ldc1 $f2, 0x10(%[block])                   \r\n"
-        "sh $10, 0x0(%[block])                      \r\n"
-        "ldc1 $f4, 0x20(%[block])                   \r\n"
-        "dli $10, 0x1                               \r\n"
-        "ldc1 $f6, 0x30(%[block])                   \r\n"
-        "dmtc1 $10, $f16                            \r\n"
-        "ldc1 $f10, 0x50(%[block])                  \r\n"
-        "ldc1 $f12, 0x60(%[block])                  \r\n"
-        "ldc1 $f14, 0x70(%[block])                  \r\n"
-        "mov.d $f0, $f2                             \r\n"
-        "psrah $f2, $f2, $f16                       \r\n"
-        "psrah $f8, $f10, $f16                      \r\n"
-        "paddh $f2, $f2, $f0                        \r\n"
-        "paddh $f8, $f8, $f10                       \r\n"
-        "paddh $f2, $f2, $f10                       \r\n"
-        "paddh $f8, $f8, $f14                       \r\n"
-        "paddh $f2, $f2, $f6                        \r\n"
-        "psubh $f8, $f8, $f0                        \r\n"
-        "psubh $f0, $f0, $f6                        \r\n"
-        "psubh $f10, $f10, $f6                      \r\n"
-        "psrah $f6, $f6, $f16                       \r\n"
-        "paddh $f0, $f0, $f14                       \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "psrah $f14, $f14, $f16                     \r\n"
-        "psubh $f0, $f0, $f6                        \r\n"
-        "dli $10, 0x2                               \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "dmtc1 $10, $f18                            \r\n"
-        "mov.d $f14, $f2                            \r\n"
-        "psrah $f2, $f2, $f18                       \r\n"
-        "psrah $f6, $f8, $f18                       \r\n"
-        "paddh $f6, $f6, $f0                        \r\n"
-        "psrah $f0, $f0, $f18                       \r\n"
-        "paddh $f2, $f2, $f10                       \r\n"
-        "psrah $f10, $f10, $f18                     \r\n"
-        "psubh $f0, $f0, $f8                        \r\n"
-        "psubh $f14, $f14, $f10                     \r\n"
-        "mov.d $f10, $f12                           \r\n"
-        "psrah $f12, $f12, $f16                     \r\n"
-        "psrah $f8, $f4, $f16                       \r\n"
-        "paddh $f12, $f12, $f4                      \r\n"
-        "psubh $f8, $f8, $f10                       \r\n"
-        "ldc1 $f4, 0x0(%[block])                    \r\n"
-        "ldc1 $f10, 0x40(%[block])                  \r\n"
-        "paddh $f10, $f10, $f4                      \r\n"
-        "paddh $f4, $f4, $f4                        \r\n"
-        "paddh $f12, $f12, $f10                     \r\n"
-        "psubh $f4, $f4, $f10                       \r\n"
-        "paddh $f10, $f10, $f10                     \r\n"
-        "paddh $f8, $f8, $f4                        \r\n"
-        "psubh $f10, $f10, $f12                     \r\n"
-        "paddh $f4, $f4, $f4                        \r\n"
-        "paddh $f14, $f14, $f12                     \r\n"
-        "psubh $f4, $f4, $f8                        \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f0, $f0, $f8                        \r\n"
-        "psubh $f12, $f12, $f14                     \r\n"
-        "paddh $f8, $f8, $f8                        \r\n"
-        "paddh $f6, $f6, $f4                        \r\n"
-        "psubh $f8, $f8, $f0                        \r\n"
-        "paddh $f4, $f4, $f4                        \r\n"
-        "paddh $f2, $f2, $f10                       \r\n"
-        "psubh $f4, $f4, $f6                        \r\n"
-        "paddh $f10, $f10, $f10                     \r\n"
-        "sdc1 $f12, 0x0(%[block])                   \r\n"
-        "psubh $f10, $f10, $f2                      \r\n"
-        "punpckhhw $f12, $f14, $f0                  \r\n"
-        "punpcklhw $f14, $f14, $f0                  \r\n"
-        "punpckhhw $f0, $f6, $f2                    \r\n"
-        "punpcklhw $f6, $f6, $f2                    \r\n"
-        "punpckhwd $f2, $f14, $f6                   \r\n"
-        "punpcklwd $f14, $f14, $f6                  \r\n"
-        "punpckhwd $f6, $f12, $f0                   \r\n"
-        "punpcklwd $f12, $f12, $f0                  \r\n"
-        "ldc1 $f0, 0x0(%[block])                    \r\n"
-        "sdc1 $f14, 0x0($29)                        \r\n"
-        "sdc1 $f2, 0x10($29)                        \r\n"
-        "dmfc1 $8, $f12                             \r\n"
-        "dmfc1 $11, $f6                             \r\n"
-        "punpckhhw $f6, $f10, $f4                   \r\n"
-        "punpcklhw $f10, $f10, $f4                  \r\n"
-        "punpckhhw $f4, $f8, $f0                    \r\n"
-        "punpcklhw $f8, $f8, $f0                    \r\n"
-        "punpckhwd $f0, $f10, $f8                   \r\n"
-        "punpcklwd $f10, $f10, $f8                  \r\n"
-        "punpckhwd $f8, $f6, $f4                    \r\n"
-        "punpcklwd $f6, $f6, $f4                    \r\n"
-        "sdc1 $f10, 0x8($29)                        \r\n"
-        "sdc1 $f0, 0x18($29)                        \r\n"
-        "dmfc1 $9, $f6                              \r\n"
-        "dmfc1 $12, $f8                             \r\n"
-        "ldc1 $f2, 0x18(%[block])                   \r\n"
-        "ldc1 $f12, 0x28(%[block])                  \r\n"
-        "ldc1 $f4, 0x38(%[block])                   \r\n"
-        "ldc1 $f0, 0x58(%[block])                   \r\n"
-        "ldc1 $f6, 0x68(%[block])                   \r\n"
-        "ldc1 $f8, 0x78(%[block])                   \r\n"
-        "mov.d $f14, $f2                            \r\n"
-        "psrah $f10, $f0, $f16                      \r\n"
-        "psrah $f2, $f2, $f16                       \r\n"
-        "paddh $f10, $f10, $f0                      \r\n"
-        "paddh $f2, $f2, $f14                       \r\n"
-        "paddh $f10, $f10, $f8                      \r\n"
-        "paddh $f2, $f2, $f0                        \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "paddh $f2, $f2, $f4                        \r\n"
-        "psubh $f14, $f14, $f4                      \r\n"
-        "psubh $f0, $f0, $f4                        \r\n"
-        "psrah $f4, $f4, $f16                       \r\n"
-        "paddh $f14, $f14, $f8                      \r\n"
-        "psubh $f0, $f0, $f8                        \r\n"
-        "psrah $f8, $f8, $f16                       \r\n"
-        "psubh $f14, $f14, $f4                      \r\n"
-        "psubh $f0, $f0, $f8                        \r\n"
-        "mov.d $f8, $f2                             \r\n"
-        "psrah $f4, $f10, $f18                      \r\n"
-        "psrah $f2, $f2, $f18                       \r\n"
-        "paddh $f4, $f4, $f14                       \r\n"
-        "psrah $f14, $f14, $f18                     \r\n"
-        "paddh $f2, $f2, $f0                        \r\n"
-        "psrah $f0, $f0, $f18                       \r\n"
-        "psubh $f14, $f14, $f10                     \r\n"
-        "psubh $f8, $f8, $f0                        \r\n"
-        "mov.d $f0, $f6                             \r\n"
-        "psrah $f6, $f6, $f16                       \r\n"
-        "psrah $f10, $f12, $f16                     \r\n"
-        "paddh $f6, $f6, $f12                       \r\n"
-        "psubh $f10, $f10, $f0                      \r\n"
-        "ldc1 $f12, 0x8(%[block])                   \r\n"
-        "ldc1 $f0, 0x48(%[block])                   \r\n"
-        "paddh $f0, $f0, $f12                       \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f6, $f6, $f0                        \r\n"
-        "psubh $f12, $f12, $f0                      \r\n"
-        "paddh $f0, $f0, $f0                        \r\n"
-        "paddh $f10, $f10, $f12                     \r\n"
-        "psubh $f0, $f0, $f6                        \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f8, $f8, $f6                        \r\n"
-        "psubh $f12, $f12, $f10                     \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "paddh $f14, $f14, $f10                     \r\n"
-        "psubh $f6, $f6, $f8                        \r\n"
-        "paddh $f10, $f10, $f10                     \r\n"
-        "paddh $f4, $f4, $f12                       \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f2, $f2, $f0                        \r\n"
-        "psubh $f12, $f12, $f4                      \r\n"
-        "paddh $f0, $f0, $f0                        \r\n"
-        "sdc1 $f6, 0x8(%[block])                    \r\n"
-        "psubh $f0, $f0, $f2                        \r\n"
-        "punpckhhw $f6, $f8, $f14                   \r\n"
-        "punpcklhw $f8, $f8, $f14                   \r\n"
-        "punpckhhw $f14, $f4, $f2                   \r\n"
-        "punpcklhw $f4, $f4, $f2                    \r\n"
-        "punpckhwd $f2, $f8, $f4                    \r\n"
-        "punpcklwd $f8, $f8, $f4                    \r\n"
-        "punpckhwd $f4, $f6, $f14                   \r\n"
-        "punpcklwd $f6, $f6, $f14                   \r\n"
-        "ldc1 $f14, 0x8(%[block])                   \r\n"
-        "dmfc1 $13, $f8                             \r\n"
-        "dmfc1 $15, $f2                             \r\n"
-        "mov.d $f24, $f6                            \r\n"
-        "mov.d $f28, $f4                            \r\n"
-        "punpckhhw $f4, $f0, $f12                   \r\n"
-        "punpcklhw $f0, $f0, $f12                   \r\n"
-        "punpckhhw $f12, $f10, $f14                 \r\n"
-        "punpcklhw $f10, $f10, $f14                 \r\n"
-        "punpckhwd $f14, $f0, $f10                  \r\n"
-        "punpcklwd $f0, $f0, $f10                   \r\n"
-        "punpckhwd $f10, $f4, $f12                  \r\n"
-        "punpcklwd $f4, $f4, $f12                   \r\n"
-        "dmfc1 $14, $f0                             \r\n"
-        "mov.d $f22, $f14                           \r\n"
-        "mov.d $f26, $f4                            \r\n"
-        "mov.d $f30, $f10                           \r\n"
-        "daddiu $10, %[dst], 0x4                    \r\n"
-        "dmtc1 $15, $f14                            \r\n"
-        "dmtc1 $11, $f12                            \r\n"
-        "ldc1 $f2, 0x10($29)                        \r\n"
-        "dmtc1 $8, $f6                              \r\n"
-        "mov.d $f8, $f2                             \r\n"
-        "psrah $f2, $f2, $f16                       \r\n"
-        "psrah $f0, $f14, $f16                      \r\n"
-        "paddh $f2, $f2, $f8                        \r\n"
-        "paddh $f0, $f0, $f14                       \r\n"
-        "paddh $f2, $f2, $f14                       \r\n"
-        "paddh $f0, $f0, $f28                       \r\n"
-        "paddh $f2, $f2, $f12                       \r\n"
-        "psubh $f0, $f0, $f8                        \r\n"
-        "psubh $f8, $f8, $f12                       \r\n"
-        "psubh $f14, $f14, $f12                     \r\n"
-        "psrah $f12, $f12, $f16                     \r\n"
-        "paddh $f8, $f8, $f28                       \r\n"
-        "psubh $f14, $f14, $f28                     \r\n"
-        "psrah $f10, $f28, $f16                     \r\n"
-        "psubh $f8, $f8, $f12                       \r\n"
-        "psubh $f14, $f14, $f10                     \r\n"
-        "mov.d $f10, $f2                            \r\n"
-        "psrah $f2, $f2, $f18                       \r\n"
-        "psrah $f12, $f0, $f18                      \r\n"
-        "paddh $f2, $f2, $f14                       \r\n"
-        "paddh $f12, $f12, $f8                      \r\n"
-        "psrah $f8, $f8, $f18                       \r\n"
-        "psrah $f14, $f14, $f18                     \r\n"
-        "psubh $f8, $f8, $f0                        \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "mov.d $f14, $f24                           \r\n"
-        "psrah $f4, $f24, $f16                      \r\n"
-        "psrah $f0, $f6, $f16                       \r\n"
-        "paddh $f4, $f4, $f6                        \r\n"
-        "psubh $f0, $f0, $f14                       \r\n"
-        "ldc1 $f6, 0x0($29)                         \r\n"
-        "dmtc1 $13, $f14                            \r\n"
-        "paddh $f14, $f14, $f6                      \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "paddh $f4, $f4, $f14                       \r\n"
-        "psubh $f6, $f6, $f14                       \r\n"
-        "paddh $f14, $f14, $f14                     \r\n"
-        "paddh $f0, $f0, $f6                        \r\n"
-        "psubh $f14, $f14, $f4                      \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "paddh $f10, $f10, $f4                      \r\n"
-        "psubh $f6, $f6, $f0                        \r\n"
-        "paddh $f4, $f4, $f4                        \r\n"
-        "paddh $f8, $f8, $f0                        \r\n"
-        "psubh $f4, $f4, $f10                       \r\n"
-        "paddh $f0, $f0, $f0                        \r\n"
-        "paddh $f12, $f12, $f6                      \r\n"
-        "psubh $f0, $f0, $f8                        \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "paddh $f2, $f2, $f14                       \r\n"
-        "psubh $f6, $f6, $f12                       \r\n"
-        "paddh $f14, $f14, $f14                     \r\n"
-        "sdc1 $f6, 0x0($29)                         \r\n"
-        "psubh $f14, $f14, $f2                      \r\n"
-        "sdc1 $f0, 0x10($29)                        \r\n"
-        "dmfc1 $8, $f4                              \r\n"
-        "xor $f4, $f4, $f4                          \r\n"
-        "sdc1 $f4, 0x0(%[block])                    \r\n"
-        "sdc1 $f4, 0x8(%[block])                    \r\n"
-        "sdc1 $f4, 0x10(%[block])                   \r\n"
-        "sdc1 $f4, 0x18(%[block])                   \r\n"
-        "sdc1 $f4, 0x20(%[block])                   \r\n"
-        "sdc1 $f4, 0x28(%[block])                   \r\n"
-        "sdc1 $f4, 0x30(%[block])                   \r\n"
-        "sdc1 $f4, 0x38(%[block])                   \r\n"
-        "sdc1 $f4, 0x40(%[block])                   \r\n"
-        "sdc1 $f4, 0x48(%[block])                   \r\n"
-        "sdc1 $f4, 0x50(%[block])                   \r\n"
-        "sdc1 $f4, 0x58(%[block])                   \r\n"
-        "sdc1 $f4, 0x60(%[block])                   \r\n"
-        "sdc1 $f4, 0x68(%[block])                   \r\n"
-        "sdc1 $f4, 0x70(%[block])                   \r\n"
-        "sdc1 $f4, 0x78(%[block])                   \r\n"
-        "dli $11, 0x6                               \r\n"
-        "lwc1 $f6, 0x0(%[dst])                      \r\n"
-        "dmtc1 $11, $f20                            \r\n"
-        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "psrah $f10, $f10, $f20                     \r\n"
-        "psrah $f8, $f8, $f20                       \r\n"
-        "punpcklbh $f6, $f6, $f4                    \r\n"
-        "punpcklbh $f0, $f0, $f4                    \r\n"
-        "paddh $f6, $f6, $f10                       \r\n"
-        "paddh $f0, $f0, $f8                        \r\n"
-        "packushb $f6, $f6, $f4                     \r\n"
-        "packushb $f0, $f0, $f4                     \r\n"
-        "swc1 $f6, 0x0(%[dst])                      \r\n"
-        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "lwc1 $f6, 0x0(%[dst])                      \r\n"
-        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "psrah $f12, $f12, $f20                     \r\n"
-        "psrah $f2, $f2, $f20                       \r\n"
-        "punpcklbh $f6, $f6, $f4                    \r\n"
-        "punpcklbh $f0, $f0, $f4                    \r\n"
-        "paddh $f6, $f6, $f12                       \r\n"
-        "paddh $f0, $f0, $f2                        \r\n"
-        "packushb $f6, $f6, $f4                     \r\n"
-        "packushb $f0, $f0, $f4                     \r\n"
-        "swc1 $f6, 0x0(%[dst])                      \r\n"
-        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "ldc1 $f10, 0x0($29)                        \r\n"
-        "ldc1 $f8, 0x10($29)                        \r\n"
-        "dmtc1 $8, $f12                             \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "lwc1 $f6, 0x0(%[dst])                      \r\n"
-        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "psrah $f14, $f14, $f20                     \r\n"
-        "psrah $f10, $f10, $f20                     \r\n"
-        "punpcklbh $f6, $f6, $f4                    \r\n"
-        "punpcklbh $f0, $f0, $f4                    \r\n"
-        "paddh $f6, $f6, $f14                       \r\n"
-        "paddh $f0, $f0, $f10                       \r\n"
-        "packushb $f6, $f6, $f4                     \r\n"
-        "packushb $f0, $f0, $f4                     \r\n"
-        "swc1 $f6, 0x0(%[dst])                      \r\n"
-        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "daddu %[dst], %[dst], %[stride]            \r\n"
-        "lwc1 $f6, 0x0(%[dst])                      \r\n"
-        "gslwxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "psrah $f8, $f8, $f20                       \r\n"
-        "psrah $f12, $f12, $f20                     \r\n"
-        "punpcklbh $f6, $f6, $f4                    \r\n"
-        "punpcklbh $f0, $f0, $f4                    \r\n"
-        "paddh $f6, $f6, $f8                        \r\n"
-        "paddh $f0, $f0, $f12                       \r\n"
-        "packushb $f6, $f6, $f4                     \r\n"
-        "packushb $f0, $f0, $f4                     \r\n"
-        "swc1 $f6, 0x0(%[dst])                      \r\n"
-        "gsswxc1 $f0, 0x0(%[dst], %[stride])        \r\n"
-        "dmtc1 $12, $f2                             \r\n"
-        "dmtc1 $9, $f12                             \r\n"
-        "ldc1 $f8, 0x18($29)                        \r\n"
-        "mov.d $f10, $f8                            \r\n"
-        "psrah $f8, $f8, $f16                       \r\n"
-        "psrah $f14, $f22, $f16                     \r\n"
-        "paddh $f14, $f14, $f22                     \r\n"
-        "paddh $f8, $f8, $f10                       \r\n"
-        "paddh $f14, $f14, $f30                     \r\n"
-        "paddh $f8, $f8, $f22                       \r\n"
-        "psubh $f14, $f14, $f10                     \r\n"
-        "paddh $f8, $f8, $f2                        \r\n"
-        "psubh $f10, $f10, $f2                      \r\n"
-        "psubh $f6, $f22, $f2                       \r\n"
-        "psrah $f2, $f2, $f16                       \r\n"
-        "paddh $f10, $f10, $f30                     \r\n"
-        "psubh $f6, $f6, $f30                       \r\n"
-        "psrah $f4, $f30, $f16                      \r\n"
-        "psubh $f10, $f10, $f2                      \r\n"
-        "psubh $f6, $f6, $f4                        \r\n"
-        "mov.d $f4, $f8                             \r\n"
-        "psrah $f8, $f8, $f18                       \r\n"
-        "psrah $f2, $f14, $f18                      \r\n"
-        "paddh $f8, $f8, $f6                        \r\n"
-        "paddh $f2, $f2, $f10                       \r\n"
-        "psrah $f10, $f10, $f18                     \r\n"
-        "psrah $f6, $f6, $f18                       \r\n"
-        "psubh $f10, $f10, $f14                     \r\n"
-        "psubh $f4, $f4, $f6                        \r\n"
-        "mov.d $f6, $f26                            \r\n"
-        "psrah $f0, $f26, $f16                      \r\n"
-        "psrah $f14, $f12, $f16                     \r\n"
-        "paddh $f0, $f0, $f12                       \r\n"
-        "psubh $f14, $f14, $f6                      \r\n"
-        "ldc1 $f12, 0x8($29)                        \r\n"
-        "dmtc1 $14, $f6                             \r\n"
-        "paddh $f6, $f6, $f12                       \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f0, $f0, $f6                        \r\n"
-        "psubh $f12, $f12, $f6                      \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "paddh $f14, $f14, $f12                     \r\n"
-        "psubh $f6, $f6, $f0                        \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f4, $f4, $f0                        \r\n"
-        "psubh $f12, $f12, $f14                     \r\n"
-        "paddh $f0, $f0, $f0                        \r\n"
-        "paddh $f10, $f10, $f14                     \r\n"
-        "psubh $f0, $f0, $f4                        \r\n"
-        "paddh $f14, $f14, $f14                     \r\n"
-        "paddh $f2, $f2, $f12                       \r\n"
-        "psubh $f14, $f14, $f10                     \r\n"
-        "paddh $f12, $f12, $f12                     \r\n"
-        "paddh $f8, $f8, $f6                        \r\n"
-        "psubh $f12, $f12, $f2                      \r\n"
-        "paddh $f6, $f6, $f6                        \r\n"
-        "sdc1 $f12, 0x8($29)                        \r\n"
-        "psubh $f6, $f6, $f8                        \r\n"
-        "sdc1 $f14, 0x18($29)                       \r\n"
-        "dmfc1 $9, $f0                              \r\n"
-        "xor $f0, $f0, $f0                          \r\n"
-        "lwc1 $f12, 0x0($10)                        \r\n"
-        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "psrah $f4, $f4, $f20                       \r\n"
-        "psrah $f10, $f10, $f20                     \r\n"
-        "punpcklbh $f12, $f12, $f0                  \r\n"
-        "punpcklbh $f14, $f14, $f0                  \r\n"
-        "paddh $f12, $f12, $f4                      \r\n"
-        "paddh $f14, $f14, $f10                     \r\n"
-        "packushb $f12, $f12, $f0                   \r\n"
-        "packushb $f14, $f14, $f0                   \r\n"
-        "swc1 $f12, 0x0($10)                        \r\n"
-        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "lwc1 $f12, 0x0($10)                        \r\n"
-        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "psrah $f2, $f2, $f20                       \r\n"
-        "psrah $f8, $f8, $f20                       \r\n"
-        "punpcklbh $f12, $f12, $f0                  \r\n"
-        "punpcklbh $f14, $f14, $f0                  \r\n"
-        "paddh $f12, $f12, $f2                      \r\n"
-        "paddh $f14, $f14, $f8                      \r\n"
-        "packushb $f12, $f12, $f0                   \r\n"
-        "packushb $f14, $f14, $f0                   \r\n"
-        "swc1 $f12, 0x0($10)                        \r\n"
-        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "ldc1 $f4, 0x8($29)                         \r\n"
-        "ldc1 $f10, 0x18($29)                       \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "dmtc1 $9, $f2                              \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "lwc1 $f12, 0x0($10)                        \r\n"
-        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "psrah $f6, $f6, $f20                       \r\n"
-        "psrah $f4, $f4, $f20                       \r\n"
-        "punpcklbh $f12, $f12, $f0                  \r\n"
-        "punpcklbh $f14, $f14, $f0                  \r\n"
-        "paddh $f12, $f12, $f6                      \r\n"
-        "paddh $f14, $f14, $f4                      \r\n"
-        "packushb $f12, $f12, $f0                   \r\n"
-        "packushb $f14, $f14, $f0                   \r\n"
-        "swc1 $f12, 0x0($10)                        \r\n"
-        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "daddu $10, $10, %[stride]                  \r\n"
-        "lwc1 $f12, 0x0($10)                        \r\n"
-        "gslwxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "psrah $f10, $f10, $f20                     \r\n"
-        "psrah $f2, $f2, $f20                       \r\n"
-        "punpcklbh $f12, $f12, $f0                  \r\n"
-        "punpcklbh $f14, $f14, $f0                  \r\n"
-        "paddh $f12, $f12, $f10                     \r\n"
-        "paddh $f14, $f14, $f2                      \r\n"
-        "packushb $f12, $f12, $f0                   \r\n"
-        "packushb $f14, $f14, $f0                   \r\n"
-        "swc1 $f12, 0x0($10)                        \r\n"
-        "gsswxc1 $f14, 0x0($10, %[stride])          \r\n"
-        "daddiu $29, $29, 0x20                      \r\n"
-        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
-        :"$8","$9","$10","$11","$12","$13","$14","$15","$29","$f0","$f2","$f4",
-         "$f8","$f10","$f12","$f14","$f16","$f18","$f20","$f22","$f24","$f26",
-         "$f28","$f30"
+        "lhu       %[tmp0],     0x00(%[block])                          \n\t"
+        PTR_ADDI  "$29,         $29,            -0x20                   \n\t"
+        PTR_ADDIU "%[tmp0],     %[tmp0],        0x20                    \n\t"
+        "ldc1      %[ftmp1],    0x10(%[block])                          \n\t"
+        "sh        %[tmp0],     0x00(%[block])                          \n\t"
+        "ldc1      %[ftmp2],    0x20(%[block])                          \n\t"
+        "dli       %[tmp0],     0x01                                    \n\t"
+        "ldc1      %[ftmp3],    0x30(%[block])                          \n\t"
+        "mtc1      %[tmp0],     %[ftmp8]                                \n\t"
+        "ldc1      %[ftmp5],    0x50(%[block])                          \n\t"
+        "ldc1      %[ftmp6],    0x60(%[block])                          \n\t"
+        "ldc1      %[ftmp7],    0x70(%[block])                          \n\t"
+        "mov.d     %[ftmp0],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp5],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp3]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp8]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "dli       %[tmp0],     0x02                                    \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "mtc1      %[tmp0],     %[ftmp9]                                \n\t"
+        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp4],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d     %[ftmp5],    %[ftmp6]                                \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "ldc1      %[ftmp2],    0x00(%[block])                          \n\t"
+        "ldc1      %[ftmp5],    0x40(%[block])                          \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "sdc1      %[ftmp6],    0x00(%[block])                          \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "punpckhhw %[ftmp6],    %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklhw %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw %[ftmp0],    %[ftmp3],       %[ftmp1]                \n\t"
+        "punpcklhw %[ftmp3],    %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhwd %[ftmp1],    %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd %[ftmp3],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklwd %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "ldc1      %[ftmp0],    0x00(%[block])                          \n\t"
+        "sdc1      %[ftmp7],    0x00($29)                               \n\t"
+        "sdc1      %[ftmp1],    0x10($29)                               \n\t"
+        "dmfc1     %[tmp1],     %[ftmp6]                                \n\t"
+        "dmfc1     %[tmp3],     %[ftmp3]                                \n\t"
+        "punpckhhw %[ftmp3],    %[ftmp5],       %[ftmp2]                \n\t"
+        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "punpckhhw %[ftmp2],    %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhwd %[ftmp0],    %[ftmp5],       %[ftmp4]                \n\t"
+        "punpcklwd %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
+        "punpckhwd %[ftmp4],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "sdc1      %[ftmp5],    0x08($29)                               \n\t"
+        "sdc1      %[ftmp0],    0x18($29)                               \n\t"
+        "dmfc1     %[tmp2],     %[ftmp3]                                \n\t"
+        "dmfc1     %[tmp4],     %[ftmp4]                                \n\t"
+        "ldc1      %[ftmp1],    0x18(%[block])                          \n\t"
+        "ldc1      %[ftmp6],    0x28(%[block])                          \n\t"
+        "ldc1      %[ftmp2],    0x38(%[block])                          \n\t"
+        "ldc1      %[ftmp0],    0x58(%[block])                          \n\t"
+        "ldc1      %[ftmp3],    0x68(%[block])                          \n\t"
+        "ldc1      %[ftmp4],    0x78(%[block])                          \n\t"
+        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp5],    %[ftmp0],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp2],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "mov.d     %[ftmp0],    %[ftmp3]                                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
+        "ldc1      %[ftmp6],    0x08(%[block])                          \n\t"
+        "ldc1      %[ftmp0],    0x48(%[block])                          \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "sdc1      %[ftmp3],    0x08(%[block])                          \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhhw %[ftmp3],    %[ftmp4],       %[ftmp7]                \n\t"
+        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhhw %[ftmp7],    %[ftmp2],       %[ftmp1]                \n\t"
+        "punpcklhw %[ftmp2],    %[ftmp2],       %[ftmp1]                \n\t"
+        "punpckhwd %[ftmp1],    %[ftmp4],       %[ftmp2]                \n\t"
+        "punpcklwd %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
+        "punpckhwd %[ftmp2],    %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "ldc1      %[ftmp7],    0x08(%[block])                          \n\t"
+        "dmfc1     %[tmp5],     %[ftmp4]                                \n\t"
+        "dmfc1     %[tmp7],     %[ftmp1]                                \n\t"
+        "mov.d     %[ftmp12],   %[ftmp3]                                \n\t"
+        "mov.d     %[ftmp14],   %[ftmp2]                                \n\t"
+        "punpckhhw %[ftmp2],    %[ftmp0],       %[ftmp6]                \n\t"
+        "punpcklhw %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "punpckhhw %[ftmp6],    %[ftmp5],       %[ftmp7]                \n\t"
+        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "punpckhwd %[ftmp7],    %[ftmp0],       %[ftmp5]                \n\t"
+        "punpcklwd %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
+        "punpckhwd %[ftmp5],    %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
+        "dmfc1     %[tmp6],     %[ftmp0]                                \n\t"
+        "mov.d     %[ftmp11],   %[ftmp7]                                \n\t"
+        "mov.d     %[ftmp13],   %[ftmp2]                                \n\t"
+        "mov.d     %[ftmp15],   %[ftmp5]                                \n\t"
+        PTR_ADDIU "%[addr0],    %[dst],         0x04                    \n\t"
+        "dmtc1     %[tmp7],     %[ftmp7]                                \n\t"
+        "dmtc1     %[tmp3],     %[ftmp6]                                \n\t"
+        "ldc1      %[ftmp1],    0x10($29)                               \n\t"
+        "dmtc1     %[tmp1],     %[ftmp3]                                \n\t"
+        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp7],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp14]               \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp14]               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp14]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp14],      %[ftmp8]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d     %[ftmp5],    %[ftmp1]                                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp6],    %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp4]                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d     %[ftmp7],    %[ftmp12]                               \n\t"
+        "psrah     %[ftmp2],    %[ftmp12],      %[ftmp8]                \n\t"
+        "psrah     %[ftmp0],    %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
+        "ldc1      %[ftmp3],    0x00($29)                               \n\t"
+        "dmtc1     %[tmp5],     %[ftmp7]                                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "sdc1      %[ftmp3],    0x00($29)                               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
+        "sdc1      %[ftmp0],    0x10($29)                               \n\t"
+        "dmfc1     %[tmp1],     %[ftmp2]                                \n\t"
+        "xor       %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
+        "sdc1      %[ftmp2],    0x00(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x08(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x10(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x18(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x20(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x28(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x30(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x38(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x40(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x48(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x50(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x58(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x60(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x68(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x70(%[block])                          \n\t"
+        "sdc1      %[ftmp2],    0x78(%[block])                          \n\t"
+        "dli       %[tmp3],     0x06                                    \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "mtc1      %[tmp3],     %[ftmp10]                               \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "ldc1      %[ftmp5],    0x00($29)                               \n\t"
+        "ldc1      %[ftmp4],    0x10($29)                               \n\t"
+        "dmtc1     %[tmp1],     %[ftmp6]                                \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[dst])                            \n\t"
+        "mtc1      %[low32],    %[ftmp3]                                \n\t"
+        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
+        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
+        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
+        "dmtc1     %[tmp4],     %[ftmp1]                                \n\t"
+        "dmtc1     %[tmp2],     %[ftmp6]                                \n\t"
+        "ldc1      %[ftmp4],    0x18($29)                               \n\t"
+        "mov.d     %[ftmp5],    %[ftmp4]                                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp11],      %[ftmp8]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp15]               \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp11]               \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp11],      %[ftmp1]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp15]               \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp15]               \n\t"
+        "psrah     %[ftmp2],    %[ftmp15],      %[ftmp8]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
+        "mov.d     %[ftmp2],    %[ftmp4]                                \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp1],    %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
+        "mov.d     %[ftmp3],    %[ftmp13]                               \n\t"
+        "psrah     %[ftmp0],    %[ftmp13],      %[ftmp8]                \n\t"
+        "psrah     %[ftmp7],    %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
+        "ldc1      %[ftmp6],    0x08($29)                               \n\t"
+        "dmtc1     %[tmp6],     %[ftmp3]                                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
+        "sdc1      %[ftmp6],    0x08($29)                               \n\t"
+        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
+        "sdc1      %[ftmp7],    0x18($29)                               \n\t"
+        "dmfc1     %[tmp2],     %[ftmp0]                                \n\t"
+        "xor       %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1      %[ftmp2],    0x08($29)                               \n\t"
+        "ldc1      %[ftmp5],    0x18($29)                               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "dmtc1     %[tmp2],     %[ftmp1]                                \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
+        "uld       %[low32],    0x00(%[addr0])                          \n\t"
+        "mtc1      %[low32],    %[ftmp6]                                \n\t"
+        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
+        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
+        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
+        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
+        PTR_ADDIU "$29,         $29,            0x20                    \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
+          [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
+          [tmp6]"=&r"(tmp[6]),              [tmp7]"=&r"(tmp[7]),
+          [addr0]"=&r"(addr[0]),
+          [low32]"=&r"(low32)
+        : [dst]"r"(dst),                    [block]"r"(block),
+          [stride]"r"((mips_reg)stride)
+        : "$29","memory"
     );
 
     memset(block, 0, 128);
@@ -593,91 +661,134 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 
 void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[6];
+    uint64_t low32;
+
+    block[0] = 0;
+
     __asm__ volatile (
-        "lh $8, 0x0(%[block])                       \r\n"
-        "sd $0, 0x0(%[block])                       \r\n"
-        "daddiu $8, $8, 0x20                        \r\n"
-        "daddu $10, %[stride], %[stride]            \r\n"
-        "dsra $8, $8, 0x6                           \r\n"
-        "xor $f2, $f2, $f2                          \r\n"
-        "mtc1 $8, $f0                               \r\n"
-        "pshufh $f0, $f0, $f2                       \r\n"
-        "daddu $8, $10, %[stride]                   \r\n"
-        "psubh $f2, $f2, $f0                        \r\n"
-        "packushb $f0, $f0, $f0                     \r\n"
-        "packushb $f2, $f2, $f2                     \r\n"
-        "lwc1 $f4, 0x0(%[dst])                      \r\n"
-        "gslwxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gslwxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "gslwxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        "paddusb $f4, $f4, $f0                      \r\n"
-        "paddusb $f6, $f6, $f0                      \r\n"
-        "paddusb $f8, $f8, $f0                      \r\n"
-        "paddusb $f10, $f10, $f0                    \r\n"
-        "psubusb $f4, $f4, $f2                      \r\n"
-        "psubusb $f6, $f6, $f2                      \r\n"
-        "psubusb $f8, $f8, $f2                      \r\n"
-        "psubusb $f10, $f10, $f2                    \r\n"
-        "swc1 $f4, 0x0(%[dst])                      \r\n"
-        "gsswxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gsswxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "gsswxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
-        : "$8","$10","$f0","$f2","$f4","$f6","$f8","$f10"
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "uld        %[low32],   0x00(%[dst0])                           \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[dst1])                           \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        "uld        %[low32],   0x00(%[dst2])                           \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "uld        %[low32],   0x00(%[dst3])                           \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
+        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
+        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [low32]"=&r"(low32)
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dc]"r"(dc)
+        : "memory"
     );
 }
 
 void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
+    int dc = (block[0] + 32) >> 6;
+    double ftmp[10];
+
+    block[0] = 0;
+
     __asm__ volatile (
-        "lh $8, 0x0(%[block])                       \r\n"
-        "sd $0, 0x0(%[block])                       \r\n"
-        "daddiu $8, $8, 0x20                        \r\n"
-        "daddu $10, %[stride], %[stride]            \r\n"
-        "dsra $8, $8, 0x6                           \r\n"
-        "xor $f2, $f2, $f2                          \r\n"
-        "mtc1 $8, $f0                               \r\n"
-        "pshufh $f0, $f0, $f2                       \r\n"
-        "daddu $8, $10, %[stride]                   \r\n"
-        "psubh $f2, $f2, $f0                        \r\n"
-        "packushb $f0, $f0, $f0                     \r\n"
-        "packushb $f2, $f2, $f2                     \r\n"
-        "ldc1 $f4, 0x0(%[dst])                      \r\n"
-        "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        "paddusb $f4, $f4, $f0                      \r\n"
-        "paddusb $f6, $f6, $f0                      \r\n"
-        "paddusb $f8, $f8, $f0                      \r\n"
-        "paddusb $f10, $f10, $f0                    \r\n"
-        "psubusb $f4, $f4, $f2                      \r\n"
-        "psubusb $f6, $f6, $f2                      \r\n"
-        "psubusb $f8, $f8, $f2                      \r\n"
-        "psubusb $f10, $f10, $f2                    \r\n"
-        "sdc1 $f4, 0x0(%[dst])                      \r\n"
-        "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "daddu $9, $10, $10                         \r\n"
-        "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        "daddu %[dst], %[dst], $9                   \r\n"
-        "ldc1 $f4, 0x0(%[dst])                      \r\n"
-        "gsldxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gsldxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "gsldxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        "paddusb $f4, $f4, $f0                      \r\n"
-        "paddusb $f6, $f6, $f0                      \r\n"
-        "paddusb $f8, $f8, $f0                      \r\n"
-        "paddusb $f10, $f10, $f0                    \r\n"
-        "psubusb $f4, $f4, $f2                      \r\n"
-        "psubusb $f6, $f6, $f2                      \r\n"
-        "psubusb $f8, $f8, $f2                      \r\n"
-        "psubusb $f10, $f10, $f2                    \r\n"
-        "sdc1 $f4, 0x0(%[dst])                      \r\n"
-        "gssdxc1 $f6, 0x0(%[dst], %[stride])        \r\n"
-        "gssdxc1 $f8, 0x0(%[dst], $10)              \r\n"
-        "gssdxc1 $f10, 0x0(%[dst], $8)              \r\n"
-        ::[dst]"r"(dst),[block]"r"(block),[stride]"r"((uint64_t)stride)
-        : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10"
+        "mtc1       %[dc],      %[ftmp5]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "ldc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "ldc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "ldc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "ldc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
+        "sdc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
+        "sdc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
+        "sdc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+
+        "ldc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
+        "ldc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
+        "ldc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
+        "ldc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp8],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpckhbh  %[ftmp9],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp9],   %[ftmp9],       %[ftmp5]                \n\t"
+        "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "sdc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
+        "sdc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
+        "sdc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
+        "sdc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9])
+        : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
+          [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
+          [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
+          [dst6]"r"(dst+6*stride),          [dst7]"r"(dst+7*stride),
+          [dc]"r"(dc)
+        : "memory"
     );
 }
 
@@ -775,212 +886,222 @@ void ff_h264_idct_add8_422_8_mmi(uint8_t **dest, const int *block_offset,
 void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
         int qmul)
 {
+    double ftmp[10];
+    uint64_t tmp[2];
+
     __asm__ volatile (
-        ".set noreorder                                 \r\n"
-        "dli $10, 0x8                                   \r\n"
-        "ldc1 $f6, 0x18(%[input])                       \r\n"
-        "dmtc1 $10, $f16                                \r\n"
-        "ldc1 $f4, 0x10(%[input])                       \r\n"
-        "dli $10, 0x20                                  \r\n"
-        "ldc1 $f2, 0x8(%[input])                        \r\n"
-        "dmtc1 $10, $f18                                \r\n"
-        "ldc1 $f0, 0x0(%[input])                        \r\n"
-        "mov.d $f8, $f6                                 \r\n"
-        "paddh $f6, $f6, $f4                            \r\n"
-        "psubh $f4, $f4, $f8                            \r\n"
-        "mov.d $f8, $f2                                 \r\n"
-        "paddh $f2, $f2, $f0                            \r\n"
-        "psubh $f0, $f0, $f8                            \r\n"
-        "mov.d $f8, $f6                                 \r\n"
-        "paddh $f6, $f6, $f2                            \r\n"
-        "psubh $f2, $f2, $f8                            \r\n"
-        "mov.d $f8, $f4                                 \r\n"
-        "paddh $f4, $f4, $f0                            \r\n"
-        "psubh $f0, $f0, $f8                            \r\n"
-        "mov.d $f8, $f6                                 \r\n"
-        "punpcklhw $f6, $f6, $f2                        \r\n"
-        "punpckhhw $f8, $f8, $f2                        \r\n"
-        "punpckhhw $f2, $f0, $f4                        \r\n"
-        "punpcklhw $f0, $f0, $f4                        \r\n"
-        "punpckhwd $f4, $f6, $f0                        \r\n"
-        "punpcklwd $f6, $f6, $f0                        \r\n"
-        "mov.d $f0, $f8                                 \r\n"
-        "punpcklwd $f8, $f8, $f2                        \r\n"
-        "punpckhwd $f0, $f0, $f2                        \r\n"
-        "mov.d $f2, $f0                                 \r\n"
-        "paddh $f0, $f0, $f8                            \r\n"
-        "psubh $f8, $f8, $f2                            \r\n"
-        "mov.d $f2, $f4                                 \r\n"
-        "paddh $f4, $f4, $f6                            \r\n"
-        "psubh $f6, $f6, $f2                            \r\n"
-        "mov.d $f2, $f0                                 \r\n"
-        "paddh $f0, $f0, $f4                            \r\n"
-        "psubh $f4, $f4, $f2                            \r\n"
-        "mov.d $f2, $f8                                 \r\n"
-        "daddiu $10, %[qmul], -0x7fff                   \r\n"
-        "paddh $f8, $f8, $f6                            \r\n"
-        "bgtz $10, 1f                                   \r\n"
-        "psubh $f6, $f6, $f2                            \r\n"
-        "ori $10, $0, 0x80                              \r\n"
-        "dsll $10, $10, 0x10                            \r\n"
-        "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
-        "daddu %[qmul], %[qmul], $10                    \r\n"
-        "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
-        "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
-        "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
-        "mtc1 %[qmul], $f14                             \r\n"
-        "punpcklwd $f14, $f14, $f14                     \r\n"
-        "pmaddhw $f0, $f0, $f14                         \r\n"
-        "pmaddhw $f4, $f4, $f14                         \r\n"
-        "pmaddhw $f2, $f2, $f14                         \r\n"
-        "pmaddhw $f10, $f10, $f14                       \r\n"
-        "psraw $f0, $f0, $f16                           \r\n"
-        "psraw $f4, $f4, $f16                           \r\n"
-        "psraw $f2, $f2, $f16                           \r\n"
-        "psraw $f10, $f10, $f16                         \r\n"
-        "packsswh $f0, $f0, $f2                         \r\n"
-        "packsswh $f4, $f4, $f10                        \r\n"
-        "mfc1 $9, $f0                                   \r\n"
-        "dsrl $f0, $f0, $f18                            \r\n"
-        "mfc1 %[input], $f0                             \r\n"
-        "sh $9, 0x0(%[output])                          \r\n"
-        "sh %[input], 0x80(%[output])                   \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x20(%[output])                         \r\n"
-        "sh %[input], 0xa0(%[output])                   \r\n"
-        "mfc1 $9, $f4                                   \r\n"
-        "dsrl $f4, $f4, $f18                            \r\n"
-        "mfc1 %[input], $f4                             \r\n"
-        "sh $9, 0x40(%[output])                         \r\n"
-        "sh %[input], 0xc0(%[output])                   \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x60(%[output])                         \r\n"
-        "sh %[input], 0xe0(%[output])                   \r\n"
-        "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
-        "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
-        "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
-        "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
-        "mtc1 %[qmul], $f14                             \r\n"
-        "punpcklwd $f14, $f14, $f14                     \r\n"
-        "pmaddhw $f6, $f6, $f14                         \r\n"
-        "pmaddhw $f8, $f8, $f14                         \r\n"
-        "pmaddhw $f2, $f2, $f14                         \r\n"
-        "pmaddhw $f10, $f10, $f14                       \r\n"
-        "psraw $f6, $f6, $f16                           \r\n"
-        "psraw $f8, $f8, $f16                           \r\n"
-        "psraw $f2, $f2, $f16                           \r\n"
-        "psraw $f10, $f10, $f16                         \r\n"
-        "packsswh $f6, $f6, $f2                         \r\n"
-        "packsswh $f8, $f8, $f10                        \r\n"
-        "mfc1 $9, $f6                                   \r\n"
-        "dsrl $f6, $f6, $f18                            \r\n"
-        "mfc1 %[input], $f6                             \r\n"
-        "sh $9, 0x100(%[output])                        \r\n"
-        "sh %[input], 0x180(%[output])                  \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x120(%[output])                        \r\n"
-        "sh %[input], 0x1a0(%[output])                  \r\n"
-        "mfc1 $9, $f8                                   \r\n"
-        "dsrl $f8, $f8, $f18                            \r\n"
-        "mfc1 %[input], $f8                             \r\n"
-        "sh $9, 0x140(%[output])                        \r\n"
-        "sh %[input], 0x1c0(%[output])                  \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x160(%[output])                        \r\n"
-        "jr $31                                         \r\n"
-        "sh %[input], 0x1e0(%[output])                  \r\n"
-        "1:                                             \r\n"
-        "ori $10, $0, 0x1f                              \r\n"
-        "clz $9, %[qmul]                                \r\n"
-        "ori %[input], $0, 0x7                          \r\n"
-        "dsubu $9, $10, $9                              \r\n"
-        "ori $10, $0, 0x80                              \r\n"
-        "dsll $10, $10, 0x10                            \r\n"
-        "daddu %[qmul], %[qmul], $10                    \r\n"
-        "dsubu $10, $9, %[input]                        \r\n"
-        "movn $9, %[input], $10                         \r\n"
-        "daddiu %[input], %[input], 0x1                 \r\n"
-        "andi $10, $9, 0xff                             \r\n"
-        "dsrlv %[qmul], %[qmul], $10                    \r\n"
-        "dsubu %[input], %[input], $9                   \r\n"
-        "mtc1 %[input], $f12                            \r\n"
-        "punpckhhw $f2, $f0, %[ff_pw_1]                 \r\n"
-        "punpcklhw $f0, $f0, %[ff_pw_1]                 \r\n"
-        "punpckhhw $f10, $f4, %[ff_pw_1]                \r\n"
-        "punpcklhw $f4, $f4, %[ff_pw_1]                 \r\n"
-        "mtc1 %[qmul], $f14                             \r\n"
-        "punpcklwd $f14, $f14, $f14                     \r\n"
-        "pmaddhw $f0, $f0, $f14                         \r\n"
-        "pmaddhw $f4, $f4, $f14                         \r\n"
-        "pmaddhw $f2, $f2, $f14                         \r\n"
-        "pmaddhw $f10, $f10, $f14                       \r\n"
-        "psraw $f0, $f0, $f12                           \r\n"
-        "psraw $f4, $f4, $f12                           \r\n"
-        "psraw $f2, $f2, $f12                           \r\n"
-        "psraw $f10, $f10, $f12                         \r\n"
-        "packsswh $f0, $f0, $f2                         \r\n"
-        "packsswh $f4, $f4, $f10                        \r\n"
-        "mfc1 $9, $f0                                   \r\n"
-        "dsrl $f0, $f0, $f18                            \r\n"
-        "sh $9, 0x0(%[output])                          \r\n"
-        "mfc1 %[input], $f0                             \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "sh %[input], 0x80(%[output])                   \r\n"
-        "sh $9, 0x20(%[output])                         \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "mfc1 $9, $f4                                   \r\n"
-        "sh %[input], 0xa0(%[output])                   \r\n"
-        "dsrl $f4, $f4, $f18                            \r\n"
-        "sh $9, 0x40(%[output])                         \r\n"
-        "mfc1 %[input], $f4                             \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "sh %[input], 0xc0(%[output])                   \r\n"
-        "sh $9, 0x60(%[output])                         \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh %[input], 0xe0(%[output])                   \r\n"
-        "punpckhhw $f2, $f6, %[ff_pw_1]                 \r\n"
-        "punpcklhw $f6, $f6, %[ff_pw_1]                 \r\n"
-        "punpckhhw $f10, $f8, %[ff_pw_1]                \r\n"
-        "punpcklhw $f8, $f8, %[ff_pw_1]                 \r\n"
-        "mtc1 %[qmul], $f14                             \r\n"
-        "punpcklwd $f14, $f14, $f14                     \r\n"
-        "pmaddhw $f6, $f6, $f14                         \r\n"
-        "pmaddhw $f8, $f8, $f14                         \r\n"
-        "pmaddhw $f2, $f2, $f14                         \r\n"
-        "pmaddhw $f10, $f10, $f14                       \r\n"
-        "psraw $f6, $f6, $f12                           \r\n"
-        "psraw $f8, $f8, $f12                           \r\n"
-        "psraw $f2, $f2, $f12                           \r\n"
-        "psraw $f10, $f10, $f12                         \r\n"
-        "packsswh $f6, $f6, $f2                         \r\n"
-        "packsswh $f8, $f8, $f10                        \r\n"
-        "mfc1 $9, $f6                                   \r\n"
-        "dsrl $f6, $f6, $f18                            \r\n"
-        "mfc1 %[input], $f6                             \r\n"
-        "sh $9, 0x100(%[output])                        \r\n"
-        "sh %[input], 0x180(%[output])                  \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x120(%[output])                        \r\n"
-        "sh %[input], 0x1a0(%[output])                  \r\n"
-        "mfc1 $9, $f8                                   \r\n"
-        "dsrl $f8, $f8, $f18                            \r\n"
-        "mfc1 %[input], $f8                             \r\n"
-        "sh $9, 0x140(%[output])                        \r\n"
-        "sh %[input], 0x1c0(%[output])                  \r\n"
-        "dsrl $9, $9, 0x10                              \r\n"
-        "dsrl %[input], %[input], 0x10                  \r\n"
-        "sh $9, 0x160(%[output])                        \r\n"
-        "sh %[input], 0x1e0(%[output])                  \r\n"
-        ".set reorder                                   \r\n"
-        ::[output]"r"(output),[input]"r"(input),[qmul]"r"((uint64_t)qmul),
-          [ff_pw_1]"f"(ff_pw_1)
-        : "$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
-          "$f18"
+        ".set       noreorder                                           \n\t"
+        "dli        %[tmp0],    0x08                                    \n\t"
+        "ldc1       %[ftmp3],   0x18(%[input])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        "ldc1       %[ftmp2],   0x10(%[input])                          \n\t"
+        "dli        %[tmp0],    0x20                                    \n\t"
+        "ldc1       %[ftmp1],   0x08(%[input])                          \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "ldc1       %[ftmp0],   0x00(%[input])                          \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp4]                                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp2]                                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "mov.d      %[ftmp1],   %[ftmp4]                                \n\t"
+        "daddi      %[tmp0],    %[qmul],        -0x7fff                 \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "bgtz       %[tmp0],    1f                                      \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "j          2f                                                  \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "1:                                                             \n\t"
+        "ori        %[tmp0],    $0,             0x1f                    \n\t"
+        "clz        %[tmp1],    %[qmul]                                 \n\t"
+        "ori        %[input],   $0,             0x07                    \n\t"
+        "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
+        "ori        %[tmp0],    $0,             0x80                    \n\t"
+        "dsll       %[tmp0],    %[tmp0],        0x10                    \n\t"
+        "daddu      %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        "dsubu      %[tmp0],    %[tmp1],        %[input]                \n\t"
+        "movn       %[tmp1],    %[input],       %[tmp0]                 \n\t"
+        PTR_ADDIU  "%[input],   %[input],       0x01                    \n\t"
+        "andi       %[tmp0],    %[tmp1],        0xff                    \n\t"
+        "srlv       %[qmul],    %[qmul],        %[tmp0]                 \n\t"
+        PTR_SUBU   "%[input],   %[input],       %[tmp1]                 \n\t"
+        "mtc1       %[input],   %[ftmp6]                                \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp0]                                \n\t"
+        "dsrl       %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x00(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp0]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0x80(%[output])                         \n\t"
+        "sh         %[tmp1],    0x20(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "sh         %[input],   0xa0(%[output])                         \n\t"
+        "dsrl       %[ftmp2],   %[ftmp2],       %[ftmp9]                \n\t"
+        "sh         %[tmp1],    0x40(%[output])                         \n\t"
+        "mfc1       %[input],   %[ftmp2]                                \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        "sh         %[input],   0xc0(%[output])                         \n\t"
+        "sh         %[tmp1],    0x60(%[output])                         \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[input],   0xe0(%[output])                         \n\t"
+        "punpckhhw  %[ftmp1],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ff_pw_1]              \n\t"
+        "punpckhhw  %[ftmp5],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ff_pw_1]              \n\t"
+        "mtc1       %[qmul],    %[ftmp7]                                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "pmaddhw    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psraw      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psraw      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "dmfc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "dsrl       %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp3]                                \n\t"
+        "sh         %[tmp1],    0x100(%[output])                        \n\t"
+        "sh         %[input],   0x180(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x120(%[output])                        \n\t"
+        "sh         %[input],   0x1a0(%[output])                        \n\t"
+        "dmfc1      %[tmp1],    %[ftmp4]                                \n\t"
+        "dsrl       %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "mfc1       %[input],   %[ftmp4]                                \n\t"
+        "sh         %[tmp1],    0x140(%[output])                        \n\t"
+        "sh         %[input],   0x1c0(%[output])                        \n\t"
+        "dsrl       %[tmp1],    %[tmp1],        0x10                    \n\t"
+        PTR_SRL    "%[input],   %[input],       0x10                    \n\t"
+        "sh         %[tmp1],    0x160(%[output])                        \n\t"
+        "sh         %[input],   0x1e0(%[output])                        \n\t"
+        "2:                                                             \n\t"
+        ".set       reorder                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          [output]"+&r"(output),            [input]"+&r"(input),
+          [qmul]"+&r"(qmul)
+        : [ff_pw_1]"f"(ff_pw_1)
+        : "memory"
     );
 }
 
@@ -1031,10 +1152,11 @@ void ff_h264_chroma_dc_dequant_idct_8_mmi(int16_t *block, int qmul)
     block[48]= ((d-b)*qmul) >> 7;
 }
 
-void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
-        int height, int log2_denom, int weight, int offset)
+void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride, int height,
+        int log2_denom, int weight, int offset)
 {
     int y;
+    double ftmp[8];
 
     offset <<= log2_denom;
 
@@ -1043,97 +1165,110 @@ void ff_h264_weight_pixels16_8_mmi(uint8_t *block, int stride,
 
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
-            "ldc1 $f2, %0                   \r\n"
-            "ldc1 $f4, %1                   \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "mtc1 %2, $f6                   \r\n"
-            "mtc1 %3, $f8                   \r\n"
-            "mtc1 %4, $f10                  \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "punpckhbh $f14, $f2, $f20      \r\n"
-            "punpckhbh $f16, $f4, $f20      \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "punpcklbh $f4, $f4, $f20       \r\n"
-            "pmullh $f14, $f14, $f6         \r\n"
-            "pmullh $f16, $f16, $f6         \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "pmullh $f4, $f4, $f6           \r\n"
-            "paddsh $f14, $f14, $f8         \r\n"
-            "paddsh $f16, $f16, $f8         \r\n"
-            "paddsh $f2, $f2, $f8           \r\n"
-            "paddsh $f4, $f4, $f8           \r\n"
-            "psrah $f14, $f14, $f10         \r\n"
-            "psrah $f16, $f16, $f10         \r\n"
-            "psrah $f2, $f2, $f10           \r\n"
-            "psrah $f4, $f4, $f10           \r\n"
-            "packushb $f2, $f2, $f14        \r\n"
-            "packushb $f4, $f4, $f16        \r\n"
-            "sdc1 $f2, %0                   \r\n"
-            "sdc1 $f4, %1                   \r\n"
-            : "=m"(*block),"=m"(*(block + 8))
-            : "r"(weight),"r"(offset),"r"(log2_denom)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[block0])                     \n\t"
+            "ldc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[offset],  %[ftmp4]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "psrah      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[block0])                     \n\t"
+            "sdc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7])
+            : [block0]"r"(block),           [block1]"r"(block+8),
+              [weight]"r"(weight),          [offset]"r"(offset),
+              [log2_denom]"r"(log2_denom)
+            : "memory"
         );
     }
 }
 
-void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
-        int stride, int height, int log2_denom, int weightd, int weights,
-        int offset)
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
 {
     int y;
+    double ftmp[9];
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
-            "ldc1 $f2, %2                   \r\n"
-            "ldc1 $f4, %3                   \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "mtc1 %6, $f6                   \r\n"
-            "mtc1 %7, $f8                   \r\n"
-            "mtc1 %8, $f10                  \r\n"
-            "mtc1 %9, $f12                  \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "pshufh $f10, $f10, $f20        \r\n"
-            "punpckhbh $f14, $f2, $f20      \r\n"
-            "punpckhbh $f16, $f4, $f20      \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "punpcklbh $f4, $f4, $f20       \r\n"
-            "pmullh $f14, $f14, $f6         \r\n"
-            "pmullh $f16, $f16, $f8         \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "pmullh $f4, $f4, $f8           \r\n"
-            "paddsh $f14, $f14, $f10        \r\n"
-            "paddsh $f2, $f2, $f10          \r\n"
-            "paddsh $f14, $f14, $f16        \r\n"
-            "paddsh $f2, $f2, $f4           \r\n"
-            "psrah $f14, $f14, $f12         \r\n"
-            "psrah $f2, $f2, $f12           \r\n"
-            "packushb $f2, $f2, $f14        \r\n"
-            "sdc1 $f2, %0                   \r\n"
-            "ldc1 $f2, %4                   \r\n"
-            "ldc1 $f4, %5                   \r\n"
-            "punpckhbh $f14, $f2, $f20      \r\n"
-            "punpckhbh $f16, $f4, $f20      \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "punpcklbh $f4, $f4, $f20       \r\n"
-            "pmullh $f14, $f14, $f6         \r\n"
-            "pmullh $f16, $f16, $f8         \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "pmullh $f4, $f4, $f8           \r\n"
-            "paddsh $f14, $f14, $f10        \r\n"
-            "paddsh $f2, $f2, $f10          \r\n"
-            "paddsh $f14, $f14, $f16        \r\n"
-            "paddsh $f2, $f2, $f4           \r\n"
-            "psrah $f14, $f14, $f12         \r\n"
-            "psrah $f2, $f2, $f12           \r\n"
-            "packushb $f2, $f2, $f14        \r\n"
-            "sdc1 $f2, %1                   \r\n"
-            : "=m"(*dst),"=m"(*(dst+8))
-            : "m"(*src),"m"(*dst),"m"(*(src+8)),"m"(*(dst+8)),
-              "r"(weights),"r"(weightd),"r"(offset),"r"(log2_denom+1)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src0])                       \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst0])                       \n\t"
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst0])                       \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src1])                       \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst1])                       \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst1])                       \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst0]"r"(dst),               [dst1]"r"(dst+8),
+              [src0]"r"(src),               [src1]"r"(src+8),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
         );
     }
 }
@@ -1142,6 +1277,7 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
         int log2_denom, int weight, int offset)
 {
     int y;
+    double ftmp[6];
 
     offset <<= log2_denom;
 
@@ -1150,68 +1286,78 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, int stride, int height,
 
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
-            "ldc1 $f2, %0                   \r\n"
-            "mtc1 %1, $f6                   \r\n"
-            "mtc1 %2, $f8                   \r\n"
-            "mtc1 %3, $f10                  \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "punpckhbh $f14, $f2, $f20      \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "pmullh $f14, $f14, $f6         \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "paddsh $f14, $f14, $f8         \r\n"
-            "paddsh $f2, $f2, $f8           \r\n"
-            "psrah $f14, $f14, $f10         \r\n"
-            "psrah $f2, $f2, $f10           \r\n"
-            "packushb $f2, $f2, $f14        \r\n"
-            "sdc1 $f2, %0                   \r\n"
-            : "=m"(*block)
-            : "r"(weight),"r"(offset),"r"(log2_denom)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp3]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5])
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
         );
     }
 }
 
-void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
-        int stride, int height, int log2_denom, int weightd, int weights,
-        int offset)
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
 {
     int y;
+    double ftmp[9];
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
-            "ldc1 $f2, %1                   \r\n"
-            "ldc1 $f4, %2                   \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "mtc1 %3, $f6                   \r\n"
-            "mtc1 %4, $f8                   \r\n"
-            "mtc1 %5, $f10                  \r\n"
-            "mtc1 %6, $f12                  \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "pshufh $f10, $f10, $f20        \r\n"
-            "punpckhbh $f14, $f2, $f20      \r\n"
-            "punpckhbh $f16, $f4, $f20      \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "punpcklbh $f4, $f4, $f20       \r\n"
-            "pmullh $f14, $f14, $f6         \r\n"
-            "pmullh $f16, $f16, $f8         \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "pmullh $f4, $f4, $f8           \r\n"
-            "paddsh $f14, $f14, $f10        \r\n"
-            "paddsh $f2, $f2, $f10          \r\n"
-            "paddsh $f14, $f14, $f16        \r\n"
-            "paddsh $f2, $f2, $f4           \r\n"
-            "psrah $f14, $f14, $f12         \r\n"
-            "psrah $f2, $f2, $f12           \r\n"
-            "packushb $f2, $f2, $f14        \r\n"
-            "sdc1 $f2, %0                   \r\n"
-            : "=m"(*dst)
-            : "m"(*src),"m"(*dst),"r"(weights),
-              "r"(weightd),"r"(offset),"r"(log2_denom+1)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "ldc1       %[ftmp1],   0x00(%[src])                        \n\t"
+            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            "mtc1       %[weights], %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp7],   %[ftmp7],       %[ftmp8]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
+            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              [ftmp8]"=&f"(ftmp[8])
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weights]"r"(weights),        [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
         );
     }
 }
@@ -1220,6 +1366,8 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
         int log2_denom, int weight, int offset)
 {
     int y;
+    double ftmp[5];
+    uint64_t low32;
 
     offset <<= log2_denom;
 
@@ -1228,745 +1376,883 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, int stride, int height,
 
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
-            "lwc1 $f2, %0                   \r\n"
-            "mtc1 %1, $f6                   \r\n"
-            "mtc1 %2, $f8                   \r\n"
-            "mtc1 %3, $f10                  \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "paddsh $f2, $f2, $f8           \r\n"
-            "psrah $f2, $f2, $f10           \r\n"
-            "packushb $f2, $f2, $f20        \r\n"
-            "swc1 $f2, %0                   \r\n"
-            : "=m"(*block)
-            : "r"(weight),"r"(offset),"r"(log2_denom)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "uld        %[low32],   0x00(%[block])                      \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "mtc1       %[weight],  %[ftmp2]                            \n\t"
+            "mtc1       %[offset],  %[ftmp3]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
+            "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "gsswlc1    %[ftmp1],   0x03(%[block])                      \n\t"
+            "gsswrc1    %[ftmp1],   0x00(%[block])                      \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),
+              [low32]"=&r"(low32)
+            : [block]"r"(block),            [weight]"r"(weight),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
+            : "memory"
         );
     }
 }
 
-void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
-        int stride, int height, int log2_denom, int weightd, int weights,
-        int offset)
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, int stride,
+        int height, int log2_denom, int weightd, int weights, int offset)
 {
     int y;
+    double ftmp[7];
+    uint64_t low32;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
-            "lwc1 $f2, %1                   \r\n"
-            "lwc1 $f4, %2                   \r\n"
-            "dmtc1 $0, $f20                 \r\n"
-            "mtc1 %3, $f6                   \r\n"
-            "mtc1 %4, $f8                   \r\n"
-            "mtc1 %5, $f10                  \r\n"
-            "mtc1 %6, $f12                  \r\n"
-            "pshufh $f6, $f6, $f20          \r\n"
-            "pshufh $f8, $f8, $f20          \r\n"
-            "pshufh $f10, $f10, $f20        \r\n"
-            "punpcklbh $f2, $f2, $f20       \r\n"
-            "punpcklbh $f4, $f4, $f20       \r\n"
-            "pmullh $f2, $f2, $f6           \r\n"
-            "pmullh $f4, $f4, $f8           \r\n"
-            "paddsh $f2, $f2, $f10          \r\n"
-            "paddsh $f2, $f2, $f4           \r\n"
-            "psrah $f2, $f2, $f12           \r\n"
-            "packushb $f2, $f2, $f20        \r\n"
-            "swc1 $f2, %0                   \r\n"
-            : "=m"(*dst)
-            : "m"(*src),"m"(*dst),"r"(weights),
-              "r"(weightd),"r"(offset),"r"(log2_denom+1)
+            "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+            "uld        %[low32],   0x00(%[src])                        \n\t"
+            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            "uld        %[low32],   0x00(%[dst])                        \n\t"
+            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            "mtc1       %[weight],  %[ftmp3]                            \n\t"
+            "mtc1       %[weightd], %[ftmp4]                            \n\t"
+            "mtc1       %[offset],  %[ftmp5]                            \n\t"
+            "mtc1       %[log2_denom],              %[ftmp6]            \n\t"
+            "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+            "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+            "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
+            "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+            "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
+            "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+            "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+            "gsswlc1    %[ftmp1],   0x03(%[dst])                        \n\t"
+            "gsswrc1    %[ftmp1],   0x00(%[dst])                        \n\t"
+            : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
+              [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
+              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
+              [ftmp6]"=&f"(ftmp[6]),
+              [low32]"=&r"(low32)
+            : [dst]"r"(dst),                [src]"r"(src),
+              [weight]"r"(weights),         [weightd]"r"(weightd),
+              [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
+            : "memory"
         );
     }
 }
 
-static void inline chroma_inter_body_mmi(uint8_t *pix, int stride,
-        int alpha, int beta, int8_t *tc0)
-{
-    __asm__ volatile (
-        "xor $f16, $f16, $f16                           \r\n"
-        "mtc1 %[alpha], $f8                             \r\n"
-        "mtc1 %[beta], $f10                             \r\n"
-        "pshufh $f8, $f8, $f16                          \r\n"
-        "pshufh $f10, $f10, $f16                        \r\n"
-        "packushb $f8, $f8, $f8                         \r\n"
-        "packushb $f10, $f10, $f10                      \r\n"
-        "psubusb $f12, $f4, $f2                         \r\n"
-        "psubusb $f14, $f2, $f4                         \r\n"
-        "or $f14, $f14, $f12                            \r\n"
-        "psubusb $f14, $f14, $f8                        \r\n"
-        "psubusb $f12, $f2, $f0                         \r\n"
-        "psubusb $f8, $f0, $f2                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "psubusb $f12, $f4, $f6                         \r\n"
-        "psubusb $f8, $f6, $f4                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "xor $f12, $f12, $f12                           \r\n"
-        "pcmpeqb $f14, $f14, $f12                       \r\n"
-        "lwc1 $f12, 0x0(%[tc0])                         \r\n"
-        "punpcklbh $f12, $f12, $f12                     \r\n"
-        "and $f14, $f14, $f12                           \r\n"
-        "pcmpeqb $f8, $f8, $f8                          \r\n"
-        "xor $f10, $f2, $f4                             \r\n"
-        "xor $f6, $f6, $f8                              \r\n"
-        "and $f10, $f10, %[ff_pb_1]                     \r\n"
-        "pavgb $f6, $f6, $f0                            \r\n"
-        "xor $f8, $f8, $f2                              \r\n"
-        "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
-        "pavgb $f8, $f8, $f4                            \r\n"
-        "pavgb $f6, $f6, $f10                           \r\n"
-        "paddusb $f6, $f6, $f8                          \r\n"
-        "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
-        "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
-        "pminub $f12, $f12, $f14                        \r\n"
-        "pminub $f6, $f6, $f14                          \r\n"
-        "psubusb $f2, $f2, $f12                         \r\n"
-        "psubusb $f4, $f4, $f6                          \r\n"
-        "paddusb $f2, $f2, $f6                          \r\n"
-        "paddusb $f4, $f4, $f12                         \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
-          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
-          [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
-        : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16"
-    );
-}
-
-static void inline chroma_intra_body_mmi(uint8_t *pix, int stride,
-        int alpha, int beta)
-{
-    __asm__ volatile (
-        "xor $f16, $f16, $f16                           \r\n"
-        "mtc1 %[alpha], $f8                             \r\n"
-        "mtc1 %[beta], $f10                             \r\n"
-        "pshufh $f8, $f8, $f16                          \r\n"
-        "pshufh $f10, $f10, $f16                        \r\n"
-        "packushb $f8, $f8, $f8                         \r\n"
-        "packushb $f10, $f10, $f10                      \r\n"
-        "psubusb $f12, $f4, $f2                         \r\n"
-        "psubusb $f14, $f2, $f4                         \r\n"
-        "or $f14, $f14, $f12                            \r\n"
-        "psubusb $f14, $f14, $f8                        \r\n"
-        "psubusb $f12, $f2, $f0                         \r\n"
-        "psubusb $f8, $f0, $f2                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "psubusb $f12, $f4, $f6                         \r\n"
-        "psubusb $f8, $f6, $f4                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "xor $f12, $f12, $f12                           \r\n"
-        "pcmpeqb $f14, $f14, $f12                       \r\n"
-        "mov.d $f10, $f2                                \r\n"
-        "mov.d $f12, $f4                                \r\n"
-        "xor $f8, $f2, $f6                              \r\n"
-        "and $f8, $f8, %[ff_pb_1]                       \r\n"
-        "pavgb $f2, $f2, $f6                            \r\n"
-        "psubusb $f2, $f2, $f8                          \r\n"
-        "pavgb $f2, $f2, $f0                            \r\n"
-        "xor $f8, $f4, $f0                              \r\n"
-        "and $f8, $f8, %[ff_pb_1]                       \r\n"
-        "pavgb $f4, $f4, $f0                            \r\n"
-        "psubusb $f4, $f4, $f8                          \r\n"
-        "pavgb $f4, $f4, $f6                            \r\n"
-        "psubb $f2, $f2, $f10                           \r\n"
-        "psubb $f4, $f4, $f12                           \r\n"
-        "and $f2, $f2, $f14                             \r\n"
-        "and $f4, $f4, $f14                             \r\n"
-        "paddb $f2, $f2, $f10                           \r\n"
-        "paddb $f4, $f4, $f12                           \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
-          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
-          [ff_pb_1]"f"(ff_pb_1)
-        : "$f0","$f2","$f4","$f8","$f10","$f12","$f14","$f16"
-    );
-}
-
 void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
+    double ftmp[12];
+    mips_reg addr[2];
+    uint64_t low32;
+
     __asm__ volatile (
-        "daddu $8, %[stride], %[stride]                 \r\n"
-        "xor $f16, $f16, $f16                           \r\n"
-        "daddu $9, %[stride], $8                        \r\n"
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "dsubu $9, $0, $9                               \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "daddu $9, $9, %[pix]                           \r\n"
-        "ldc1 $f4, 0x0(%[pix])                          \r\n"
-        "gsldxc1 $f0, 0x0($9, %[stride])                \r\n"
-        "gsldxc1 $f2, 0x0($9, $8)                       \r\n"
-        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
-        "mtc1 %[alpha], $f8                             \r\n"
-        "mtc1 %[beta], $f10                             \r\n"
-        "pshufh $f8, $f8, $f16                          \r\n"
-        "pshufh $f10, $f10, $f16                        \r\n"
-        "packushb $f8, $f8, $f8                         \r\n"
-        "packushb $f10, $f10, $f10                      \r\n"
-        "psubusb $f12, $f4, $f2                         \r\n"
-        "psubusb $f14, $f2, $f4                         \r\n"
-        "or $f14, $f14, $f12                            \r\n"
-        "psubusb $f12, $f2, $f0                         \r\n"
-        "psubusb $f14, $f14, $f8                        \r\n"
-        "psubusb $f8, $f0, $f2                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f12, $f4, $f6                         \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "psubusb $f8, $f6, $f4                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "pcmpeqb $f14, $f14, $f16                       \r\n"
-        "pcmpeqb $f6, $f6, $f6                          \r\n"
-        "gslwlc1 $f8, 0x3(%[tc0])                       \r\n"
-        "gslwrc1 $f8, 0x0(%[tc0])                       \r\n"
-        "punpcklbh $f8, $f8, $f8                        \r\n"
-        "punpcklbh $f18, $f8, $f8                       \r\n"
-        "pcmpgtb $f8, $f18, $f6                         \r\n"
-        "ldc1 $f6, 0x0($9)                              \r\n"
-        "and $f20, $f8, $f14                            \r\n"
-        "psubusb $f14, $f6, $f2                         \r\n"
-        "psubusb $f12, $f2, $f6                         \r\n"
-        "psubusb $f14, $f14, $f10                       \r\n"
-        "psubusb $f12, $f12, $f10                       \r\n"
-        "pcmpeqb $f12, $f12, $f14                       \r\n"
-        "and $f12, $f12, $f20                           \r\n"
-        "and $f8, $f20, $f18                            \r\n"
-        "psubb $f14, $f8, $f12                          \r\n"
-        "and $f12, $f12, $f8                            \r\n"
-        "pavgb $f8, $f2, $f4                            \r\n"
-        "ldc1 $f22, 0x0($9)                             \r\n"
-        "pavgb $f6, $f6, $f8                            \r\n"
-        "xor $f8, $f8, $f22                             \r\n"
-        "and $f8, $f8, %[ff_pb_1]                       \r\n"
-        "psubusb $f6, $f6, $f8                          \r\n"
-        "psubusb $f8, $f0, $f12                         \r\n"
-        "paddusb $f12, $f12, $f0                        \r\n"
-        "pmaxub $f6, $f6, $f8                           \r\n"
-        "pminub $f6, $f6, $f12                          \r\n"
-        "gssdxc1 $f6, 0x0($9, %[stride])                \r\n"
-        "gsldxc1 $f8, 0x0(%[pix], $8)                   \r\n"
-        "psubusb $f6, $f8, $f4                          \r\n"
-        "psubusb $f12, $f4, $f8                         \r\n"
-        "psubusb $f6, $f6, $f10                         \r\n"
-        "psubusb $f12, $f12, $f10                       \r\n"
-        "pcmpeqb $f12, $f12, $f6                        \r\n"
-        "and $f12, $f12, $f20                           \r\n"
-        "psubb $f14, $f14, $f12                         \r\n"
-        "and $f10, $f18, $f12                           \r\n"
-        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
-        "pavgb $f12, $f2, $f4                           \r\n"
-        "gsldxc1 $f22, 0x0(%[pix], $8)                  \r\n"
-        "pavgb $f8, $f8, $f12                           \r\n"
-        "xor $f12, $f12, $f22                           \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "psubusb $f8, $f8, $f12                         \r\n"
-        "psubusb $f12, $f6, $f10                        \r\n"
-        "paddusb $f10, $f10, $f6                        \r\n"
-        "pmaxub $f8, $f8, $f12                          \r\n"
-        "pminub $f8, $f8, $f10                          \r\n"
-        "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
-        "xor $f10, $f2, $f4                             \r\n"
-        "pcmpeqb $f8, $f8, $f8                          \r\n"
-        "and $f10, $f10, %[ff_pb_1]                     \r\n"
-        "xor $f6, $f6, $f8                              \r\n"
-        "xor $f8, $f8, $f2                              \r\n"
-        "pavgb $f6, $f6, $f0                            \r\n"
-        "pavgb $f6, $f6, %[ff_pb_3]                     \r\n"
-        "pavgb $f8, $f8, $f4                            \r\n"
-        "pavgb $f6, $f6, $f10                           \r\n"
-        "paddusb $f6, $f6, $f8                          \r\n"
-        "psubusb $f12, %[ff_pb_A1], $f6                 \r\n"
-        "psubusb $f6, $f6, %[ff_pb_A1]                  \r\n"
-        "pminub $f12, $f12, $f14                        \r\n"
-        "pminub $f6, $f6, $f14                          \r\n"
-        "psubusb $f2, $f2, $f12                         \r\n"
-        "psubusb $f4, $f4, $f6                          \r\n"
-        "paddusb $f2, $f2, $f6                          \r\n"
-        "paddusb $f4, $f4, $f12                         \r\n"
-        "gssdxc1 $f2, 0x0($9, $8)                       \r\n"
-        "sdc1 $f4, 0x0(%[pix])                          \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
-          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),[tc0]"r"(tc0),
-          [ff_pb_1]"f"(ff_pb_1),[ff_pb_3]"f"(ff_pb_3),[ff_pb_A1]"f"(ff_pb_A1)
-        : "$8","$9","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
-          "$f18","$f20","$f22"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        PTR_ADDU   "%[addr1],   %[stride],      %[addr0]                \n\t"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp1],   0x00(%[addr1],  %[stride])              \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
+        "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
+        "ldc1       %[ftmp4],   0x00(%[addr1])                          \n\t"
+        "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "and        %[ftmp5],   %[ftmp10],      %[ftmp9]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
+        "ldc1       %[ftmp11],  0x00(%[addr1])                          \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "gssdxc1    %[ftmp4],   0x00(%[addr1],  %[stride])              \n\t"
+        "gsldxc1    %[ftmp5],   0x00(%[pix],    %[addr0])               \n\t"
+        "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        "gsldxc1    %[ftmp11],  0x00(%[pix],    %[addr0])               \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "gssdxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [low32]"=&r"(low32)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
     );
 }
 
-void ff_deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
+static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    uint64_t stack[0xa];
-
-    __asm__ volatile (
-        "ori $8, $0, 0x1                                \r\n"
-        "xor $f30, $f30, $f30                           \r\n"
-        "dmtc1 $8, $f16                                 \r\n"
-        "dsll $8, %[stride], 2                          \r\n"
-        "daddu $10, %[stride], %[stride]                \r\n"
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "dsll $f20, $f16, $f16                          \r\n"
-        "bltz %[alpha], 1f                              \r\n"
-        "daddu $9, $10, %[stride]                       \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "bltz %[beta], 1f                               \r\n"
-        "dsubu $8, $0, $8                               \r\n"
-        "daddu $8, $8, %[pix]                           \r\n"
-        "ldc1 $f4, 0x0(%[pix])                          \r\n"
-        "gsldxc1 $f0, 0x0($8, $10)                      \r\n"
-        "gsldxc1 $f2, 0x0($8, $9)                       \r\n"
-        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
-        "mtc1 %[alpha], $f8                             \r\n"
-        "mtc1 %[beta], $f10                             \r\n"
-        "pshufh $f8, $f8, $f30                          \r\n"
-        "pshufh $f10, $f10, $f30                        \r\n"
-        "packushb $f8, $f8, $f8                         \r\n"
-        "psubusb $f12, $f4, $f2                         \r\n"
-        "psubusb $f14, $f2, $f4                         \r\n"
-        "packushb $f10, $f10, $f10                      \r\n"
-        "or $f14, $f14, $f12                            \r\n"
-        "sdc1 $f8, 0x10+%[stack]                        \r\n"
-        "psubusb $f14, $f14, $f8                        \r\n"
-        "psubusb $f12, $f2, $f0                         \r\n"
-        "psubusb $f8, $f0, $f2                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "psubusb $f12, $f4, $f6                         \r\n"
-        "psubusb $f8, $f6, $f4                          \r\n"
-        "or $f8, $f8, $f12                              \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "or $f14, $f14, $f8                             \r\n"
-        "xor $f12, $f12, $f12                           \r\n"
-        "ldc1 $f8, 0x10+%[stack]                        \r\n"
-        "pcmpeqb $f14, $f14, $f12                       \r\n"
-        "sdc1 $f14, 0x20+%[stack]                       \r\n"
-        "pavgb $f8, $f8, $f30                           \r\n"
-        "psubusb $f14, $f4, $f2                         \r\n"
-        "pavgb $f8, $f8, %[ff_pb_1]                     \r\n"
-        "psubusb $f12, $f2, $f4                         \r\n"
-        "psubusb $f14, $f14, $f8                        \r\n"
-        "psubusb $f12, $f12, $f8                        \r\n"
-        "ldc1 $f28, 0x20+%[stack]                       \r\n"
-        "pcmpeqb $f12, $f12, $f14                       \r\n"
-        "and $f12, $f12, $f28                           \r\n"
-        "gsldxc1 $f28, 0x0($8, %[stride])               \r\n"
-        "psubusb $f14, $f28, $f2                        \r\n"
-        "psubusb $f8, $f2, $f28                         \r\n"
-        "psubusb $f14, $f14, $f10                       \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "pcmpeqb $f8, $f8, $f14                         \r\n"
-        "and $f8, $f8, $f12                             \r\n"
-        "gsldxc1 $f26, 0x0(%[pix], $10)                 \r\n"
-        "sdc1 $f8, 0x30+%[stack]                        \r\n"
-        "psubusb $f14, $f26, $f4                        \r\n"
-        "psubusb $f8, $f4, $f26                         \r\n"
-        "psubusb $f14, $f14, $f10                       \r\n"
-        "psubusb $f8, $f8, $f10                         \r\n"
-        "pcmpeqb $f8, $f8, $f14                         \r\n"
-        "and $f8, $f8, $f12                             \r\n"
-        "sdc1 $f8, 0x40+%[stack]                        \r\n"
-        "pavgb $f8, $f28, $f0                           \r\n"
-        "pavgb $f10, $f2, $f4                           \r\n"
-        "pavgb $f8, $f8, $f10                           \r\n"
-        "sdc1 $f10, 0x10+%[stack]                       \r\n"
-        "paddb $f12, $f28, $f0                          \r\n"
-        "paddb $f14, $f2, $f4                           \r\n"
-        "paddb $f12, $f12, $f14                         \r\n"
-        "mov.d $f14, $f12                               \r\n"
-        "sdc1 $f12, 0x0+%[stack]                        \r\n"
-        "psrlh $f12, $f12, $f16                         \r\n"
-        "pavgb $f12, $f12, $f30                         \r\n"
-        "xor $f12, $f12, $f8                            \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "psubb $f8, $f8, $f12                           \r\n"
-        "pavgb $f10, $f28, $f6                          \r\n"
-        "psubb $f12, $f28, $f6                          \r\n"
-        "paddb $f14, $f14, $f14                         \r\n"
-        "psubb $f14, $f14, $f12                         \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "psubb $f10, $f10, $f12                         \r\n"
-        "ldc1 $f24, 0x10+%[stack]                       \r\n"
-        "pavgb $f10, $f10, $f0                          \r\n"
-        "psrlh $f14, $f14, $f20                         \r\n"
-        "pavgb $f10, $f10, $f24                         \r\n"
-        "pavgb $f14, $f14, $f30                         \r\n"
-        "xor $f14, $f14, $f10                           \r\n"
-        "and $f14, $f14, %[ff_pb_1]                     \r\n"
-        "psubb $f10, $f10, $f14                         \r\n"
-        "xor $f14, $f2, $f6                             \r\n"
-        "pavgb $f12, $f2, $f6                           \r\n"
-        "and $f14, $f14, %[ff_pb_1]                     \r\n"
-        "psubb $f12, $f12, $f14                         \r\n"
-        "ldc1 $f24, 0x30+%[stack]                       \r\n"
-        "pavgb $f12, $f12, $f0                          \r\n"
-        "ldc1 $f22, 0x20+%[stack]                       \r\n"
-        "xor $f10, $f10, $f12                           \r\n"
-        "xor $f12, $f12, $f2                            \r\n"
-        "and $f10, $f10, $f24                           \r\n"
-        "and $f12, $f12, $f22                           \r\n"
-        "xor $f10, $f10, $f12                           \r\n"
-        "xor $f10, $f10, $f2                            \r\n"
-        "gssdxc1 $f10, 0x0($8, $9)                      \r\n"
-        "ldc1 $f10, 0x0($8)                             \r\n"
-        "paddb $f12, $f28, $f10                         \r\n"
-        "pavgb $f10, $f10, $f28                         \r\n"
-        "ldc1 $f22, 0x0+%[stack]                        \r\n"
-        "pavgb $f10, $f10, $f8                          \r\n"
-        "paddb $f12, $f12, $f12                         \r\n"
-        "paddb $f12, $f12, $f22                         \r\n"
-        "psrlh $f12, $f12, $f20                         \r\n"
-        "pavgb $f12, $f12, $f30                         \r\n"
-        "xor $f12, $f12, $f10                           \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "ldc1 $f22, 0x30+%[stack]                       \r\n"
-        "psubb $f10, $f10, $f12                         \r\n"
-        "xor $f8, $f8, $f0                              \r\n"
-        "xor $f10, $f10, $f28                           \r\n"
-        "and $f8, $f8, $f22                             \r\n"
-        "and $f10, $f10, $f22                           \r\n"
-        "xor $f8, $f8, $f0                              \r\n"
-        "xor $f10, $f10, $f28                           \r\n"
-        "gssdxc1 $f8, 0x0($8, $10)                      \r\n"
-        "gssdxc1 $f10, 0x0($8, %[stride])               \r\n"
-        "pavgb $f8, $f26, $f6                           \r\n"
-        "pavgb $f10, $f4, $f2                           \r\n"
-        "pavgb $f8, $f8, $f10                           \r\n"
-        "sdc1 $f10, 0x10+%[stack]                       \r\n"
-        "paddb $f12, $f26, $f6                          \r\n"
-        "paddb $f14, $f4, $f2                           \r\n"
-        "paddb $f12, $f12, $f14                         \r\n"
-        "mov.d $f14, $f12                               \r\n"
-        "sdc1 $f12, 0x0+%[stack]                        \r\n"
-        "psrlh $f12, $f12, $f16                         \r\n"
-        "pavgb $f12, $f12, $f30                         \r\n"
-        "xor $f12, $f12, $f8                            \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "psubb $f8, $f8, $f12                           \r\n"
-        "pavgb $f10, $f26, $f0                          \r\n"
-        "paddb $f14, $f14, $f14                         \r\n"
-        "psubb $f12, $f26, $f0                          \r\n"
-        "psubb $f14, $f14, $f12                         \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "psubb $f10, $f10, $f12                         \r\n"
-        "ldc1 $f22, 0x10+%[stack]                       \r\n"
-        "pavgb $f10, $f10, $f6                          \r\n"
-        "pavgb $f10, $f10, $f22                         \r\n"
-        "psrlh $f14, $f14, $f20                         \r\n"
-        "pavgb $f14, $f14, $f30                         \r\n"
-        "xor $f14, $f14, $f10                           \r\n"
-        "and $f14, $f14, %[ff_pb_1]                     \r\n"
-        "psubb $f10, $f10, $f14                         \r\n"
-        "xor $f14, $f4, $f0                             \r\n"
-        "pavgb $f12, $f4, $f0                           \r\n"
-        "and $f14, $f14, %[ff_pb_1]                     \r\n"
-        "ldc1 $f22, 0x40+%[stack]                       \r\n"
-        "psubb $f12, $f12, $f14                         \r\n"
-        "ldc1 $f24, 0x20+%[stack]                       \r\n"
-        "pavgb $f12, $f12, $f6                          \r\n"
-        "xor $f10, $f10, $f12                           \r\n"
-        "xor $f12, $f12, $f4                            \r\n"
-        "and $f10, $f10, $f22                           \r\n"
-        "and $f12, $f12, $f24                           \r\n"
-        "xor $f10, $f10, $f12                           \r\n"
-        "xor $f10, $f10, $f4                            \r\n"
-        "sdc1 $f10, 0x0(%[pix])                         \r\n"
-        "gsldxc1 $f10, 0x0(%[pix], $9)                  \r\n"
-        "paddb $f12, $f26, $f10                         \r\n"
-        "pavgb $f10, $f10, $f26                         \r\n"
-        "ldc1 $f22, 0x0+%[stack]                        \r\n"
-        "pavgb $f10, $f10, $f8                          \r\n"
-        "paddb $f12, $f12, $f12                         \r\n"
-        "paddb $f12, $f12, $f22                         \r\n"
-        "psrlh $f12, $f12, $f20                         \r\n"
-        "pavgb $f12, $f12, $f30                         \r\n"
-        "xor $f12, $f12, $f10                           \r\n"
-        "and $f12, $f12, %[ff_pb_1]                     \r\n"
-        "ldc1 $f22, 0x40+%[stack]                       \r\n"
-        "psubb $f10, $f10, $f12                         \r\n"
-        "xor $f8, $f8, $f6                              \r\n"
-        "xor $f10, $f10, $f26                           \r\n"
-        "and $f8, $f8, $f22                             \r\n"
-        "and $f10, $f10, $f22                           \r\n"
-        "xor $f8, $f8, $f6                              \r\n"
-        "xor $f10, $f10, $f26                           \r\n"
-        "gssdxc1 $f8, 0x0(%[pix], %[stride])            \r\n"
-        "gssdxc1 $f10, 0x0(%[pix], $10)                 \r\n"
-        "1:                                             \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),
-          [alpha]"r"((int64_t)alpha),[beta]"r"((int64_t)beta),
-          [stack]"m"(stack[0]),[ff_pb_1]"f"(ff_pb_1)
-        : "$8","$9","$10","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14",
-          "$f16","$f18","$f20","$f22","$f24","$f26","$f28","$f30"
-    );
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0a]);
+    double ftmp[16];
+    uint64_t tmp[1];
+    mips_reg addr[3];
+
+__asm__ volatile (
+"ori        %[tmp0],    $0,             0x01                    \n\t"
+"xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+"mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
+PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
+PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
+PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
+"bltz       %[alpha],   1f                                      \n\t"
+PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
+PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
+"bltz       %[beta],    1f                                      \n\t"
+PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
+PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
+"ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+"gsldxc1    %[ftmp1],   0x00(%[addr0],  %[addr2])               \n\t"
+"gsldxc1    %[ftmp2],   0x00(%[addr0],  %[addr1])               \n\t"
+"gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+"mtc1       %[alpha],   %[ftmp5]                                \n\t"
+"mtc1       %[beta],    %[ftmp6]                                \n\t"
+"pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+"pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+"packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+"packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"sdc1       %[ftmp5],   0x10+%[stack]                           \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp5],   0x10+%[stack]                           \n\t"
+"pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
+"sdc1       %[ftmp8],   0x20+%[stack]                           \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+"psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"ldc1       %[ftmp15],  0x20+%[stack]                           \n\t"
+"pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+"gsldxc1    %[ftmp15],  0x00(%[addr0],  %[stride])              \n\t"
+"psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"gsldxc1    %[ftmp14],  0x00(%[pix],    %[addr2])               \n\t"
+"sdc1       %[ftmp5],   0x30+%[stack]                           \n\t"
+"psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
+"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"sdc1       %[ftmp5],   0x40+%[stack]                           \n\t"
+"pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
+"paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
+"paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
+"psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
+"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp13],  0x10+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+"xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"ldc1       %[ftmp13],  0x30+%[stack]                           \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+"ldc1       %[ftmp12],  0x20+%[stack]                           \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[addr1])               \n\t"
+"ldc1       %[ftmp6],   0x00(%[addr0])                          \n\t"
+"paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x30+%[stack]                           \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+"gssdxc1    %[ftmp5],   0x00(%[addr0],  %[addr2])               \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[stride])              \n\t"
+"pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
+"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
+"paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
+"paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
+"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+"psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
+"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"ldc1       %[ftmp12],  0x10+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+"xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
+"pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
+"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
+"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+"ldc1       %[ftmp13],  0x20+%[stack]                           \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+"sdc1       %[ftmp6],   0x00(%[pix])                            \n\t"
+"gsldxc1    %[ftmp6],   0x00(%[pix],    %[addr1])               \n\t"
+"paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
+"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
+"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+"gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+"gssdxc1    %[ftmp6],   0x00(%[pix],    %[addr2])               \n\t"
+"1:                                                             \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+          [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
+          [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
+  [tmp0]"=&r"(tmp[0]),
+  [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+  [addr2]"=&r"(addr[2]),
+  [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+  [stack]"m"(stack[0]),             [ff_pb_1]"m"(ff_pb_1)
+: "memory"
+);
 }
 
 void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
-    __asm__ volatile (
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "or $16, $0, %[pix]                             \r\n"
-        "dsubu $16, $16, %[stride]                      \r\n"
-        "dsubu $16, $16, %[stride]                      \r\n"
-        "ldc1 $f0, 0x0($16)                             \r\n"
-        "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
-        "ldc1 $f4, 0x0(%[pix])                          \r\n"
-        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
-        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
-          [beta]"+r"(beta)
-        : [tc0]"r"(tc0)
-        : "$16","$f2","$f4"
-    );
-
-    chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
+    double ftmp[9];
+    mips_reg addr[1];
+    uint64_t low32;
 
     __asm__ volatile (
-        "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
-        "sdc1 $f4, 0x0(%[pix])                          \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
-        : "$16","$f2","$f4"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "paddusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ff_pb_A1],    %[ftmp4]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "pminub     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0]),
+          [low32]"=&r"(low32)
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
+          [ff_pb_3]"f"(ff_pb_3),            [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
     );
 }
 
 void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    __asm__ volatile (
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "or $16, $0, %[pix]                             \r\n"
-        "dsubu $16, $16, %[stride]                      \r\n"
-        "dsubu $16, $16, %[stride]                      \r\n"
-        "ldc1 $f0, 0x0($16)                             \r\n"
-        "gsldxc1 $f2, 0x0($16, %[stride])               \r\n"
-        "ldc1 $f4, 0x0(%[pix])                          \r\n"
-        "gsldxc1 $f6, 0x0(%[pix], %[stride])            \r\n"
-        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
-          [beta]"+r"(beta)
-        ::"$16","$f0","$f2","$f4","$f6"
-    );
-
-    chroma_intra_body_mmi(pix, stride, alpha, beta);
+    double ftmp[9];
+    mips_reg addr[1];
 
     __asm__ volatile (
-        "gssdxc1 $f2, 0x0($16, %[stride])               \r\n"
-        "sdc1 $f4, 0x0(%[pix])                          \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
-        : "$16","$f2","$f4"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        "or         %[addr0],   $0,             %[pix]                  \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
+        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp3]                                \n\t"
+        "xor        %[ftmp5],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "xor        %[ftmp5],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "and        %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+
+        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
+        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),
+          [addr0]"=&r"(addr[0])
+        : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
+          [alpha]"r"(alpha),                [beta]"r"(beta),
+          [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
     );
 }
 
 void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
-    __asm__ volatile (
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "daddu $16, %[stride], %[stride]                \r\n"
-        "daddiu %[pix], %[pix], -0x2                    \r\n"
-        "daddu $17, $16, %[stride]                      \r\n"
-        "daddu $19, $16, $16                            \r\n"
-        "or $18, $0, %[pix]                             \r\n"
-        "daddu %[pix], %[pix], $17                      \r\n"
-        "gslwlc1 $f0, 0x3($18)                          \r\n"
-        "daddu $12, $18, %[stride]                      \r\n"
-        "gslwrc1 $f0, 0x0($18)                          \r\n"
-        "gslwlc1 $f4, 0x3($12)                          \r\n"
-        "daddu $13, $18, $16                            \r\n"
-        "gslwrc1 $f4, 0x0($12)                          \r\n"
-        "gslwlc1 $f2, 0x3($13)                          \r\n"
-        "gslwrc1 $f2, 0x0($13)                          \r\n"
-        "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
-        "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
-        "punpcklbh $f0, $f0, $f4                        \r\n"
-        "punpcklbh $f2, $f2, $f6                        \r\n"
-        "daddu $12, %[pix], %[stride]                   \r\n"
-        "punpckhhw $f4, $f0, $f2                        \r\n"
-        "punpcklhw $f0, $f0, $f2                        \r\n"
-        "gslwlc1 $f8, 0x3($12)                          \r\n"
-        "daddu $13, %[pix], $16                         \r\n"
-        "gslwrc1 $f8, 0x0($12)                          \r\n"
-        "gslwlc1 $f12, 0x3($13)                         \r\n"
-        "daddu $12, %[pix], $17                         \r\n"
-        "gslwrc1 $f12, 0x0($13)                         \r\n"
-        "gslwlc1 $f10, 0x3($12)                         \r\n"
-        "daddu $13, %[pix], $19                         \r\n"
-        "gslwrc1 $f10, 0x0($12)                         \r\n"
-        "gslwlc1 $f14, 0x3($13)                         \r\n"
-        "gslwrc1 $f14, 0x0($13)                         \r\n"
-        "punpcklbh $f8, $f8, $f12                       \r\n"
-        "punpcklbh $f10, $f10, $f14                     \r\n"
-        "mov.d $f12, $f8                                \r\n"
-        "punpcklhw $f8, $f8, $f10                       \r\n"
-        "punpckhhw $f12, $f12, $f10                     \r\n"
-        "punpckhwd $f2, $f0, $f8                        \r\n"
-        "punpckhwd $f6, $f4, $f12                       \r\n"
-        "punpcklwd $f0, $f0, $f8                        \r\n"
-        "punpcklwd $f4, $f4, $f12                       \r\n"
-        "mov.d $f20, $f0                                \r\n"
-        "mov.d $f22, $f6                                \r\n"
-        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
-          [beta]"+r"(beta)
-        ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
-          "$f10","$f12","$f14","$f20","$f22"
-    );
-
-    chroma_inter_body_mmi(pix, stride, alpha, beta, tc0);
+    double ftmp[11];
+    mips_reg addr[6];
+    uint64_t low32;
 
     __asm__ volatile (
-        "punpckhwd $f8, $f20, $f20                      \r\n"
-        "punpckhwd $f10, $f2, $f2                       \r\n"
-        "punpckhwd $f12, $f4, $f4                       \r\n"
-        "punpcklbh $f0, $f20, $f2                       \r\n"
-        "punpcklbh $f4, $f4, $f22                       \r\n"
-        "punpcklhw $f2, $f0, $f4                        \r\n"
-        "punpckhhw $f0, $f0, $f4                        \r\n"
-        "gsswlc1 $f2, 0x3($18)                          \r\n"
-        "gsswrc1 $f2, 0x0($18)                          \r\n"
-        "daddu $12, $18, %[stride]                      \r\n"
-        "punpckhwd $f2, $f2, $f2                        \r\n"
-        "gsswlc1 $f2, 0x3($12)                          \r\n"
-        "daddu $13, $18, $16                            \r\n"
-        "gsswrc1 $f2, 0x0($12)                          \r\n"
-        "gsswlc1 $f0, 0x3($13)                          \r\n"
-        "gsswrc1 $f0, 0x0($13)                          \r\n"
-        "punpckhwd $f0, $f0, $f0                        \r\n"
-        "punpckhwd $f6, $f22, $f22                      \r\n"
-        "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
-        "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
-        "punpcklbh $f8, $f8, $f10                       \r\n"
-        "punpcklbh $f12, $f12, $f6                      \r\n"
-        "daddu $12, %[pix], %[stride]                   \r\n"
-        "punpcklhw $f10, $f8, $f12                      \r\n"
-        "punpckhhw $f8, $f8, $f12                       \r\n"
-        "gsswlc1 $f10, 0x3($12)                         \r\n"
-        "gsswrc1 $f10, 0x0($12)                         \r\n"
-        "punpckhwd $f10, $f10, $f10                     \r\n"
-        "daddu $12, %[pix], $16                         \r\n"
-        "daddu $13, %[pix], $17                         \r\n"
-        "gsswlc1 $f10, 0x3($12)                         \r\n"
-        "gsswrc1 $f10, 0x0($12)                         \r\n"
-        "gsswlc1 $f8, 0x3($13)                          \r\n"
-        "daddu $12, %[pix], $19                         \r\n"
-        "punpckhwd $f20, $f8, $f8                       \r\n"
-        "gsswrc1 $f8, 0x0($13)                          \r\n"
-        "gsswlc1 $f20, 0x3($12)                         \r\n"
-        "gsswrc1 $f20, 0x0($12)                         \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
-        : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
-          "$f10","$f12","$f20"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[pix])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp9],   %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp3]                                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "uld        %[low32],   0x00(%[tc0])                            \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "xor        %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ff_pb_3]              \n\t"
+        "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ff_pb_A1],    %[ftmp3]                \n\t"
+        "psubusb    %[ftmp3],   %[ftmp3],       %[ff_pb_A1]             \n\t"
+        "pminub     %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "pminub     %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp9],       %[ftmp9]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp9],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix),
+          [low32]"=&r"(low32)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
+          [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
+          [ff_pb_A1]"f"(ff_pb_A1)
+        : "memory"
     );
 }
 
 void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    __asm__ volatile (
-        "daddiu %[alpha], %[alpha], -0x1                \r\n"
-        "daddiu %[beta], %[beta], -0x1                  \r\n"
-        "daddu $16, %[stride], %[stride]                \r\n"
-        "daddiu %[pix], %[pix], -0x2                    \r\n"
-        "daddu $17, $16, %[stride]                      \r\n"
-        "daddu $19, $16, $16                            \r\n"
-        "or $18, $0, %[pix]                             \r\n"
-        "daddu %[pix], %[pix], $17                      \r\n"
-        "gslwlc1 $f0, 0x3($18)                          \r\n"
-        "daddu $12, $18, %[stride]                      \r\n"
-        "gslwrc1 $f0, 0x0($18)                          \r\n"
-        "gslwlc1 $f4, 0x3($12)                          \r\n"
-        "daddu $13, $18, $16                            \r\n"
-        "gslwrc1 $f4, 0x0($12)                          \r\n"
-        "gslwlc1 $f2, 0x3($13)                          \r\n"
-        "gslwrc1 $f2, 0x0($13)                          \r\n"
-        "gslwlc1 $f6, 0x3(%[pix])                       \r\n"
-        "gslwrc1 $f6, 0x0(%[pix])                       \r\n"
-        "punpcklbh $f0, $f0, $f4                        \r\n"
-        "punpcklbh $f2, $f2, $f6                        \r\n"
-        "daddu $12, %[pix], %[stride]                   \r\n"
-        "punpckhhw $f4, $f0, $f2                        \r\n"
-        "punpcklhw $f0, $f0, $f2                        \r\n"
-        "gslwlc1 $f8, 0x3($12)                          \r\n"
-        "daddu $13, %[pix], $16                         \r\n"
-        "gslwrc1 $f8, 0x0($12)                          \r\n"
-        "gslwlc1 $f12, 0x3($13)                         \r\n"
-        "daddu $12, %[pix], $17                         \r\n"
-        "gslwrc1 $f12, 0x0($13)                         \r\n"
-        "gslwlc1 $f10, 0x3($12)                         \r\n"
-        "daddu $13, %[pix], $19                         \r\n"
-        "gslwrc1 $f10, 0x0($12)                         \r\n"
-        "gslwlc1 $f14, 0x3($13)                         \r\n"
-        "gslwrc1 $f14, 0x0($13)                         \r\n"
-        "punpcklbh $f8, $f8, $f12                       \r\n"
-        "punpcklbh $f10, $f10, $f14                     \r\n"
-        "mov.d $f12, $f8                                \r\n"
-        "punpcklhw $f8, $f8, $f10                       \r\n"
-        "punpckhhw $f12, $f12, $f10                     \r\n"
-        "punpckhwd $f2, $f0, $f8                        \r\n"
-        "punpckhwd $f6, $f4, $f12                       \r\n"
-        "punpcklwd $f0, $f0, $f8                        \r\n"
-        "punpcklwd $f4, $f4, $f12                       \r\n"
-        : [pix]"+r"(pix),[stride]"+r"(stride),[alpha]"+r"(alpha),
-          [beta]"+r"(beta)
-        ::"$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
-          "$f10","$f12","$f14","$f20","$f22"
-    );
-
-    chroma_intra_body_mmi(pix, stride, alpha, beta);
+    double ftmp[11];
+    mips_reg addr[6];
+    uint64_t low32;
 
     __asm__ volatile (
-        "punpckhwd $f8, $f0, $f0                        \r\n"
-        "punpckhwd $f10, $f2, $f2                       \r\n"
-        "punpckhwd $f12, $f4, $f4                       \r\n"
-        "punpcklbh $f0, $f0, $f2                        \r\n"
-        "punpcklbh $f4, $f4, $f6                        \r\n"
-        "punpcklhw $f2, $f0, $f4                        \r\n"
-        "punpckhhw $f0, $f0, $f4                        \r\n"
-        "gsswlc1 $f2, 0x3($18)                          \r\n"
-        "gsswrc1 $f2, 0x0($18)                          \r\n"
-        "daddu $12, $18, %[stride]                      \r\n"
-        "punpckhwd $f2, $f2, $f2                        \r\n"
-        "gsswlc1 $f2, 0x3($12)                          \r\n"
-        "daddu $13, $18, $16                            \r\n"
-        "gsswrc1 $f2, 0x0($12)                          \r\n"
-        "gsswlc1 $f0, 0x3($13)                          \r\n"
-        "gsswrc1 $f0, 0x0($13)                          \r\n"
-        "punpckhwd $f0, $f0, $f0                        \r\n"
-        "punpckhwd $f6, $f6, $f6                        \r\n"
-        "gsswlc1 $f0, 0x3(%[pix])                       \r\n"
-        "gsswrc1 $f0, 0x0(%[pix])                       \r\n"
-        "punpcklbh $f8, $f8, $f10                       \r\n"
-        "punpcklbh $f12, $f12, $f6                      \r\n"
-        "daddu $12, %[pix], %[stride]                   \r\n"
-        "punpcklhw $f10, $f8, $f12                      \r\n"
-        "punpckhhw $f8, $f8, $f12                       \r\n"
-        "gsswlc1 $f10, 0x3($12)                         \r\n"
-        "gsswrc1 $f10, 0x0($12)                         \r\n"
-        "punpckhwd $f10, $f10, $f10                     \r\n"
-        "daddu $12, %[pix], $16                         \r\n"
-        "daddu $13, %[pix], $17                         \r\n"
-        "gsswlc1 $f10, 0x3($12)                         \r\n"
-        "gsswrc1 $f10, 0x0($12)                         \r\n"
-        "gsswlc1 $f8, 0x3($13)                          \r\n"
-        "daddu $12, %[pix], $19                         \r\n"
-        "punpckhwd $f20, $f8, $f8                       \r\n"
-        "gsswrc1 $f8, 0x0($13)                          \r\n"
-        "gsswlc1 $f20, 0x3($12)                         \r\n"
-        "gsswrc1 $f20, 0x0($12)                         \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride)
-        : "$12","$13","$16","$17","$18","$19","$f0","$f2","$f4","$f6","$f8",
-          "$f10","$f12","$f20"
+        "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
+        "addi       %[beta],    %[beta],        -0x01                   \n\t"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[pix],     %[pix],         -0x02                   \n\t"
+        PTR_ADDU   "%[addr1],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
+        "or         %[addr5],   $0,             %[pix]                  \n\t"
+        PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr5])                          \n\t"
+        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp1]                                \n\t"
+        "uld        %[low32],   0x00(%[pix])                            \n\t"
+        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
+        "uld        %[low32],   0x00(%[addr3])                          \n\t"
+        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
+        "uld        %[low32],   0x00(%[addr4])                          \n\t"
+        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "mtc1       %[alpha],   %[ftmp4]                                \n\t"
+        "mtc1       %[beta],    %[ftmp5]                                \n\t"
+        "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "packushb   %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp0],       %[ftmp1]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "or         %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp6],   %[ftmp2]                                \n\t"
+        "xor        %[ftmp4],   %[ftmp1],       %[ftmp3]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "xor        %[ftmp4],   %[ftmp2],       %[ftmp0]                \n\t"
+        "and        %[ftmp4],   %[ftmp4],       %[ff_pb_1]              \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "and        %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "and        %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+
+        "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
+        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
+        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
+        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
+        PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
+        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
+        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
+        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
+        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+          [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+          [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+          [ftmp10]"=&f"(ftmp[10]),
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
+          [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
+          [pix]"+&r"(pix),
+          [low32]"=&r"(low32)
+        : [alpha]"r"(alpha),                [beta]"r"(beta),
+          [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
+        : "memory"
     );
 }
 
@@ -1982,233 +2268,253 @@ void ff_deblock_v_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    ff_deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
-    ff_deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
+    deblock_v8_luma_intra_8_mmi(pix + 0, stride, alpha, beta);
+    deblock_v8_luma_intra_8_mmi(pix + 8, stride, alpha, beta);
 }
 
 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
     uint64_t stack[0xd];
+    double ftmp[9];
+    mips_reg addr[8];
 
     __asm__ volatile (
-        "daddu $15, %[stride], %[stride]                \r\n"
-        "daddiu $8, %[pix], -0x4                        \r\n"
-        "daddu $9, %[stride], $15                       \r\n"
-        "gsldlc1 $f0, 0x7($8)                           \r\n"
-        "gsldrc1 $f0, 0x0($8)                           \r\n"
-        "daddu $12, $8, %[stride]                       \r\n"
-        "daddu $10, $8, $9                              \r\n"
-        "gsldlc1 $f2, 0x7($12)                          \r\n"
-        "daddu $11, $8, $15                             \r\n"
-        "gsldrc1 $f2, 0x0($12)                          \r\n"
-        "gsldlc1 $f4, 0x7($11)                          \r\n"
-        "gsldrc1 $f4, 0x0($11)                          \r\n"
-        "gsldlc1 $f6, 0x7($10)                          \r\n"
-        "daddu $12, $10, %[stride]                      \r\n"
-        "gsldrc1 $f6, 0x0($10)                          \r\n"
-        "gsldlc1 $f8, 0x7($12)                          \r\n"
-        "daddu $11, $10, $15                            \r\n"
-        "gsldrc1 $f8, 0x0($12)                          \r\n"
-        "gsldlc1 $f10, 0x7($11)                         \r\n"
-        "daddu $12, $10, $9                             \r\n"
-        "gsldrc1 $f10, 0x0($11)                         \r\n"
-        "gsldlc1 $f12, 0x7($12)                         \r\n"
-        "gsldrc1 $f12, 0x0($12)                         \r\n"
-        "daddu $14, $15, $15                            \r\n"
-        "punpckhbh $f14, $f0, $f2                       \r\n"
-        "punpcklbh $f0, $f0, $f2                        \r\n"
-        "punpckhbh $f2, $f4, $f6                        \r\n"
-        "punpcklbh $f4, $f4, $f6                        \r\n"
-        "punpckhbh $f6, $f8, $f10                       \r\n"
-        "punpcklbh $f8, $f8, $f10                       \r\n"
-        "daddu $12, $10, $14                            \r\n"
-        "sdc1 $f2, 0x10+%[stack]                        \r\n"
-        "gsldlc1 $f16, 0x7($12)                         \r\n"
-        "gsldrc1 $f16, 0x0($12)                         \r\n"
-        "daddu $13, $14, $14                            \r\n"
-        "punpckhbh $f10, $f12, $f16                     \r\n"
-        "punpcklbh $f12, $f12, $f16                     \r\n"
-        "punpckhhw $f2, $f0, $f4                        \r\n"
-        "punpcklhw $f0, $f0, $f4                        \r\n"
-        "punpckhhw $f4, $f8, $f12                       \r\n"
-        "punpcklhw $f8, $f8, $f12                       \r\n"
-        "ldc1 $f16, 0x10+%[stack]                       \r\n"
-        "punpckhwd $f0, $f0, $f8                        \r\n"
-        "sdc1 $f0, 0x0+%[stack]                         \r\n"
-        "punpckhhw $f12, $f14, $f16                     \r\n"
-        "punpcklhw $f14, $f14, $f16                     \r\n"
-        "punpckhhw $f0, $f6, $f10                       \r\n"
-        "punpcklhw $f6, $f6, $f10                       \r\n"
-        "punpcklwd $f12, $f12, $f0                      \r\n"
-        "punpckhwd $f10, $f14, $f6                      \r\n"
-        "punpcklwd $f14, $f14, $f6                      \r\n"
-        "punpckhwd $f6, $f2, $f4                        \r\n"
-        "punpcklwd $f2, $f2, $f4                        \r\n"
-        "sdc1 $f2, 0x10+%[stack]                        \r\n"
-        "sdc1 $f6, 0x20+%[stack]                        \r\n"
-        "sdc1 $f14, 0x30+%[stack]                       \r\n"
-        "sdc1 $f10, 0x40+%[stack]                       \r\n"
-        "sdc1 $f12, 0x50+%[stack]                       \r\n"
-        "daddu $8, $8, $13                              \r\n"
-        "daddu $10, $10, $13                            \r\n"
-        "gsldlc1 $f0, 0x7($8)                           \r\n"
-        "daddu $12, $8, %[stride]                       \r\n"
-        "gsldrc1 $f0, 0x0($8)                           \r\n"
-        "gsldlc1 $f2, 0x7($12)                          \r\n"
-        "daddu $11, $8, $15                             \r\n"
-        "gsldrc1 $f2, 0x0($12)                          \r\n"
-        "gsldlc1 $f4, 0x7($11)                          \r\n"
-        "gsldrc1 $f4, 0x0($11)                          \r\n"
-        "gsldlc1 $f6, 0x7($10)                          \r\n"
-        "daddu $12, $10, %[stride]                      \r\n"
-        "gsldrc1 $f6, 0x0($10)                          \r\n"
-        "gsldlc1 $f8, 0x7($12)                          \r\n"
-        "daddu $11, $10, $15                            \r\n"
-        "gsldrc1 $f8, 0x0($12)                          \r\n"
-        "gsldlc1 $f10, 0x7($11)                         \r\n"
-        "daddu $12, $10, $9                             \r\n"
-        "gsldrc1 $f10, 0x0($11)                         \r\n"
-        "gsldlc1 $f12, 0x7($12)                         \r\n"
-        "gsldrc1 $f12, 0x0($12)                         \r\n"
-        "punpckhbh $f14, $f0, $f2                       \r\n"
-        "punpcklbh $f0, $f0, $f2                        \r\n"
-        "punpckhbh $f2, $f4, $f6                        \r\n"
-        "punpcklbh $f4, $f4, $f6                        \r\n"
-        "punpckhbh $f6, $f8, $f10                       \r\n"
-        "punpcklbh $f8, $f8, $f10                       \r\n"
-        "daddu $12, $10, $14                            \r\n"
-        "sdc1 $f2, 0x18+%[stack]                        \r\n"
-        "gsldlc1 $f16, 0x7($12)                         \r\n"
-        "gsldrc1 $f16, 0x0($12)                         \r\n"
-        "punpckhhw $f2, $f0, $f4                        \r\n"
-        "punpckhbh $f10, $f12, $f16                     \r\n"
-        "punpcklbh $f12, $f12, $f16                     \r\n"
-        "punpcklhw $f0, $f0, $f4                        \r\n"
-        "punpckhhw $f4, $f8, $f12                       \r\n"
-        "punpcklhw $f8, $f8, $f12                       \r\n"
-        "punpckhwd $f0, $f0, $f8                        \r\n"
-        "ldc1 $f16, 0x18+%[stack]                       \r\n"
-        "sdc1 $f0, 0x8+%[stack]                         \r\n"
-        "punpckhhw $f12, $f14, $f16                     \r\n"
-        "punpcklhw $f14, $f14, $f16                     \r\n"
-        "punpckhhw $f0, $f6, $f10                       \r\n"
-        "punpcklhw $f6, $f6, $f10                       \r\n"
-        "punpckhwd $f10, $f14, $f6                      \r\n"
-        "punpcklwd $f14, $f14, $f6                      \r\n"
-        "punpckhwd $f6, $f2, $f4                        \r\n"
-        "punpcklwd $f2, $f2, $f4                        \r\n"
-        "punpcklwd $f12, $f12, $f0                      \r\n"
-        "sdc1 $f2, 0x18+%[stack]                        \r\n"
-        "sdc1 $f6, 0x28+%[stack]                        \r\n"
-        "sdc1 $f14, 0x38+%[stack]                       \r\n"
-        "sdc1 $f10, 0x48+%[stack]                       \r\n"
-        "sdc1 $f12, 0x58+%[stack]                       \r\n"
-        ::[pix]"r"(pix),[stride]"r"((int64_t)stride),[stack]"m"(stack[0])
-        : "$8","$9","$10","$11","$12","$13","$14","$15","$f0","$f2","$f4",
-          "$f6","$f8","$f10","$f12","$f14","$f16"
+        PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
+        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
+        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
+        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
+        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
+        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
+        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        "gsldlc1    %[ftmp5],   0x07(%[addr5])   &n