avcodec/mips: loongson optimize mmi load and store operators
authorZhou Xiaoyong <zhouxiaoyong@loongson.cn>
Mon, 10 Oct 2016 08:09:12 +0000 (16:09 +0800)
committerMichael Niedermayer <michael@niedermayer.cc>
Sun, 23 Oct 2016 01:23:09 +0000 (03:23 +0200)
1.MMI_ load/store macros are defined in libavutil/mips/mmiutils.h
2.Replace some unnecessary unaligned access with aligned operator
3.The MMI_ load/store is compatible with cpu loongson2e/2f which not support instructions start with gs

Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
libavcodec/mips/blockdsp_mmi.c
libavcodec/mips/h264chroma_mmi.c
libavcodec/mips/h264dsp_mmi.c
libavcodec/mips/h264pred_mmi.c
libavcodec/mips/h264qpel_mmi.c
libavcodec/mips/hpeldsp_mmi.c
libavcodec/mips/idctdsp_mmi.c
libavcodec/mips/mpegvideo_mmi.c
libavcodec/mips/pixblockdsp_mmi.c

index 6eb2bd7..1035dbb 100644 (file)
  */
 
 #include "blockdsp_mips.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 {
     double ftmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "mtc1       %[value],   %[ftmp0]                                \n\t"
@@ -34,15 +35,14 @@ void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
-        PTR_ADDI    "%[h],      %[h],           -0x01                   \n\t"
-        "gssdlc1    %[ftmp0],   0x0f(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x08(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
+        PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x08)
         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
-        : [block]"+&r"(block),              [h]"+&r"(h),
-          [ftmp0]"=&f"(ftmp[0])
+        : [ftmp0]"=&f"(ftmp[0]),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
         : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
         : "memory"
     );
@@ -51,6 +51,7 @@ void ff_fill_block16_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
 {
     double ftmp0;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "mtc1       %[value],   %[ftmp0]                                \n\t"
@@ -58,13 +59,13 @@ void ff_fill_block8_mmi(uint8_t *block, uint8_t value, int line_size, int h)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[block])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[block])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[block], 0x00)
         PTR_ADDI   "%[h],       %[h],           -0x01                   \n\t"
         PTR_ADDU   "%[block],   %[block],       %[line_size]            \n\t"
         "bnez       %[h],       1b                                      \n\t"
-        : [block]"+&r"(block),              [h]"+&r"(h),
-          [ftmp0]"=&f"(ftmp0)
+        : [ftmp0]"=&f"(ftmp0),
+          RESTRICT_ASM_ALL64
+          [block]"+&r"(block),              [h]"+&r"(h)
         : [value]"r"(value),                [line_size]"r"((mips_reg)line_size)
         : "memory"
     );
@@ -77,14 +78,14 @@ void ff_clear_block_mmi(int16_t *block)
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
         : [block]"r"(block)
         : "memory"
@@ -98,61 +99,61 @@ void ff_clear_blocks_mmi(int16_t *block)
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "xor        %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x00(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x10(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x20(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x30(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x40(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x50(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x60(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x70(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x00)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x10)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x20)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x30)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x40)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x50)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x60)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x70)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x80(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x90(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xa0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xb0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xc0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xd0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xe0(%[block])          \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0xf0(%[block])          \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x80)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x90)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xa0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xb0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xc0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xd0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xe0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0xf0)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x100(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x110(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x120(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x130(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x140(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x150(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x160(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x170(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x100)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x110)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x120)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x130)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x140)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x150)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x160)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x170)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x180(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x190(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1a0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1b0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1c0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1d0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1e0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x1f0(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x180)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x190)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x1f0)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x200(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x210(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x220(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x230(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x240(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x250(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x260(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x270(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x200)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x210)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x220)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x230)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x240)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x250)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x260)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x270)
 
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x280(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x290(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2a0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2b0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2c0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2d0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2e0(%[block])         \n\t"
-        "gssqc1     %[ftmp0],   %[ftmp1],       0x2f0(%[block])         \n\t"
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x280)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x290)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2a0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2b0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2c0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2d0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2e0)
+        MMI_SQC1(%[ftmp0], %[ftmp1], %[block], 0x2f0)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1])
-        : [block]"r"((mips_reg)block)
+        : [block]"r"((uint64_t *)block)
         : "memory"
     );
 }
index 3dd123d..417b4a2 100644 (file)
@@ -24,7 +24,7 @@
 
 #include "h264chroma_mips.h"
 #include "constants.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
         int h, int x, int y)
@@ -37,6 +37,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     if (D) {
         __asm__ volatile (
@@ -47,16 +48,13 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
-            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
-            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -88,7 +86,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -98,6 +96,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -115,12 +114,11 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -139,7 +137,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -148,6 +146,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -162,9 +161,9 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
             "1:                                                         \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -175,11 +174,10 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -190,7 +188,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -199,6 +197,7 @@ void ff_put_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
@@ -219,6 +218,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[10];
     uint64_t tmp[1];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
 
     if (D) {
         __asm__ volatile (
@@ -229,16 +229,13 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp9]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x08(%[src])                        \n\t"
-            "gsldrc1    %[ftmp2],   0x01(%[src])                        \n\t"
-            "gsldlc1    %[ftmp3],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp3],   0x00(%[addr0])                      \n\t"
-            "gsldlc1    %[ftmp4],   0x08(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp4],   0x01(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[src], 0x01)
+            MMI_ULDC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULDC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -269,10 +266,10 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -282,6 +279,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [ftmp8]"=&f"(ftmp[8]),        [ftmp9]"=&f"(ftmp[9]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -299,12 +297,11 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
-            "gsldlc1    %[ftmp2],   0x07(%[addr0])                      \n\t"
-            "gsldrc1    %[ftmp2],   0x00(%[addr0])                      \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
+            MMI_ULDC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -322,10 +319,10 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -334,6 +331,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
@@ -348,9 +346,9 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+
             "1:                                                         \n\t"
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -360,14 +358,13 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
-            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            MMI_ULDC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
@@ -377,10 +374,10 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -389,6 +386,7 @@ void ff_avg_h264_chroma_mc8_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_ALL64
               [dst]"+&r"(dst),              [src]"+&r"(src),
               [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
@@ -409,7 +407,7 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[8];
     uint64_t tmp[1];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     if (D) {
         __asm__ volatile (
@@ -420,16 +418,13 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x01(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
-            "uld        %[low32],   0x01(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -448,7 +443,7 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -457,10 +452,10 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [B]"f"(B),
               [C]"f"(C),                    [D]"f"(D)
@@ -475,12 +470,11 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "pshufh     %[E],       %[E],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -492,7 +486,7 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -500,10 +494,10 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
               [ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [E]"f"(E)
@@ -515,27 +509,26 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+
             "1:                                                         \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -543,9 +536,9 @@ void ff_put_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A)
             : "memory"
@@ -564,7 +557,7 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
     double ftmp[8];
     uint64_t tmp[1];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     if (D) {
         __asm__ volatile (
@@ -575,16 +568,13 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
             "pshufh     %[C],       %[C],           %[ftmp0]            \n\t"
             "pshufh     %[D],       %[D],           %[ftmp0]            \n\t"
+
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[stride]           \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x01(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp3]                            \n\t"
-            "uld        %[low32],   0x01(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp4]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[src], 0x01)
+            MMI_ULWC1(%[ftmp3], %[addr0], 0x00)
+            MMI_ULWC1(%[ftmp4], %[addr0], 0x01)
 
             "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -602,10 +592,10 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -614,10 +604,10 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [B]"f"(B),
               [C]"f"(C),                    [D]"f"(D)
@@ -634,10 +624,8 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
             "1:                                                         \n\t"
             PTR_ADDU   "%[addr0],   %[src],         %[step]             \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[addr0])                      \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[addr0], 0x00)
 
             "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"
@@ -648,10 +636,10 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x01               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
             "bnez       %[h],       1b                                  \n\t"
@@ -659,10 +647,10 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [addr0]"=&r"(addr[0]),
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[step]"r"((mips_reg)step),
               [ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A),                    [E]"f"(E)
@@ -674,31 +662,30 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             "dli        %[tmp0],    0x06                                \n\t"
             "pshufh     %[A],       %[A],           %[ftmp0]            \n\t"
             "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+
             "1:                                                         \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
 
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
             "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"
             "pmullh     %[ftmp1],   %[ftmp2],       %[A]                \n\t"
             "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_32]         \n\t"
             "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "lwc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LWC1(%[ftmp2], %[dst], 0x00)
             "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "addi       %[h],       %[h],           -0x02               \n\t"
-            "swc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
 
             PTR_ADDU   "%[src],     %[src],         %[stride]           \n\t"
             PTR_ADDU   "%[dst],     %[dst],         %[stride]           \n\t"
@@ -706,9 +693,9 @@ void ff_avg_h264_chroma_mc4_mmi(uint8_t *dst, uint8_t *src, int stride,
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [tmp0]"=&r"(tmp[0]),
+              RESTRICT_ASM_LOW32
               [dst]"+&r"(dst),              [src]"+&r"(src),
-              [h]"+&r"(h),
-              [low32]"=&r"(low32)
+              [h]"+&r"(h)
             : [stride]"r"((mips_reg)stride),[ff_pw_32]"f"(ff_pw_32),
               [A]"f"(A)
             : "memory"
index a550eee..ac6fa99 100644 (file)
 
 #include "libavcodec/bit_depth_template.c"
 #include "h264dsp_mips.h"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 
 void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
 {
     double ftmp[9];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "ldc1       %[ftmp1],   0x00(%[src])                            \n\t"
-        "ldc1       %[ftmp2],   0x08(%[src])                            \n\t"
-        "ldc1       %[ftmp3],   0x10(%[src])                            \n\t"
-        "ldc1       %[ftmp4],   0x18(%[src])                            \n\t"
-        "uld        %[low32],   0x00(%[dst0])                           \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
-        "uld        %[low32],   0x00(%[dst1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
-        "uld        %[low32],   0x00(%[dst2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
-        "uld        %[low32],   0x00(%[dst3])                           \n\t"
-        "mtc1       %[low32],   %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[src], 0x00)
+        MMI_LDC1(%[ftmp2], %[src], 0x08)
+        MMI_LDC1(%[ftmp3], %[src], 0x10)
+        MMI_LDC1(%[ftmp4], %[src], 0x18)
+        MMI_ULWC1(%[ftmp5], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp6], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp7], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp8], %[dst3], 0x00)
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
@@ -58,20 +55,17 @@ void ff_h264_add_pixels4_8_mmi(uint8_t *dst, int16_t *src, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
-        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [ftmp8]"=&f"(ftmp[8]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          [ftmp8]"=&f"(ftmp[8])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [src]"r"(src)
@@ -85,18 +79,20 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     double ftmp[12];
     uint64_t tmp[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x01                                    \n\t"
-        "ldc1       %[ftmp0],   0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-        "ldc1       %[ftmp1],   0x08(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x08)
         "dli        %[tmp0],    0x06                                    \n\t"
-        "ldc1       %[ftmp2],   0x10(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x10)
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
         "psrah      %[ftmp4],   %[ftmp1],       %[ftmp8]                \n\t"
-        "ldc1       %[ftmp3],   0x18(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x18)
         "psrah      %[ftmp5],   %[ftmp3],       %[ftmp8]                \n\t"
         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
         "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
@@ -126,14 +122,13 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp11],  %[ftmp4],       %[ftmp5]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "psubh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp7],   0x00(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x08(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x10(%[block])                          \n\t"
-        "sdc1       %[ftmp7],   0x18(%[block])                          \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], %[block], 0x08)
+        MMI_SDC1(%[ftmp7], %[block], 0x10)
+        MMI_SDC1(%[ftmp7], %[block], 0x18)
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
         "psrah      %[ftmp3],   %[ftmp10],      %[ftmp9]                \n\t"
-        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         "psrah      %[ftmp4],   %[ftmp11],      %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
@@ -141,33 +136,32 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
-        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
         PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[dst])                            \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[dst], 0x00)
         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
-        "gslwxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst])                            \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst])                            \n\t"
+        MMI_SWC1(%[ftmp2], %[dst], 0x00)
         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
-        "gsswxc1    %[ftmp0],   0x00(%[dst],    %[stride])              \n\t"
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-          [tmp0]"=&r"(tmp[0]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [tmp0]"=&r"(tmp[0])
         : [dst]"r"(dst),                    [block]"r"(block),
           [stride]"r"((mips_reg)stride),    [ff_pw_32]"f"(ff_pw_32)
         : "memory"
@@ -179,464 +173,450 @@ void ff_h264_idct_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     double ftmp[16];
-    uint64_t tmp[8];
+    uint64_t tmp[7];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
-        "lhu       %[tmp0],     0x00(%[block])                          \n\t"
-        PTR_ADDI  "$29,         $29,            -0x20                   \n\t"
-        PTR_ADDIU "%[tmp0],     %[tmp0],        0x20                    \n\t"
-        "ldc1      %[ftmp1],    0x10(%[block])                          \n\t"
-        "sh        %[tmp0],     0x00(%[block])                          \n\t"
-        "ldc1      %[ftmp2],    0x20(%[block])                          \n\t"
-        "dli       %[tmp0],     0x01                                    \n\t"
-        "ldc1      %[ftmp3],    0x30(%[block])                          \n\t"
-        "mtc1      %[tmp0],     %[ftmp8]                                \n\t"
-        "ldc1      %[ftmp5],    0x50(%[block])                          \n\t"
-        "ldc1      %[ftmp6],    0x60(%[block])                          \n\t"
-        "ldc1      %[ftmp7],    0x70(%[block])                          \n\t"
-        "mov.d     %[ftmp0],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp5],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp3]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp8]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "dli       %[tmp0],     0x02                                    \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "mtc1      %[tmp0],     %[ftmp9]                                \n\t"
-        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp4],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "mov.d     %[ftmp5],    %[ftmp6]                                \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp2],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1      %[ftmp2],    0x00(%[block])                          \n\t"
-        "ldc1      %[ftmp5],    0x40(%[block])                          \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "sdc1      %[ftmp6],    0x00(%[block])                          \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "punpckhhw %[ftmp6],    %[ftmp7],       %[ftmp0]                \n\t"
-        "punpcklhw %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "punpckhhw %[ftmp0],    %[ftmp3],       %[ftmp1]                \n\t"
-        "punpcklhw %[ftmp3],    %[ftmp3],       %[ftmp1]                \n\t"
-        "punpckhwd %[ftmp1],    %[ftmp7],       %[ftmp3]                \n\t"
-        "punpcklwd %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "punpckhwd %[ftmp3],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklwd %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "ldc1      %[ftmp0],    0x00(%[block])                          \n\t"
-        "sdc1      %[ftmp7],    0x00($29)                               \n\t"
-        "sdc1      %[ftmp1],    0x10($29)                               \n\t"
-        "dmfc1     %[tmp1],     %[ftmp6]                                \n\t"
-        "dmfc1     %[tmp3],     %[ftmp3]                                \n\t"
-        "punpckhhw %[ftmp3],    %[ftmp5],       %[ftmp2]                \n\t"
-        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "punpckhhw %[ftmp2],    %[ftmp4],       %[ftmp0]                \n\t"
-        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "punpckhwd %[ftmp0],    %[ftmp5],       %[ftmp4]                \n\t"
-        "punpcklwd %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
-        "punpckhwd %[ftmp4],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "sdc1      %[ftmp5],    0x08($29)                               \n\t"
-        "sdc1      %[ftmp0],    0x18($29)                               \n\t"
-        "dmfc1     %[tmp2],     %[ftmp3]                                \n\t"
-        "dmfc1     %[tmp4],     %[ftmp4]                                \n\t"
-        "ldc1      %[ftmp1],    0x18(%[block])                          \n\t"
-        "ldc1      %[ftmp6],    0x28(%[block])                          \n\t"
-        "ldc1      %[ftmp2],    0x38(%[block])                          \n\t"
-        "ldc1      %[ftmp0],    0x58(%[block])                          \n\t"
-        "ldc1      %[ftmp3],    0x68(%[block])                          \n\t"
-        "ldc1      %[ftmp4],    0x78(%[block])                          \n\t"
-        "mov.d     %[ftmp7],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp5],    %[ftmp0],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp2],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp0],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "mov.d     %[ftmp0],    %[ftmp3]                                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp0]                \n\t"
-        "ldc1      %[ftmp6],    0x08(%[block])                          \n\t"
-        "ldc1      %[ftmp0],    0x48(%[block])                          \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "sdc1      %[ftmp3],    0x08(%[block])                          \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
-        "punpckhhw %[ftmp3],    %[ftmp4],       %[ftmp7]                \n\t"
-        "punpcklhw %[ftmp4],    %[ftmp4],       %[ftmp7]                \n\t"
-        "punpckhhw %[ftmp7],    %[ftmp2],       %[ftmp1]                \n\t"
-        "punpcklhw %[ftmp2],    %[ftmp2],       %[ftmp1]                \n\t"
-        "punpckhwd %[ftmp1],    %[ftmp4],       %[ftmp2]                \n\t"
-        "punpcklwd %[ftmp4],    %[ftmp4],       %[ftmp2]                \n\t"
-        "punpckhwd %[ftmp2],    %[ftmp3],       %[ftmp7]                \n\t"
-        "punpcklwd %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "ldc1      %[ftmp7],    0x08(%[block])                          \n\t"
-        "dmfc1     %[tmp5],     %[ftmp4]                                \n\t"
-        "dmfc1     %[tmp7],     %[ftmp1]                                \n\t"
-        "mov.d     %[ftmp12],   %[ftmp3]                                \n\t"
-        "mov.d     %[ftmp14],   %[ftmp2]                                \n\t"
-        "punpckhhw %[ftmp2],    %[ftmp0],       %[ftmp6]                \n\t"
-        "punpcklhw %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "punpckhhw %[ftmp6],    %[ftmp5],       %[ftmp7]                \n\t"
-        "punpcklhw %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "punpckhwd %[ftmp7],    %[ftmp0],       %[ftmp5]                \n\t"
-        "punpcklwd %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
-        "punpckhwd %[ftmp5],    %[ftmp2],       %[ftmp6]                \n\t"
-        "punpcklwd %[ftmp2],    %[ftmp2],       %[ftmp6]                \n\t"
-        "dmfc1     %[tmp6],     %[ftmp0]                                \n\t"
-        "mov.d     %[ftmp11],   %[ftmp7]                                \n\t"
-        "mov.d     %[ftmp13],   %[ftmp2]                                \n\t"
-        "mov.d     %[ftmp15],   %[ftmp5]                                \n\t"
-        PTR_ADDIU "%[addr0],    %[dst],         0x04                    \n\t"
-        "dmtc1     %[tmp7],     %[ftmp7]                                \n\t"
-        "dmtc1     %[tmp3],     %[ftmp6]                                \n\t"
-        "ldc1      %[ftmp1],    0x10($29)                               \n\t"
-        "dmtc1     %[tmp1],     %[ftmp3]                                \n\t"
-        "mov.d     %[ftmp4],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp7],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp14]               \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp14]               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp14]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp14],      %[ftmp8]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "mov.d     %[ftmp5],    %[ftmp1]                                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp6],    %[ftmp0],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp4]                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "mov.d     %[ftmp7],    %[ftmp12]                               \n\t"
-        "psrah     %[ftmp2],    %[ftmp12],      %[ftmp8]                \n\t"
-        "psrah     %[ftmp0],    %[ftmp3],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp7]                \n\t"
-        "ldc1      %[ftmp3],    0x00($29)                               \n\t"
-        "dmtc1     %[tmp5],     %[ftmp7]                                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp2]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "sdc1      %[ftmp3],    0x00($29)                               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1      %[ftmp0],    0x10($29)                               \n\t"
-        "dmfc1     %[tmp1],     %[ftmp2]                                \n\t"
-        "xor       %[ftmp2],    %[ftmp2],       %[ftmp2]                \n\t"
-        "sdc1      %[ftmp2],    0x00(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x08(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x10(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x18(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x20(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x28(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x30(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x38(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x40(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x48(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x50(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x58(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x60(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x68(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x70(%[block])                          \n\t"
-        "sdc1      %[ftmp2],    0x78(%[block])                          \n\t"
-        "dli       %[tmp3],     0x06                                    \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "mtc1      %[tmp3],     %[ftmp10]                               \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp4]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp1]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "ldc1      %[ftmp5],    0x00($29)                               \n\t"
-        "ldc1      %[ftmp4],    0x10($29)                               \n\t"
-        "dmtc1     %[tmp1],     %[ftmp6]                                \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp7],    %[ftmp7],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp5]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        PTR_ADDU  "%[dst],      %[dst],         %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[dst])                            \n\t"
-        "mtc1      %[low32],    %[ftmp3]                                \n\t"
-        "gslwxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp6],    %[ftmp6],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "punpcklbh %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "packushb  %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1   %[ftmp3],    0x03(%[dst])                            \n\t"
-        "gsswrc1   %[ftmp3],    0x00(%[dst])                            \n\t"
-        "gsswxc1   %[ftmp0],    0x00(%[dst],    %[stride])              \n\t"
-        "dmtc1     %[tmp4],     %[ftmp1]                                \n\t"
-        "dmtc1     %[tmp2],     %[ftmp6]                                \n\t"
-        "ldc1      %[ftmp4],    0x18($29)                               \n\t"
-        "mov.d     %[ftmp5],    %[ftmp4]                                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp8]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp11],      %[ftmp8]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp11]               \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp15]               \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp11]               \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp11],      %[ftmp1]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp15]               \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp15]               \n\t"
-        "psrah     %[ftmp2],    %[ftmp15],      %[ftmp8]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp1]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp2]                \n\t"
-        "mov.d     %[ftmp2],    %[ftmp4]                                \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp1],    %[ftmp7],       %[ftmp9]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp5]                \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp9]                \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp9]                \n\t"
-        "psubh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp2],    %[ftmp2],       %[ftmp3]                \n\t"
-        "mov.d     %[ftmp3],    %[ftmp13]                               \n\t"
-        "psrah     %[ftmp0],    %[ftmp13],      %[ftmp8]                \n\t"
-        "psrah     %[ftmp7],    %[ftmp6],       %[ftmp8]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp3]                \n\t"
-        "ldc1      %[ftmp6],    0x08($29)                               \n\t"
-        "dmtc1     %[tmp6],     %[ftmp3]                                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp2],    %[ftmp2],       %[ftmp0]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp5],    %[ftmp5],       %[ftmp7]                \n\t"
-        "psubh     %[ftmp0],    %[ftmp0],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp7]                \n\t"
-        "paddh     %[ftmp1],    %[ftmp1],       %[ftmp6]                \n\t"
-        "psubh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp6]                \n\t"
-        "paddh     %[ftmp4],    %[ftmp4],       %[ftmp3]                \n\t"
-        "psubh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
-        "paddh     %[ftmp3],    %[ftmp3],       %[ftmp3]                \n\t"
-        "sdc1      %[ftmp6],    0x08($29)                               \n\t"
-        "psubh     %[ftmp3],    %[ftmp3],       %[ftmp4]                \n\t"
-        "sdc1      %[ftmp7],    0x18($29)                               \n\t"
-        "dmfc1     %[tmp2],     %[ftmp0]                                \n\t"
-        "xor       %[ftmp0],    %[ftmp0],       %[ftmp0]                \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp2]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp5]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp4],    %[ftmp4],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp1]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp4]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1      %[ftmp2],    0x08($29)                               \n\t"
-        "ldc1      %[ftmp5],    0x18($29)                               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "dmtc1     %[tmp2],     %[ftmp1]                                \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp3],    %[ftmp3],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp2],    %[ftmp2],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp3]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp2]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        PTR_ADDU  "%[addr0],    %[addr0],       %[stride]               \n\t"
-        "uld       %[low32],    0x00(%[addr0])                          \n\t"
-        "mtc1      %[low32],    %[ftmp6]                                \n\t"
-        "gslwxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        "psrah     %[ftmp5],    %[ftmp5],       %[ftmp10]               \n\t"
-        "psrah     %[ftmp1],    %[ftmp1],       %[ftmp10]               \n\t"
-        "punpcklbh %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "punpcklbh %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "paddh     %[ftmp6],    %[ftmp6],       %[ftmp5]                \n\t"
-        "paddh     %[ftmp7],    %[ftmp7],       %[ftmp1]                \n\t"
-        "packushb  %[ftmp6],    %[ftmp6],       %[ftmp0]                \n\t"
-        "packushb  %[ftmp7],    %[ftmp7],       %[ftmp0]                \n\t"
-        "gsswlc1   %[ftmp6],    0x03(%[addr0])                          \n\t"
-        "gsswrc1   %[ftmp6],    0x00(%[addr0])                          \n\t"
-        "gsswxc1   %[ftmp7],    0x00(%[addr0],  %[stride])              \n\t"
-        PTR_ADDIU "$29,         $29,            0x20                    \n\t"
+        "lhu        %[tmp0],    0x00(%[block])                          \n\t"
+        PTR_ADDI   "$29,        $29,            -0x20                   \n\t"
+        PTR_ADDIU  "%[tmp0],    %[tmp0],        0x20                    \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x10)
+        "sh         %[tmp0],    0x00(%[block])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x20)
+        "dli        %[tmp0],    0x01                                    \n\t"
+        MMI_LDC1(%[ftmp3], %[block], 0x30)
+        "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+        MMI_LDC1(%[ftmp5], %[block], 0x50)
+        MMI_LDC1(%[ftmp6], %[block], 0x60)
+        MMI_LDC1(%[ftmp7], %[block], 0x70)
+        "mov.d      %[ftmp0],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp5],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "dli        %[tmp0],    0x02                                    \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp4],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp6]                                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp2], %[block], 0x00)
+        MMI_LDC1(%[ftmp5], %[block], 0x40)
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        MMI_SDC1(%[ftmp6], %[block], 0x00)
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "punpckhwd  %[ftmp3],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp0], %[block], 0x00)
+        MMI_SDC1(%[ftmp7], $29, 0x00)
+        MMI_SDC1(%[ftmp1], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp6]                                \n\t"
+        "dmfc1      %[tmp3],    %[ftmp3]                                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "punpckhwd  %[ftmp0],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpcklwd  %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "punpckhwd  %[ftmp4],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp5], $29, 0x08)
+        MMI_SDC1(%[ftmp0], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp3]                                \n\t"
+        "dmfc1      %[tmp4],    %[ftmp4]                                \n\t"
+        MMI_LDC1(%[ftmp1], %[block], 0x18)
+        MMI_LDC1(%[ftmp6], %[block], 0x28)
+        MMI_LDC1(%[ftmp2], %[block], 0x38)
+        MMI_LDC1(%[ftmp0], %[block], 0x58)
+        MMI_LDC1(%[ftmp3], %[block], 0x68)
+        MMI_LDC1(%[ftmp4], %[block], 0x78)
+        "mov.d      %[ftmp7],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp5],   %[ftmp0],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp2],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "mov.d      %[ftmp0],   %[ftmp3]                                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        MMI_LDC1(%[ftmp6], %[block], 0x08)
+        MMI_LDC1(%[ftmp0], %[block], 0x48)
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_SDC1(%[ftmp3], %[block], 0x08)
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "punpckhhw  %[ftmp3],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
+        "punpckhhw  %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp1]                \n\t"
+        "punpckhwd  %[ftmp1],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpcklwd  %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
+        "punpckhwd  %[ftmp2],   %[ftmp3],       %[ftmp7]                \n\t"
+        "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp7], %[block], 0x08)
+        "dmfc1      %[tmp5],    %[ftmp4]                                \n\t"
+        "mov.d      %[ftmp10],  %[ftmp1]                                \n\t"
+        "mov.d      %[ftmp12],  %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp14],  %[ftmp2]                                \n\t"
+        "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "punpckhhw  %[ftmp6],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpcklhw  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "punpckhwd  %[ftmp7],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "punpckhwd  %[ftmp5],   %[ftmp2],       %[ftmp6]                \n\t"
+        "punpcklwd  %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
+        "dmfc1      %[tmp6],    %[ftmp0]                                \n\t"
+        "mov.d      %[ftmp11],  %[ftmp7]                                \n\t"
+        "mov.d      %[ftmp13],  %[ftmp2]                                \n\t"
+        "mov.d      %[ftmp15],  %[ftmp5]                                \n\t"
+        PTR_ADDIU  "%[addr0],   %[dst],         0x04                    \n\t"
+        "mov.d      %[ftmp7],   %[ftmp10]                               \n\t"
+        "dmtc1      %[tmp3],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp1], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp3]                                \n\t"
+        "mov.d      %[ftmp4],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp7],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp14]               \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp14]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp14]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp14],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "mov.d      %[ftmp5],   %[ftmp1]                                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp6],   %[ftmp0],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "mov.d      %[ftmp7],   %[ftmp12]                               \n\t"
+        "psrah      %[ftmp2],   %[ftmp12],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp3], $29, 0x00)
+        "dmtc1      %[tmp5],    %[ftmp7]                                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp2]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp3], $29, 0x00)
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_SDC1(%[ftmp0], $29, 0x10)
+        "dmfc1      %[tmp1],    %[ftmp2]                                \n\t"
+        "xor        %[ftmp2],   %[ftmp2],       %[ftmp2]                \n\t"
+        MMI_SDC1(%[ftmp2], %[block], 0x00)
+        MMI_SDC1(%[ftmp2], %[block], 0x08)
+        MMI_SDC1(%[ftmp2], %[block], 0x10)
+        MMI_SDC1(%[ftmp2], %[block], 0x18)
+        MMI_SDC1(%[ftmp2], %[block], 0x20)
+        MMI_SDC1(%[ftmp2], %[block], 0x28)
+        MMI_SDC1(%[ftmp2], %[block], 0x30)
+        MMI_SDC1(%[ftmp2], %[block], 0x38)
+        MMI_SDC1(%[ftmp2], %[block], 0x40)
+        MMI_SDC1(%[ftmp2], %[block], 0x48)
+        MMI_SDC1(%[ftmp2], %[block], 0x50)
+        MMI_SDC1(%[ftmp2], %[block], 0x58)
+        MMI_SDC1(%[ftmp2], %[block], 0x60)
+        MMI_SDC1(%[ftmp2], %[block], 0x68)
+        MMI_SDC1(%[ftmp2], %[block], 0x70)
+        MMI_SDC1(%[ftmp2], %[block], 0x78)
+        "dli        %[tmp3],    0x06                                    \n\t"
+        "mtc1       %[tmp3],    %[ftmp10]                               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        MMI_LDC1(%[ftmp5], $29, 0x00)
+        MMI_LDC1(%[ftmp4], $29, 0x10)
+        "dmtc1      %[tmp1],    %[ftmp6]                                \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        PTR_ADDU   "%[dst],     %[dst],         %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp3], %[dst], 0x00)
+        MMI_LWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp6],   %[ftmp6],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "packushb   %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        MMI_SWC1(%[ftmp3], %[dst], 0x00)
+        MMI_SWXC1(%[ftmp0], %[dst], %[stride], 0x00)
+        "dmtc1      %[tmp4],    %[ftmp1]                                \n\t"
+        "dmtc1      %[tmp2],    %[ftmp6]                                \n\t"
+        MMI_LDC1(%[ftmp4], $29, 0x18)
+        "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp11],      %[ftmp8]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp11]               \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp11],      %[ftmp1]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp15]               \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp15]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp15],      %[ftmp8]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
+        "mov.d      %[ftmp2],   %[ftmp4]                                \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp1],   %[ftmp7],       %[ftmp9]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp9]                \n\t"
+        "psubh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
+        "mov.d      %[ftmp3],   %[ftmp13]                               \n\t"
+        "psrah      %[ftmp0],   %[ftmp13],      %[ftmp8]                \n\t"
+        "psrah      %[ftmp7],   %[ftmp6],       %[ftmp8]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        MMI_LDC1(%[ftmp6], $29, 0x08)
+        "dmtc1      %[tmp6],    %[ftmp3]                                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubh      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
+        "psubh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "paddh      %[ftmp4],   %[ftmp4],       %[ftmp3]                \n\t"
+        "psubh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], $29, 0x08)
+        "psubh      %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
+        MMI_SDC1(%[ftmp7], $29, 0x18)
+        "dmfc1      %[tmp2],    %[ftmp0]                                \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp4],   %[ftmp4],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp2], $29, 0x08)
+        MMI_LDC1(%[ftmp5], $29, 0x18)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        "dmtc1      %[tmp2],    %[ftmp1]                                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp3],   %[ftmp3],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[stride]               \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_LWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        "psrah      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psrah      %[ftmp1],   %[ftmp1],       %[ftmp10]               \n\t"
+        "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "paddh      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddh      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        MMI_SWC1(%[ftmp6], %[addr0], 0x00)
+        MMI_SWXC1(%[ftmp7], %[addr0], %[stride], 0x00)
+        PTR_ADDIU  "$29,        $29,            0x20                    \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -648,9 +628,11 @@ void ff_h264_idct8_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
           [tmp2]"=&r"(tmp[2]),              [tmp3]"=&r"(tmp[3]),
           [tmp4]"=&r"(tmp[4]),              [tmp5]"=&r"(tmp[5]),
-          [tmp6]"=&r"(tmp[6]),              [tmp7]"=&r"(tmp[7]),
-          [addr0]"=&r"(addr[0]),
-          [low32]"=&r"(low32)
+          [tmp6]"=&r"(tmp[6]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
         : [dst]"r"(dst),                    [block]"r"(block),
           [stride]"r"((mips_reg)stride)
         : "$29","memory"
@@ -663,7 +645,7 @@ void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     int dc = (block[0] + 32) >> 6;
     double ftmp[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     block[0] = 0;
 
@@ -671,14 +653,10 @@ void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "mtc1       %[dc],      %[ftmp5]                                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "uld        %[low32],   0x00(%[dst0])                           \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[dst1])                           \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
-        "uld        %[low32],   0x00(%[dst2])                           \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
-        "uld        %[low32],   0x00(%[dst3])                           \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_ULWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_ULWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_ULWC1(%[ftmp4], %[dst3], 0x00)
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
@@ -691,18 +669,15 @@ void ff_h264_idct_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[dst0])                           \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "gsswlc1    %[ftmp2],   0x03(%[dst1])                           \n\t"
-        "gsswrc1    %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "gsswlc1    %[ftmp3],   0x03(%[dst2])                           \n\t"
-        "gsswrc1    %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[dst3])                           \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_SWC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SWC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SWC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SWC1(%[ftmp4], %[dst3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
-          [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
-          [low32]"=&r"(low32)
+          [ftmp4]"=&f"(ftmp[4]),
+          RESTRICT_ASM_LOW32
+          [ftmp5]"=&f"(ftmp[5])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [dc]"r"(dc)
@@ -714,6 +689,7 @@ void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
 {
     int dc = (block[0] + 32) >> 6;
     double ftmp[10];
+    DECLARE_VAR_ALL64;
 
     block[0] = 0;
 
@@ -721,10 +697,10 @@ void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "mtc1       %[dc],      %[ftmp5]                                \n\t"
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-        "ldc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "ldc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "ldc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
+        MMI_LDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst3], 0x00)
         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -745,15 +721,15 @@ void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst0])                           \n\t"
-        "sdc1       %[ftmp2],   0x00(%[dst1])                           \n\t"
-        "sdc1       %[ftmp3],   0x00(%[dst2])                           \n\t"
-        "sdc1       %[ftmp4],   0x00(%[dst3])                           \n\t"
-
-        "ldc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
-        "ldc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
-        "ldc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst1], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst2], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst3], 0x00)
+
+        MMI_LDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_LDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_LDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_LDC1(%[ftmp4], %[dst7], 0x00)
         "punpckhbh  %[ftmp6],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp2],       %[ftmp0]                \n\t"
@@ -774,15 +750,17 @@ void ff_h264_idct8_dc_add_8_mmi(uint8_t *dst, int16_t *block, int stride)
         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
-        "sdc1       %[ftmp1],   0x00(%[dst4])                           \n\t"
-        "sdc1       %[ftmp2],   0x00(%[dst5])                           \n\t"
-        "sdc1       %[ftmp3],   0x00(%[dst6])                           \n\t"
-        "sdc1       %[ftmp4],   0x00(%[dst7])                           \n\t"
+        MMI_SDC1(%[ftmp1], %[dst4], 0x00)
+        MMI_SDC1(%[ftmp2], %[dst5], 0x00)
+        MMI_SDC1(%[ftmp3], %[dst6], 0x00)
+        MMI_SDC1(%[ftmp4], %[dst7], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
-          [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9])
+          [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          [ftmp9]"=&f"(ftmp[9])
         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
           [dst4]"r"(dst+4*stride),          [dst5]"r"(dst+5*stride),
@@ -888,17 +866,18 @@ void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
 {
     double ftmp[10];
     uint64_t tmp[2];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         ".set       noreorder                                           \n\t"
         "dli        %[tmp0],    0x08                                    \n\t"
-        "ldc1       %[ftmp3],   0x18(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp3], %[input], 0x18)
         "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
-        "ldc1       %[ftmp2],   0x10(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp2], %[input], 0x10)
         "dli        %[tmp0],    0x20                                    \n\t"
-        "ldc1       %[ftmp1],   0x08(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp1], %[input], 0x08)
         "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-        "ldc1       %[ftmp0],   0x00(%[input])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[input], 0x00)
         "mov.d      %[ftmp4],   %[ftmp3]                                \n\t"
         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp2]                \n\t"
         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -1009,7 +988,10 @@ void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
         "sh         %[input],   0x1e0(%[output])                        \n\t"
         "1:                                                             \n\t"
         "ori        %[tmp0],    $0,             0x1f                    \n\t"
+#if HAVE_LOONGSON3
         "clz        %[tmp1],    %[qmul]                                 \n\t"
+#elif HAVE_LOONGSON2
+#endif
         "ori        %[input],   $0,             0x07                    \n\t"
         "dsubu      %[tmp1],    %[tmp0],        %[tmp1]                 \n\t"
         "ori        %[tmp0],    $0,             0x80                    \n\t"
@@ -1098,6 +1080,7 @@ void ff_h264_luma_dc_dequant_idct_8_mmi(int16_t *output, int16_t *input,
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
           [output]"+&r"(output),            [input]"+&r"(input),
           [qmul]"+&r"(qmul)
         : [ff_pw_1]"f"(ff_pw_1)
@@ -1157,6 +1140,7 @@ void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[8];
+    DECLARE_VAR_ALL64;
 
     offset <<= log2_denom;
 
@@ -1166,8 +1150,8 @@ void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[block0])                     \n\t"
-            "ldc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            MMI_LDC1(%[ftmp1], %[block0], 0x00)
+            MMI_LDC1(%[ftmp2], %[block1], 0x00)
             "mtc1       %[weight],  %[ftmp3]                            \n\t"
             "mtc1       %[offset],  %[ftmp4]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
@@ -1191,12 +1175,14 @@ void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "psrah      %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp2],   %[ftmp2],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[block0])                     \n\t"
-            "sdc1       %[ftmp2],   0x00(%[block1])                     \n\t"
+            MMI_SDC1(%[ftmp1], %[block0], 0x00)
+            MMI_SDC1(%[ftmp2], %[block1], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
-              [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7])
+              [ftmp6]"=&f"(ftmp[6]),
+              RESTRICT_ASM_ALL64
+              [ftmp7]"=&f"(ftmp[7])
             : [block0]"r"(block),           [block1]"r"(block+8),
               [weight]"r"(weight),          [offset]"r"(offset),
               [log2_denom]"r"(log2_denom)
@@ -1205,19 +1191,21 @@ void ff_h264_weight_pixels16_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[9];
+    DECLARE_VAR_ALL64;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src0])                       \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst0])                       \n\t"
+            MMI_LDC1(%[ftmp1], %[src0], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst0], 0x00)
             "mtc1       %[weights], %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1240,9 +1228,9 @@ void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t strid
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst0])                       \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src1])                       \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst1])                       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst0], 0x00)
+            MMI_LDC1(%[ftmp1], %[src1], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst1], 0x00)
             "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t"
             "punpckhbh  %[ftmp8],   %[ftmp2],       %[ftmp0]            \n\t"
             "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
@@ -1258,11 +1246,12 @@ void ff_h264_biweight_pixels16_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t strid
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst1])                       \n\t"
+            MMI_SDC1(%[ftmp1], %[dst1], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
               [ftmp8]"=&f"(ftmp[8])
             : [dst0]"r"(dst),               [dst1]"r"(dst+8),
               [src0]"r"(src),               [src1]"r"(src+8),
@@ -1278,6 +1267,7 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[6];
+    DECLARE_VAR_ALL64;
 
     offset <<= log2_denom;
 
@@ -1287,7 +1277,7 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_LDC1(%[ftmp1], %[block], 0x00)
             "mtc1       %[weight],  %[ftmp2]                            \n\t"
             "mtc1       %[offset],  %[ftmp3]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp5]            \n\t"
@@ -1302,10 +1292,12 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "psrah      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_SDC1(%[ftmp1], %[block], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
-              [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5])
+              [ftmp4]"=&f"(ftmp[4]),
+              RESTRICT_ASM_ALL64
+              [ftmp5]"=&f"(ftmp[5])
             : [block]"r"(block),            [weight]"r"(weight),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
             : "memory"
@@ -1313,19 +1305,21 @@ void ff_h264_weight_pixels8_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[9];
+    DECLARE_VAR_ALL64;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "ldc1       %[ftmp1],   0x00(%[src])                        \n\t"
-            "ldc1       %[ftmp2],   0x00(%[dst])                        \n\t"
+            MMI_LDC1(%[ftmp1], %[src], 0x00)
+            MMI_LDC1(%[ftmp2], %[dst], 0x00)
             "mtc1       %[weights], %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1348,11 +1342,12 @@ void ff_h264_biweight_pixels8_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride
             "psrah      %[ftmp7],   %[ftmp7],       %[ftmp6]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp7]            \n\t"
-            "sdc1       %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SDC1(%[ftmp1], %[dst], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
               [ftmp6]"=&f"(ftmp[6]),        [ftmp7]"=&f"(ftmp[7]),
+              RESTRICT_ASM_ALL64
               [ftmp8]"=&f"(ftmp[8])
             : [dst]"r"(dst),                [src]"r"(src),
               [weights]"r"(weights),        [weightd]"r"(weightd),
@@ -1367,7 +1362,7 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
 {
     int y;
     double ftmp[5];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     offset <<= log2_denom;
 
@@ -1377,8 +1372,7 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     for (y=0; y<height; y++, block+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "uld        %[low32],   0x00(%[block])                      \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[block], 0x00)
             "mtc1       %[weight],  %[ftmp2]                            \n\t"
             "mtc1       %[offset],  %[ftmp3]                            \n\t"
             "mtc1       %[log2_denom],              %[ftmp4]            \n\t"
@@ -1389,12 +1383,11 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp4]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "gsswlc1    %[ftmp1],   0x03(%[block])                      \n\t"
-            "gsswrc1    %[ftmp1],   0x00(%[block])                      \n\t"
+            MMI_SWC1(%[ftmp1], %[block], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
-              [ftmp4]"=&f"(ftmp[4]),
-              [low32]"=&r"(low32)
+              RESTRICT_ASM_LOW32
+              [ftmp4]"=&f"(ftmp[4])
             : [block]"r"(block),            [weight]"r"(weight),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom)
             : "memory"
@@ -1402,22 +1395,21 @@ void ff_h264_weight_pixels4_8_mmi(uint8_t *block, ptrdiff_t stride, int height,
     }
 }
 
-void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
-        int height, int log2_denom, int weightd, int weights, int offset)
+void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src,
+        ptrdiff_t stride, int height, int log2_denom, int weightd, int weights,
+        int offset)
 {
     int y;
     double ftmp[7];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     offset = ((offset + 1) | 1) << log2_denom;
 
     for (y=0; y<height; y++, dst+=stride, src+=stride) {
         __asm__ volatile (
             "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
-            "uld        %[low32],   0x00(%[src])                        \n\t"
-            "mtc1       %[low32],   %[ftmp1]                            \n\t"
-            "uld        %[low32],   0x00(%[dst])                        \n\t"
-            "mtc1       %[low32],   %[ftmp2]                            \n\t"
+            MMI_ULWC1(%[ftmp1], %[src], 0x00)
+            MMI_ULWC1(%[ftmp2], %[dst], 0x00)
             "mtc1       %[weight],  %[ftmp3]                            \n\t"
             "mtc1       %[weightd], %[ftmp4]                            \n\t"
             "mtc1       %[offset],  %[ftmp5]                            \n\t"
@@ -1433,13 +1425,12 @@ void ff_h264_biweight_pixels4_8_mmi(uint8_t *dst, uint8_t *src, ptrdiff_t stride
             "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"
             "psrah      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
             "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
-            "gsswlc1    %[ftmp1],   0x03(%[dst])                        \n\t"
-            "gsswrc1    %[ftmp1],   0x00(%[dst])                        \n\t"
+            MMI_SWC1(%[ftmp1], %[dst], 0x00)
             : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
               [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
               [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
-              [ftmp6]"=&f"(ftmp[6]),
-              [low32]"=&r"(low32)
+              RESTRICT_ASM_LOW32
+              [ftmp6]"=&f"(ftmp[6])
             : [dst]"r"(dst),                [src]"r"(src),
               [weight]"r"(weights),         [weightd]"r"(weightd),
               [offset]"r"(offset),          [log2_denom]"r"(log2_denom+1)
@@ -1453,7 +1444,9 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 {
     double ftmp[12];
     mips_reg addr[2];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
@@ -1463,10 +1456,10 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_SUBU   "%[addr1],   $0,             %[addr1]                \n\t"
         "addi       %[beta],    %[beta],        -0x01                   \n\t"
         PTR_ADDU   "%[addr1],   %[addr1],       %[pix]                  \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp1],   0x00(%[addr1],  %[stride])              \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
         "mtc1       %[beta],    %[ftmp6]                                \n\t"
         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
@@ -1489,12 +1482,11 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[tc0], 0x00)
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp9],   %[ftmp5],       %[ftmp5]                \n\t"
         "pcmpgtb    %[ftmp5],   %[ftmp9],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp4],   0x00(%[addr1])                          \n\t"
+        MMI_LDC1(%[ftmp4], %[addr1], 0x00)
         "and        %[ftmp10],  %[ftmp5],       %[ftmp8]                \n\t"
         "psubusb    %[ftmp8],   %[ftmp4],       %[ftmp2]                \n\t"
         "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
@@ -1506,7 +1498,7 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "psubb      %[ftmp8],   %[ftmp5],       %[ftmp7]                \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
         "pavgb      %[ftmp5],   %[ftmp2],       %[ftmp3]                \n\t"
-        "ldc1       %[ftmp11],  0x00(%[addr1])                          \n\t"
+        MMI_LDC1(%[ftmp11], %[addr1], 0x00)
         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "xor        %[ftmp5],   %[ftmp5],       %[ftmp11]               \n\t"
         "and        %[ftmp5],   %[ftmp5],       %[ff_pb_1]              \n\t"
@@ -1515,8 +1507,8 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
         "pmaxub     %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "pminub     %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
-        "gssdxc1    %[ftmp4],   0x00(%[addr1],  %[stride])              \n\t"
-        "gsldxc1    %[ftmp5],   0x00(%[pix],    %[addr0])               \n\t"
+        MMI_SDXC1(%[ftmp4], %[addr1], %[stride], 0x00)
+        MMI_LDXC1(%[ftmp5], %[pix], %[addr0], 0x00)
         "psubusb    %[ftmp4],   %[ftmp5],       %[ftmp3]                \n\t"
         "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp5]                \n\t"
         "psubusb    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
@@ -1525,9 +1517,9 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
         "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
         "and        %[ftmp6],   %[ftmp9],       %[ftmp7]                \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
         "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
-        "gsldxc1    %[ftmp11],  0x00(%[pix],    %[addr0])               \n\t"
+        MMI_LDXC1(%[ftmp11], %[pix], %[addr0], 0x00)
         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ff_pb_1]              \n\t"
@@ -1536,7 +1528,7 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
         "pmaxub     %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "pminub     %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-        "gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
         "xor        %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         "and        %[ftmp6],   %[ftmp6],       %[ff_pb_1]              \n\t"
@@ -1555,16 +1547,18 @@ void ff_deblock_v8_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "psubusb    %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
-        "gssdxc1    %[ftmp2],   0x00(%[addr1],  %[addr0])               \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr1], %[addr0], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
-          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"((mips_reg)alpha),      [beta]"r"((mips_reg)beta),
           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
@@ -1580,203 +1574,205 @@ static void deblock_v8_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
     double ftmp[16];
     uint64_t tmp[1];
     mips_reg addr[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
-__asm__ volatile (
-"ori        %[tmp0],    $0,             0x01                    \n\t"
-"xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-"mtc1       %[tmp0],    %[ftmp9]                                \n\t"
-PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
-PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
-PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
-PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
-"bltz       %[alpha],   1f                                      \n\t"
-PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
-PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
-"bltz       %[beta],    1f                                      \n\t"
-PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
-PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
-"ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-"gsldxc1    %[ftmp1],   0x00(%[addr0],  %[addr2])               \n\t"
-"gsldxc1    %[ftmp2],   0x00(%[addr0],  %[addr1])               \n\t"
-"gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
-"mtc1       %[alpha],   %[ftmp5]                                \n\t"
-"mtc1       %[beta],    %[ftmp6]                                \n\t"
-"pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-"pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-"packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
-"packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"sdc1       %[ftmp5],   0x10+%[stack]                           \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
-"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
-"or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp5],   0x10+%[stack]                           \n\t"
-"pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
-"sdc1       %[ftmp8],   0x20+%[stack]                           \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
-"psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
-"psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"ldc1       %[ftmp15],  0x20+%[stack]                           \n\t"
-"pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
-"gsldxc1    %[ftmp15],  0x00(%[addr0],  %[stride])              \n\t"
-"psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"gsldxc1    %[ftmp14],  0x00(%[pix],    %[addr2])               \n\t"
-"sdc1       %[ftmp5],   0x30+%[stack]                           \n\t"
-"psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
-"psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"sdc1       %[ftmp5],   0x40+%[stack]                           \n\t"
-"pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
-"paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
-"paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
-"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
-"psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
-"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
-"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp13],  0x10+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
-"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
-"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-"xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"ldc1       %[ftmp13],  0x30+%[stack]                           \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-"ldc1       %[ftmp12],  0x20+%[stack]                           \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[addr1])               \n\t"
-"ldc1       %[ftmp6],   0x00(%[addr0])                          \n\t"
-"paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x30+%[stack]                           \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
-"gssdxc1    %[ftmp5],   0x00(%[addr0],  %[addr2])               \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[addr0],  %[stride])              \n\t"
-"pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
-"pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
-"sdc1       %[ftmp6],   0x10+%[stack]                           \n\t"
-"paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
-"paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
-"sdc1       %[ftmp7],   0x00+%[stack]                           \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
-"paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
-"psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
-"psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"ldc1       %[ftmp12],  0x10+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
-"xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-"xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
-"pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
-"and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
-"psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
-"ldc1       %[ftmp13],  0x20+%[stack]                           \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
-"sdc1       %[ftmp6],   0x00(%[pix])                            \n\t"
-"gsldxc1    %[ftmp6],   0x00(%[pix],    %[addr1])               \n\t"
-"paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"ldc1       %[ftmp12],  0x00+%[stack]                           \n\t"
-"pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
-"paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
-"psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
-"pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
-"xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-"and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
-"ldc1       %[ftmp12],  0x40+%[stack]                           \n\t"
-"psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
-"and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
-"xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
-"xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
-"gssdxc1    %[ftmp5],   0x00(%[pix],    %[stride])              \n\t"
-"gssdxc1    %[ftmp6],   0x00(%[pix],    %[addr2])               \n\t"
-"1:                                                             \n\t"
+    __asm__ volatile (
+        "ori        %[tmp0],    $0,             0x01                    \n\t"
+        "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
+        "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+        PTR_SLL    "%[addr0],   %[stride],      0x02                    \n\t"
+        PTR_ADDU   "%[addr2],   %[stride],      %[stride]               \n\t"
+        PTR_ADDIU  "%[alpha],   %[alpha],       -0x01                   \n\t"
+        PTR_SLL    "%[ftmp11],  %[ftmp9],       %[ftmp9]                \n\t"
+        "bltz       %[alpha],   1f                                      \n\t"
+        PTR_ADDU   "%[addr1],   %[addr2],       %[stride]               \n\t"
+        PTR_ADDIU  "%[beta],    %[beta],        -0x01                   \n\t"
+        "bltz       %[beta],    1f                                      \n\t"
+        PTR_SUBU   "%[addr0],   $0,             %[addr0]                \n\t"
+        PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp1], %[addr0], %[addr2], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[addr1], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
+        "mtc1       %[alpha],   %[ftmp5]                                \n\t"
+        "mtc1       %[beta],    %[ftmp6]                                \n\t"
+        "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "pshufh     %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
+        "packushb   %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp2]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "packushb   %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x10)
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp1]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp1],       %[ftmp2]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp3],       %[ftmp4]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp4],       %[ftmp3]                \n\t"
+        "or         %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp5], %[stack], 0x10)
+        "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "ldc1       %[ftmp10],  %[ff_pb_1]                              \n\t"
+        MMI_SDC1(%[ftmp8], %[stack], 0x20)
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp0]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp10]               \n\t"
+        "psubusb    %[ftmp7],   %[ftmp2],       %[ftmp3]                \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
+        "psubusb    %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        MMI_LDC1(%[ftmp15], %[stack], 0x20)
+        "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp15]               \n\t"
+        MMI_LDXC1(%[ftmp15], %[addr0], %[stride], 0x00)
+        "psubusb    %[ftmp8],   %[ftmp15],      %[ftmp2]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp2],       %[ftmp15]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_LDXC1(%[ftmp14], %[pix], %[addr2], 0x00)
+        MMI_SDC1(%[ftmp5], %[stack], 0x30)
+        "psubusb    %[ftmp8],   %[ftmp14],      %[ftmp3]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp3],       %[ftmp14]               \n\t"
+        "psubusb    %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "psubusb    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp8]                \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        "pavgb      %[ftmp5],   %[ftmp15],      %[ftmp1]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp2],       %[ftmp3]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp2],       %[ftmp3]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp15],      %[ftmp4]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp15],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp2],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp2],       %[ftmp4]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x30)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x20)
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp13]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp2]                \n\t"
+        MMI_SDXC1(%[ftmp6], %[addr0], %[addr1], 0x00)
+        MMI_LDC1(%[ftmp6], %[addr0], 0x00)
+        "paddb      %[ftmp7],   %[ftmp15],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x30)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp15]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[addr0], %[addr2], 0x00)
+        MMI_SDXC1(%[ftmp6], %[addr0], %[stride], 0x00)
+        "pavgb      %[ftmp5],   %[ftmp14],      %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp3],       %[ftmp2]                \n\t"
+        "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
+        MMI_SDC1(%[ftmp6], %[stack], 0x10)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp4]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp3],       %[ftmp2]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        "mov.d      %[ftmp8],   %[ftmp7]                                \n\t"
+        MMI_SDC1(%[ftmp7], %[stack], 0x00)
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp9]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp5]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp14],      %[ftmp1]                \n\t"
+        "paddb      %[ftmp8],   %[ftmp8],       %[ftmp8]                \n\t"
+        "psubb      %[ftmp7],   %[ftmp14],      %[ftmp1]                \n\t"
+        "psubb      %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x10)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp4]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp8],   %[ftmp8],       %[ftmp0]                \n\t"
+        "xor        %[ftmp8],   %[ftmp8],       %[ftmp6]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
+        "xor        %[ftmp8],   %[ftmp3],       %[ftmp1]                \n\t"
+        "pavgb      %[ftmp7],   %[ftmp3],       %[ftmp1]                \n\t"
+        "and        %[ftmp8],   %[ftmp8],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
+        MMI_LDC1(%[ftmp13], %[stack], 0x20)
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp13]               \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
+        MMI_SDC1(%[ftmp6], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp6], %[pix], %[addr1], 0x00)
+        "paddb      %[ftmp7],   %[ftmp14],      %[ftmp6]                \n\t"
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x00)
+        "pavgb      %[ftmp6],   %[ftmp6],       %[ftmp5]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
+        "paddb      %[ftmp7],   %[ftmp7],       %[ftmp12]               \n\t"
+        "psrlh      %[ftmp7],   %[ftmp7],       %[ftmp11]               \n\t"
+        "pavgb      %[ftmp7],   %[ftmp7],       %[ftmp0]                \n\t"
+        "xor        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
+        "and        %[ftmp7],   %[ftmp7],       %[ftmp10]               \n\t"
+        MMI_LDC1(%[ftmp12], %[stack], 0x40)
+        "psubb      %[ftmp6],   %[ftmp6],       %[ftmp7]                \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        "and        %[ftmp5],   %[ftmp5],       %[ftmp12]               \n\t"
+        "and        %[ftmp6],   %[ftmp6],       %[ftmp12]               \n\t"
+        "xor        %[ftmp5],   %[ftmp5],       %[ftmp4]                \n\t"
+        "xor        %[ftmp6],   %[ftmp6],       %[ftmp14]               \n\t"
+        MMI_SDXC1(%[ftmp5], %[pix], %[stride], 0x00)
+        MMI_SDXC1(%[ftmp6], %[pix], %[addr2], 0x00)
+        "1:                                                             \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
@@ -1785,22 +1781,26 @@ PTR_ADDU   "%[addr0],   %[addr0],       %[pix]                  \n\t"
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
           [ftmp14]"=&f"(ftmp[14]),          [ftmp15]"=&f"(ftmp[15]),
-  [tmp0]"=&r"(tmp[0]),
-  [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
-  [addr2]"=&r"(addr[2]),
-  [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+          [addr2]"=&r"(addr[2]),
+          [alpha]"+&r"(alpha),              [beta]"+&r"(beta)
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
-  [stack]"m"(stack[0]),             [ff_pb_1]"m"(ff_pb_1)
-: "memory"
-);
+          [stack]"r"(stack),                [ff_pb_1]"m"(ff_pb_1)
+        : "memory"
+    );
 }
 
-void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
-        int8_t *tc0)
+void ff_deblock_v_chroma_8_mmi(uint8_t *pix, ptrdiff_t stride, int alpha,
+        int beta, int8_t *tc0)
 {
     double ftmp[9];
     mips_reg addr[1];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1808,10 +1808,10 @@ void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[addr0],   $0,             %[pix]                  \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
 
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
@@ -1836,8 +1836,7 @@ void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp8],   %[ftmp8],       %[ftmp5]                \n\t"
         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "pcmpeqb    %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[tc0], 0x00)
         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
         "and        %[ftmp8],   %[ftmp8],       %[ftmp7]                \n\t"
         "pcmpeqb    %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
@@ -1859,15 +1858,17 @@ void ff_deblock_v_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "paddusb    %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
         "paddusb    %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 
-        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
-          [addr0]"=&r"(addr[0]),
-          [low32]"=&r"(low32)
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
+          [addr0]"=&r"(addr[0])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"(alpha),                [beta]"r"(beta),
           [tc0]"r"(tc0),                    [ff_pb_1]"f"(ff_pb_1),
@@ -1881,6 +1882,8 @@ void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 {
     double ftmp[9];
     mips_reg addr[1];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1888,10 +1891,10 @@ void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "or         %[addr0],   $0,             %[pix]                  \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
         PTR_SUBU   "%[addr0],   %[addr0],       %[stride]               \n\t"
-        "ldc1       %[ftmp1],   0x00(%[addr0])                          \n\t"
-        "gsldxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "ldc1       %[ftmp3],   0x00(%[pix])                            \n\t"
-        "gsldxc1    %[ftmp4],   0x00(%[pix],    %[stride])              \n\t"
+        MMI_LDC1(%[ftmp1], %[addr0], 0x00)
+        MMI_LDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_LDC1(%[ftmp3], %[pix], 0x00)
+        MMI_LDXC1(%[ftmp4], %[pix], %[stride], 0x00)
 
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "mtc1       %[alpha],   %[ftmp5]                                \n\t"
@@ -1935,13 +1938,15 @@ void ff_deblock_v_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "paddb      %[ftmp2],   %[ftmp2],       %[ftmp6]                \n\t"
         "paddb      %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
 
-        "gssdxc1    %[ftmp2],   0x00(%[addr0],  %[stride])              \n\t"
-        "sdc1       %[ftmp3],   0x00(%[pix])                            \n\t"
+        MMI_SDXC1(%[ftmp2], %[addr0], %[stride], 0x00)
+        MMI_SDC1(%[ftmp3], %[pix], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [addr0]"=&r"(addr[0])
         : [pix]"r"(pix),                    [stride]"r"((mips_reg)stride),
           [alpha]"r"(alpha),                [beta]"r"(beta),
@@ -1955,7 +1960,7 @@ void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 {
     double ftmp[11];
     mips_reg addr[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -1966,32 +1971,24 @@ void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
         "or         %[addr5],   $0,             %[pix]                  \n\t"
         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[pix])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
@@ -2027,8 +2024,7 @@ void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "or         %[ftmp7],   %[ftmp7],       %[ftmp4]                \n\t"
         "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
         "pcmpeqb    %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
-        "uld        %[low32],   0x00(%[tc0])                            \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[tc0], 0x00)
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
         "and        %[ftmp7],   %[ftmp7],       %[ftmp6]                \n\t"
         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
@@ -2057,48 +2053,40 @@ void ff_deblock_h_chroma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp10]               \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp10],      %[ftmp10]               \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
-          [pix]"+&r"(pix),
-          [low32]"=&r"(low32)
+          [pix]"+&r"(pix)
         : [alpha]"r"(alpha),                [beta]"r"(beta),
           [stride]"r"((mips_reg)stride),    [tc0]"r"(tc0),
           [ff_pb_1]"f"(ff_pb_1),            [ff_pb_3]"f"(ff_pb_3),
@@ -2112,7 +2100,7 @@ void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 {
     double ftmp[11];
     mips_reg addr[6];
-    uint64_t low32;
+    DECLARE_VAR_LOW32;
 
     __asm__ volatile (
         "addi       %[alpha],   %[alpha],       -0x01                   \n\t"
@@ -2123,32 +2111,24 @@ void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_ADDU   "%[addr2],   %[addr0],       %[addr0]                \n\t"
         "or         %[addr5],   $0,             %[pix]                  \n\t"
         PTR_ADDU   "%[pix],     %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr5])                          \n\t"
-        "mtc1       %[low32],   %[ftmp0]                                \n\t"
+        MMI_ULWC1(%[ftmp0], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp2]                                \n\t"
+        MMI_ULWC1(%[ftmp2], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp1]                                \n\t"
-        "uld        %[low32],   0x00(%[pix])                            \n\t"
-        "mtc1       %[low32],   %[ftmp3]                                \n\t"
+        MMI_ULWC1(%[ftmp1], %[addr4], 0x00)
+        MMI_ULWC1(%[ftmp3], %[pix], 0x00)
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp4]                                \n\t"
+        MMI_ULWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr0]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp6]                                \n\t"
+        MMI_ULWC1(%[ftmp6], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr1]                \n\t"
-        "uld        %[low32],   0x00(%[addr3])                          \n\t"
-        "mtc1       %[low32],   %[ftmp5]                                \n\t"
+        MMI_ULWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[pix],         %[addr2]                \n\t"
-        "uld        %[low32],   0x00(%[addr4])                          \n\t"
-        "mtc1       %[low32],   %[ftmp7]                                \n\t"
+        MMI_ULWC1(%[ftmp7], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
         "mov.d      %[ftmp6],   %[ftmp4]                                \n\t"
@@ -2208,48 +2188,40 @@ void ff_deblock_h_chroma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr5],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
         PTR_ADDU   "%[addr4],   %[addr5],       %[addr0]                \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[pix])                            \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[pix])                            \n\t"
+        MMI_USWC1(%[ftmp0], %[pix], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[pix],         %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[pix],         %[addr1]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[pix],         %[addr2]                \n\t"
+        MMI_USWC1(%[ftmp4], %[addr4], 0x00)
         "punpckhwd  %[ftmp9],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr4])                          \n\t"
-        "gsswlc1    %[ftmp9],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp9],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp9], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
+          RESTRICT_ASM_LOW32
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
-          [pix]"+&r"(pix),
-          [low32]"=&r"(low32)
+          [pix]"+&r"(pix)
         : [alpha]"r"(alpha),                [beta]"r"(beta),
           [stride]"r"((mips_reg)stride),    [ff_pb_1]"f"(ff_pb_1)
         : "memory"
@@ -2275,34 +2247,29 @@ void ff_deblock_v_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
 void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         int8_t *tc0)
 {
-    uint64_t stack[0xd];
+    DECLARE_ALIGNED(8, const uint64_t, stack[0x0d]);
     double ftmp[9];
     mips_reg addr[8];
+    DECLARE_VAR_LOW32;
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
         PTR_ADDI   "%[addr1],   %[pix],         -0x4                    \n\t"
         PTR_ADDU   "%[addr2],   %[stride],      %[addr0]                \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2311,9 +2278,8 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
-        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
@@ -2321,9 +2287,9 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "ldc1       %[ftmp8],   0x10(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x10)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp0], %[stack], 0x00)
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
@@ -2333,32 +2299,25 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp3]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp1],   0x10(%[stack])                          \n\t"
-        "sdc1       %[ftmp3],   0x20(%[stack])                          \n\t"
-        "sdc1       %[ftmp7],   0x30(%[stack])                          \n\t"
-        "sdc1       %[ftmp5],   0x40(%[stack])                          \n\t"
-        "sdc1       %[ftmp6],   0x50(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x10)
+        MMI_SDC1(%[ftmp3], %[stack], 0x20)
+        MMI_SDC1(%[ftmp7], %[stack], 0x30)
+        MMI_SDC1(%[ftmp5], %[stack], 0x40)
+        MMI_SDC1(%[ftmp6], %[stack], 0x50)
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr2]                \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr3])                          \n\t"
+        MMI_ULDC1(%[ftmp6], %[addr3], 0x00)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
@@ -2366,9 +2325,8 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
-        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr3])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr3])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_ULDC1(%[ftmp8], %[addr3], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
@@ -2376,8 +2334,8 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
-        "ldc1       %[ftmp8],   0x18(%[stack])                          \n\t"
-        "sdc1       %[ftmp0],   0x08(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp8], %[stack], 0x18)
+        MMI_SDC1(%[ftmp0], %[stack], 0x08)
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp8]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp3],       %[ftmp5]                \n\t"
@@ -2387,16 +2345,17 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpckhwd  %[ftmp3],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp0]                \n\t"
-        "sdc1       %[ftmp1],   0x18(%[stack])                          \n\t"
-        "sdc1       %[ftmp3],   0x28(%[stack])                          \n\t"
-        "sdc1       %[ftmp7],   0x38(%[stack])                          \n\t"
-        "sdc1       %[ftmp5],   0x48(%[stack])                          \n\t"
-        "sdc1       %[ftmp6],   0x58(%[stack])                          \n\t"
+        MMI_SDC1(%[ftmp1], %[stack], 0x18)
+        MMI_SDC1(%[ftmp3], %[stack], 0x28)
+        MMI_SDC1(%[ftmp7], %[stack], 0x38)
+        MMI_SDC1(%[ftmp5], %[stack], 0x48)
+        MMI_SDC1(%[ftmp6], %[stack], 0x58)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2410,15 +2369,15 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
-        PTR_ADDI   "%[addr1],   %[pix],          -0x02                  \n\t"
+        PTR_ADDI   "%[addr1],   %[pix],         -0x02                   \n\t"
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
         PTR_ADDU   "%[addr2],   %[addr0],       %[stride]               \n\t"
         PTR_ADDU   "%[addr7],   %[addr6],       %[addr6]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "ldc1       %[ftmp0],   0x10(%[stack])                          \n\t"
-        "ldc1       %[ftmp1],   0x20(%[stack])                          \n\t"
-        "ldc1       %[ftmp2],   0x30(%[stack])                          \n\t"
-        "ldc1       %[ftmp3],   0x40(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x10)
+        MMI_LDC1(%[ftmp1], %[stack], 0x20)
+        MMI_LDC1(%[ftmp2], %[stack], 0x30)
+        MMI_LDC1(%[ftmp3], %[stack], 0x40)
         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp1],       %[ftmp1]                \n\t"
         "punpckhwd  %[ftmp6],   %[ftmp2],       %[ftmp2]                \n\t"
@@ -2426,43 +2385,35 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr7]                \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr7]                \n\t"
-        "ldc1       %[ftmp0],   0x18(%[stack])                          \n\t"
-        "ldc1       %[ftmp1],   0x28(%[stack])                          \n\t"
-        "ldc1       %[ftmp2],   0x38(%[stack])                          \n\t"
-        "ldc1       %[ftmp3],   0x48(%[stack])                          \n\t"
+        MMI_LDC1(%[ftmp0], %[stack], 0x18)
+        MMI_LDC1(%[ftmp1], %[stack], 0x28)
+        MMI_LDC1(%[ftmp2], %[stack], 0x38)
+        MMI_LDC1(%[ftmp3], %[stack], 0x48)
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp0],       %[ftmp0]                \n\t"
         PTR_ADDU   "%[addr6],   %[addr0],       %[addr0]                \n\t"
@@ -2473,41 +2424,35 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
         PTR_ADDU   "%[addr3],   %[addr1],       %[stride]               \n\t"
         "punpcklhw  %[ftmp1],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr1], 0x00)
         "punpckhwd  %[ftmp1],   %[ftmp1],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
-        "gsswlc1    %[ftmp1],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp1],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp1], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp0], %[addr5], 0x00)
         "punpckhwd  %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "punpckhwd  %[ftmp3],   %[ftmp3],       %[ftmp3]                \n\t"
-        "gsswlc1    %[ftmp0],   0x03(%[addr4])                          \n\t"
-        "gsswrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USWC1(%[ftmp0], %[addr4], 0x00)
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp3]                \n\t"
         PTR_ADDU   "%[addr3],   %[addr4],       %[stride]               \n\t"
         "punpcklhw  %[ftmp5],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp5],       %[ftmp5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsswlc1    %[ftmp5],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp5],   0x00(%[addr3])                          \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr5])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_USWC1(%[ftmp5], %[addr3], 0x00)
+        MMI_USWC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr3],   %[addr4],       %[addr6]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
-        "gsswlc1    %[ftmp4],   0x03(%[addr3])                          \n\t"
-        "gsswrc1    %[ftmp4],   0x00(%[addr3])                          \n\t"
+        MMI_USWC1(%[ftmp4], %[addr3], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_LOW32
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2521,10 +2466,11 @@ void ff_deblock_h_luma_8_mmi(uint8_t *pix, int stride, int alpha, int beta,
 void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         int beta)
 {
-    uint64_t ptmp[0x11];
-    uint64_t pdat[4];
+    DECLARE_ALIGNED(8, const uint64_t, ptmp[0x11]);
+    DECLARE_ALIGNED(8, const uint64_t, pdat[0x04]);
     double ftmp[9];
     mips_reg addr[7];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         PTR_ADDU   "%[addr0],   %[stride],      %[stride]               \n\t"
@@ -2533,24 +2479,17 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_ADDU   "%[addr3],   %[addr0],       %[addr0]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2558,60 +2497,52 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "sdc1       %[ftmp3],   0x00(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x00)
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x00(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x10(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp7],   0x40(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp4],   0x50(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp8],   0x20(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x10)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x40)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x20)
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
-        "sdc1       %[ftmp3],   0x20(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp0],   0x30(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x70(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x20)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x30)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x60)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x70)
         PTR_ADDU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
         PTR_ADDU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_ULDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp1],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp1],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr6])                          \n\t"
+        MMI_ULDC1(%[ftmp1], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[stride]               \n\t"
-        "gsldlc1    %[ftmp3],   0x07(%[addr4])                          \n\t"
-        "gsldrc1    %[ftmp3],   0x00(%[addr4])                          \n\t"
+        MMI_ULDC1(%[ftmp3], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr0]                \n\t"
-        "gsldlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp4], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp5], %[addr6], 0x00)
+        MMI_ULDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr3]                \n\t"
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
@@ -2619,38 +2550,37 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr5])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr5])                          \n\t"
+        MMI_ULDC1(%[ftmp8], %[addr5], 0x00)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "sdc1       %[ftmp3],   0x08(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x08)
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x08(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x08)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
-        "sdc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x18(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp7],   0x48(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp4],   0x58(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp8],   0x28(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x18)
+        MMI_SDC1(%[ftmp7], %[ptmp], 0x48)
+        MMI_SDC1(%[ftmp4], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x28)
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "sdc1       %[ftmp3],   0x28(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp0],   0x38(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
-        "sdc1       %[ftmp5],   0x78(%[ptmp])                           \n\t"
+        MMI_SDC1(%[ftmp3], %[ptmp], 0x28)
+        MMI_SDC1(%[ftmp0], %[ptmp], 0x38)
+        MMI_SDC1(%[ftmp6], %[ptmp], 0x68)
+        MMI_SDC1(%[ftmp5], %[ptmp], 0x78)
         PTR_S      "%[addr1],   0x00(%[pdat])                           \n\t"
         PTR_S      "%[addr2],   0x08(%[pdat])                           \n\t"
         PTR_S      "%[addr0],   0x10(%[pdat])                           \n\t"
@@ -2660,6 +2590,7 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
@@ -2677,24 +2608,23 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         PTR_L      "%[addr0],   0x10(%[pdat])                           \n\t"
         PTR_L      "%[addr3],   0x18(%[pdat])                           \n\t"
         PTR_ADDU   "%[addr4],   %[addr1],       %[addr2]                \n\t"
-        "ldc1       %[ftmp0],   0x08(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp1],   0x18(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x28(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp3],   0x38(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp4],   0x48(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp5],   0x58(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp6],   0x68(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x08)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x18)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x28)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x38)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x48)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x58)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x68)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1       %[ftmp8],   0x78(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x78)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
@@ -2702,10 +2632,8 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
@@ -2713,55 +2641,45 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr3],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
         PTR_SUBU   "%[addr1],   %[addr1],       %[addr5]                \n\t"
         PTR_SUBU   "%[addr4],   %[addr4],       %[addr5]                \n\t"
-        "ldc1       %[ftmp0],   0x00(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp1],   0x10(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp2],   0x20(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp3],   0x30(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp4],   0x40(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp5],   0x50(%[ptmp])                           \n\t"
-        "ldc1       %[ftmp6],   0x60(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[ptmp], 0x00)
+        MMI_LDC1(%[ftmp1], %[ptmp], 0x10)
+        MMI_LDC1(%[ftmp2], %[ptmp], 0x20)
+        MMI_LDC1(%[ftmp3], %[ptmp], 0x30)
+        MMI_LDC1(%[ftmp4], %[ptmp], 0x40)
+        MMI_LDC1(%[ftmp5], %[ptmp], 0x50)
+        MMI_LDC1(%[ftmp6], %[ptmp], 0x60)
         "punpckhbh  %[ftmp7],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
         "punpckhbh  %[ftmp1],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp3]                \n\t"
         "punpckhbh  %[ftmp3],   %[ftmp4],       %[ftmp5]                \n\t"
         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
-        "ldc1       %[ftmp8],   0x70(%[ptmp])                           \n\t"
+        MMI_LDC1(%[ftmp8], %[ptmp], 0x70)
         "punpckhbh  %[ftmp5],   %[ftmp6],       %[ftmp8]                \n\t"
         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr1], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhhw  %[ftmp3],   %[ftmp0],       %[ftmp2]                \n\t"
         "punpcklhw  %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
@@ -2769,10 +2687,8 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpcklhw  %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
         "punpckhhw  %[ftmp6],   %[ftmp7],       %[ftmp1]                \n\t"
         "punpcklhw  %[ftmp7],   %[ftmp7],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp2],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp2],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
-        "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp2], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp2], %[addr1], 0x00)
         "punpckhhw  %[ftmp1],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpcklhw  %[ftmp2],   %[ftmp2],       %[ftmp5]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp0],       %[ftmp4]                \n\t"
@@ -2780,39 +2696,31 @@ void ff_deblock_h_luma_intra_8_mmi(uint8_t *pix, int stride, int alpha,
         "punpckhwd  %[ftmp4],   %[ftmp7],       %[ftmp2]                \n\t"
         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp2]                \n\t"
         PTR_ADDU   "%[addr5],   %[addr1],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr1], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[stride]               \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp5], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp7],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp7],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp7], %[addr6], 0x00)
         PTR_ADDU   "%[addr6],   %[addr1],       %[addr0]                \n\t"
-        "gssdlc1    %[ftmp4],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp4],   0x00(%[addr5])                          \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[addr6])                          \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp4], %[addr5], 0x00)
+        MMI_ULDC1(%[ftmp8], %[addr6], 0x00)
         PTR_ADDU   "%[addr5],   %[addr1],       %[addr0]                \n\t"
         "punpckhwd  %[ftmp0],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp8]                \n\t"
         "punpckhwd  %[ftmp5],   %[ftmp6],       %[ftmp1]                \n\t"
         "punpcklwd  %[ftmp6],   %[ftmp6],       %[ftmp1]                \n\t"
-        "gssdlc1    %[ftmp3],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp3],   0x00(%[addr5])                          \n\t"
+        MMI_USDC1(%[ftmp3], %[addr5], 0x00)
         PTR_ADDU   "%[addr5],   %[addr4],       %[addr2]                \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[addr4])                          \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[addr4])                          \n\t"
+        MMI_USDC1(%[ftmp0], %[addr4], 0x00)
         PTR_ADDU   "%[addr6],   %[addr4],       %[addr3]                \n\t"
-        "gssdlc1    %[ftmp6],   0x07(%[addr5])                          \n\t"
-        "gssdrc1    %[ftmp6],   0x00(%[addr5])                          \n\t"
-        "gssdlc1    %[ftmp5],   0x07(%[addr6])                          \n\t"
-        "gssdrc1    %[ftmp5],   0x00(%[addr6])                          \n\t"
+        MMI_USDC1(%[ftmp6], %[addr5], 0x00)
+        MMI_USDC1(%[ftmp5], %[addr6], 0x00)
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
           [ftmp8]"=&f"(ftmp[8]),
+          RESTRICT_ASM_ALL64
           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
index bb795a1..f4fe091 100644 (file)
 
 #include "h264pred_mips.h"
 #include "libavcodec/bit_depth_template.c"
-#include "libavutil/mips/asmdefs.h"
+#include "libavutil/mips/mmiutils.h"
 #include "constants.h"
 
 void ff_pred16x16_vertical_8_mmi(uint8_t *src, ptrdiff_t stride)
 {
     double ftmp[2];
     uint64_t tmp[1];
+    DECLARE_VAR_ALL64;
 
     __asm__ volatile (
         "dli        %[tmp0],    0x08                                    \n\t"
-        "gsldlc1    %[ftmp0],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp0],   0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp1],   0x0f(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp1],   0x08(%[srcA])                           \n\t"
+        MMI_LDC1(%[ftmp0], %[srcA], 0x00)
+        MMI_LDC1(%[ftmp1], %[srcA], 0x08)
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdlc1    %[ftmp1],   0x0f(%[src])                            \n\t"
-        "gssdrc1    %[ftmp1],   0x08(%[src])                            \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDC1(%[ftmp1], %[src], 0x08)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
           [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
           [src]"+&r"(src)
         : [stride]"r"((mips_reg)stride),    [srcA]"r"((mips_reg)(src-stride))
         : "memory"
@@ -160,15 +158,14 @@ void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
     uint32_t dc;
     double ftmp[11];
     mips_reg tmp[3];
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
 
     __asm__ volatile (
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
-        "gsldlc1    %[ftmp10],  0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp10],  0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp9],   0x07(%[src0])                           \n\t"
-        "gsldrc1    %[ftmp9],   0x00(%[src0])                           \n\t"
-        "gsldlc1    %[ftmp8],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp8],   0x00(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp10], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp9], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp8], %[src1], 0x00)
 
         "punpcklbh  %[ftmp7],   %[ftmp10],      %[ftmp0]                \n\t"
         "punpckhbh  %[ftmp6],   %[ftmp10],      %[ftmp0]                \n\t"
@@ -209,6 +206,7 @@ void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),
           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+          RESTRICT_ASM_ALL64
           [dc]"=r"(dc)
         : [srcA]"r"((mips_reg)(src-stride-1)),
           [src0]"r"((mips_reg)(src-stride)),
@@ -221,20 +219,22 @@ void ff_pred8x8l_top_dc_8_mmi(uint8_t *src, int has_topleft,
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
         "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],    %[stride])              \n\t"
+        MMI_SDC1(%[ftmp0], %[src], 0x00)
+        MMI_SDXC1(%[ftmp0], %[src], %[stride], 0x00)
+
         "daddi      %[tmp0],    %[tmp0],        -0x01                   \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         PTR_ADDU   "%[src],     %[src],         %[stride]               \n\t"
         "bnez       %[tmp0],    1b                                      \n\t"
         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          RESTRICT_ASM_ADDRT
           [src]"+&r"(src)
         : [dc]"f"(dc),                      [stride]"r"((mips_reg)stride)
         : "memory"
@@ -257,13 +257,13 @@ void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
     const int l6 = (src[-1+5*stride] + 2*src[-1+6*stride] + src[-1+7*stride] + 2) >> 2;
     const int l7 = (src[-1+6*stride] + 2*src[-1+7*stride] + src[-1+7*stride] + 2) >> 2;
 
+    DECLARE_VAR_ALL64;
+    DECLARE_VAR_ADDRT;
+
     __asm__ volatile (
-        "gsldlc1    %[ftmp4],   0x07(%[srcA])                           \n\t"
-        "gsldrc1    %[ftmp4],   0x00(%[srcA])                           \n\t"
-        "gsldlc1    %[ftmp5],   0x07(%[src0])                           \n\t"
-        "gsldrc1    %[ftmp5],   0x00(%[src0])                           \n\t"
-        "gsldlc1    %[ftmp6],   0x07(%[src1])                           \n\t"
-        "gsldrc1    %[ftmp6],   0x00(%[src1])                           \n\t"
+        MMI_ULDC1(%[ftmp4], %[srcA], 0x00)
+        MMI_ULDC1(%[ftmp5], %[src0], 0x00)
+        MMI_ULDC1(%[ftmp6], %[src1], 0x00)
         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "dli        %[tmp0],    0x03                                    \n\t"
         "punpcklbh  %[ftmp7],   %[ftmp4],       %[ftmp0]                \n\t"
@@ -309,7 +309,9 @@ void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
           [ftmp12]"=&f"(ftmp[12]),          [ftmp13]"=&f"(ftmp[13]),
-          [tmp0]"=&r"(tmp[0]),              [dc2]"=r"(dc2)
+          [tmp0]"=&r"(tmp[0]),
+          RESTRICT_ASM_ALL64
+          [dc2]"=r"(dc2)
         : [srcA]"r"((mips_reg)(src-stride-1)),
           [src0]"r"((mips_reg)(src-stride)),
           [src1]"r"((mips_reg)(src-stride+1)),
@@ -323,20 +325,22 @@ void ff_pred8x8l_dc_8_mmi(uint8_t *src, int has_topleft, int has_topright,
     __asm__ volatile (
         "dli        %[tmp0],    0x02                                    \n\t"
         "punpcklwd  %[ftmp0],   %[dc],          %[dc]                   \n\t"
+
         "1:                                                             \n\t"
-        "gssdlc1    %[ftmp0],   0x07(%[src])                            \n\t"
-        "gssdrc1    %[ftmp0],   0x00(%[src])                            \n\t"
-        "gssdxc1    %[ftmp0],   0x00(%[src],