x86/exrdsp: optimize ff_reorder_pixels_avx2()
authorHenrik Gramner <henrik@gramner.com>
Mon, 18 Sep 2017 01:52:13 +0000 (22:52 -0300)
committerJames Almer <jamrial@gmail.com>
Tue, 19 Sep 2017 02:24:55 +0000 (23:24 -0300)
Tested with "checkasm --test=exrdsp -bench"

Before:
reorder_pixels_c: 5187.8
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 331.3

After:
reorder_pixels_c: 5181.5
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 313.8

Signed-off-by: James Almer <jamrial@gmail.com>
libavcodec/x86/exrdsp.asm

index b91a7be..06c629e 100644 (file)
@@ -39,16 +39,15 @@ cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
     neg                              sizeq                ; size = offset for dst, src1, src2
 .loop:
 
-%if cpuflag(avx2)
-    vpermq                              m0, [src1q + sizeq], 0xd8; load first part
-    vpermq                              m1, [src2q + sizeq], 0xd8; load second part
-%else
     mova                                m0, [src1q+sizeq]        ; load first part
     movu                                m1, [src2q+sizeq]        ; load second part
-%endif
     SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
-    mova                 [dstq+2*sizeq   ], m0                   ; copy to dst
-    mova             [dstq+2*sizeq+mmsize], m1
+    mova                 [dstq+2*sizeq   ], xm0                  ; copy to dst
+    mova                 [dstq+2*sizeq+16], xm1
+%if cpuflag(avx2)
+    vperm2i128                          m0, m0, m1, q0301
+    mova                 [dstq+2*sizeq+32], m0
+%endif
     add     sizeq, mmsize
     jl .loop
     RET