avcodec/x86/hevc: add avx2 dc idct
authorplepere <pierre-edouard.lepere@insa-rennes.fr>
Mon, 16 Jun 2014 12:47:21 +0000 (14:47 +0200)
committerMichael Niedermayer <michaelni@gmx.at>
Wed, 25 Jun 2014 12:49:44 +0000 (14:49 +0200)
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
libavcodec/x86/hevc_idct.asm
libavcodec/x86/hevcdsp.h
libavcodec/x86/hevcdsp_init.c

index 6963dc7..31532ae 100644 (file)
 ; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-max_pixels_10:          times 8  dw ((1 << 10)-1)
+SECTION_RODATA 32
+max_pixels_10:          times 16  dw ((1 << 10)-1)
 dc_add_10:              times 4 dd ((1 << 14-10) + 1)
 
 
-SECTION .text
+SECTION_TEXT 32
 
 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
 
@@ -41,6 +41,18 @@ SECTION .text
     packuswb          m1, m1
 %endmacro
 
+%macro DC_ADD_INIT_AVX2 2
+    add              %1w, ((1 << 14-8) + 1)
+    sar              %1w, (15-8)
+    movd             xm0, %1d
+    vpbroadcastw      m0, xm0    ;SPLATW
+    lea               %1, [%2*3]
+    pxor              m1, m1
+    psubw             m1, m0
+    packuswb          m0, m0
+    packuswb          m1, m1
+%endmacro
+
 %macro DC_ADD_OP 4
     %1                m2, [%2     ]
     %1                m3, [%2+%3  ]
@@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0
     DC_ADD_OP       mova, r0, r2, r3
     RET
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct32_dc_add_8, 3, 4, 6
+    movsx             r3, word [r1]
+    DC_ADD_INIT_AVX2  r3, r2
+    DC_ADD_OP       mova, r0, r2, r3,
+ %rep 7
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+%endrep
+    RET
+%endif ;HAVE_AVX2_EXTERNAL
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -178,3 +203,23 @@ IDCT8_DC_ADD
 INIT_XMM avx
 IDCT8_DC_ADD
 %endif
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal hevc_idct16_dc_add_10,3,4,7
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd             xm0, r1d
+    lea               r1, [r2*3]
+    vpbroadcastw      m0, xm0    ;SPLATW
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+%endif ;HAVE_AVX_EXTERNAL
index 029492e..661a860 100644 (file)
@@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext);
 idct_dc_proto(16,8,  sse2);
 idct_dc_proto(32,8,  sse2);
 
+idct_dc_proto(32,8,  avx2);
+
 
 idct_dc_proto(4, 10,mmxext);
 idct_dc_proto(8, 10,  sse2);
@@ -142,6 +144,10 @@ idct_dc_proto(8, 10,   avx);
 idct_dc_proto(16,10,   avx);
 idct_dc_proto(32,10,   avx);
 
+idct_dc_proto(16,10,  avx2);
+idct_dc_proto(32,10,  avx2);
+
+
 
 
 
index 58a0891..cad236d 100644 (file)
@@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
 }
 #endif //HAVE_AVX_EXTERNAL
 
+#if HAVE_AVX2_EXTERNAL
+
+void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_AVX2_EXTERNAL
+
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
                                                 uint8_t *_src, ptrdiff_t _srcstride, int height,                \
@@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
         }
+        if (EXTERNAL_AVX2(mm_flags)) {
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_avx2;
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(mm_flags)) {
                 c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
@@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
             c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
         }
+        if (EXTERNAL_AVX2(mm_flags)) {
+            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx2;
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx2;
 
+        }
     }
 }