%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
INIT_XMM %1
-cglobal vp9_idct_idct_32x32_add, 4, 7 + ARCH_X86_64 * 2, 16, 2048, dst, stride, block, eob
+cglobal vp9_idct_idct_32x32_add, 0, 6 + ARCH_X86_64 * 3, 16, 2048, dst, stride, block, eob
+ movifnidn eobd, dword eobm
%if cpuflag(ssse3)
cmp eobd, 135
jg .idctfull
%endif
; dc-only case
+ movifnidn blockq, blockmp
+ movifnidn dstq, dstmp
+ movifnidn strideq, stridemp
%if cpuflag(ssse3)
movd m0, [blockq]
mova m1, [pw_11585x2]
%if ARCH_X86_64
DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
%else
- DEFINE_ARGS dst, stride, block, stride30, dst_end, stride2, tmp
-%define cntd dword r4m
%define dst_bakq r0mp
%endif
%if cpuflag(ssse3)
.idct8x8:
+%if ARCH_X86_32
+ DEFINE_ARGS block, u1, u2, u3, u4, tmp
+ mov blockq, r2mp
+%endif
mov tmpq, rsp
VP9_IDCT32_1D blockq, 1, 8
+%if ARCH_X86_32
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 8, m1
RET
.idct16x16:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
mov cntd, 2
mov tmpq, rsp
.loop1_16x16:
add tmpq, 512
dec cntd
jg .loop1_16x16
+
+%if ARCH_X86_64
sub blockq, 32
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 16, m1
RET
%endif
.idctfull:
+%if ARCH_X86_32
+ DEFINE_ARGS block, tmp, cnt
+ mov blockq, r2mp
+%endif
mov cntd, 4
mov tmpq, rsp
.loop1_full:
add tmpq, 512
dec cntd
jg .loop1_full
+
+%if ARCH_X86_64
sub blockq, 64
+%else
+ DEFINE_ARGS dst, stride, stride30, dst_end, stride2, tmp
+ mov strideq, r1mp
+%define cntd dword r3m
+%endif
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
+%if ARCH_X86_32
+ DEFINE_ARGS block
+ mov blockq, r2mp
+%endif
ZERO_BLOCK blockq, 64, 32, m1
RET
%endmacro