jle .loop2x1
.ret:
REP_RET
-
+%endif
+
+ INIT_XMM sse2
+ cglobal evaluate_lls, 2,4,2, ctx, var, order, i
+ ; This function is often called on the same buffer as update_lls, but with
+ ; an offset. They can't both be aligned.
+ ; Load halves rather than movu to avoid store-forwarding stalls, since the
+ ; input was initialized immediately prior to this function using scalar math.
+ %define coefsq ctxq
+ mov id, orderd
+ imul orderd, MAX_VARS
+ lea coefsq, [ctxq + LLSModel.coeff + orderq*8]
+ movsd m0, [varq]
+ movhpd m0, [varq + 8]
+ mulpd m0, [coefsq]
+ lea coefsq, [coefsq + iq*8]
+ lea varq, [varq + iq*8]
+ neg iq
+ add iq, 2
+ .loop:
+ movsd m1, [varq + iq*8]
+ movhpd m1, [varq + iq*8 + 8]
+ mulpd m1, [coefsq + iq*8]
+ addpd m0, m1
+ add iq, 2
+ jl .loop
+ jg .skip1
+ movsd m1, [varq + iq*8]
+ mulsd m1, [coefsq + iq*8]
+ addpd m0, m1
+ .skip1:
+ movhlps m1, m0
+ addsd m0, m1
+ %if ARCH_X86_32
+ movsd r0m, m0
+ fld qword r0m
+ %endif
+ RET
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags)) {
m->update_lls = ff_update_lls_sse2;
+ if (m->indep_count >= 4)
+ m->evaluate_lls = ff_evaluate_lls_sse2;
}
- if (EXTERNAL_AVX(cpu_flags)) {
+ if (EXTERNAL_AVX(cpu_flags) && 0) {
m->update_lls = ff_update_lls_avx;
}
}