The vertically interpolating variants of these functions read
ahead one line to optimise the loop. On the last line processed,
this might be outside the buffer. Fix these invalid reads by
processing the last line outside the loop.
Signed-off-by: Mans Rullgard <mans@mansr.com>
.endm
.macro pixels16_y2 rnd=1, avg=0
.endm
.macro pixels16_y2 rnd=1, avg=0
vld1.64 {q0}, [r1], r2
vld1.64 {q1}, [r1], r2
1: subs r3, r3, #2
vld1.64 {q0}, [r1], r2
vld1.64 {q1}, [r1], r2
1: subs r3, r3, #2
vst1.64 {q2}, [r0,:128], r2
vst1.64 {q3}, [r0,:128], r2
bne 1b
vst1.64 {q2}, [r0,:128], r2
vst1.64 {q3}, [r0,:128], r2
bne 1b
+
+ avg q2, q0, q1
+ vld1.64 {q0}, [r1], r2
+ avg q3, q0, q1
+ .if \avg
+ vld1.8 {q8}, [r0,:128], r2
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q2, q2, q8
+ vrhadd.u8 q3, q3, q9
+ sub r0, r0, r2
+ .endif
+ vst1.64 {q2}, [r0,:128], r2
+ vst1.64 {q3}, [r0,:128], r2
+
bx lr
.endm
.macro pixels16_xy2 rnd=1, avg=0
bx lr
.endm
.macro pixels16_xy2 rnd=1, avg=0
vld1.64 {d0-d2}, [r1], r2
vld1.64 {d4-d6}, [r1], r2
.ifeq \rnd
vld1.64 {d0-d2}, [r1], r2
vld1.64 {d4-d6}, [r1], r2
.ifeq \rnd
vaddl.u8 q11, d3, d5
vst1.64 {q15}, [r0,:128], r2
bgt 1b
vaddl.u8 q11, d3, d5
vst1.64 {q15}, [r0,:128], r2
bgt 1b
+
+ vld1.64 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+ .ifeq \rnd
+ vadd.u16 q12, q12, q13
+ .endif
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ shrn d28, q12, #2
+ .ifeq \rnd
+ vadd.u16 q1, q1, q13
+ .endif
+ shrn d29, q1, #2
+ .if \avg
+ vld1.8 {q8}, [r0,:128]
+ vrhadd.u8 q14, q14, q8
+ .endif
+ vaddl.u8 q8, d0, d30
+ vaddl.u8 q10, d1, d31
+ vst1.64 {q14}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+ .ifeq \rnd
+ vadd.u16 q12, q12, q13
+ .endif
+ vadd.u16 q0, q10, q11
+ shrn d30, q12, #2
+ .ifeq \rnd
+ vadd.u16 q0, q0, q13
+ .endif
+ shrn d31, q0, #2
+ .if \avg
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q15, q15, q9
+ .endif
+ vst1.64 {q15}, [r0,:128], r2
+
.endm
.macro pixels8_y2 rnd=1, avg=0
.endm
.macro pixels8_y2 rnd=1, avg=0
vld1.64 {d0}, [r1], r2
vld1.64 {d1}, [r1], r2
1: subs r3, r3, #2
vld1.64 {d0}, [r1], r2
vld1.64 {d1}, [r1], r2
1: subs r3, r3, #2
vst1.64 {d4}, [r0,:64], r2
vst1.64 {d5}, [r0,:64], r2
bne 1b
vst1.64 {d4}, [r0,:64], r2
vst1.64 {d5}, [r0,:64], r2
bne 1b
+
+ avg d4, d0, d1
+ vld1.64 {d0}, [r1], r2
+ avg d5, d0, d1
+ .if \avg
+ vld1.8 {d2}, [r0,:64], r2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 q2, q2, q1
+ sub r0, r0, r2
+ .endif
+ vst1.64 {d4}, [r0,:64], r2
+ vst1.64 {d5}, [r0,:64], r2
+
bx lr
.endm
.macro pixels8_xy2 rnd=1, avg=0
bx lr
.endm
.macro pixels8_xy2 rnd=1, avg=0
vld1.64 {q0}, [r1], r2
vld1.64 {q1}, [r1], r2
.ifeq \rnd
vld1.64 {q0}, [r1], r2
vld1.64 {q1}, [r1], r2
.ifeq \rnd
vaddl.u8 q9, d2, d6
vst1.64 {d7}, [r0,:64], r2
bgt 1b
vaddl.u8 q9, d2, d6
vst1.64 {d7}, [r0,:64], r2
bgt 1b
+
+ vld1.64 {q0}, [r1], r2
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+ .ifeq \rnd
+ vadd.u16 q10, q10, q11
+ .endif
+ vaddl.u8 q8, d0, d4
+ shrn d5, q10, #2
+ vadd.u16 q10, q8, q9
+ .if \avg
+ vld1.8 {d7}, [r0,:64]
+ vrhadd.u8 d5, d5, d7
+ .endif
+ .ifeq \rnd
+ vadd.u16 q10, q10, q11
+ .endif
+ vst1.64 {d5}, [r0,:64], r2
+ shrn d7, q10, #2
+ .if \avg
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 d7, d7, d5
+ .endif
+ vst1.64 {d7}, [r0,:64], r2
+