c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
*/
static int calc_add_mv(RV34DecContext *r, int dir, int val)
{
- int mul = dir ? -r->weight2 : r->weight1;
+ int mul = dir ? -r->mv_weight2 : r->mv_weight1;
return (val * mul + 0x2000) >> 14;
}
static void rv4_weight(RV34DecContext *r)
{
- r->rdsp.rv40_weight_pixels_tab[0](r->s.dest[0],
- r->tmp_b_block_y[0],
- r->tmp_b_block_y[1],
- r->weight1,
- r->weight2,
- r->s.linesize);
- r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[1],
- r->tmp_b_block_uv[0],
- r->tmp_b_block_uv[2],
- r->weight1,
- r->weight2,
- r->s.uvlinesize);
- r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[2],
- r->tmp_b_block_uv[1],
- r->tmp_b_block_uv[3],
- r->weight1,
- r->weight2,
- r->s.uvlinesize);
+ r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][0](r->s.dest[0],
+ r->tmp_b_block_y[0],
+ r->tmp_b_block_y[1],
+ r->weight1,
+ r->weight2,
+ r->s.linesize);
+ r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[1],
+ r->tmp_b_block_uv[0],
+ r->tmp_b_block_uv[2],
+ r->weight1,
+ r->weight2,
+ r->s.uvlinesize);
+ r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[2],
+ r->tmp_b_block_uv[1],
+ r->tmp_b_block_uv[3],
+ r->weight1,
+ r->weight2,
+ r->s.uvlinesize);
}
static void rv34_mc_2mv(RV34DecContext *r, const int block_type)
int dist0 = GET_PTS_DIFF(r->cur_pts, r->last_pts);
int dist1 = GET_PTS_DIFF(r->next_pts, r->cur_pts);
- if (!refdist) {
- r->weight1 = r->weight2 = 8192;
- } else {
- r->weight1 = (dist0 << 14) / refdist;
- r->weight2 = (dist1 << 14) / refdist;
+ if(!refdist){
+ r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192;
+ r->scaled_weight = 0;
+ }else{
+ r->mv_weight1 = (dist0 << 14) / refdist;
+ r->mv_weight2 = (dist1 << 14) / refdist;
+ if((r->mv_weight1|r->mv_weight2) & 511){
+ r->weight1 = r->mv_weight1;
+ r->weight2 = r->mv_weight2;
+ r->scaled_weight = 0;
+ }else{
+ r->weight1 = r->mv_weight1 >> 9;
+ r->weight2 = r->mv_weight2 >> 9;
+ r->scaled_weight = 1;
+ }
}
}
s->mb_x = s->mb_y = 0;
int rpr; ///< one field size in RV30 slice header
int cur_pts, last_pts, next_pts;
+ int scaled_weight;
int weight1, weight2; ///< B frame distance fractions (0.14) used in motion compensation
+ int mv_weight1, mv_weight2;
uint16_t *cbp_luma; ///< CBP values for luma subblocks
uint8_t *cbp_chroma; ///< CBP values for chroma subblocks
qpel_mc_func avg_pixels_tab[4][16];
h264_chroma_mc_func put_chroma_pixels_tab[3];
h264_chroma_mc_func avg_chroma_pixels_tab[3];
- rv40_weight_func rv40_weight_pixels_tab[2];
+ /**
+ * Biweight functions, first dimension is transform size (16/8),
+ * second is whether the weight is prescaled by 1/512 to skip
+ * the intermediate shifting.
+ */
+ rv40_weight_func rv40_weight_pixels_tab[2][2];
rv34_inv_transform_func rv34_inv_transform;
rv34_inv_transform_func rv34_inv_transform_dc;
rv34_idct_add_func rv34_idct_add;
RV40_CHROMA_MC(avg_, op_avg)
#define RV40_WEIGHT_FUNC(size) \
-static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
+static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
{\
int i, j;\
\
src2 += stride;\
dst += stride;\
}\
+}\
+static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
+{\
+ int i, j;\
+\
+ for (j = 0; j < size; j++) {\
+ for (i = 0; i < size; i++)\
+ dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\
+ src1 += stride;\
+ src2 += stride;\
+ dst += stride;\
+ }\
}
RV40_WEIGHT_FUNC(16)
c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c;
c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c;
- c->rv40_weight_pixels_tab[0] = rv40_weight_func_16;
- c->rv40_weight_pixels_tab[1] = rv40_weight_func_8;
+ c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16;
+ c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8;
+ c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16;
+ c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8;
c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter;
c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter;
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs
-%macro RV40_WEIGHT 2
-cglobal rv40_weight_func_%1, 6, 7, %2
+; The weights are FP0.14 notation of fractions depending on pts.
+; For timebases without rounding error (i.e. PAL), the fractions
+; can be simplified, and several operations can be avoided.
+; Therefore, we check here whether they are multiples of 2^9 for
+; those simplifications to occur.
+%macro RV40_WEIGHT 3
+cglobal rv40_weight_func_%1_%2, 6, 7, %3
%if cpuflag(ssse3)
mova m1, [shift_round]
%else
mova m1, [pw_16]
%endif
pxor m0, m0
- mov r6, r3
- or r6, r4
- ; The weights are FP0.14 notation of fractions depending on pts.
- ; For timebases without rounding error (i.e. PAL), the fractions
- ; can be simplified, and several operations can be avoided.
- ; Therefore, we check here whether they are multiples of 2^9 for
- ; those simplifications to occur.
- and r6, 0x1FF
; Set loop counter and increments
%if mmsize == 8
- mov r6, %1
+ mov r6, %2
%else
- mov r6, (%1 * %1) / mmsize
+ mov r6, (%2 * %2) / mmsize
%endif
- ; Use result of test now
- jz .loop_512
movd m2, r3
movd m3, r4
+%ifidn %1,rnd
+%define RND 0
SPLATW m2, m2
- SPLATW m3, m3
-
-.loop:
- MAIN_LOOP %1, 0
- jnz .loop
- REP_RET
-
- ; Weights are multiple of 512, which allows some shortcuts
-.loop_512:
- sar r3, 9
- sar r4, 9
- movd m2, r3
- movd m3, r4
+%else
+%define RND 1
%if cpuflag(ssse3)
punpcklbw m3, m2
- SPLATW m3, m3
%else
SPLATW m2, m2
- SPLATW m3, m3
%endif
-.loop2:
- MAIN_LOOP %1, 1
- jnz .loop2
- REP_RET
+%endif
+ SPLATW m3, m3
+.loop:
+ MAIN_LOOP %2, RND
+ jnz .loop
+ REP_RET
%endmacro
INIT_MMX mmx
-RV40_WEIGHT 8, 0
-RV40_WEIGHT 16, 0
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM sse2
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
INIT_XMM ssse3
-RV40_WEIGHT 8, 8
-RV40_WEIGHT 16, 8
+RV40_WEIGHT rnd, 8, 3
+RV40_WEIGHT rnd, 16, 4
+RV40_WEIGHT nornd, 8, 3
+RV40_WEIGHT nornd, 16, 4
int stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \
-void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride); \
-void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
- int w1, int w2, ptrdiff_t stride);
+void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride); \
+void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmx)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
}
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
}
if (mm_flags & AV_CPU_FLAG_SSE2) {
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
}
if (mm_flags & AV_CPU_FLAG_SSSE3) {
- c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
- c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
+ c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
+ c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
}
#endif
}