c->vsse[5] = vsse_intra8_c;
c->nsse[0] = nsse16_c;
c->nsse[1] = nsse8_c;
+#if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
+ ff_dsputil_init_dwt(c);
+#endif
- c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
-
c->bswap_buf = bswap_buf;
c->bswap16_buf = bswap16_buf;
--- /dev/null
- * This file is part of Libav.
+ /*
+ * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
+ *
- * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+ *
- * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
- * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ #include <stdint.h>
+
+ #include "config.h"
+ #if HAVE_ALTIVEC_H
+ #include <altivec.h>
+ #endif
+
+ #include "libavutil/attributes.h"
+ #include "libavutil/ppc/types_altivec.h"
+ #include "libavutil/ppc/util_altivec.h"
+ #include "libavcodec/svq1enc.h"
+
+ #if HAVE_ALTIVEC
+ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
+ int size)
+ {
+ int i, size16 = size >> 4;
+ vector signed char vpix1;
+ vector signed short vpix2, vdiff, vpix1l, vpix1h;
+ union {
+ vector signed int vscore;
+ int32_t score[4];
+ } u = { .vscore = vec_splat_s32(0) };
+
+ while (size16) {
+ // score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
+ // load pix1 and the first batch of pix2
+
+ vpix1 = vec_unaligned_load(pix1);
+ vpix2 = vec_unaligned_load(pix2);
+ pix2 += 8;
+ // unpack
+ vpix1h = vec_unpackh(vpix1);
+ vdiff = vec_sub(vpix1h, vpix2);
+ vpix1l = vec_unpackl(vpix1);
+ // load another batch from pix2
+ vpix2 = vec_unaligned_load(pix2);
+ u.vscore = vec_msum(vdiff, vdiff, u.vscore);
+ vdiff = vec_sub(vpix1l, vpix2);
+ u.vscore = vec_msum(vdiff, vdiff, u.vscore);
+ pix1 += 16;
+ pix2 += 8;
+ size16--;
+ }
+ u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
+
+ size %= 16;
+ for (i = 0; i < size; i++)
+ u.score[3] += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
+
+ return u.score[3];
+ }
+ #endif /* HAVE_ALTIVEC */
+
+ av_cold void ff_svq1enc_init_ppc(SVQ1EncContext *c)
+ {
+ #if HAVE_ALTIVEC
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
+ #endif /* HAVE_ALTIVEC */
+ }
#include "internal.h"
#include "mpegutils.h"
#include "svq1.h"
+ #include "svq1enc.h"
#include "svq1enc_cb.h"
+#include "libavutil/avassert.h"
-#undef NDEBUG
-#include <assert.h>
- typedef struct SVQ1EncContext {
- /* FIXME: Needed for motion estimation, should not be used for anything
- * else, the idea is to make the motion estimation eventually independent
- * of MpegEncContext, so this will be removed then. */
- MpegEncContext m;
- AVCodecContext *avctx;
- DSPContext dsp;
- HpelDSPContext hdsp;
- AVFrame *current_picture;
- AVFrame *last_picture;
- PutBitContext pb;
- GetBitContext gb;
-
- /* why ooh why this sick breadth first order,
- * everything is slower and more complex */
- PutBitContext reorder_pb[6];
-
- int frame_width;
- int frame_height;
-
- /* Y plane block dimensions */
- int y_block_width;
- int y_block_height;
-
- /* U & V plane (C planes) block dimensions */
- int c_block_width;
- int c_block_height;
-
- uint16_t *mb_type;
- uint32_t *dummy;
- int16_t (*motion_val8[3])[2];
- int16_t (*motion_val16[3])[2];
-
- int64_t rd_total;
-
- uint8_t *scratchbuf;
- } SVQ1EncContext;
-
static void svq1_write_header(SVQ1EncContext *s, int frame_type)
{
int i;
--- /dev/null
- * This file is part of Libav.
+ /*
+ * SVQ1 encoder
+ *
- * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+ *
- * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
- * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ #ifndef AVCODEC_SVQ1ENC_H
+ #define AVCODEC_SVQ1ENC_H
+
+ #include <stdint.h>
+
+ #include "libavutil/frame.h"
+ #include "avcodec.h"
+ #include "dsputil.h"
+ #include "get_bits.h"
+ #include "hpeldsp.h"
+ #include "mpegvideo.h"
+ #include "put_bits.h"
+
+ typedef struct SVQ1EncContext {
+ /* FIXME: Needed for motion estimation, should not be used for anything
+ * else, the idea is to make the motion estimation eventually independent
+ * of MpegEncContext, so this will be removed then. */
+ MpegEncContext m;
+ AVCodecContext *avctx;
+ DSPContext dsp;
+ HpelDSPContext hdsp;
+ AVFrame *current_picture;
+ AVFrame *last_picture;
+ PutBitContext pb;
+ GetBitContext gb;
+
+ /* why ooh why this sick breadth first order,
+ * everything is slower and more complex */
+ PutBitContext reorder_pb[6];
+
+ int frame_width;
+ int frame_height;
+
+ /* Y plane block dimensions */
+ int y_block_width;
+ int y_block_height;
+
+ /* U & V plane (C planes) block dimensions */
+ int c_block_width;
+ int c_block_height;
+
+ uint16_t *mb_type;
+ uint32_t *dummy;
+ int16_t (*motion_val8[3])[2];
+ int16_t (*motion_val16[3])[2];
+
+ int64_t rd_total;
+
+ uint8_t *scratchbuf;
+
+ int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
+ int size);
+ } SVQ1EncContext;
+
+ void ff_svq1enc_init_ppc(SVQ1EncContext *c);
+ void ff_svq1enc_init_x86(SVQ1EncContext *c);
+
+ #endif /* AVCODEC_SVQ1ENC_H */
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/simple_idct.o
-MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
- x86/hpeldsp_mmx.o
+MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
MMX-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_mmx.o
+MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
+MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
+ MMX-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_mmx.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
YASM-OBJS += x86/deinterlace.o \
}
#undef SUM
-#define MMABS_MMX(a,z) \
- "pxor " #z ", " #z " \n\t" \
- "pcmpgtw " #a ", " #z " \n\t" \
- "pxor " #z ", " #a " \n\t" \
- "psubw " #z ", " #a " \n\t"
-
-#define MMABS_MMXEXT(a, z) \
- "pxor " #z ", " #z " \n\t" \
- "psubw " #a ", " #z " \n\t" \
- "pmaxsw " #z ", " #a " \n\t"
-
-#define MMABS_SSSE3(a,z) \
- "pabsw " #a ", " #a " \n\t"
-
-#define MMABS_SUM(a,z, sum) \
- MMABS(a,z) \
- "paddusw " #a ", " #sum " \n\t"
-
-/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
- * up to about 100k on extreme inputs. But that's very unlikely to occur in
- * natural video, and it's even more unlikely to not have any alternative
- * mvs/modes with lower cost. */
-#define HSUM_MMX(a, t, dst) \
- "movq " #a ", " #t " \n\t" \
- "psrlq $32, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movq " #a ", " #t " \n\t" \
- "psrlq $16, " #a " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_MMXEXT(a, t, dst) \
- "pshufw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshufw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define HSUM_SSE2(a, t, dst) \
- "movhlps " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x0E, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "pshuflw $0x01, " #a ", " #t " \n\t" \
- "paddusw " #t ", " #a " \n\t" \
- "movd " #a ", " #dst " \n\t" \
-
-#define DCT_SAD4(m, mm, o) \
- "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
- "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
- "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
- "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
- MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
- MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
- MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
-
-#define DCT_SAD_MMX \
- "pxor %%mm0, %%mm0 \n\t" \
- "pxor %%mm1, %%mm1 \n\t" \
- DCT_SAD4(q, %%mm, 0) \
- DCT_SAD4(q, %%mm, 8) \
- DCT_SAD4(q, %%mm, 64) \
- DCT_SAD4(q, %%mm, 72) \
- "paddusw %%mm1, %%mm0 \n\t" \
- HSUM(%%mm0, %%mm1, %0)
-
-#define DCT_SAD_SSE2 \
- "pxor %%xmm0, %%xmm0 \n\t" \
- "pxor %%xmm1, %%xmm1 \n\t" \
- DCT_SAD4(dqa, %%xmm, 0) \
- DCT_SAD4(dqa, %%xmm, 64) \
- "paddusw %%xmm1, %%xmm0 \n\t" \
- HSUM(%%xmm0, %%xmm1, %0)
-
-#define DCT_SAD_FUNC(cpu) \
-static int sum_abs_dctelem_ ## cpu(int16_t *block) \
-{ \
- int sum; \
- __asm__ volatile ( \
- DCT_SAD \
- :"=r"(sum) \
- :"r"(block)); \
- return sum & 0xFFFF; \
-}
-
-#define DCT_SAD DCT_SAD_MMX
-#define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
-#define MMABS(a, z) MMABS_MMX(a, z)
-DCT_SAD_FUNC(mmx)
-#undef MMABS
-#undef HSUM
-
-#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
-#define MMABS(a, z) MMABS_MMXEXT(a, z)
-DCT_SAD_FUNC(mmxext)
-#undef HSUM
-#undef DCT_SAD
-
-#define DCT_SAD DCT_SAD_SSE2
-#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
-DCT_SAD_FUNC(sse2)
-#undef MMABS
-
-#if HAVE_SSSE3_INLINE
-#define MMABS(a, z) MMABS_SSSE3(a, z)
-DCT_SAD_FUNC(ssse3)
-#undef MMABS
-#endif
-#undef HSUM
-#undef DCT_SAD
- static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
- int size)
- {
- int sum;
- x86_reg i = size;
-
- __asm__ volatile (
- "pxor %%mm4, %%mm4 \n"
- "1: \n"
- "sub $8, %0 \n"
- "movq (%2, %0), %%mm2 \n"
- "movq (%3, %0, 2), %%mm0 \n"
- "movq 8(%3, %0, 2), %%mm1 \n"
- "punpckhbw %%mm2, %%mm3 \n"
- "punpcklbw %%mm2, %%mm2 \n"
- "psraw $8, %%mm3 \n"
- "psraw $8, %%mm2 \n"
- "psubw %%mm3, %%mm1 \n"
- "psubw %%mm2, %%mm0 \n"
- "pmaddwd %%mm1, %%mm1 \n"
- "pmaddwd %%mm0, %%mm0 \n"
- "paddd %%mm1, %%mm4 \n"
- "paddd %%mm0, %%mm4 \n"
- "jg 1b \n"
- "movq %%mm4, %%mm3 \n"
- "psrlq $32, %%mm3 \n"
- "paddd %%mm3, %%mm4 \n"
- "movd %%mm4, %1 \n"
- : "+r" (i), "=r" (sum)
- : "r" (pix1), "r" (pix2));
-
- return sum;
- }
-
#define PHADDD(a, t) \
"movq " #a ", " #t " \n\t" \
"psrlq $32, " #a " \n\t" \
--- /dev/null
- * This file is part of Libav.
+ /*
- * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+ *
- * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
- * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+ #include "config.h"
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/x86/asm.h"
+ #include "libavutil/x86/cpu.h"
+ #include "libavcodec/svq1enc.h"
+
+ #if HAVE_INLINE_ASM
+
+ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
+ int size)
+ {
+ int sum;
+ x86_reg i = size;
+
+ __asm__ volatile (
+ "pxor %%mm4, %%mm4 \n"
+ "1: \n"
+ "sub $8, %0 \n"
+ "movq (%2, %0), %%mm2 \n"
+ "movq (%3, %0, 2), %%mm0 \n"
+ "movq 8(%3, %0, 2), %%mm1 \n"
+ "punpckhbw %%mm2, %%mm3 \n"
+ "punpcklbw %%mm2, %%mm2 \n"
+ "psraw $8, %%mm3 \n"
+ "psraw $8, %%mm2 \n"
+ "psubw %%mm3, %%mm1 \n"
+ "psubw %%mm2, %%mm0 \n"
+ "pmaddwd %%mm1, %%mm1 \n"
+ "pmaddwd %%mm0, %%mm0 \n"
+ "paddd %%mm1, %%mm4 \n"
+ "paddd %%mm0, %%mm4 \n"
+ "jg 1b \n"
+ "movq %%mm4, %%mm3 \n"
+ "psrlq $32, %%mm3 \n"
+ "paddd %%mm3, %%mm4 \n"
+ "movd %%mm4, %1 \n"
+ : "+r" (i), "=r" (sum)
+ : "r" (pix1), "r" (pix2));
+
+ return sum;
+ }
+
+ #endif /* HAVE_INLINE_ASM */
+
+ av_cold void ff_svq1enc_init_x86(SVQ1EncContext *c)
+ {
+ #if HAVE_INLINE_ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (INLINE_MMX(cpu_flags)) {
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+ }
+ #endif /* HAVE_INLINE_ASM */
+ }