Merge remote-tracking branch 'qatar/master'
authorMichael Niedermayer <michaelni@gmx.at>
Tue, 13 Sep 2011 21:31:17 +0000 (23:31 +0200)
committerMichael Niedermayer <michaelni@gmx.at>
Tue, 13 Sep 2011 21:44:12 +0000 (23:44 +0200)
* qatar/master:
  sws: implement MMX/SSE2/SSSE3/SSE4 versions for horizontal scaling.
  include stdint.h in adpcm_data.h
  mpeg12: reorder functions to avoid ugly forward declarations
  Fixed off by one packet size allocation in the smacker demuxer.
  Check for invalid packet size in the smacker demuxer.
  ape demuxer: fix segfault on memory allocation failure.
  xan: Add some buffer checks
  xan: Remove extra trailing newline
  Fixed size given to init_get_bits() in xan decoder.

Conflicts:
libavcodec/mpeg12.c
libswscale/x86/swscale_template.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>
1  2 
libavcodec/adpcm_data.h
libavcodec/mpeg12.c
libavcodec/xan.c
libswscale/Makefile
libswscale/x86/swscale_mmx.c
libswscale/x86/swscale_template.c

Simple merge
@@@ -301,821 -433,678 +434,686 @@@ static inline int mpeg2_fast_decode_blo
              }
          }
  
-         if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
-             if (s->flags2 & CODEC_FLAG2_FAST) {
-                 for (i = 0; i < 6; i++) {
-                     mpeg2_fast_decode_block_intra(s, *s->pblocks[i], i);
-                 }
+         block[j] = level;
+         if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
+             break;
+         UPDATE_CACHE(re, &s->gb);
+     }
+ end:
+     LAST_SKIP_BITS(re, &s->gb, 2);
+     CLOSE_READER(re, &s->gb);
+     s->block_last_index[n] = i;
+     return 0;
+ }
+ static inline int mpeg2_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
+ {
+     int level, dc, diff, i, j, run;
+     int component;
+     RLTable *rl;
+     uint8_t * const scantable = s->intra_scantable.permutated;
+     const uint16_t *quant_matrix;
+     const int qscale = s->qscale;
+     int mismatch;
+     /* DC coefficient */
+     if (n < 4) {
+         quant_matrix = s->intra_matrix;
+         component = 0;
+     } else {
+         quant_matrix = s->chroma_intra_matrix;
+         component = (n & 1) + 1;
+     }
+     diff = decode_dc(&s->gb, component);
+     if (diff >= 0xffff)
+         return -1;
+     dc  = s->last_dc[component];
+     dc += diff;
+     s->last_dc[component] = dc;
+     block[0] = dc << (3 - s->intra_dc_precision);
+     av_dlog(s->avctx, "dc=%d\n", block[0]);
+     mismatch = block[0] ^ 1;
+     i = 0;
+     if (s->intra_vlc_format)
+         rl = &ff_rl_mpeg2;
+     else
+         rl = &ff_rl_mpeg1;
+     {
+         OPEN_READER(re, &s->gb);
+         /* now quantify & encode AC coefficients */
+         for (;;) {
+             UPDATE_CACHE(re, &s->gb);
+             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+             if (level == 127) {
+                 break;
+             } else if (level != 0) {
+                 i += run;
+                 j  = scantable[i];
+                 level = (level * qscale * quant_matrix[j]) >> 4;
+                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                 LAST_SKIP_BITS(re, &s->gb, 1);
              } else {
-                 for (i = 0; i < mb_block_count; i++) {
-                     if (mpeg2_decode_block_intra(s, *s->pblocks[i], i) < 0)
-                         return -1;
+                 /* escape */
+                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
+                 UPDATE_CACHE(re, &s->gb);
+                 level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+                 i += run;
+                 j  = scantable[i];
+                 if (level < 0) {
+                     level = (-level * qscale * quant_matrix[j]) >> 4;
+                     level = -level;
+                 } else {
+                     level = (level * qscale * quant_matrix[j]) >> 4;
                  }
              }
-         } else {
-             for (i = 0; i < 6; i++) {
-                 if (mpeg1_decode_block_intra(s, *s->pblocks[i], i) < 0)
-                     return -1;
+             if (i > 63) {
+                 av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                 return -1;
              }
+             mismatch ^= level;
+             block[j]  = level;
          }
+         CLOSE_READER(re, &s->gb);
+     }
+     block[63] ^= mismatch & 1;
+     s->block_last_index[n] = i;
+     return 0;
+ }
+ static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
+ {
+     int level, dc, diff, j, run;
+     int component;
+     RLTable *rl;
+     uint8_t * scantable = s->intra_scantable.permutated;
+     const uint16_t *quant_matrix;
+     const int qscale = s->qscale;
+     /* DC coefficient */
+     if (n < 4) {
+         quant_matrix = s->intra_matrix;
+         component = 0;
      } else {
-         if (mb_type & MB_TYPE_ZERO_MV) {
-             assert(mb_type & MB_TYPE_CBP);
+         quant_matrix = s->chroma_intra_matrix;
+         component = (n & 1) + 1;
+     }
+     diff = decode_dc(&s->gb, component);
+     if (diff >= 0xffff)
+         return -1;
+     dc = s->last_dc[component];
+     dc += diff;
+     s->last_dc[component] = dc;
+     block[0] = dc << (3 - s->intra_dc_precision);
+     if (s->intra_vlc_format)
+         rl = &ff_rl_mpeg2;
+     else
+         rl = &ff_rl_mpeg1;
  
-             s->mv_dir = MV_DIR_FORWARD;
-             if (s->picture_structure == PICT_FRAME) {
-                 if (!s->frame_pred_frame_dct)
-                     s->interlaced_dct = get_bits1(&s->gb);
-                 s->mv_type = MV_TYPE_16X16;
+     {
+         OPEN_READER(re, &s->gb);
+         /* now quantify & encode AC coefficients */
+         for (;;) {
+             UPDATE_CACHE(re, &s->gb);
+             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+             if (level == 127) {
+                 break;
+             } else if (level != 0) {
+                 scantable += run;
+                 j = *scantable;
+                 level = (level * qscale * quant_matrix[j]) >> 4;
+                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
+                 LAST_SKIP_BITS(re, &s->gb, 1);
              } else {
-                 s->mv_type = MV_TYPE_FIELD;
-                 mb_type |= MB_TYPE_INTERLACED;
-                 s->field_select[0][0] = s->picture_structure - 1;
+                 /* escape */
+                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
+                 UPDATE_CACHE(re, &s->gb);
+                 level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+                 scantable += run;
+                 j = *scantable;
+                 if (level < 0) {
+                     level = (-level * qscale * quant_matrix[j]) >> 4;
+                     level = -level;
+                 } else {
+                     level = (level * qscale * quant_matrix[j]) >> 4;
+                 }
              }
  
-             if (IS_QUANT(mb_type))
-                 s->qscale = get_qscale(s);
+             block[j] = level;
+         }
+         CLOSE_READER(re, &s->gb);
+     }
  
-             s->last_mv[0][0][0] = 0;
-             s->last_mv[0][0][1] = 0;
-             s->last_mv[0][1][0] = 0;
-             s->last_mv[0][1][1] = 0;
-             s->mv[0][0][0] = 0;
-             s->mv[0][0][1] = 0;
+     s->block_last_index[n] = scantable - s->intra_scantable.permutated;
+     return 0;
+ }
+ uint8_t ff_mpeg12_static_rl_table_store[2][2][2*MAX_RUN + MAX_LEVEL + 3];
+ #define INIT_2D_VLC_RL(rl, static_size)\
+ {\
+     static RL_VLC_ELEM rl_vlc_table[static_size];\
+     INIT_VLC_STATIC(&rl.vlc, TEX_VLC_BITS, rl.n + 2,\
+                     &rl.table_vlc[0][1], 4, 2,\
+                     &rl.table_vlc[0][0], 4, 2, static_size);\
+ \
+     rl.rl_vlc[0] = rl_vlc_table;\
+     init_2d_vlc_rl(&rl);\
+ }
+ static void init_2d_vlc_rl(RLTable *rl)
+ {
+     int i;
+     for (i = 0; i < rl->vlc.table_size; i++) {
+         int code = rl->vlc.table[i][0];
+         int len  = rl->vlc.table[i][1];
+         int level, run;
+         if (len == 0) { // illegal code
+             run   = 65;
+             level = MAX_LEVEL;
+         } else if (len<0) { //more bits needed
+             run   = 0;
+             level = code;
          } else {
-             assert(mb_type & MB_TYPE_L0L1);
-             // FIXME decide if MBs in field pictures are MB_TYPE_INTERLACED
-             /* get additional motion vector type */
-             if (s->frame_pred_frame_dct)
-                 motion_type = MT_FRAME;
-             else {
-                 motion_type = get_bits(&s->gb, 2);
-                 if (s->picture_structure == PICT_FRAME && HAS_CBP(mb_type))
-                     s->interlaced_dct = get_bits1(&s->gb);
+             if (code == rl->n) { //esc
+                 run   = 65;
+                 level = 0;
+             } else if (code == rl->n+1) { //eob
+                 run   = 0;
+                 level = 127;
+             } else {
+                 run   = rl->table_run  [code] + 1;
+                 level = rl->table_level[code];
              }
+         }
+         rl->rl_vlc[0][i].len   = len;
+         rl->rl_vlc[0][i].level = level;
+         rl->rl_vlc[0][i].run   = run;
+     }
+ }
  
-             if (IS_QUANT(mb_type))
-                 s->qscale = get_qscale(s);
-             /* motion vectors */
-             s->mv_dir = (mb_type >> 13) & 3;
-             av_dlog(s->avctx, "motion_type=%d\n", motion_type);
-             switch (motion_type) {
-             case MT_FRAME: /* or MT_16X8 */
-                 if (s->picture_structure == PICT_FRAME) {
-                     mb_type |= MB_TYPE_16x16;
-                     s->mv_type = MV_TYPE_16X16;
-                     for (i = 0; i < 2; i++) {
-                         if (USES_LIST(mb_type, i)) {
-                             /* MT_FRAME */
-                             s->mv[i][0][0]= s->last_mv[i][0][0]= s->last_mv[i][1][0] =
-                                 mpeg_decode_motion(s, s->mpeg_f_code[i][0], s->last_mv[i][0][0]);
-                             s->mv[i][0][1]= s->last_mv[i][0][1]= s->last_mv[i][1][1] =
-                                 mpeg_decode_motion(s, s->mpeg_f_code[i][1], s->last_mv[i][0][1]);
-                             /* full_pel: only for MPEG-1 */
-                             if (s->full_pel[i]) {
-                                 s->mv[i][0][0] <<= 1;
-                                 s->mv[i][0][1] <<= 1;
-                             }
-                         }
-                     }
-                 } else {
-                     mb_type |= MB_TYPE_16x8 | MB_TYPE_INTERLACED;
-                     s->mv_type = MV_TYPE_16X8;
-                     for (i = 0; i < 2; i++) {
-                         if (USES_LIST(mb_type, i)) {
-                             /* MT_16X8 */
-                             for (j = 0; j < 2; j++) {
-                                 s->field_select[i][j] = get_bits1(&s->gb);
-                                 for (k = 0; k < 2; k++) {
-                                     val = mpeg_decode_motion(s, s->mpeg_f_code[i][k],
-                                                              s->last_mv[i][j][k]);
-                                     s->last_mv[i][j][k] = val;
-                                     s->mv[i][j][k]      = val;
-                                 }
-                             }
-                         }
-                     }
-                 }
-                 break;
-             case MT_FIELD:
-                 if(s->progressive_sequence){
-                     av_log(s->avctx, AV_LOG_ERROR, "MT_FIELD in progressive_sequence\n");
-                     return -1;
-                 }
-                 s->mv_type = MV_TYPE_FIELD;
-                 if (s->picture_structure == PICT_FRAME) {
-                     mb_type |= MB_TYPE_16x8 | MB_TYPE_INTERLACED;
-                     for (i = 0; i < 2; i++) {
-                         if (USES_LIST(mb_type, i)) {
-                             for (j = 0; j < 2; j++) {
-                                 s->field_select[i][j] = get_bits1(&s->gb);
-                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][0],
-                                                          s->last_mv[i][j][0]);
-                                 s->last_mv[i][j][0] = val;
-                                 s->mv[i][j][0]      = val;
-                                 av_dlog(s->avctx, "fmx=%d\n", val);
-                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
-                                                          s->last_mv[i][j][1] >> 1);
-                                 s->last_mv[i][j][1] = val << 1;
-                                 s->mv[i][j][1]      = val;
-                                 av_dlog(s->avctx, "fmy=%d\n", val);
-                             }
-                         }
-                     }
-                 } else {
-                     mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
-                     for (i = 0; i < 2; i++) {
-                         if (USES_LIST(mb_type, i)) {
-                             s->field_select[i][0] = get_bits1(&s->gb);
-                             for (k = 0; k < 2; k++) {
-                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][k],
-                                                          s->last_mv[i][0][k]);
-                                 s->last_mv[i][0][k] = val;
-                                 s->last_mv[i][1][k] = val;
-                                 s->mv[i][0][k]      = val;
-                             }
-                         }
-                     }
-                 }
-                 break;
-             case MT_DMV:
-                 if(s->progressive_sequence){
-                     av_log(s->avctx, AV_LOG_ERROR, "MT_DMV in progressive_sequence\n");
-                     return -1;
-                 }
-                 s->mv_type = MV_TYPE_DMV;
-                 for (i = 0; i < 2; i++) {
-                     if (USES_LIST(mb_type, i)) {
-                         int dmx, dmy, mx, my, m;
-                         const int my_shift = s->picture_structure == PICT_FRAME;
-                         mx = mpeg_decode_motion(s, s->mpeg_f_code[i][0],
-                                                 s->last_mv[i][0][0]);
-                         s->last_mv[i][0][0] = mx;
-                         s->last_mv[i][1][0] = mx;
-                         dmx = get_dmv(s);
-                         my  = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
-                                                  s->last_mv[i][0][1] >> my_shift);
-                         dmy = get_dmv(s);
+ void ff_mpeg12_common_init(MpegEncContext *s)
+ {
  
+     s->y_dc_scale_table =
+     s->c_dc_scale_table = ff_mpeg2_dc_scale_table[s->intra_dc_precision];
  
-                         s->last_mv[i][0][1] = my << my_shift;
-                         s->last_mv[i][1][1] = my << my_shift;
+ }
  
-                         s->mv[i][0][0] = mx;
-                         s->mv[i][0][1] = my;
-                         s->mv[i][1][0] = mx; // not used
-                         s->mv[i][1][1] = my; // not used
+ void ff_mpeg1_clean_buffers(MpegEncContext *s)
+ {
+     s->last_dc[0] = 1 << (7 + s->intra_dc_precision);
+     s->last_dc[1] = s->last_dc[0];
+     s->last_dc[2] = s->last_dc[0];
+     memset(s->last_mv, 0, sizeof(s->last_mv));
+ }
  
-                         if (s->picture_structure == PICT_FRAME) {
-                             mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
  
-                             // m = 1 + 2 * s->top_field_first;
-                             m = s->top_field_first ? 1 : 3;
+ /******************************************/
+ /* decoding */
  
-                             /* top -> top pred */
-                             s->mv[i][2][0] = ((mx * m + (mx > 0)) >> 1) + dmx;
-                             s->mv[i][2][1] = ((my * m + (my > 0)) >> 1) + dmy - 1;
-                             m = 4 - m;
-                             s->mv[i][3][0] = ((mx * m + (mx > 0)) >> 1) + dmx;
-                             s->mv[i][3][1] = ((my * m + (my > 0)) >> 1) + dmy + 1;
-                         } else {
-                             mb_type |= MB_TYPE_16x16;
+ VLC ff_dc_lum_vlc;
+ VLC ff_dc_chroma_vlc;
  
-                             s->mv[i][2][0] = ((mx + (mx > 0)) >> 1) + dmx;
-                             s->mv[i][2][1] = ((my + (my > 0)) >> 1) + dmy;
-                             if (s->picture_structure == PICT_TOP_FIELD)
-                                 s->mv[i][2][1]--;
-                             else
-                                 s->mv[i][2][1]++;
-                         }
-                     }
-                 }
-                 break;
-             default:
-                 av_log(s->avctx, AV_LOG_ERROR, "00 motion_type at %d %d\n", s->mb_x, s->mb_y);
-                 return -1;
-             }
-         }
+ static VLC mbincr_vlc;
+ static VLC mb_ptype_vlc;
+ static VLC mb_btype_vlc;
+ static VLC mb_pat_vlc;
  
-         s->mb_intra = 0;
        if (HAS_CBP(mb_type)) {
-             s->dsp.clear_blocks(s->block[0]);
+ av_cold void ff_mpeg12_init_vlcs(void)
+ {
+     static int done = 0;
  
-             cbp = get_vlc2(&s->gb, mb_pat_vlc.table, MB_PAT_VLC_BITS, 1);
-             if (mb_block_count > 6) {
-                  cbp <<= mb_block_count - 6;
-                  cbp  |= get_bits(&s->gb, mb_block_count - 6);
-                  s->dsp.clear_blocks(s->block[6]);
-             }
-             if (cbp <= 0) {
-                 av_log(s->avctx, AV_LOG_ERROR, "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
-                 return -1;
-             }
+     if (!done) {
+         done = 1;
  
-             //if 1, we memcpy blocks in xvmcvideo
-             if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1) {
-                 ff_xvmc_pack_pblocks(s, cbp);
-                 if (s->swap_uv) {
-                     exchange_uv(s);
-                 }
-             }
+         INIT_VLC_STATIC(&ff_dc_lum_vlc, DC_VLC_BITS, 12,
+                         ff_mpeg12_vlc_dc_lum_bits, 1, 1,
+                         ff_mpeg12_vlc_dc_lum_code, 2, 2, 512);
+         INIT_VLC_STATIC(&ff_dc_chroma_vlc,  DC_VLC_BITS, 12,
+                         ff_mpeg12_vlc_dc_chroma_bits, 1, 1,
+                         ff_mpeg12_vlc_dc_chroma_code, 2, 2, 514);
+         INIT_VLC_STATIC(&mv_vlc, MV_VLC_BITS, 17,
+                         &ff_mpeg12_mbMotionVectorTable[0][1], 2, 1,
+                         &ff_mpeg12_mbMotionVectorTable[0][0], 2, 1, 518);
+         INIT_VLC_STATIC(&mbincr_vlc, MBINCR_VLC_BITS, 36,
+                         &ff_mpeg12_mbAddrIncrTable[0][1], 2, 1,
+                         &ff_mpeg12_mbAddrIncrTable[0][0], 2, 1, 538);
+         INIT_VLC_STATIC(&mb_pat_vlc, MB_PAT_VLC_BITS, 64,
+                         &ff_mpeg12_mbPatTable[0][1], 2, 1,
+                         &ff_mpeg12_mbPatTable[0][0], 2, 1, 512);
  
-             if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
-                 if (s->flags2 & CODEC_FLAG2_FAST) {
-                     for (i = 0; i < 6; i++) {
-                         if (cbp & 32) {
-                             mpeg2_fast_decode_block_non_intra(s, *s->pblocks[i], i);
-                         } else {
-                             s->block_last_index[i] = -1;
-                         }
-                         cbp += cbp;
-                     }
-                 } else {
-                     cbp <<= 12-mb_block_count;
+         INIT_VLC_STATIC(&mb_ptype_vlc, MB_PTYPE_VLC_BITS, 7,
+                         &table_mb_ptype[0][1], 2, 1,
+                         &table_mb_ptype[0][0], 2, 1, 64);
+         INIT_VLC_STATIC(&mb_btype_vlc, MB_BTYPE_VLC_BITS, 11,
+                         &table_mb_btype[0][1], 2, 1,
+                         &table_mb_btype[0][0], 2, 1, 64);
+         init_rl(&ff_rl_mpeg1, ff_mpeg12_static_rl_table_store[0]);
+         init_rl(&ff_rl_mpeg2, ff_mpeg12_static_rl_table_store[1]);
  
-                     for (i = 0; i < mb_block_count; i++) {
-                         if (cbp & (1 << 11)) {
-                             if (mpeg2_decode_block_non_intra(s, *s->pblocks[i], i) < 0)
-                                 return -1;
-                         } else {
-                             s->block_last_index[i] = -1;
-                         }
-                         cbp += cbp;
-                     }
-                 }
-             } else {
-                 if (s->flags2 & CODEC_FLAG2_FAST) {
-                     for (i = 0; i < 6; i++) {
-                         if (cbp & 32) {
-                             mpeg1_fast_decode_block_inter(s, *s->pblocks[i], i);
-                         } else {
-                             s->block_last_index[i] = -1;
-                         }
-                         cbp += cbp;
-                     }
-                 } else {
-                     for (i = 0; i < 6; i++) {
-                         if (cbp & 32) {
-                             if (mpeg1_decode_block_inter(s, *s->pblocks[i], i) < 0)
-                                 return -1;
-                         } else {
-                             s->block_last_index[i] = -1;
-                         }
-                         cbp += cbp;
-                     }
-                 }
-             }
-         } else {
-             for (i = 0; i < 12; i++)
-                 s->block_last_index[i] = -1;
-         }
+         INIT_2D_VLC_RL(ff_rl_mpeg1, 680);
+         INIT_2D_VLC_RL(ff_rl_mpeg2, 674);
      }
-     s->current_picture.f.mb_type[s->mb_x + s->mb_y * s->mb_stride] = mb_type;
-     return 0;
  }
  
- /* as H.263, but only 17 codes */
- static int mpeg_decode_motion(MpegEncContext *s, int fcode, int pred)
+ static inline int get_dmv(MpegEncContext *s)
  {
-     int code, sign, val, l, shift;
+     if (get_bits1(&s->gb))
+         return 1 - (get_bits1(&s->gb) << 1);
+     else
+         return 0;
+ }
  
-     code = get_vlc2(&s->gb, mv_vlc.table, MV_VLC_BITS, 2);
-     if (code == 0) {
-         return pred;
-     }
-     if (code < 0) {
-         return 0xffff;
+ static inline int get_qscale(MpegEncContext *s)
+ {
+     int qscale = get_bits(&s->gb, 5);
+     if (s->q_scale_type) {
+         return non_linear_qscale[qscale];
+     } else {
+         return qscale << 1;
      }
+ }
  
-     sign  = get_bits1(&s->gb);
-     shift = fcode - 1;
-     val   = code;
-     if (shift) {
-         val  = (val - 1) << shift;
-         val |= get_bits(&s->gb, shift);
-         val++;
-     }
-     if (sign)
-         val = -val;
-     val += pred;
+ static void exchange_uv(MpegEncContext *s)
+ {
+     DCTELEM (*tmp)[64];
  
-     /* modulo decoding */
-     l   = INT_BIT - 5 - shift;
-     val = (val << l) >> l;
-     return val;
+     tmp           = s->pblocks[4];
+     s->pblocks[4] = s->pblocks[5];
+     s->pblocks[5] = tmp;
  }
  
- static inline int mpeg1_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, dc, diff, i, j, run;
-     int component;
-     RLTable *rl = &ff_rl_mpeg1;
-     uint8_t * const scantable    = s->intra_scantable.permutated;
-     const uint16_t *quant_matrix = s->intra_matrix;
-     const int qscale             = s->qscale;
+ /* motion type (for MPEG-2) */
+ #define MT_FIELD 1
+ #define MT_FRAME 2
+ #define MT_16X8  2
+ #define MT_DMV   3
  
-     /* DC coefficient */
-     component = (n <= 3 ? 0 : n - 4 + 1);
-     diff = decode_dc(&s->gb, component);
-     if (diff >= 0xffff)
-         return -1;
-     dc  = s->last_dc[component];
-     dc += diff;
-     s->last_dc[component] = dc;
-     block[0] = dc * quant_matrix[0];
-     av_dlog(s->avctx, "dc=%d diff=%d\n", dc, diff);
-     i = 0;
-     {
-         OPEN_READER(re, &s->gb);
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             UPDATE_CACHE(re, &s->gb);
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+ static int mpeg_decode_mb(MpegEncContext *s, DCTELEM block[12][64])
+ {
+     int i, j, k, cbp, val, mb_type, motion_type;
+     const int mb_block_count = 4 + (1 << s->chroma_format);
  
-             if (level == 127) {
-                 break;
-             } else if (level != 0) {
-                 i += run;
-                 j = scantable[i];
-                 level = (level * qscale * quant_matrix[j]) >> 4;
-                 level = (level - 1) | 1;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 LAST_SKIP_BITS(re, &s->gb, 1);
-             } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
-                 if (level == -128) {
-                     level = SHOW_UBITS(re, &s->gb, 8) - 256; LAST_SKIP_BITS(re, &s->gb, 8);
-                 } else if (level == 0) {
-                     level = SHOW_UBITS(re, &s->gb, 8)      ; LAST_SKIP_BITS(re, &s->gb, 8);
-                 }
-                 i += run;
-                 j = scantable[i];
-                 if (level < 0) {
-                     level = -level;
-                     level = (level * qscale * quant_matrix[j]) >> 4;
-                     level = (level - 1) | 1;
-                     level = -level;
-                 } else {
-                     level = (level * qscale * quant_matrix[j]) >> 4;
-                     level = (level - 1) | 1;
-                 }
-             }
-             if (i > 63) {
-                 av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                 return -1;
-             }
+     av_dlog(s->avctx, "decode_mb: x=%d y=%d\n", s->mb_x, s->mb_y);
  
-             block[j] = level;
-         }
-         CLOSE_READER(re, &s->gb);
-     }
-     s->block_last_index[n] = i;
-    return 0;
- }
+     assert(s->mb_skipped == 0);
  
- int ff_mpeg1_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
- {
-     return mpeg1_decode_block_intra(s, block, n);
- }
+     if (s->mb_skip_run-- != 0) {
+         if (s->pict_type == AV_PICTURE_TYPE_P) {
+             s->mb_skipped = 1;
+             s->current_picture.f.mb_type[s->mb_x + s->mb_y * s->mb_stride] = MB_TYPE_SKIP | MB_TYPE_L0 | MB_TYPE_16x16;
+         } else {
+             int mb_type;
  
- static inline int mpeg1_decode_block_inter(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, i, j, run;
-     RLTable *rl = &ff_rl_mpeg1;
-     uint8_t * const scantable    = s->intra_scantable.permutated;
-     const uint16_t *quant_matrix = s->inter_matrix;
-     const int qscale             = s->qscale;
+             if (s->mb_x)
+                 mb_type = s->current_picture.f.mb_type[s->mb_x + s->mb_y * s->mb_stride - 1];
+             else
+                 mb_type = s->current_picture.f.mb_type[s->mb_width + (s->mb_y - 1) * s->mb_stride - 1]; // FIXME not sure if this is allowed in MPEG at all
+             if (IS_INTRA(mb_type))
+                 return -1;
+             s->current_picture.f.mb_type[s->mb_x + s->mb_y*s->mb_stride] =
+                 mb_type | MB_TYPE_SKIP;
+ //            assert(s->current_picture.f.mb_type[s->mb_x + s->mb_y * s->mb_stride - 1] & (MB_TYPE_16x16 | MB_TYPE_16x8));
  
-     {
-         OPEN_READER(re, &s->gb);
-         i = -1;
-         // special case for first coefficient, no need to add second VLC table
-         UPDATE_CACHE(re, &s->gb);
-         if (((int32_t)GET_CACHE(re, &s->gb)) < 0) {
-             level = (3 * qscale * quant_matrix[0]) >> 5;
-             level = (level - 1) | 1;
-             if (GET_CACHE(re, &s->gb) & 0x40000000)
-                 level = -level;
-             block[0] = level;
-             i++;
-             SKIP_BITS(re, &s->gb, 2);
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 goto end;
+             if ((s->mv[0][0][0] | s->mv[0][0][1] | s->mv[1][0][0] | s->mv[1][0][1]) == 0)
+                 s->mb_skipped = 1;
          }
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
  
-             if (level != 0) {
-                 i += run;
-                 j = scantable[i];
-                 level = ((level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
-                 level = (level - 1) | 1;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 SKIP_BITS(re, &s->gb, 1);
-             } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
-                 if (level == -128) {
-                     level = SHOW_UBITS(re, &s->gb, 8) - 256; SKIP_BITS(re, &s->gb, 8);
-                 } else if (level == 0) {
-                     level = SHOW_UBITS(re, &s->gb, 8)      ; SKIP_BITS(re, &s->gb, 8);
-                 }
-                 i += run;
-                 j = scantable[i];
-                 if (level < 0) {
-                     level = -level;
-                     level = ((level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
-                     level = (level - 1) | 1;
-                     level = -level;
-                 } else {
-                     level = ((level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
-                     level = (level - 1) | 1;
-                 }
-             }
-             if (i > 63) {
-                 av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+         return 0;
+     }
+     switch (s->pict_type) {
+     default:
+     case AV_PICTURE_TYPE_I:
+         if (get_bits1(&s->gb) == 0) {
+             if (get_bits1(&s->gb) == 0) {
+                 av_log(s->avctx, AV_LOG_ERROR, "invalid mb type in I Frame at %d %d\n", s->mb_x, s->mb_y);
                  return -1;
              }
-             block[j] = level;
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 break;
-             UPDATE_CACHE(re, &s->gb);
+             mb_type = MB_TYPE_QUANT | MB_TYPE_INTRA;
+         } else {
+             mb_type = MB_TYPE_INTRA;
          }
- end:
-         LAST_SKIP_BITS(re, &s->gb, 2);
-         CLOSE_READER(re, &s->gb);
+         break;
+     case AV_PICTURE_TYPE_P:
+         mb_type = get_vlc2(&s->gb, mb_ptype_vlc.table, MB_PTYPE_VLC_BITS, 1);
+         if (mb_type < 0) {
+             av_log(s->avctx, AV_LOG_ERROR, "invalid mb type in P Frame at %d %d\n", s->mb_x, s->mb_y);
+             return -1;
+         }
+         mb_type = ptype2mb_type[mb_type];
+         break;
+     case AV_PICTURE_TYPE_B:
+         mb_type = get_vlc2(&s->gb, mb_btype_vlc.table, MB_BTYPE_VLC_BITS, 1);
+         if (mb_type < 0) {
+             av_log(s->avctx, AV_LOG_ERROR, "invalid mb type in B Frame at %d %d\n", s->mb_x, s->mb_y);
+             return -1;
+         }
+         mb_type = btype2mb_type[mb_type];
+         break;
      }
-     s->block_last_index[n] = i;
-     return 0;
- }
+     av_dlog(s->avctx, "mb_type=%x\n", mb_type);
+ //    motion_type = 0; /* avoid warning */
+     if (IS_INTRA(mb_type)) {
+         s->dsp.clear_blocks(s->block[0]);
  
- static inline int mpeg1_fast_decode_block_inter(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, i, j, run;
-     RLTable *rl = &ff_rl_mpeg1;
-     uint8_t * const scantable = s->intra_scantable.permutated;
-     const int qscale          = s->qscale;
+         if (!s->chroma_y_shift) {
+             s->dsp.clear_blocks(s->block[6]);
+         }
  
-     {
-         OPEN_READER(re, &s->gb);
-         i = -1;
-         // special case for first coefficient, no need to add second VLC table
-         UPDATE_CACHE(re, &s->gb);
-         if (((int32_t)GET_CACHE(re, &s->gb)) < 0) {
-             level = (3 * qscale) >> 1;
-             level = (level - 1) | 1;
-             if (GET_CACHE(re, &s->gb) & 0x40000000)
-                 level = -level;
-             block[0] = level;
-             i++;
-             SKIP_BITS(re, &s->gb, 2);
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 goto end;
-         }
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
-             if (level != 0) {
-                 i += run;
-                 j = scantable[i];
-                 level = ((level * 2 + 1) * qscale) >> 1;
-                 level = (level - 1) | 1;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 SKIP_BITS(re, &s->gb, 1);
-             } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6)+1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 8); SKIP_BITS(re, &s->gb, 8);
-                 if (level == -128) {
-                     level = SHOW_UBITS(re, &s->gb, 8) - 256; SKIP_BITS(re, &s->gb, 8);
-                 } else if (level == 0) {
-                     level = SHOW_UBITS(re, &s->gb, 8)      ; SKIP_BITS(re, &s->gb, 8);
-                 }
-                 i += run;
-                 j = scantable[i];
-                 if (level < 0) {
-                     level = -level;
-                     level = ((level * 2 + 1) * qscale) >> 1;
-                     level = (level - 1) | 1;
-                     level = -level;
-                 } else {
-                     level = ((level * 2 + 1) * qscale) >> 1;
-                     level = (level - 1) | 1;
-                 }
-             }
-             block[j] = level;
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 break;
-             UPDATE_CACHE(re, &s->gb);
+         /* compute DCT type */
+         if (s->picture_structure == PICT_FRAME && // FIXME add an interlaced_dct coded var?
+             !s->frame_pred_frame_dct) {
+             s->interlaced_dct = get_bits1(&s->gb);
          }
- end:
-         LAST_SKIP_BITS(re, &s->gb, 2);
-         CLOSE_READER(re, &s->gb);
-     }
-     s->block_last_index[n] = i;
-     return 0;
- }
  
+         if (IS_QUANT(mb_type))
+             s->qscale = get_qscale(s);
  
- static inline int mpeg2_decode_block_non_intra(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, i, j, run;
-     RLTable *rl = &ff_rl_mpeg1;
-     uint8_t * const scantable = s->intra_scantable.permutated;
-     const uint16_t *quant_matrix;
-     const int qscale = s->qscale;
-     int mismatch;
-     mismatch = 1;
+         if (s->concealment_motion_vectors) {
+             /* just parse them */
+             if (s->picture_structure != PICT_FRAME)
+                 skip_bits1(&s->gb); /* field select */
  
-     {
-         OPEN_READER(re, &s->gb);
-         i = -1;
-         if (n < 4)
-             quant_matrix = s->inter_matrix;
-         else
-             quant_matrix = s->chroma_inter_matrix;
+             s->mv[0][0][0]= s->last_mv[0][0][0]= s->last_mv[0][1][0] =
+                 mpeg_decode_motion(s, s->mpeg_f_code[0][0], s->last_mv[0][0][0]);
+             s->mv[0][0][1]= s->last_mv[0][0][1]= s->last_mv[0][1][1] =
+                 mpeg_decode_motion(s, s->mpeg_f_code[0][1], s->last_mv[0][0][1]);
  
-         // special case for first coefficient, no need to add second VLC table
-         UPDATE_CACHE(re, &s->gb);
-         if (((int32_t)GET_CACHE(re, &s->gb)) < 0) {
-             level= (3 * qscale * quant_matrix[0]) >> 5;
-             if (GET_CACHE(re, &s->gb) & 0x40000000)
-                 level = -level;
-             block[0]  = level;
-             mismatch ^= level;
-             i++;
-             SKIP_BITS(re, &s->gb, 2);
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 goto end;
+             skip_bits1(&s->gb); /* marker */
+         } else
+             memset(s->last_mv, 0, sizeof(s->last_mv)); /* reset mv prediction */
+         s->mb_intra = 1;
+         // if 1, we memcpy blocks in xvmcvideo
+         if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1) {
+             ff_xvmc_pack_pblocks(s, -1); // inter are always full blocks
+             if (s->swap_uv) {
+                 exchange_uv(s);
+             }
          }
  
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
-             if (level != 0) {
-                 i += run;
-                 j = scantable[i];
-                 level = ((level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 SKIP_BITS(re, &s->gb, 1);
+         if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
+             if (s->flags2 & CODEC_FLAG2_FAST) {
+                 for (i = 0; i < 6; i++) {
+                     mpeg2_fast_decode_block_intra(s, *s->pblocks[i], i);
+                 }
              } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
-                 i += run;
-                 j = scantable[i];
-                 if (level < 0) {
-                     level = ((-level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
-                     level = -level;
-                 } else {
-                     level = ((level * 2 + 1) * qscale * quant_matrix[j]) >> 5;
+                 for (i = 0; i < mb_block_count; i++) {
+                     if (mpeg2_decode_block_intra(s, *s->pblocks[i], i) < 0)
+                         return -1;
                  }
              }
-             if (i > 63) {
-                 av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
-                 return -1;
+         } else {
+             for (i = 0; i < 6; i++) {
+                 if (mpeg1_decode_block_intra(s, *s->pblocks[i], i) < 0)
+                     return -1;
              }
-             mismatch ^= level;
-             block[j]  = level;
-             if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-                 break;
-             UPDATE_CACHE(re, &s->gb);
          }
- end:
-         LAST_SKIP_BITS(re, &s->gb, 2);
-         CLOSE_READER(re, &s->gb);
-     }
-     block[63] ^= (mismatch & 1);
+     } else {
+         if (mb_type & MB_TYPE_ZERO_MV) {
+             assert(mb_type & MB_TYPE_CBP);
  
-     s->block_last_index[n] = i;
-     return 0;
- }
+             s->mv_dir = MV_DIR_FORWARD;
+             if (s->picture_structure == PICT_FRAME) {
+                 if (!s->frame_pred_frame_dct)
+                     s->interlaced_dct = get_bits1(&s->gb);
+                 s->mv_type = MV_TYPE_16X16;
+             } else {
+                 s->mv_type = MV_TYPE_FIELD;
+                 mb_type |= MB_TYPE_INTERLACED;
+                 s->field_select[0][0] = s->picture_structure - 1;
+             }
  
- static inline int mpeg2_fast_decode_block_non_intra(MpegEncContext *s,
-                                                     DCTELEM *block, int n)
- {
-     int level, i, j, run;
-     RLTable *rl = &ff_rl_mpeg1;
-     uint8_t * const scantable = s->intra_scantable.permutated;
-     const int qscale          = s->qscale;
-     OPEN_READER(re, &s->gb);
-     i = -1;
+             if (IS_QUANT(mb_type))
+                 s->qscale = get_qscale(s);
  
-     // special case for first coefficient, no need to add second VLC table
-     UPDATE_CACHE(re, &s->gb);
-     if (((int32_t)GET_CACHE(re, &s->gb)) < 0) {
-         level = (3 * qscale) >> 1;
-         if (GET_CACHE(re, &s->gb) & 0x40000000)
-             level = -level;
-         block[0] = level;
-         i++;
-         SKIP_BITS(re, &s->gb, 2);
-         if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-             goto end;
-     }
+             s->last_mv[0][0][0] = 0;
+             s->last_mv[0][0][1] = 0;
+             s->last_mv[0][1][0] = 0;
+             s->last_mv[0][1][1] = 0;
+             s->mv[0][0][0] = 0;
+             s->mv[0][0][1] = 0;
+         } else {
+             assert(mb_type & MB_TYPE_L0L1);
+             // FIXME decide if MBs in field pictures are MB_TYPE_INTERLACED
+             /* get additional motion vector type */
+             if (s->frame_pred_frame_dct)
+                 motion_type = MT_FRAME;
+             else {
+                 motion_type = get_bits(&s->gb, 2);
+                 if (s->picture_structure == PICT_FRAME && HAS_CBP(mb_type))
+                     s->interlaced_dct = get_bits1(&s->gb);
+             }
  
-     /* now quantify & encode AC coefficients */
-     for (;;) {
-         GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+             if (IS_QUANT(mb_type))
+                 s->qscale = get_qscale(s);
  
-         if (level != 0) {
-             i += run;
-             j  = scantable[i];
-             level = ((level * 2 + 1) * qscale) >> 1;
-             level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-             SKIP_BITS(re, &s->gb, 1);
-         } else {
-             /* escape */
-             run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-             UPDATE_CACHE(re, &s->gb);
-             level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
+             /* motion vectors */
+             s->mv_dir = (mb_type >> 13) & 3;
+             av_dlog(s->avctx, "motion_type=%d\n", motion_type);
+             switch (motion_type) {
+             case MT_FRAME: /* or MT_16X8 */
+                 if (s->picture_structure == PICT_FRAME) {
+                     mb_type |= MB_TYPE_16x16;
+                     s->mv_type = MV_TYPE_16X16;
+                     for (i = 0; i < 2; i++) {
+                         if (USES_LIST(mb_type, i)) {
+                             /* MT_FRAME */
+                             s->mv[i][0][0]= s->last_mv[i][0][0]= s->last_mv[i][1][0] =
+                                 mpeg_decode_motion(s, s->mpeg_f_code[i][0], s->last_mv[i][0][0]);
+                             s->mv[i][0][1]= s->last_mv[i][0][1]= s->last_mv[i][1][1] =
+                                 mpeg_decode_motion(s, s->mpeg_f_code[i][1], s->last_mv[i][0][1]);
+                             /* full_pel: only for MPEG-1 */
+                             if (s->full_pel[i]) {
+                                 s->mv[i][0][0] <<= 1;
+                                 s->mv[i][0][1] <<= 1;
+                             }
+                         }
+                     }
+                 } else {
+                     mb_type |= MB_TYPE_16x8 | MB_TYPE_INTERLACED;
+                     s->mv_type = MV_TYPE_16X8;
+                     for (i = 0; i < 2; i++) {
+                         if (USES_LIST(mb_type, i)) {
+                             /* MT_16X8 */
+                             for (j = 0; j < 2; j++) {
+                                 s->field_select[i][j] = get_bits1(&s->gb);
+                                 for (k = 0; k < 2; k++) {
+                                     val = mpeg_decode_motion(s, s->mpeg_f_code[i][k],
+                                                              s->last_mv[i][j][k]);
+                                     s->last_mv[i][j][k] = val;
+                                     s->mv[i][j][k]      = val;
+                                 }
+                             }
+                         }
+                     }
+                 }
+                 break;
+             case MT_FIELD:
++                if(s->progressive_sequence){
++                    av_log(s->avctx, AV_LOG_ERROR, "MT_FIELD in progressive_sequence\n");
++                    return -1;
++                }
+                 s->mv_type = MV_TYPE_FIELD;
+                 if (s->picture_structure == PICT_FRAME) {
+                     mb_type |= MB_TYPE_16x8 | MB_TYPE_INTERLACED;
+                     for (i = 0; i < 2; i++) {
+                         if (USES_LIST(mb_type, i)) {
+                             for (j = 0; j < 2; j++) {
+                                 s->field_select[i][j] = get_bits1(&s->gb);
+                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][0],
+                                                          s->last_mv[i][j][0]);
+                                 s->last_mv[i][j][0] = val;
+                                 s->mv[i][j][0]      = val;
+                                 av_dlog(s->avctx, "fmx=%d\n", val);
+                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
+                                                          s->last_mv[i][j][1] >> 1);
+                                 s->last_mv[i][j][1] = val << 1;
+                                 s->mv[i][j][1]      = val;
+                                 av_dlog(s->avctx, "fmy=%d\n", val);
+                             }
+                         }
+                     }
+                 } else {
+                     mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
+                     for (i = 0; i < 2; i++) {
+                         if (USES_LIST(mb_type, i)) {
+                             s->field_select[i][0] = get_bits1(&s->gb);
+                             for (k = 0; k < 2; k++) {
+                                 val = mpeg_decode_motion(s, s->mpeg_f_code[i][k],
+                                                          s->last_mv[i][0][k]);
+                                 s->last_mv[i][0][k] = val;
+                                 s->last_mv[i][1][k] = val;
+                                 s->mv[i][0][k]      = val;
+                             }
+                         }
+                     }
+                 }
+                 break;
+             case MT_DMV:
++                if(s->progressive_sequence){
++                    av_log(s->avctx, AV_LOG_ERROR, "MT_DMV in progressive_sequence\n");
++                    return -1;
++                }
+                 s->mv_type = MV_TYPE_DMV;
+                 for (i = 0; i < 2; i++) {
+                     if (USES_LIST(mb_type, i)) {
+                         int dmx, dmy, mx, my, m;
+                         const int my_shift = s->picture_structure == PICT_FRAME;
  
-             i += run;
-             j  = scantable[i];
-             if (level < 0) {
-                 level = ((-level * 2 + 1) * qscale) >> 1;
-                 level = -level;
-             } else {
-                 level = ((level * 2 + 1) * qscale) >> 1;
-             }
-         }
+                         mx = mpeg_decode_motion(s, s->mpeg_f_code[i][0],
+                                                 s->last_mv[i][0][0]);
+                         s->last_mv[i][0][0] = mx;
+                         s->last_mv[i][1][0] = mx;
+                         dmx = get_dmv(s);
+                         my  = mpeg_decode_motion(s, s->mpeg_f_code[i][1],
+                                                  s->last_mv[i][0][1] >> my_shift);
+                         dmy = get_dmv(s);
  
-         block[j] = level;
-         if (((int32_t)GET_CACHE(re, &s->gb)) <= (int32_t)0xBFFFFFFF)
-             break;
-         UPDATE_CACHE(re, &s->gb);
-     }
- end:
-     LAST_SKIP_BITS(re, &s->gb, 2);
-     CLOSE_READER(re, &s->gb);
-     s->block_last_index[n] = i;
-     return 0;
- }
  
+                         s->last_mv[i][0][1] = my << my_shift;
+                         s->last_mv[i][1][1] = my << my_shift;
  
- static inline int mpeg2_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, dc, diff, i, j, run;
-     int component;
-     RLTable *rl;
-     uint8_t * const scantable = s->intra_scantable.permutated;
-     const uint16_t *quant_matrix;
-     const int qscale = s->qscale;
-     int mismatch;
+                         s->mv[i][0][0] = mx;
+                         s->mv[i][0][1] = my;
+                         s->mv[i][1][0] = mx; // not used
+                         s->mv[i][1][1] = my; // not used
  
-     /* DC coefficient */
-     if (n < 4) {
-         quant_matrix = s->intra_matrix;
-         component = 0;
-     } else {
-         quant_matrix = s->chroma_intra_matrix;
-         component = (n & 1) + 1;
-     }
-     diff = decode_dc(&s->gb, component);
-     if (diff >= 0xffff)
-         return -1;
-     dc  = s->last_dc[component];
-     dc += diff;
-     s->last_dc[component] = dc;
-     block[0] = dc << (3 - s->intra_dc_precision);
-     av_dlog(s->avctx, "dc=%d\n", block[0]);
-     mismatch = block[0] ^ 1;
-     i = 0;
-     if (s->intra_vlc_format)
-         rl = &ff_rl_mpeg2;
-     else
-         rl = &ff_rl_mpeg1;
+                         if (s->picture_structure == PICT_FRAME) {
+                             mb_type |= MB_TYPE_16x16 | MB_TYPE_INTERLACED;
  
-     {
-         OPEN_READER(re, &s->gb);
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             UPDATE_CACHE(re, &s->gb);
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+                             // m = 1 + 2 * s->top_field_first;
+                             m = s->top_field_first ? 1 : 3;
  
-             if (level == 127) {
-                 break;
-             } else if (level != 0) {
-                 i += run;
-                 j  = scantable[i];
-                 level = (level * qscale * quant_matrix[j]) >> 4;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 LAST_SKIP_BITS(re, &s->gb, 1);
-             } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
-                 i += run;
-                 j  = scantable[i];
-                 if (level < 0) {
-                     level = (-level * qscale * quant_matrix[j]) >> 4;
-                     level = -level;
-                 } else {
-                     level = (level * qscale * quant_matrix[j]) >> 4;
+                             /* top -> top pred */
+                             s->mv[i][2][0] = ((mx * m + (mx > 0)) >> 1) + dmx;
+                             s->mv[i][2][1] = ((my * m + (my > 0)) >> 1) + dmy - 1;
+                             m = 4 - m;
+                             s->mv[i][3][0] = ((mx * m + (mx > 0)) >> 1) + dmx;
+                             s->mv[i][3][1] = ((my * m + (my > 0)) >> 1) + dmy + 1;
+                         } else {
+                             mb_type |= MB_TYPE_16x16;
+                             s->mv[i][2][0] = ((mx + (mx > 0)) >> 1) + dmx;
+                             s->mv[i][2][1] = ((my + (my > 0)) >> 1) + dmy;
+                             if (s->picture_structure == PICT_TOP_FIELD)
+                                 s->mv[i][2][1]--;
+                             else
+                                 s->mv[i][2][1]++;
+                         }
+                     }
                  }
-             }
-             if (i > 63) {
-                 av_log(s->avctx, AV_LOG_ERROR, "ac-tex damaged at %d %d\n", s->mb_x, s->mb_y);
+                 break;
+             default:
+                 av_log(s->avctx, AV_LOG_ERROR, "00 motion_type at %d %d\n", s->mb_x, s->mb_y);
                  return -1;
              }
-             mismatch ^= level;
-             block[j]  = level;
          }
-         CLOSE_READER(re, &s->gb);
-     }
-     block[63] ^= mismatch & 1;
  
-     s->block_last_index[n] = i;
-     return 0;
- }
+         s->mb_intra = 0;
+         if (HAS_CBP(mb_type)) {
+             s->dsp.clear_blocks(s->block[0]);
  
- static inline int mpeg2_fast_decode_block_intra(MpegEncContext *s, DCTELEM *block, int n)
- {
-     int level, dc, diff, j, run;
-     int component;
-     RLTable *rl;
-     uint8_t * scantable = s->intra_scantable.permutated;
-     const uint16_t *quant_matrix;
-     const int qscale = s->qscale;
+             cbp = get_vlc2(&s->gb, mb_pat_vlc.table, MB_PAT_VLC_BITS, 1);
+             if (mb_block_count > 6) {
+                  cbp <<= mb_block_count - 6;
+                  cbp  |= get_bits(&s->gb, mb_block_count - 6);
+                  s->dsp.clear_blocks(s->block[6]);
+             }
+             if (cbp <= 0) {
+                 av_log(s->avctx, AV_LOG_ERROR, "invalid cbp at %d %d\n", s->mb_x, s->mb_y);
+                 return -1;
+             }
  
-     /* DC coefficient */
-     if (n < 4) {
-         quant_matrix = s->intra_matrix;
-         component = 0;
-     } else {
-         quant_matrix = s->chroma_intra_matrix;
-         component = (n & 1) + 1;
-     }
-     diff = decode_dc(&s->gb, component);
-     if (diff >= 0xffff)
-         return -1;
-     dc = s->last_dc[component];
-     dc += diff;
-     s->last_dc[component] = dc;
-     block[0] = dc << (3 - s->intra_dc_precision);
-     if (s->intra_vlc_format)
-         rl = &ff_rl_mpeg2;
-     else
-         rl = &ff_rl_mpeg1;
+             //if 1, we memcpy blocks in xvmcvideo
+             if (CONFIG_MPEG_XVMC_DECODER && s->avctx->xvmc_acceleration > 1) {
+                 ff_xvmc_pack_pblocks(s, cbp);
+                 if (s->swap_uv) {
+                     exchange_uv(s);
+                 }
+             }
  
-     {
-         OPEN_READER(re, &s->gb);
-         /* now quantify & encode AC coefficients */
-         for (;;) {
-             UPDATE_CACHE(re, &s->gb);
-             GET_RL_VLC(level, run, re, &s->gb, rl->rl_vlc[0], TEX_VLC_BITS, 2, 0);
+             if (s->codec_id == CODEC_ID_MPEG2VIDEO) {
+                 if (s->flags2 & CODEC_FLAG2_FAST) {
+                     for (i = 0; i < 6; i++) {
+                         if (cbp & 32) {
+                             mpeg2_fast_decode_block_non_intra(s, *s->pblocks[i], i);
+                         } else {
+                             s->block_last_index[i] = -1;
+                         }
+                         cbp += cbp;
+                     }
+                 } else {
+                     cbp <<= 12-mb_block_count;
  
-             if (level == 127) {
-                 break;
-             } else if (level != 0) {
-                 scantable += run;
-                 j = *scantable;
-                 level = (level * qscale * quant_matrix[j]) >> 4;
-                 level = (level ^ SHOW_SBITS(re, &s->gb, 1)) - SHOW_SBITS(re, &s->gb, 1);
-                 LAST_SKIP_BITS(re, &s->gb, 1);
+                     for (i = 0; i < mb_block_count; i++) {
+                         if (cbp & (1 << 11)) {
+                             if (mpeg2_decode_block_non_intra(s, *s->pblocks[i], i) < 0)
+                                 return -1;
+                         } else {
+                             s->block_last_index[i] = -1;
+                         }
+                         cbp += cbp;
+                     }
+                 }
              } else {
-                 /* escape */
-                 run = SHOW_UBITS(re, &s->gb, 6) + 1; LAST_SKIP_BITS(re, &s->gb, 6);
-                 UPDATE_CACHE(re, &s->gb);
-                 level = SHOW_SBITS(re, &s->gb, 12); SKIP_BITS(re, &s->gb, 12);
-                 scantable += run;
-                 j = *scantable;
-                 if (level < 0) {
-                     level = (-level * qscale * quant_matrix[j]) >> 4;
-                     level = -level;
+                 if (s->flags2 & CODEC_FLAG2_FAST) {
+                     for (i = 0; i < 6; i++) {
+                         if (cbp & 32) {
+                             mpeg1_fast_decode_block_inter(s, *s->pblocks[i], i);
+                         } else {
+                             s->block_last_index[i] = -1;
+                         }
+                         cbp += cbp;
+                     }
                  } else {
-                     level = (level * qscale * quant_matrix[j]) >> 4;
+                     for (i = 0; i < 6; i++) {
+                         if (cbp & 32) {
+                             if (mpeg1_decode_block_inter(s, *s->pblocks[i], i) < 0)
+                                 return -1;
+                         } else {
+                             s->block_last_index[i] = -1;
+                         }
+                         cbp += cbp;
+                     }
                  }
              }
-             block[j] = level;
+         } else {
+             for (i = 0; i < 12; i++)
+                 s->block_last_index[i] = -1;
          }
-         CLOSE_READER(re, &s->gb);
      }
  
-     s->block_last_index[n] = scantable - s->intra_scantable.permutated;
+     s->current_picture.f.mb_type[s->mb_x + s->mb_y * s->mb_stride] = mb_type;
      return 0;
  }
  
Simple merge
Simple merge
@@@ -186,4 -221,55 +221,55 @@@ void ff_sws_init_swScale_mmx(SwsContex
      if (cpu_flags & AV_CPU_FLAG_MMX2)
          sws_init_swScale_MMX2(c);
  #endif
 -    } else /* c->srcBpc == 16 */ { \
+ #if HAVE_YASM
+ #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
+     if (c->srcBpc == 8) { \
+         hscalefn = c->dstBpc <= 10 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
+                                      ff_hscale8to19_ ## filtersize ## _ ## opt1; \
+     } else if (c->srcBpc == 9) { \
+         hscalefn = c->dstBpc <= 10 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
+                                      ff_hscale9to19_ ## filtersize ## _ ## opt1; \
+     } else if (c->srcBpc == 10) { \
+         hscalefn = c->dstBpc <= 10 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
+                                      ff_hscale10to19_ ## filtersize ## _ ## opt1; \
++    } else if(c->srcBpc == 16 && !((c->srcFormat==PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)) { \
+         hscalefn = c->dstBpc <= 10 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
+                                      ff_hscale16to19_ ## filtersize ## _ ## opt1; \
+     } \
+ } while (0)
+ #define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+     switch (filtersize) { \
+     case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+     case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+     default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
+     }
+ #if ARCH_X86_32
+     if (cpu_flags & AV_CPU_FLAG_MMX) {
+         ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
+         ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
+     }
+ #endif
+ #define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
+     switch (filtersize) { \
+     case 4:  ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
+     case 8:  ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
+     default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
+              else                ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
+              break; \
+     }
+     if (cpu_flags & AV_CPU_FLAG_SSE2) {
+         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
+         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
+     }
+     if (cpu_flags & AV_CPU_FLAG_SSSE3) {
+         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
+         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
+     }
+     if (cpu_flags & AV_CPU_FLAG_SSE4) {
+         /* Xto15 don't need special sse4 functions */
+         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
+         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+     }
+ #endif
  }
@@@ -1951,321 -1958,6 +1951,162 @@@ static void RENAME(rgb24ToUV)(int16_t *
      RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
  }
  
- #if !COMPILE_TEMPLATE_MMX2
- // bilinear / bicubic scaling
- static void RENAME(hScale)(SwsContext *c, int16_t *dst, int dstW,
-                            const uint8_t *src, const int16_t *filter,
-                            const int16_t *filterPos, int filterSize)
- {
-     assert(filterSize % 4 == 0 && filterSize>0);
-     if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
-         x86_reg counter= -2*dstW;
-         filter-= counter*2;
-         filterPos-= counter/2;
-         dst-= counter/2;
-         __asm__ volatile(
- #if defined(PIC)
-             "push            %%"REG_b"              \n\t"
- #endif
-             "pxor                %%mm7, %%mm7       \n\t"
-             "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
-             "mov             %%"REG_a", %%"REG_BP"  \n\t"
-             ".p2align                4              \n\t"
-             "1:                                     \n\t"
-             "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
-             "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
-             "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
-             "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
-             "movd      (%3, %%"REG_a"), %%mm0       \n\t"
-             "movd      (%3, %%"REG_b"), %%mm2       \n\t"
-             "punpcklbw           %%mm7, %%mm0       \n\t"
-             "punpcklbw           %%mm7, %%mm2       \n\t"
-             "pmaddwd             %%mm1, %%mm0       \n\t"
-             "pmaddwd             %%mm2, %%mm3       \n\t"
-             "movq                %%mm0, %%mm4       \n\t"
-             "punpckldq           %%mm3, %%mm0       \n\t"
-             "punpckhdq           %%mm3, %%mm4       \n\t"
-             "paddd               %%mm4, %%mm0       \n\t"
-             "psrad                  $7, %%mm0       \n\t"
-             "packssdw            %%mm0, %%mm0       \n\t"
-             "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
-             "add                    $4, %%"REG_BP"  \n\t"
-             " jnc                   1b              \n\t"
-             "pop            %%"REG_BP"              \n\t"
- #if defined(PIC)
-             "pop             %%"REG_b"              \n\t"
- #endif
-             : "+a" (counter)
-             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
- #if !defined(PIC)
-             : "%"REG_b
- #endif
-         );
-     } else if (filterSize==8) {
-         x86_reg counter= -2*dstW;
-         filter-= counter*4;
-         filterPos-= counter/2;
-         dst-= counter/2;
-         __asm__ volatile(
- #if defined(PIC)
-             "push             %%"REG_b"             \n\t"
- #endif
-             "pxor                 %%mm7, %%mm7      \n\t"
-             "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
-             "mov              %%"REG_a", %%"REG_BP" \n\t"
-             ".p2align                 4             \n\t"
-             "1:                                     \n\t"
-             "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
-             "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
-             "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
-             "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
-             "movd       (%3, %%"REG_a"), %%mm0      \n\t"
-             "movd       (%3, %%"REG_b"), %%mm2      \n\t"
-             "punpcklbw            %%mm7, %%mm0      \n\t"
-             "punpcklbw            %%mm7, %%mm2      \n\t"
-             "pmaddwd              %%mm1, %%mm0      \n\t"
-             "pmaddwd              %%mm2, %%mm3      \n\t"
-             "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
-             "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
-             "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
-             "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
-             "punpcklbw            %%mm7, %%mm4      \n\t"
-             "punpcklbw            %%mm7, %%mm2      \n\t"
-             "pmaddwd              %%mm1, %%mm4      \n\t"
-             "pmaddwd              %%mm2, %%mm5      \n\t"
-             "paddd                %%mm4, %%mm0      \n\t"
-             "paddd                %%mm5, %%mm3      \n\t"
-             "movq                 %%mm0, %%mm4      \n\t"
-             "punpckldq            %%mm3, %%mm0      \n\t"
-             "punpckhdq            %%mm3, %%mm4      \n\t"
-             "paddd                %%mm4, %%mm0      \n\t"
-             "psrad                   $7, %%mm0      \n\t"
-             "packssdw             %%mm0, %%mm0      \n\t"
-             "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
-             "add                     $4, %%"REG_BP" \n\t"
-             " jnc                    1b             \n\t"
-             "pop             %%"REG_BP"             \n\t"
- #if defined(PIC)
-             "pop              %%"REG_b"             \n\t"
- #endif
-             : "+a" (counter)
-             : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
- #if !defined(PIC)
-             : "%"REG_b
- #endif
-         );
-     } else {
-         const uint8_t *offset = src+filterSize;
-         x86_reg counter= -2*dstW;
-         //filter-= counter*filterSize/2;
-         filterPos-= counter/2;
-         dst-= counter/2;
-         __asm__ volatile(
-             "pxor                  %%mm7, %%mm7     \n\t"
-             ".p2align                  4            \n\t"
-             "1:                                     \n\t"
-             "mov                      %2, %%"REG_c" \n\t"
-             "movzwl      (%%"REG_c", %0), %%eax     \n\t"
-             "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
-             "mov                      %5, %%"REG_c" \n\t"
-             "pxor                  %%mm4, %%mm4     \n\t"
-             "pxor                  %%mm5, %%mm5     \n\t"
-             "2:                                     \n\t"
-             "movq                   (%1), %%mm1     \n\t"
-             "movq               (%1, %6), %%mm3     \n\t"
-             "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
-             "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
-             "punpcklbw             %%mm7, %%mm0     \n\t"
-             "punpcklbw             %%mm7, %%mm2     \n\t"
-             "pmaddwd               %%mm1, %%mm0     \n\t"
-             "pmaddwd               %%mm2, %%mm3     \n\t"
-             "paddd                 %%mm3, %%mm5     \n\t"
-             "paddd                 %%mm0, %%mm4     \n\t"
-             "add                      $8, %1        \n\t"
-             "add                      $4, %%"REG_c" \n\t"
-             "cmp                      %4, %%"REG_c" \n\t"
-             " jb                      2b            \n\t"
-             "add                      %6, %1        \n\t"
-             "movq                  %%mm4, %%mm0     \n\t"
-             "punpckldq             %%mm5, %%mm4     \n\t"
-             "punpckhdq             %%mm5, %%mm0     \n\t"
-             "paddd                 %%mm0, %%mm4     \n\t"
-             "psrad                    $7, %%mm4     \n\t"
-             "packssdw              %%mm4, %%mm4     \n\t"
-             "mov                      %3, %%"REG_a" \n\t"
-             "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
-             "add                      $4, %0        \n\t"
-             " jnc                     1b            \n\t"
-             : "+r" (counter), "+r" (filter)
-             : "m" (filterPos), "m" (dst), "m"(offset),
-             "m" (src), "r" ((x86_reg)filterSize*2)
-             : "%"REG_a, "%"REG_c, "%"REG_d
-         );
-     }
- }
- #endif /* !COMPILE_TEMPLATE_MMX2 */
 +static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
 +                                    const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
 +{
 +    int i, j;
 +
 +    assert(filterSize % 4 == 0 && filterSize>0);
 +    if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
 +        x86_reg counter= -2*dstW;
 +        filter-= counter*2;
 +        filterPos-= counter/2;
 +        dst-= counter/2;
 +        __asm__ volatile(
 +            "movd                   %5, %%mm7       \n\t"
 +#if defined(PIC)
 +            "push            %%"REG_b"              \n\t"
 +#endif
 +            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
 +            "mov             %%"REG_a", %%"REG_BP"  \n\t"
 +            ".p2align                4              \n\t"
 +            "1:                                     \n\t"
 +            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
 +            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
 +            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
 +            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
 +            "movq      (%3, %%"REG_a", 2), %%mm0    \n\t"
 +            "movq      (%3, %%"REG_b", 2), %%mm2    \n\t"
 +            "pmaddwd             %%mm1, %%mm0       \n\t"
 +            "pmaddwd             %%mm2, %%mm3       \n\t"
 +            "movq                %%mm0, %%mm4       \n\t"
 +            "punpckldq           %%mm3, %%mm0       \n\t"
 +            "punpckhdq           %%mm3, %%mm4       \n\t"
 +            "paddd               %%mm4, %%mm0       \n\t"
 +            "psrad               %%mm7, %%mm0       \n\t"
 +            "packssdw            %%mm0, %%mm0       \n\t"
 +            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
 +            "add                    $4, %%"REG_BP"  \n\t"
 +            " jnc                   1b              \n\t"
 +
 +            "pop            %%"REG_BP"              \n\t"
 +#if defined(PIC)
 +            "pop             %%"REG_b"              \n\t"
 +#endif
 +            : "+a" (counter)
 +            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
 +#if !defined(PIC)
 +            : "%"REG_b
 +#endif
 +        );
 +    } else if (filterSize==8 && shift<15) {
 +        x86_reg counter= -2*dstW;
 +        filter-= counter*4;
 +        filterPos-= counter/2;
 +        dst-= counter/2;
 +        __asm__ volatile(
 +            "movd                   %5, %%mm7       \n\t"
 +#if defined(PIC)
 +            "push            %%"REG_b"              \n\t"
 +#endif
 +            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
 +            "mov              %%"REG_a", %%"REG_BP" \n\t"
 +            ".p2align                 4             \n\t"
 +            "1:                                     \n\t"
 +            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
 +            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
 +            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
 +            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
 +            "movq       (%3, %%"REG_a", 2), %%mm0   \n\t"
 +            "movq       (%3, %%"REG_b", 2), %%mm2   \n\t"
 +            "pmaddwd              %%mm1, %%mm0      \n\t"
 +            "pmaddwd              %%mm2, %%mm3      \n\t"
 +
 +            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
 +            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
 +            "movq      8(%3, %%"REG_a", 2), %%mm4   \n\t"
 +            "movq      8(%3, %%"REG_b", 2), %%mm2   \n\t"
 +            "pmaddwd              %%mm1, %%mm4      \n\t"
 +            "pmaddwd              %%mm2, %%mm5      \n\t"
 +            "paddd                %%mm4, %%mm0      \n\t"
 +            "paddd                %%mm5, %%mm3      \n\t"
 +            "movq                 %%mm0, %%mm4      \n\t"
 +            "punpckldq            %%mm3, %%mm0      \n\t"
 +            "punpckhdq            %%mm3, %%mm4      \n\t"
 +            "paddd                %%mm4, %%mm0      \n\t"
 +            "psrad                %%mm7, %%mm0      \n\t"
 +            "packssdw             %%mm0, %%mm0      \n\t"
 +            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
 +            "add                     $4, %%"REG_BP" \n\t"
 +            " jnc                    1b             \n\t"
 +
 +            "pop             %%"REG_BP"             \n\t"
 +#if defined(PIC)
 +            "pop             %%"REG_b"              \n\t"
 +#endif
 +            : "+a" (counter)
 +            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
 +#if !defined(PIC)
 +            : "%"REG_b
 +#endif
 +        );
 +    } else if (shift<15){
 +        const uint16_t *offset = src+filterSize;
 +        x86_reg counter= -2*dstW;
 +        //filter-= counter*filterSize/2;
 +        filterPos-= counter/2;
 +        dst-= counter/2;
 +        __asm__ volatile(
 +            "movd                   %7, %%mm7       \n\t"
 +            ".p2align                  4            \n\t"
 +            "1:                                     \n\t"
 +            "mov                      %2, %%"REG_c" \n\t"
 +            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
 +            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
 +            "mov                      %5, %%"REG_c" \n\t"
 +            "pxor                  %%mm4, %%mm4     \n\t"
 +            "pxor                  %%mm5, %%mm5     \n\t"
 +            "2:                                     \n\t"
 +            "movq                   (%1), %%mm1     \n\t"
 +            "movq               (%1, %6), %%mm3     \n\t"
 +            "movq (%%"REG_c", %%"REG_a", 2), %%mm0     \n\t"
 +            "movq (%%"REG_c", %%"REG_d", 2), %%mm2     \n\t"
 +            "pmaddwd               %%mm1, %%mm0     \n\t"
 +            "pmaddwd               %%mm2, %%mm3     \n\t"
 +            "paddd                 %%mm3, %%mm5     \n\t"
 +            "paddd                 %%mm0, %%mm4     \n\t"
 +            "add                      $8, %1        \n\t"
 +            "add                      $8, %%"REG_c" \n\t"
 +            "cmp                      %4, %%"REG_c" \n\t"
 +            " jb                      2b            \n\t"
 +            "add                      %6, %1        \n\t"
 +            "movq                  %%mm4, %%mm0     \n\t"
 +            "punpckldq             %%mm5, %%mm4     \n\t"
 +            "punpckhdq             %%mm5, %%mm0     \n\t"
 +            "paddd                 %%mm0, %%mm4     \n\t"
 +            "psrad                 %%mm7, %%mm4     \n\t"
 +            "packssdw              %%mm4, %%mm4     \n\t"
 +            "mov                      %3, %%"REG_a" \n\t"
 +            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
 +            "add                      $4, %0        \n\t"
 +            " jnc                     1b            \n\t"
 +
 +            : "+r" (counter), "+r" (filter)
 +            : "m" (filterPos), "m" (dst), "m"(offset),
 +            "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
 +            : "%"REG_a, "%"REG_c, "%"REG_d
 +        );
 +    } else
 +    for (i=0; i<dstW; i++) {
 +        int srcPos= filterPos[i];
 +        int val=0;
 +        for (j=0; j<filterSize; j++) {
 +            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
 +        }
 +        dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
 +    }
 +}
 +
  #if COMPILE_TEMPLATE_MMX2
  static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                   int dstWidth, const uint8_t *src,