X-Git-Url: http://git.ffmpeg.org/gitweb/ffmpeg.git/blobdiff_plain/5369d74c30c89e2f460960cd99ea6f483e383c1e..8a03404d4f15e2b8aefdeca9294f5c31d429ca75:/libpostproc/postprocess_altivec_template.c diff --git a/libpostproc/postprocess_altivec_template.c b/libpostproc/postprocess_altivec_template.c index c7ef78e..47135e8 100644 --- a/libpostproc/postprocess_altivec_template.c +++ b/libpostproc/postprocess_altivec_template.c @@ -84,6 +84,17 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) vector signed short v_data = vec_ld(0, data); vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; +//FIXME avoid this mess if possible + register int j0 = 0, + j1 = stride, + j2 = 2 * stride, + j3 = 3 * stride, + j4 = 4 * stride, + j5 = 5 * stride, + j6 = 6 * stride, + j7 = 7 * stride; + vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3, + v_srcA4, v_srcA5, v_srcA6, v_srcA7; v_dcOffset = vec_splat(v_data, 0); v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1); @@ -92,23 +103,21 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) src2 += stride * 4; - #define LOAD_LINE(i) \ - register int j##i = i * stride; \ + { \ vector unsigned char perm##i = vec_lvsl(j##i, src2); \ - const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ vector unsigned char v_srcA2##i; \ + vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ if (two_vectors) \ v_srcA2##i = vec_ld(j##i + 16, src2); \ - const vector unsigned char v_srcA##i = \ + v_srcA##i = \ vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ v_srcAss##i = \ (vector signed short)vec_mergeh((vector signed char)zero, \ - (vector signed char)v_srcA##i) + (vector signed char)v_srcA##i); } #define LOAD_LINE_ALIGNED(i) \ - register int j##i = i * stride; \ - const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ + v_srcA##i = vec_ld(j##i, src2); \ v_srcAss##i = \ (vector signed short)vec_mergeh((vector signed char)zero, \ (vector signed char)v_srcA##i) @@ -146,16 +155,26 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) const vector signed short v_comp##i = \ (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ v_dcThreshold); \ - const vector signed short v_part##i = vec_and(mask, v_comp##i); \ - v_numEq = vec_sum4s(v_part##i, v_numEq); - - ITER(0, 1); - ITER(1, 2); - ITER(2, 3); - ITER(3, 4); - ITER(4, 5); - ITER(5, 6); - ITER(6, 7); + const vector signed short v_part##i = vec_and(mask, v_comp##i); + + { + ITER(0, 1) + ITER(1, 2) + ITER(2, 3) + ITER(3, 4) + ITER(4, 5) + ITER(5, 6) + ITER(6, 7) + + v_numEq = vec_sum4s(v_part0, v_numEq); + v_numEq = vec_sum4s(v_part1, v_numEq); + v_numEq = vec_sum4s(v_part2, v_numEq); + v_numEq = vec_sum4s(v_part3, v_numEq); + v_numEq = vec_sum4s(v_part4, v_numEq); + v_numEq = vec_sum4s(v_part5, v_numEq); + v_numEq = vec_sum4s(v_part6, v_numEq); + } + #undef ITER v_numEq = vec_sums(v_numEq, zero); @@ -203,21 +222,31 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) const vector signed int zero = vec_splat_s32(0); const int properStride = (stride % 16); const int srcAlign = ((unsigned long)src2 % 16); - DECLARE_ALIGNED(16, short, qp[8]); - qp[0] = c->QP; + DECLARE_ALIGNED(16, short, qp[8]) = {c->QP}; vector signed short vqp = vec_ld(0, qp); - vqp = vec_splat(vqp, 0); - - src2 += stride*3; - vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; + vector unsigned char perml0, perml1, perml2, perml3, perml4, + perml5, perml6, perml7, perml8, perml9; + register int j0 = 0, + j1 = stride, + j2 = 2 * stride, + j3 = 3 * stride, + j4 = 4 * stride, + j5 = 5 * stride, + j6 = 6 * stride, + j7 = 7 * stride, + j8 = 8 * stride, + j9 = 9 * stride; + + vqp = vec_splat(vqp, 0); + + src2 += stride*3; #define LOAD_LINE(i) \ - const vector unsigned char perml##i = \ - vec_lvsl(i * stride, src2); \ + perml##i = vec_lvsl(i * stride, src2); \ vbA##i = vec_ld(i * stride, src2); \ vbB##i = vec_ld(i * stride + 16, src2); \ vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ @@ -226,7 +255,6 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) (vector unsigned char)vbT##i) #define LOAD_LINE_ALIGNED(i) \ - register int j##i = i * stride; \ vbT##i = vec_ld(j##i, src2); \ vb##i = \ (vector signed short)vec_mergeh((vector signed char)zero, \ @@ -235,7 +263,7 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) /* Special-casing the aligned case is worthwhile, as all calls from * the (transposed) horizontable deblocks will be aligned, in addition * to the naturally aligned vertical deblocks. */ - if (properStride && srcAlign) { + if (properStride && srcAlign) { LOAD_LINE_ALIGNED(0); LOAD_LINE_ALIGNED(1); LOAD_LINE_ALIGNED(2); @@ -246,7 +274,7 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) LOAD_LINE_ALIGNED(7); LOAD_LINE_ALIGNED(8); LOAD_LINE_ALIGNED(9); - } else { + } else { LOAD_LINE(0); LOAD_LINE(1); LOAD_LINE(2); @@ -257,76 +285,76 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) LOAD_LINE(7); LOAD_LINE(8); LOAD_LINE(9); - } + } #undef LOAD_LINE #undef LOAD_LINE_ALIGNED - - const vector unsigned short v_2 = vec_splat_u16(2); - const vector unsigned short v_4 = vec_splat_u16(4); - - const vector signed short v_diff01 = vec_sub(vb0, vb1); - const vector unsigned short v_cmp01 = - (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp); - const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); - const vector signed short v_diff89 = vec_sub(vb8, vb9); - const vector unsigned short v_cmp89 = - (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); - const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); - - const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); - const vector signed short temp02 = vec_add(vb2, vb3); - const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); - const vector signed short v_sumsB0 = vec_add(temp02, temp03); - - const vector signed short temp11 = vec_sub(v_sumsB0, v_first); - const vector signed short v_sumsB1 = vec_add(temp11, vb4); - - const vector signed short temp21 = vec_sub(v_sumsB1, v_first); - const vector signed short v_sumsB2 = vec_add(temp21, vb5); - - const vector signed short temp31 = vec_sub(v_sumsB2, v_first); - const vector signed short v_sumsB3 = vec_add(temp31, vb6); - - const vector signed short temp41 = vec_sub(v_sumsB3, v_first); - const vector signed short v_sumsB4 = vec_add(temp41, vb7); - - const vector signed short temp51 = vec_sub(v_sumsB4, vb1); - const vector signed short v_sumsB5 = vec_add(temp51, vb8); - - const vector signed short temp61 = vec_sub(v_sumsB5, vb2); - const vector signed short v_sumsB6 = vec_add(temp61, v_last); - - const vector signed short temp71 = vec_sub(v_sumsB6, vb3); - const vector signed short v_sumsB7 = vec_add(temp71, v_last); - - const vector signed short temp81 = vec_sub(v_sumsB7, vb4); - const vector signed short v_sumsB8 = vec_add(temp81, v_last); - - const vector signed short temp91 = vec_sub(v_sumsB8, vb5); - const vector signed short v_sumsB9 = vec_add(temp91, v_last); - -#define COMPUTE_VR(i, j, k) \ - const vector signed short temps1##i = \ - vec_add(v_sumsB##i, v_sumsB##k); \ - const vector signed short temps2##i = \ - vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ - const vector signed short vr##j = vec_sra(temps2##i, v_4) - - COMPUTE_VR(0, 1, 2); - COMPUTE_VR(1, 2, 3); - COMPUTE_VR(2, 3, 4); - COMPUTE_VR(3, 4, 5); - COMPUTE_VR(4, 5, 6); - COMPUTE_VR(5, 6, 7); - COMPUTE_VR(6, 7, 8); - COMPUTE_VR(7, 8, 9); - - const vector signed char neg1 = vec_splat_s8(-1); - const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); + { + const vector unsigned short v_2 = vec_splat_u16(2); + const vector unsigned short v_4 = vec_splat_u16(4); + + const vector signed short v_diff01 = vec_sub(vb0, vb1); + const vector unsigned short v_cmp01 = + (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp); + const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); + const vector signed short v_diff89 = vec_sub(vb8, vb9); + const vector unsigned short v_cmp89 = + (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); + const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); + + const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); + const vector signed short temp02 = vec_add(vb2, vb3); + const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); + const vector signed short v_sumsB0 = vec_add(temp02, temp03); + + const vector signed short temp11 = vec_sub(v_sumsB0, v_first); + const vector signed short v_sumsB1 = vec_add(temp11, vb4); + + const vector signed short temp21 = vec_sub(v_sumsB1, v_first); + const vector signed short v_sumsB2 = vec_add(temp21, vb5); + + const vector signed short temp31 = vec_sub(v_sumsB2, v_first); + const vector signed short v_sumsB3 = vec_add(temp31, vb6); + + const vector signed short temp41 = vec_sub(v_sumsB3, v_first); + const vector signed short v_sumsB4 = vec_add(temp41, vb7); + + const vector signed short temp51 = vec_sub(v_sumsB4, vb1); + const vector signed short v_sumsB5 = vec_add(temp51, vb8); + + const vector signed short temp61 = vec_sub(v_sumsB5, vb2); + const vector signed short v_sumsB6 = vec_add(temp61, v_last); + + const vector signed short temp71 = vec_sub(v_sumsB6, vb3); + const vector signed short v_sumsB7 = vec_add(temp71, v_last); + + const vector signed short temp81 = vec_sub(v_sumsB7, vb4); + const vector signed short v_sumsB8 = vec_add(temp81, v_last); + + const vector signed short temp91 = vec_sub(v_sumsB8, vb5); + const vector signed short v_sumsB9 = vec_add(temp91, v_last); + + #define COMPUTE_VR(i, j, k) \ + const vector signed short temps1##i = \ + vec_add(v_sumsB##i, v_sumsB##k); \ + const vector signed short temps2##i = \ + vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ + const vector signed short vr##j = vec_sra(temps2##i, v_4) + + COMPUTE_VR(0, 1, 2); + COMPUTE_VR(1, 2, 3); + COMPUTE_VR(2, 3, 4); + COMPUTE_VR(3, 4, 5); + COMPUTE_VR(4, 5, 6); + COMPUTE_VR(5, 6, 7); + COMPUTE_VR(6, 7, 8); + COMPUTE_VR(7, 8, 9); + + const vector signed char neg1 = vec_splat_s8(-1); + const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); #define PACK_AND_STORE(i) \ - const vector unsigned char perms##i = \ +{ const vector unsigned char perms##i = \ vec_lvsr(i * stride, src2); \ const vector unsigned char vf##i = \ vec_packsu(vr##i, (vector signed short)zero); \ @@ -341,39 +369,40 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) const vector unsigned char svB##i = \ vec_sel(vg2##i, vbB##i, mask##i); \ vec_st(svA##i, i * stride, src2); \ - vec_st(svB##i, i * stride + 16, src2) + vec_st(svB##i, i * stride + 16, src2);} #define PACK_AND_STORE_ALIGNED(i) \ - const vector unsigned char vf##i = \ +{ const vector unsigned char vf##i = \ vec_packsu(vr##i, (vector signed short)zero); \ const vector unsigned char vg##i = \ vec_perm(vf##i, vbT##i, permHH); \ - vec_st(vg##i, i * stride, src2) - - /* Special-casing the aligned case is worthwhile, as all calls from - * the (transposed) horizontable deblocks will be aligned, in addition - * to the naturally aligned vertical deblocks. */ - if (properStride && srcAlign) { - PACK_AND_STORE_ALIGNED(1); - PACK_AND_STORE_ALIGNED(2); - PACK_AND_STORE_ALIGNED(3); - PACK_AND_STORE_ALIGNED(4); - PACK_AND_STORE_ALIGNED(5); - PACK_AND_STORE_ALIGNED(6); - PACK_AND_STORE_ALIGNED(7); - PACK_AND_STORE_ALIGNED(8); - } else { - PACK_AND_STORE(1); - PACK_AND_STORE(2); - PACK_AND_STORE(3); - PACK_AND_STORE(4); - PACK_AND_STORE(5); - PACK_AND_STORE(6); - PACK_AND_STORE(7); - PACK_AND_STORE(8); + vec_st(vg##i, i * stride, src2);} + + /* Special-casing the aligned case is worthwhile, as all calls from + * the (transposed) horizontable deblocks will be aligned, in addition + * to the naturally aligned vertical deblocks. */ + if (properStride && srcAlign) { + PACK_AND_STORE_ALIGNED(1) + PACK_AND_STORE_ALIGNED(2) + PACK_AND_STORE_ALIGNED(3) + PACK_AND_STORE_ALIGNED(4) + PACK_AND_STORE_ALIGNED(5) + PACK_AND_STORE_ALIGNED(6) + PACK_AND_STORE_ALIGNED(7) + PACK_AND_STORE_ALIGNED(8) + } else { + PACK_AND_STORE(1) + PACK_AND_STORE(2) + PACK_AND_STORE(3) + PACK_AND_STORE(4) + PACK_AND_STORE(5) + PACK_AND_STORE(6) + PACK_AND_STORE(7) + PACK_AND_STORE(8) + } + #undef PACK_AND_STORE + #undef PACK_AND_STORE_ALIGNED } -#undef PACK_AND_STORE -#undef PACK_AND_STORE_ALIGNED } @@ -387,12 +416,11 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext can be removed by assuming proper alignment of src & stride :-( */ - uint8_t *src2 = src; + uint8_t *src2 = src + stride*3; const vector signed int zero = vec_splat_s32(0); - DECLARE_ALIGNED(16, short, qp[8]); - qp[0] = 8*c->QP; - vector signed short vqp = vec_ld(0, qp); - vqp = vec_splat(vqp, 0); + DECLARE_ALIGNED(16, short, qp[8]) = {8*c->QP}; + vector signed short vqp = vec_splat( + (vector signed short)vec_ld(0, qp), 0); #define LOAD_LINE(i) \ const vector unsigned char perm##i = \ @@ -407,8 +435,6 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext (vector signed short)vec_mergeh((vector unsigned char)zero, \ (vector unsigned char)vbT##i) - src2 += stride*3; - LOAD_LINE(1); LOAD_LINE(2); LOAD_LINE(3); @@ -483,7 +509,7 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); #define STORE(i) \ - const vector unsigned char perms##i = \ +{ const vector unsigned char perms##i = \ vec_lvsr(i * stride, src2); \ const vector unsigned char vg##i = \ vec_perm(st##i, vbT##i, permHH); \ @@ -496,10 +522,10 @@ static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext const vector unsigned char svB##i = \ vec_sel(vg2##i, vbB##i, mask##i); \ vec_st(svA##i, i * stride, src2); \ - vec_st(svB##i, i * stride + 16, src2) + vec_st(svB##i, i * stride + 16, src2);} - STORE(4); - STORE(5); + STORE(4) + STORE(5) } static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { @@ -518,11 +544,11 @@ static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { dt[0] = deringThreshold; v_dt = vec_splat(vec_ld(0, dt), 0); -#define LOAD_LINE(i) \ - const vector unsigned char perm##i = \ - vec_lvsl(i * stride, srcCopy); \ - vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ - vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ +#define LOAD_LINE(i) \ + const vector unsigned char perm##i = \ + vec_lvsl(i * stride, srcCopy); \ + vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ + vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) LOAD_LINE(0); @@ -846,7 +872,7 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, #undef LOAD_LINE #define ACCUMULATE_DIFFS(i) \ - vector signed short v_d##i = vec_sub(v_tempBlurredAss##i, \ + vector signed short v_d##i = vec_sub(v_tempBlurredAss##i, \ v_srcAss##i); \ v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) @@ -913,7 +939,7 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, #define OP(i) \ const vector signed short v_temp##i = \ - vec_mladd(v_tempBlurredAss##i, \ + vec_mladd(v_tempBlurredAss##i, \ vsint16_7, v_srcAss##i); \ const vector signed short v_temp2##i = \ vec_add(v_temp##i, vsint16_4); \ @@ -934,7 +960,7 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, #define OP(i) \ const vector signed short v_temp##i = \ - vec_mladd(v_tempBlurredAss##i, \ + vec_mladd(v_tempBlurredAss##i, \ vsint16_3, v_srcAss##i); \ const vector signed short v_temp2##i = \ vec_add(v_temp##i, vsint16_2); \ @@ -1029,16 +1055,16 @@ static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, vector unsigned char tempO = vec_mergeh(src7, zero); vector unsigned char tempP = vec_mergel(src7, zero); - vector unsigned char temp0 = vec_mergeh(tempA, tempI); - vector unsigned char temp1 = vec_mergel(tempA, tempI); - vector unsigned char temp2 = vec_mergeh(tempB, tempJ); - vector unsigned char temp3 = vec_mergel(tempB, tempJ); - vector unsigned char temp4 = vec_mergeh(tempC, tempK); - vector unsigned char temp5 = vec_mergel(tempC, tempK); - vector unsigned char temp6 = vec_mergeh(tempD, tempL); - vector unsigned char temp7 = vec_mergel(tempD, tempL); - vector unsigned char temp8 = vec_mergeh(tempE, tempM); - vector unsigned char temp9 = vec_mergel(tempE, tempM); + vector unsigned char temp0 = vec_mergeh(tempA, tempI); + vector unsigned char temp1 = vec_mergel(tempA, tempI); + vector unsigned char temp2 = vec_mergeh(tempB, tempJ); + vector unsigned char temp3 = vec_mergel(tempB, tempJ); + vector unsigned char temp4 = vec_mergeh(tempC, tempK); + vector unsigned char temp5 = vec_mergel(tempC, tempK); + vector unsigned char temp6 = vec_mergeh(tempD, tempL); + vector unsigned char temp7 = vec_mergel(tempD, tempL); + vector unsigned char temp8 = vec_mergeh(tempE, tempM); + vector unsigned char temp9 = vec_mergel(tempE, tempM); vector unsigned char temp10 = vec_mergeh(tempF, tempN); vector unsigned char temp11 = vec_mergel(tempF, tempN); vector unsigned char temp12 = vec_mergeh(tempG, tempO); @@ -1063,16 +1089,16 @@ static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, tempO = vec_mergeh(temp7, temp15); tempP = vec_mergel(temp7, temp15); - temp0 = vec_mergeh(tempA, tempI); - temp1 = vec_mergel(tempA, tempI); - temp2 = vec_mergeh(tempB, tempJ); - temp3 = vec_mergel(tempB, tempJ); - temp4 = vec_mergeh(tempC, tempK); - temp5 = vec_mergel(tempC, tempK); - temp6 = vec_mergeh(tempD, tempL); - temp7 = vec_mergel(tempD, tempL); - temp8 = vec_mergeh(tempE, tempM); - temp9 = vec_mergel(tempE, tempM); + temp0 = vec_mergeh(tempA, tempI); + temp1 = vec_mergel(tempA, tempI); + temp2 = vec_mergeh(tempB, tempJ); + temp3 = vec_mergel(tempB, tempJ); + temp4 = vec_mergeh(tempC, tempK); + temp5 = vec_mergel(tempC, tempK); + temp6 = vec_mergeh(tempD, tempL); + temp7 = vec_mergel(tempD, tempL); + temp8 = vec_mergeh(tempE, tempM); + temp9 = vec_mergel(tempE, tempM); temp10 = vec_mergeh(tempF, tempN); temp11 = vec_mergel(tempF, tempN); temp12 = vec_mergeh(tempG, tempO); @@ -1080,16 +1106,16 @@ static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, temp14 = vec_mergeh(tempH, tempP); temp15 = vec_mergel(tempH, tempP); - vec_st(temp0, 0, dst); - vec_st(temp1, 16, dst); - vec_st(temp2, 32, dst); - vec_st(temp3, 48, dst); - vec_st(temp4, 64, dst); - vec_st(temp5, 80, dst); - vec_st(temp6, 96, dst); - vec_st(temp7, 112, dst); - vec_st(temp8, 128, dst); - vec_st(temp9, 144, dst); + vec_st(temp0, 0, dst); + vec_st(temp1, 16, dst); + vec_st(temp2, 32, dst); + vec_st(temp3, 48, dst); + vec_st(temp4, 64, dst); + vec_st(temp5, 80, dst); + vec_st(temp6, 96, dst); + vec_st(temp7, 112, dst); + vec_st(temp8, 128, dst); + vec_st(temp9, 144, dst); vec_st(temp10, 160, dst); vec_st(temp11, 176, dst); vec_st(temp12, 192, dst);