2 ; * Provide SSE luma and chroma mc functions for HEVC decoding
3 ; * Copyright (c) 2013 Pierre-Edouard LEPERE
5 ; * This file is part of FFmpeg.
7 ; * FFmpeg is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * FFmpeg is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with FFmpeg; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 %include "libavutil/x86/x86util.asm"
34 %define pw_bi_10 pw_1024
35 %define pw_bi_12 pw_4096
36 %define max_pixels_8 pw_255
37 %define max_pixels_10 pw_1023
38 pw_bi_8: times 16 dw (1 << 8)
39 max_pixels_12: times 16 dw ((1 << 12)-1)
45 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
62 EPEL_TABLE 8,16, b, avx2
63 EPEL_TABLE 10, 8, w, avx2
65 EPEL_TABLE 8, 8, b, sse4
66 EPEL_TABLE 10, 4, w, sse4
67 EPEL_TABLE 12, 4, w, sse4
70 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
84 QPEL_TABLE 8, 8, b, sse4
85 QPEL_TABLE 10, 4, w, sse4
86 QPEL_TABLE 12, 4, w, sse4
88 QPEL_TABLE 8,16, b, avx2
89 QPEL_TABLE 10, 8, w, avx2
91 %define MAX_PB_SIZE 64
93 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
95 %define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10
99 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
101 movq %3, [%2] ; load data from source2
103 movdqa %3, [%2] ; load data from source2
108 movdqa %3, [%2] ; load data from source2
109 movq %4, [%2+16] ; load data from source2
115 movdqa %3, [%2] ; load data from source2
116 movdqa %4, [%2+16] ; load data from source2
124 %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
125 %if %1 == 2 || (%2 == 8 && %1 <= 4)
126 movd %4, [%3] ; load data from source
127 %elif %1 == 4 || (%2 == 8 && %1 <= 8)
128 movq %4, [%3] ; load data from source
129 %elif notcpuflag(avx)
130 movu %4, [%3] ; load data from source
131 %elif %1 <= 8 || (%2 == 8 && %1 <= 16)
139 %macro EPEL_FILTER 2-4 ; bit depth, filter index
143 lea rfilterq, [hevc_epel_filters_avx2_%1]
145 %define rfilterq hevc_epel_filters_avx2_%1
150 lea rfilterq, [hevc_epel_filters_sse4_%1]
152 %define rfilterq hevc_epel_filters_sse4_%1
154 %endif ;cpuflag(avx2)
157 shl %2q, 6 ; multiply by 64
159 shl %2q, 5 ; multiply by 32
162 mova m14, [rfilterq + %2q] ; get 2 first values of filters
163 mova m15, [rfilterq + %2q+%%offset] ; get 2 last values of filters
165 mova %3, [rfilterq + %2q] ; get 2 first values of filters
166 mova %4, [rfilterq + %2q+%%offset] ; get 2 last values of filters
170 %macro EPEL_HV_FILTER 1
174 %define %%table hevc_epel_filters_avx2_%1
178 %define %%table hevc_epel_filters_sse4_%1
182 lea rfilterq, [%%table]
184 %define rfilterq %%table
188 shl mxq, %%shift ; multiply by 32
189 shl myq, %%shift ; multiply by 32
190 mova m14, [rfilterq + mxq] ; get 2 first values of filters
191 mova m15, [rfilterq + mxq+%%offset] ; get 2 last values of filters
192 lea r3srcq, [srcstrideq*3]
195 %define %%table hevc_epel_filters_avx2_10
197 %define %%table hevc_epel_filters_sse4_10
200 lea rfilterq, [%%table]
202 %define rfilterq %%table
204 mova m12, [rfilterq + myq] ; get 2 first values of filters
205 mova m13, [rfilterq + myq+%%offset] ; get 2 last values of filters
213 %define %%table hevc_qpel_filters_avx2_%1
217 %define %%table hevc_qpel_filters_sse4_%1
221 lea rfilterq, [%%table]
223 %define rfilterq %%table
226 shl %2q, %%shift ; multiply by 32
227 mova m12, [rfilterq + %2q] ; get 4 first values of filters
228 mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters
229 mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters
230 mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters
239 %if (%1 == 8 && %4 <= 4)
241 %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
244 %define %%load movdqu
247 %%load m0, [rfilterq ]
249 %%load m1, [rfilterq+ %3]
250 %%load m2, [rfilterq+2*%3]
251 %%load m3, [rfilterq+3*%3]
253 %%load m1, [rfilterq+ %3q]
254 %%load m2, [rfilterq+2*%3q]
255 %%load m3, [rfilterq+r3srcq]
259 SBUTTERFLY bw, 0, 1, 7
260 SBUTTERFLY bw, 2, 3, 7
267 SBUTTERFLY wd, 0, 1, 7
268 SBUTTERFLY wd, 2, 3, 7
278 %assign %%stride (%1+7)/8
296 %%load m0, [%2-3*%%stride] ;load data from source
297 %%load m1, [%2-2*%%stride]
298 %%load m2, [%2-%%stride ]
300 %%load m4, [%2+%%stride ]
301 %%load m5, [%2+2*%%stride]
302 %%load m6, [%2+3*%%stride]
303 %%load m7, [%2+4*%%stride]
307 SBUTTERFLY wd, 0, 1, %4
308 SBUTTERFLY wd, 2, 3, %4
309 SBUTTERFLY wd, 4, 5, %4
310 SBUTTERFLY wd, 6, 7, %4
319 SBUTTERFLY dq, 0, 1, %4
320 SBUTTERFLY dq, 2, 3, %4
321 SBUTTERFLY dq, 4, 5, %4
322 SBUTTERFLY dq, 6, 7, %4
335 movu m0, [%5q ] ;load x- 3*srcstride
336 movu m1, [%5q+ %3q ] ;load x- 2*srcstride
337 movu m2, [%5q+ 2*%3q ] ;load x-srcstride
338 movu m3, [%2 ] ;load x
339 movu m4, [%2+ %3q] ;load x+stride
340 movu m5, [%2+ 2*%3q] ;load x+2*stride
341 movu m6, [%2+r3srcq] ;load x+3*stride
342 movu m7, [%2+ 4*%3q] ;load x+4*stride
345 SBUTTERFLY bw, 0, 1, 8
346 SBUTTERFLY bw, 2, 3, 8
347 SBUTTERFLY bw, 4, 5, 8
348 SBUTTERFLY bw, 6, 7, 8
357 SBUTTERFLY wd, 0, 1, 8
358 SBUTTERFLY wd, 2, 3, 8
359 SBUTTERFLY wd, 4, 5, 8
360 SBUTTERFLY wd, 6, 7, 8
370 %macro PEL_12STORE2 3
373 %macro PEL_12STORE4 3
376 %macro PEL_12STORE6 3
381 %macro PEL_12STORE8 3
384 %macro PEL_12STORE12 3
388 %macro PEL_12STORE16 3
389 PEL_12STORE8 %1, %2, %3
393 %macro PEL_10STORE2 3
396 %macro PEL_10STORE4 3
399 %macro PEL_10STORE6 3
404 %macro PEL_10STORE8 3
407 %macro PEL_10STORE12 3
411 %macro PEL_10STORE16 3
415 PEL_10STORE8 %1, %2, %3
420 %macro PEL_10STORE32 3
421 PEL_10STORE16 %1, %2, %3
438 %macro PEL_8STORE12 3
443 %macro PEL_8STORE16 3
450 %macro PEL_8STORE32 3
455 add %1q, 2*MAX_PB_SIZE ; dst += dststride
456 add %2q, %3q ; src += srcstride
457 dec heightd ; cmp height
458 jnz .loop ; height loop
462 %macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth
464 %if cpuflag(avx2) && %0 ==3
466 vextracti128 xm1, m0, 1
482 %macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3
495 %if cpuflag(avx2) && (%0 == 5)
497 vextracti128 xm10, m0, 1
498 vinserti128 m10, m1, xm10, 0
500 vinserti128 m0, m0, xm1, 1
503 vextracti128 xm10, m2, 1
504 vinserti128 m10, m3, xm10, 0
506 vinserti128 m2, m2, xm3, 1
509 pmaddubsw %%reg0, %3 ;x1*c1+x2*c2
510 pmaddubsw %%reg2, %4 ;x3*c3+x4*c4
532 packssdw %%reg0, %%reg1
536 %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
540 %define %%table hevc_qpel_filters_avx2_%2
543 %define %%table hevc_qpel_filters_sse4_%2
547 lea rfilterq, [%%table]
549 %define rfilterq %%table
553 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
554 pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4
555 pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6
556 pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8
561 pmaddwd m0, [rfilterq + %3q*8 ]
562 pmaddwd m2, [rfilterq + %3q*8+%%offset]
563 pmaddwd m4, [rfilterq + %3q*8+2*%%offset]
564 pmaddwd m6, [rfilterq + %3q*8+3*%%offset]
572 pmaddwd m1, [rfilterq + %3q*8 ]
573 pmaddwd m3, [rfilterq + %3q*8+%%offset]
574 pmaddwd m5, [rfilterq + %3q*8+2*%%offset]
575 pmaddwd m7, [rfilterq + %3q*8+3*%%offset]
587 %macro QPEL_COMPUTE 2-3 ; width, bitdepth
589 %if cpuflag(avx2) && (%0 == 3)
591 vextracti128 xm10, m0, 1
592 vinserti128 m10, m1, xm10, 0
593 vinserti128 m0, m0, xm1, 1
596 vextracti128 xm10, m2, 1
597 vinserti128 m10, m3, xm10, 0
598 vinserti128 m2, m2, xm3, 1
602 vextracti128 xm10, m4, 1
603 vinserti128 m10, m5, xm10, 0
604 vinserti128 m4, m4, xm5, 1
607 vextracti128 xm10, m6, 1
608 vinserti128 m10, m7, xm10, 0
609 vinserti128 m6, m6, xm7, 1
613 pmaddubsw m0, m12 ;x1*c1+x2*c2
614 pmaddubsw m2, m13 ;x3*c3+x4*c4
615 pmaddubsw m4, m14 ;x5*c5+x6*c6
616 pmaddubsw m6, m15 ;x7*c7+x8*c8
655 %macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
660 UNI_COMPUTE %1, %2, %3, %4, %7
661 %if %0 == 8 && cpuflag(avx2) && (%2 == 8)
669 %if %1 > 8 || (%2 > 8 && %1 > 4)
675 CLIPW %3, [pb_0], [max_pixels_%2]
676 %if (%1 > 8 && notcpuflag(avx)) || %1 > 16
677 CLIPW %4, [pb_0], [max_pixels_%2]
683 ; ******************************
684 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
685 ; uint8_t *_src, ptrdiff_t _srcstride,
686 ; int height, int mx, int my)
687 ; ******************************
689 %macro HEVC_PUT_HEVC_PEL_PIXELS 2
690 HEVC_PEL_PIXELS %1, %2
691 HEVC_UNI_PEL_PIXELS %1, %2
692 HEVC_BI_PEL_PIXELS %1, %2
695 %macro HEVC_PEL_PIXELS 2
696 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height
699 SIMPLE_LOAD %1, %2, srcq, m0
700 MC_PIXEL_COMPUTE %1, %2, 1
701 PEL_10STORE%1 dstq, m0, m1
702 LOOP_END dst, src, srcstride
706 %macro HEVC_UNI_PEL_PIXELS 2
707 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height
709 SIMPLE_LOAD %1, %2, srcq, m0
710 PEL_%2STORE%1 dstq, m0, m1
711 add dstq, dststrideq ; dst += dststride
712 add srcq, srcstrideq ; src += srcstride
713 dec heightd ; cmp height
714 jnz .loop ; height loop
718 %macro HEVC_BI_PEL_PIXELS 2
719 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height
721 movdqa m5, [pw_bi_%2]
723 SIMPLE_LOAD %1, %2, srcq, m0
724 SIMPLE_BILOAD %1, src2q, m3, m4
725 MC_PIXEL_COMPUTE %1, %2, 1
726 BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1
727 PEL_%2STORE%1 dstq, m0, m1
728 add dstq, dststrideq ; dst += dststride
729 add srcq, srcstrideq ; src += srcstride
730 add src2q, 2*MAX_PB_SIZE ; src += srcstride
731 dec heightd ; cmp height
732 jnz .loop ; height loop
737 ; ******************************
738 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
739 ; uint8_t *_src, ptrdiff_t _srcstride,
740 ; int width, int height, int mx, int my,
742 ; ******************************
745 %macro HEVC_PUT_HEVC_EPEL 2
746 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rfilter
747 %assign %%stride ((%2 + 7)/8)
748 EPEL_FILTER %2, mx, m4, m5
750 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
751 EPEL_COMPUTE %2, %1, m4, m5, 1
752 PEL_10STORE%1 dstq, m0, m1
753 LOOP_END dst, src, srcstride
756 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 11, dst, dststride, src, srcstride, height, mx, rfilter
757 %assign %%stride ((%2 + 7)/8)
759 EPEL_FILTER %2, mx, m4, m5
761 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
762 EPEL_COMPUTE %2, %1, m4, m5
763 UNI_COMPUTE %1, %2, m0, m1, m6
764 PEL_%2STORE%1 dstq, m0, m1
765 add dstq, dststrideq ; dst += dststride
766 add srcq, srcstrideq ; src += srcstride
767 dec heightd ; cmp height
768 jnz .loop ; height loop
771 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride, src2, height, mx, rfilter
772 movdqa m6, [pw_bi_%2]
773 EPEL_FILTER %2, mx, m4, m5
775 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
776 EPEL_COMPUTE %2, %1, m4, m5, 1
777 SIMPLE_BILOAD %1, src2q, m2, m3
778 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
779 PEL_%2STORE%1 dstq, m0, m1
780 add dstq, dststrideq ; dst += dststride
781 add srcq, srcstrideq ; src += srcstride
782 add src2q, 2*MAX_PB_SIZE ; src += srcstride
783 dec heightd ; cmp height
784 jnz .loop ; height loop
787 ; ******************************
788 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
789 ; uint8_t *_src, ptrdiff_t _srcstride,
790 ; int width, int height, int mx, int my,
792 ; ******************************
794 cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src, my, rfilter
795 lea r3srcq, [srcstrideq*3]
797 EPEL_FILTER %2, my, m4, m5
799 EPEL_LOAD %2, srcq, srcstride, %1
800 EPEL_COMPUTE %2, %1, m4, m5, 1
801 PEL_10STORE%1 dstq, m0, m1
802 LOOP_END dst, src, srcstride
805 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 11, dst, dststride, src, srcstride, height, r3src, my, rfilter
806 lea r3srcq, [srcstrideq*3]
809 EPEL_FILTER %2, my, m4, m5
811 EPEL_LOAD %2, srcq, srcstride, %1
812 EPEL_COMPUTE %2, %1, m4, m5
813 UNI_COMPUTE %1, %2, m0, m1, m6
814 PEL_%2STORE%1 dstq, m0, m1
815 add dstq, dststrideq ; dst += dststride
816 add srcq, srcstrideq ; src += srcstride
817 dec heightd ; cmp height
818 jnz .loop ; height loop
822 cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
823 lea r3srcq, [srcstrideq*3]
824 movdqa m6, [pw_bi_%2]
826 EPEL_FILTER %2, my, m4, m5
828 EPEL_LOAD %2, srcq, srcstride, %1
829 EPEL_COMPUTE %2, %1, m4, m5, 1
830 SIMPLE_BILOAD %1, src2q, m2, m3
831 BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1
832 PEL_%2STORE%1 dstq, m0, m1
833 add dstq, dststrideq ; dst += dststride
834 add srcq, srcstrideq ; src += srcstride
835 add src2q, 2*MAX_PB_SIZE ; src += srcstride
836 dec heightd ; cmp height
837 jnz .loop ; height loop
842 ; ******************************
843 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
844 ; uint8_t *_src, ptrdiff_t _srcstride,
845 ; int width, int height, int mx, int my)
846 ; ******************************
848 %macro HEVC_PUT_HEVC_EPEL_HV 2
849 cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx, my, r3src, rfilter
850 %assign %%stride ((%2 + 7)/8)
853 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
854 EPEL_COMPUTE %2, %1, m14, m15
855 %if (%1 > 8 && (%2 == 8))
860 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
861 EPEL_COMPUTE %2, %1, m14, m15
862 %if (%1 > 8 && (%2 == 8))
867 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
868 EPEL_COMPUTE %2, %1, m14, m15
869 %if (%1 > 8 && (%2 == 8))
875 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
876 EPEL_COMPUTE %2, %1, m14, m15
877 %if (%1 > 8 && (%2 == 8))
887 EPEL_COMPUTE 14, %1, m12, m13
888 %if (%1 > 8 && (%2 == 8))
890 punpcklwd m2, m10, m11
892 punpckhwd m3, m10, m11
893 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
895 vinserti128 m2, m0, xm4, 1
896 vextracti128 xm3, m0, 1
897 vinserti128 m3, m4, xm3, 0
898 PEL_10STORE%1 dstq, m2, m3
900 PEL_10STORE%1 dstq, m0, m4
903 PEL_10STORE%1 dstq, m0, m1
908 %if (%1 > 8 && (%2 == 8))
913 LOOP_END dst, src, srcstride
916 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
917 %assign %%stride ((%2 + 7)/8)
920 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
921 EPEL_COMPUTE %2, %1, m14, m15
922 %if (%1 > 8 && (%2 == 8))
927 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
928 EPEL_COMPUTE %2, %1, m14, m15
929 %if (%1 > 8 && (%2 == 8))
934 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
935 EPEL_COMPUTE %2, %1, m14, m15
936 %if (%1 > 8 && (%2 == 8))
942 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
943 EPEL_COMPUTE %2, %1, m14, m15
944 %if (%1 > 8 && (%2 == 8))
954 EPEL_COMPUTE 14, %1, m12, m13
955 %if (%1 > 8 && (%2 == 8))
957 punpcklwd m2, m10, m11
959 punpckhwd m3, m10, m11
960 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
961 UNI_COMPUTE %1, %2, m0, m4, [pw_%2]
963 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
965 PEL_%2STORE%1 dstq, m0, m1
969 %if (%1 > 8 && (%2 == 8))
974 add dstq, dststrideq ; dst += dststride
975 add srcq, srcstrideq ; src += srcstride
976 dec heightd ; cmp height
977 jnz .loop ; height loop
980 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
981 %assign %%stride ((%2 + 7)/8)
984 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
985 EPEL_COMPUTE %2, %1, m14, m15
986 %if (%1 > 8 && (%2 == 8))
991 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
992 EPEL_COMPUTE %2, %1, m14, m15
993 %if (%1 > 8 && (%2 == 8))
998 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
999 EPEL_COMPUTE %2, %1, m14, m15
1000 %if (%1 > 8 && (%2 == 8))
1004 add srcq, srcstrideq
1006 EPEL_LOAD %2, srcq-%%stride, %%stride, %1
1007 EPEL_COMPUTE %2, %1, m14, m15
1008 %if (%1 > 8 && (%2 == 8))
1012 punpcklwd m0, m4, m5
1013 punpcklwd m2, m6, m7
1015 punpckhwd m1, m4, m5
1016 punpckhwd m3, m6, m7
1018 EPEL_COMPUTE 14, %1, m12, m13
1019 %if (%1 > 8 && (%2 == 8))
1020 punpcklwd m4, m8, m9
1021 punpcklwd m2, m10, m11
1022 punpckhwd m8, m8, m9
1023 punpckhwd m3, m10, m11
1024 EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3
1025 SIMPLE_BILOAD %1, src2q, m8, m3
1027 vinserti128 m1, m8, xm3, 1
1028 vextracti128 xm8, m8, 1
1029 vinserti128 m2, m3, xm8, 0
1030 BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2]
1032 BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2]
1035 SIMPLE_BILOAD %1, src2q, m8, m9
1036 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1038 PEL_%2STORE%1 dstq, m0, m4
1042 %if (%1 > 8 && (%2 == 8))
1047 add dstq, dststrideq ; dst += dststride
1048 add srcq, srcstrideq ; src += srcstride
1049 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1050 dec heightd ; cmp height
1051 jnz .loop ; height loop
1055 ; ******************************
1056 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
1057 ; uint8_t *_src, ptrdiff_t _srcstride,
1058 ; int width, int height, int mx, int my)
1059 ; ******************************
1061 %macro HEVC_PUT_HEVC_QPEL 2
1062 cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rfilter
1065 QPEL_H_LOAD %2, srcq, %1, 10
1066 QPEL_COMPUTE %1, %2, 1
1070 PEL_10STORE%1 dstq, m0, m1
1071 LOOP_END dst, src, srcstride
1074 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter
1078 QPEL_H_LOAD %2, srcq, %1, 10
1083 UNI_COMPUTE %1, %2, m0, m1, m9
1084 PEL_%2STORE%1 dstq, m0, m1
1085 add dstq, dststrideq ; dst += dststride
1086 add srcq, srcstrideq ; src += srcstride
1087 dec heightd ; cmp height
1088 jnz .loop ; height loop
1091 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter
1092 movdqa m9, [pw_bi_%2]
1095 QPEL_H_LOAD %2, srcq, %1, 10
1096 QPEL_COMPUTE %1, %2, 1
1100 SIMPLE_BILOAD %1, src2q, m10, m11
1101 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1102 PEL_%2STORE%1 dstq, m0, m1
1103 add dstq, dststrideq ; dst += dststride
1104 add srcq, srcstrideq ; src += srcstride
1105 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1106 dec heightd ; cmp height
1107 jnz .loop ; height loop
1111 ; ******************************
1112 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
1113 ; uint8_t *_src, ptrdiff_t _srcstride,
1114 ; int width, int height, int mx, int my)
1115 ; ******************************
1117 cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src, my, rfilter
1118 lea r3srcq, [srcstrideq*3]
1121 QPEL_V_LOAD %2, srcq, srcstride, %1, r7
1122 QPEL_COMPUTE %1, %2, 1
1126 PEL_10STORE%1 dstq, m0, m1
1127 LOOP_END dst, src, srcstride
1130 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 16, dst, dststride, src, srcstride, height, r3src, my, rfilter
1132 lea r3srcq, [srcstrideq*3]
1135 QPEL_V_LOAD %2, srcq, srcstride, %1, r8
1140 UNI_COMPUTE %1, %2, m0, m1, m9
1141 PEL_%2STORE%1 dstq, m0, m1
1142 add dstq, dststrideq ; dst += dststride
1143 add srcq, srcstrideq ; src += srcstride
1144 dec heightd ; cmp height
1145 jnz .loop ; height loop
1148 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
1149 movdqa m9, [pw_bi_%2]
1150 lea r3srcq, [srcstrideq*3]
1153 QPEL_V_LOAD %2, srcq, srcstride, %1, r9
1154 QPEL_COMPUTE %1, %2, 1
1158 SIMPLE_BILOAD %1, src2q, m10, m11
1159 BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1
1160 PEL_%2STORE%1 dstq, m0, m1
1161 add dstq, dststrideq ; dst += dststride
1162 add srcq, srcstrideq ; src += srcstride
1163 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1164 dec heightd ; cmp height
1165 jnz .loop ; height loop
1170 ; ******************************
1171 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
1172 ; uint8_t *_src, ptrdiff_t _srcstride,
1173 ; int height, int mx, int my)
1174 ; ******************************
1175 %macro HEVC_PUT_HEVC_QPEL_HV 2
1176 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter
1184 shl mxq, %%shift ; multiply by 32
1185 shl myq, %%shift ; multiply by 32
1186 lea r3srcq, [srcstrideq*3]
1188 QPEL_H_LOAD %2, srcq, %1, 15
1189 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1191 add srcq, srcstrideq
1192 QPEL_H_LOAD %2, srcq, %1, 15
1193 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1195 add srcq, srcstrideq
1196 QPEL_H_LOAD %2, srcq, %1, 15
1197 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1199 add srcq, srcstrideq
1200 QPEL_H_LOAD %2, srcq, %1, 15
1201 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1203 add srcq, srcstrideq
1204 QPEL_H_LOAD %2, srcq, %1, 15
1205 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1207 add srcq, srcstrideq
1208 QPEL_H_LOAD %2, srcq, %1, 15
1209 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1211 add srcq, srcstrideq
1212 QPEL_H_LOAD %2, srcq, %1, 15
1213 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1215 add srcq, srcstrideq
1217 QPEL_H_LOAD %2, srcq, %1, 15
1218 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1220 punpcklwd m0, m8, m9
1221 punpcklwd m2, m10, m11
1222 punpcklwd m4, m12, m13
1223 punpcklwd m6, m14, m15
1225 punpckhwd m1, m8, m9
1226 punpckhwd m3, m10, m11
1227 punpckhwd m5, m12, m13
1228 punpckhwd m7, m14, m15
1230 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1231 PEL_10STORE%1 dstq, m0, m1
1249 LOOP_END dst, src, srcstride
1252 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
1260 shl mxq, %%shift ; multiply by 32
1261 shl myq, %%shift ; multiply by 32
1262 lea r3srcq, [srcstrideq*3]
1264 QPEL_H_LOAD %2, srcq, %1, 15
1265 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1267 add srcq, srcstrideq
1268 QPEL_H_LOAD %2, srcq, %1, 15
1269 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1271 add srcq, srcstrideq
1272 QPEL_H_LOAD %2, srcq, %1, 15
1273 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1275 add srcq, srcstrideq
1276 QPEL_H_LOAD %2, srcq, %1, 15
1277 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1279 add srcq, srcstrideq
1280 QPEL_H_LOAD %2, srcq, %1, 15
1281 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1283 add srcq, srcstrideq
1284 QPEL_H_LOAD %2, srcq, %1, 15
1285 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1287 add srcq, srcstrideq
1288 QPEL_H_LOAD %2, srcq, %1, 15
1289 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1291 add srcq, srcstrideq
1293 QPEL_H_LOAD %2, srcq, %1, 15
1294 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1296 punpcklwd m0, m8, m9
1297 punpcklwd m2, m10, m11
1298 punpcklwd m4, m12, m13
1299 punpcklwd m6, m14, m15
1301 punpckhwd m1, m8, m9
1302 punpckhwd m3, m10, m11
1303 punpckhwd m5, m12, m13
1304 punpckhwd m7, m14, m15
1306 QPEL_HV_COMPUTE %1, 14, my, ackusdw
1307 UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
1308 PEL_%2STORE%1 dstq, m0, m1
1327 add dstq, dststrideq ; dst += dststride
1328 add srcq, srcstrideq ; src += srcstride
1329 dec heightd ; cmp height
1330 jnz .loop ; height loop
1333 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
1341 shl mxq, %%shift ; multiply by 32
1342 shl myq, %%shift ; multiply by 32
1343 lea r3srcq, [srcstrideq*3]
1345 QPEL_H_LOAD %2, srcq, %1, 15
1346 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1348 add srcq, srcstrideq
1349 QPEL_H_LOAD %2, srcq, %1, 15
1350 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1352 add srcq, srcstrideq
1353 QPEL_H_LOAD %2, srcq, %1, 15
1354 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1356 add srcq, srcstrideq
1357 QPEL_H_LOAD %2, srcq, %1, 15
1358 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1360 add srcq, srcstrideq
1361 QPEL_H_LOAD %2, srcq, %1, 15
1362 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1364 add srcq, srcstrideq
1365 QPEL_H_LOAD %2, srcq, %1, 15
1366 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1368 add srcq, srcstrideq
1369 QPEL_H_LOAD %2, srcq, %1, 15
1370 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1372 add srcq, srcstrideq
1374 QPEL_H_LOAD %2, srcq, %1, 15
1375 QPEL_HV_COMPUTE %1, %2, mx, ackssdw
1377 punpcklwd m0, m8, m9
1378 punpcklwd m2, m10, m11
1379 punpcklwd m4, m12, m13
1380 punpcklwd m6, m14, m15
1382 punpckhwd m1, m8, m9
1383 punpckhwd m3, m10, m11
1384 punpckhwd m5, m12, m13
1385 punpckhwd m7, m14, m15
1387 QPEL_HV_COMPUTE %1, 14, my, ackssdw
1388 SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
1389 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
1390 PEL_%2STORE%1 dstq, m0, m1
1409 add dstq, dststrideq ; dst += dststride
1410 add srcq, srcstrideq ; src += srcstride
1411 add src2q, 2*MAX_PB_SIZE ; src += srcstride
1412 dec heightd ; cmp height
1413 jnz .loop ; height loop
1417 %macro WEIGHTING_FUNCS 2
1418 %if WIN64 || ARCH_X86_32
1419 cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1423 cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
1424 %define SHIFT denomd
1426 lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
1431 movd m4, SHIFT ; shift
1444 shl SHIFT, %2-8 ; ox << (bitd - 8)
1448 %if WIN64 || ARCH_X86_32
1452 SIMPLE_LOAD %1, 10, srcq, m0
1462 punpckhwd m1, m0, m6
1475 CLIPW m0, [pb_0], [max_pixels_%2]
1477 PEL_%2STORE%1 dstq, m0, m1
1478 add dstq, dststrideq ; dst += dststride
1479 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1480 dec heightd ; cmp height
1481 jnz .loop ; height loop
1484 cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1
1490 lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
1492 movd m0, r6d ; shift
1501 movd m5, r6d ; shift+1
1507 shl r6d, %2-8 ; ox << (bitd - 8)
1510 movd m4, r6d ; offset
1516 SIMPLE_LOAD %1, 10, srcq, m0
1517 SIMPLE_LOAD %1, 10, src2q, m8
1531 punpckhwd m1, m0, m6
1533 punpckhwd m9, m8, m7
1546 CLIPW m0, [pb_0], [max_pixels_%2]
1548 PEL_%2STORE%1 dstq, m0, m1
1549 add dstq, dststrideq ; dst += dststride
1550 add srcq, 2*MAX_PB_SIZE ; src += srcstride
1551 add src2q, 2*MAX_PB_SIZE ; src2 += srcstride
1552 dec r6d ; cmp height
1553 jnz .loop ; height loop
1557 INIT_XMM sse4 ; adds ff_ and _sse4 to function name
1559 WEIGHTING_FUNCS 2, 8
1560 WEIGHTING_FUNCS 4, 8
1561 WEIGHTING_FUNCS 6, 8
1562 WEIGHTING_FUNCS 8, 8
1564 WEIGHTING_FUNCS 2, 10
1565 WEIGHTING_FUNCS 4, 10
1566 WEIGHTING_FUNCS 6, 10
1567 WEIGHTING_FUNCS 8, 10
1569 WEIGHTING_FUNCS 2, 12
1570 WEIGHTING_FUNCS 4, 12
1571 WEIGHTING_FUNCS 6, 12
1572 WEIGHTING_FUNCS 8, 12
1574 HEVC_PUT_HEVC_PEL_PIXELS 2, 8
1575 HEVC_PUT_HEVC_PEL_PIXELS 4, 8
1576 HEVC_PUT_HEVC_PEL_PIXELS 6, 8
1577 HEVC_PUT_HEVC_PEL_PIXELS 8, 8
1578 HEVC_PUT_HEVC_PEL_PIXELS 12, 8
1579 HEVC_PUT_HEVC_PEL_PIXELS 16, 8
1581 HEVC_PUT_HEVC_PEL_PIXELS 2, 10
1582 HEVC_PUT_HEVC_PEL_PIXELS 4, 10
1583 HEVC_PUT_HEVC_PEL_PIXELS 6, 10
1584 HEVC_PUT_HEVC_PEL_PIXELS 8, 10
1586 HEVC_PUT_HEVC_PEL_PIXELS 2, 12
1587 HEVC_PUT_HEVC_PEL_PIXELS 4, 12
1588 HEVC_PUT_HEVC_PEL_PIXELS 6, 12
1589 HEVC_PUT_HEVC_PEL_PIXELS 8, 12
1591 HEVC_PUT_HEVC_EPEL 2, 8
1592 HEVC_PUT_HEVC_EPEL 4, 8
1593 HEVC_PUT_HEVC_EPEL 6, 8
1594 HEVC_PUT_HEVC_EPEL 8, 8
1595 HEVC_PUT_HEVC_EPEL 12, 8
1596 HEVC_PUT_HEVC_EPEL 16, 8
1599 HEVC_PUT_HEVC_EPEL 2, 10
1600 HEVC_PUT_HEVC_EPEL 4, 10
1601 HEVC_PUT_HEVC_EPEL 6, 10
1602 HEVC_PUT_HEVC_EPEL 8, 10
1604 HEVC_PUT_HEVC_EPEL 2, 12
1605 HEVC_PUT_HEVC_EPEL 4, 12
1606 HEVC_PUT_HEVC_EPEL 6, 12
1607 HEVC_PUT_HEVC_EPEL 8, 12
1609 HEVC_PUT_HEVC_EPEL_HV 2, 8
1610 HEVC_PUT_HEVC_EPEL_HV 4, 8
1611 HEVC_PUT_HEVC_EPEL_HV 6, 8
1612 HEVC_PUT_HEVC_EPEL_HV 8, 8
1613 HEVC_PUT_HEVC_EPEL_HV 16, 8
1615 HEVC_PUT_HEVC_EPEL_HV 2, 10
1616 HEVC_PUT_HEVC_EPEL_HV 4, 10
1617 HEVC_PUT_HEVC_EPEL_HV 6, 10
1618 HEVC_PUT_HEVC_EPEL_HV 8, 10
1620 HEVC_PUT_HEVC_EPEL_HV 2, 12
1621 HEVC_PUT_HEVC_EPEL_HV 4, 12
1622 HEVC_PUT_HEVC_EPEL_HV 6, 12
1623 HEVC_PUT_HEVC_EPEL_HV 8, 12
1625 HEVC_PUT_HEVC_QPEL 4, 8
1626 HEVC_PUT_HEVC_QPEL 8, 8
1627 HEVC_PUT_HEVC_QPEL 12, 8
1628 HEVC_PUT_HEVC_QPEL 16, 8
1630 HEVC_PUT_HEVC_QPEL 4, 10
1631 HEVC_PUT_HEVC_QPEL 8, 10
1633 HEVC_PUT_HEVC_QPEL 4, 12
1634 HEVC_PUT_HEVC_QPEL 8, 12
1636 HEVC_PUT_HEVC_QPEL_HV 2, 8
1637 HEVC_PUT_HEVC_QPEL_HV 4, 8
1638 HEVC_PUT_HEVC_QPEL_HV 6, 8
1639 HEVC_PUT_HEVC_QPEL_HV 8, 8
1641 HEVC_PUT_HEVC_QPEL_HV 2, 10
1642 HEVC_PUT_HEVC_QPEL_HV 4, 10
1643 HEVC_PUT_HEVC_QPEL_HV 6, 10
1644 HEVC_PUT_HEVC_QPEL_HV 8, 10
1646 HEVC_PUT_HEVC_QPEL_HV 2, 12
1647 HEVC_PUT_HEVC_QPEL_HV 4, 12
1648 HEVC_PUT_HEVC_QPEL_HV 6, 12
1649 HEVC_PUT_HEVC_QPEL_HV 8, 12
1651 %if HAVE_AVX2_EXTERNAL
1652 INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. cpuflag(avx2) = 1 / notcpuflag(avx) = 0
1654 HEVC_PUT_HEVC_PEL_PIXELS 32, 8
1655 HEVC_PUT_HEVC_PEL_PIXELS 16, 10
1657 HEVC_PUT_HEVC_EPEL 32, 8
1658 HEVC_PUT_HEVC_EPEL 16, 10
1660 HEVC_PUT_HEVC_EPEL_HV 16, 10
1661 HEVC_PUT_HEVC_EPEL_HV 32, 8
1663 HEVC_PUT_HEVC_QPEL 32, 8
1665 HEVC_PUT_HEVC_QPEL 16, 10
1667 HEVC_PUT_HEVC_QPEL_HV 16, 10
1670 %endif ; ARCH_X86_64