Merge commit 'd5dd8c7bf0f0d77c581db3236e0d938f06fd5591'
authorMichael Niedermayer <michaelni@gmx.at>
Wed, 15 Jan 2014 14:13:12 +0000 (15:13 +0100)
committerMichael Niedermayer <michaelni@gmx.at>
Wed, 15 Jan 2014 14:13:41 +0000 (15:13 +0100)
* commit 'd5dd8c7bf0f0d77c581db3236e0d938f06fd5591':
  aarch64: h264 qpel NEON optimizations

Merged-by: Michael Niedermayer <michaelni@gmx.at>
1  2 
libavcodec/aarch64/h264qpel_init_aarch64.c
libavcodec/aarch64/h264qpel_neon.S
libavcodec/aarch64/neon.S
libavcodec/h264qpel.c
libavcodec/h264qpel.h

index 0000000,11611df..570dee1
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,172 +1,172 @@@
 - * This file is part of Libav.
+ /*
+  * ARM NEON optimised DSP functions
+  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+  *
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include <stdint.h>
+ #include "config.h"
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
+ #include "libavcodec/h264qpel.h"
+ void ff_put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_put_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel16_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc00_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc10_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc20_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc30_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc01_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc11_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc21_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc31_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc02_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc12_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc22_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc32_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc03_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc13_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc23_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ void ff_avg_h264_qpel8_mc33_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
+ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
+ {
+     const int high_bit_depth = bit_depth > 8;
+     int cpu_flags = av_get_cpu_flags();
+     if (have_neon(cpu_flags) && !high_bit_depth) {
+         /* c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; */
+         c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+         c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+         c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+         c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+         c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+         c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+         c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+         c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+         c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+         c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+         c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+         c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+         c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+         c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+         c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+         /* c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; */
+         c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+         c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+         c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+         c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+         c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+         c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+         c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+         c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+         c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+         c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+         c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+         c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+         c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+         c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+         c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+         /* c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; */
+         c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+         c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+         c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+         c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+         c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+         c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+         c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+         c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+         /* c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; */
+         c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+         c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+         c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+         c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+         c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+         c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+         c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+     }
+ }
index 0000000,731dc06..d27cfac
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,934 +1,934 @@@
 - * This file is part of Libav.
+ /*
+  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+  *
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include "libavutil/aarch64/asm.S"
+ #include "neon.S"
+         /* H.264 qpel MC */
+ .macro  lowpass_const   r
+         movz            \r, #20, lsl #16
+         movk            \r, #5
+         mov             v6.S[0], \r
+ .endm
+ //trashes v0-v5
+ .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
+         ext             v2.8B,      \r0\().8B, \r1\().8B, #2
+         ext             v3.8B,      \r0\().8B, \r1\().8B, #3
+         uaddl           v2.8H,      v2.8B,     v3.8B
+         ext             v4.8B,      \r0\().8B, \r1\().8B, #1
+         ext             v5.8B,      \r0\().8B, \r1\().8B, #4
+         uaddl           v4.8H,      v4.8B,     v5.8B
+         ext             v1.8B,      \r0\().8B, \r1\().8B, #5
+         uaddl           \d0\().8H,  \r0\().8B, v1.8B
+         ext             v0.8B,      \r2\().8B, \r3\().8B, #2
+         mla             \d0\().8H,  v2.8H,     v6.H[1]
+         ext             v1.8B,      \r2\().8B, \r3\().8B, #3
+         uaddl           v0.8H,      v0.8B,     v1.8B
+         ext             v1.8B,      \r2\().8B, \r3\().8B, #1
+         mls             \d0\().8H,  v4.8H,     v6.H[0]
+         ext             v3.8B,      \r2\().8B, \r3\().8B, #4
+         uaddl           v1.8H,      v1.8B,     v3.8B
+         ext             v2.8B,      \r2\().8B, \r3\().8B, #5
+         uaddl           \d1\().8H,  \r2\().8B, v2.8B
+         mla             \d1\().8H,  v0.8H,     v6.H[1]
+         mls             \d1\().8H,  v1.8H,     v6.H[0]
+   .if \narrow
+         sqrshrun        \d0\().8B,  \d0\().8H, #5
+         sqrshrun        \d1\().8B,  \d1\().8H, #5
+   .endif
+ .endm
+ //trashes v0-v5, v7, v30-v31
+ .macro  lowpass_8H      r0,  r1
+         ext             v0.16B,     \r0\().16B, \r0\().16B, #2
+         ext             v1.16B,     \r0\().16B, \r0\().16B, #3
+         uaddl           v0.8H,      v0.8B,      v1.8B
+         ext             v2.16B,     \r0\().16B, \r0\().16B, #1
+         ext             v3.16B,     \r0\().16B, \r0\().16B, #4
+         uaddl           v2.8H,      v2.8B,      v3.8B
+         ext             v30.16B,    \r0\().16B, \r0\().16B, #5
+         uaddl           \r0\().8H,  \r0\().8B,  v30.8B
+         ext             v4.16B,     \r1\().16B, \r1\().16B, #2
+         mla             \r0\().8H,  v0.8H,      v6.H[1]
+         ext             v5.16B,     \r1\().16B, \r1\().16B, #3
+         uaddl           v4.8H,      v4.8B,      v5.8B
+         ext             v7.16B,     \r1\().16B, \r1\().16B, #1
+         mls             \r0\().8H,  v2.8H,      v6.H[0]
+         ext             v0.16B,     \r1\().16B, \r1\().16B, #4
+         uaddl           v7.8H,      v7.8B,      v0.8B
+         ext             v31.16B,    \r1\().16B, \r1\().16B, #5
+         uaddl           \r1\().8H,  \r1\().8B,  v31.8B
+         mla             \r1\().8H,  v4.8H,      v6.H[1]
+         mls             \r1\().8H,  v7.8H,      v6.H[0]
+ .endm
+ // trashes v2-v5, v30
+ .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
+         ext             v2.8B,     \r0\().8B, \r1\().8B, #2
+         ext             v3.8B,     \r0\().8B, \r1\().8B, #3
+         uaddl           v2.8H,     v2.8B,     v3.8B
+         ext             v4.8B,     \r0\().8B, \r1\().8B, #1
+         ext             v5.8B,     \r0\().8B, \r1\().8B, #4
+         uaddl           v4.8H,     v4.8B,     v5.8B
+         ext             v30.8B,    \r0\().8B, \r1\().8B, #5
+         uaddl           \d0\().8H, \r0\().8B, v30.8B
+         mla             \d0\().8H, v2.8H,     v6.H[1]
+         mls             \d0\().8H, v4.8H,     v6.H[0]
+   .if \narrow
+         sqrshrun        \d0\().8B, \d0\().8H, #5
+   .endif
+ .endm
+ // trashed v0-v7
+ .macro  lowpass_8.16    r0,  r1,  r2
+         ext             v1.16B,     \r0\().16B, \r1\().16B, #4
+         ext             v0.16B,     \r0\().16B, \r1\().16B, #6
+         saddl           v5.4S,      v1.4H,      v0.4H
+         ext             v2.16B,     \r0\().16B, \r1\().16B, #2
+         saddl2          v1.4S,      v1.8H,      v0.8H
+         ext             v3.16B,     \r0\().16B, \r1\().16B, #8
+         saddl           v6.4S,      v2.4H,      v3.4H
+         ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
+         saddl2          v2.4S,      v2.8H,      v3.8H
+         saddl           v0.4S,      \r0\().4H,  \r1\().4H
+         saddl2          v4.4S,      \r0\().8H,  \r1\().8H
+         shl             v3.4S,  v5.4S,  #4
+         shl             v5.4S,  v5.4S,  #2
+         shl             v7.4S,  v6.4S,  #2
+         add             v5.4S,  v5.4S,  v3.4S
+         add             v6.4S,  v6.4S,  v7.4S
+         shl             v3.4S,  v1.4S,  #4
+         shl             v1.4S,  v1.4S,  #2
+         shl             v7.4S,  v2.4S,  #2
+         add             v1.4S,  v1.4S,  v3.4S
+         add             v2.4S,  v2.4S,  v7.4S
+         add             v5.4S,  v5.4S,  v0.4S
+         sub             v5.4S,  v5.4S,  v6.4S
+         add             v1.4S,  v1.4S,  v4.4S
+         sub             v1.4S,  v1.4S,  v2.4S
+         rshrn           v5.4H,  v5.4S,  #10
+         rshrn2          v5.8H,  v1.4S,  #10
+         sqxtun          \r2\().8B,  v5.8H
+ .endm
+ function put_h264_qpel16_h_lowpass_neon_packed
+         mov             x4,  x30
+         mov             x12, #16
+         mov             x3,  #8
+         bl              put_h264_qpel8_h_lowpass_neon
+         sub             x1,  x1,  x2, lsl #4
+         add             x1,  x1,  #8
+         mov             x12, #16
+         mov             x30, x4
+         b               put_h264_qpel8_h_lowpass_neon
+ endfunc
+ .macro  h264_qpel_h_lowpass type
+ function \type\()_h264_qpel16_h_lowpass_neon
+         mov             x13, x30
+         mov             x12, #16
+         bl              \type\()_h264_qpel8_h_lowpass_neon
+         sub             x0,  x0,  x3, lsl #4
+         sub             x1,  x1,  x2, lsl #4
+         add             x0,  x0,  #8
+         add             x1,  x1,  #8
+         mov             x12, #16
+         mov             x30, x13
+ endfunc
+ function \type\()_h264_qpel8_h_lowpass_neon
+ 1:      ld1             {v28.8B, v29.8B}, [x1], x2
+         ld1             {v16.8B, v17.8B}, [x1], x2
+         subs            x12, x12, #2
+         lowpass_8       v28, v29, v16, v17, v28, v16
+   .ifc \type,avg
+         ld1             {v2.8B},    [x0], x3
+         urhadd          v28.8B, v28.8B,  v2.8B
+         ld1             {v3.8B},    [x0]
+         urhadd          v16.8B, v16.8B, v3.8B
+         sub             x0,  x0,  x3
+   .endif
+         st1             {v28.8B},    [x0], x3
+         st1             {v16.8B},    [x0], x3
+         b.ne            1b
+         ret
+ endfunc
+ .endm
+         h264_qpel_h_lowpass put
+         h264_qpel_h_lowpass avg
+ .macro  h264_qpel_h_lowpass_l2 type
+ function \type\()_h264_qpel16_h_lowpass_l2_neon
+         mov             x13, x30
+         mov             x12, #16
+         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
+         sub             x0,  x0,  x2, lsl #4
+         sub             x1,  x1,  x2, lsl #4
+         sub             x3,  x3,  x2, lsl #4
+         add             x0,  x0,  #8
+         add             x1,  x1,  #8
+         add             x3,  x3,  #8
+         mov             x12, #16
+         mov             x30, x13
+ endfunc
+ function \type\()_h264_qpel8_h_lowpass_l2_neon
+ 1:      ld1             {v26.8B, v27.8B}, [x1], x2
+         ld1             {v16.8B, v17.8B}, [x1], x2
+         ld1             {v28.8B},     [x3], x2
+         ld1             {v29.8B},     [x3], x2
+         subs            x12, x12, #2
+         lowpass_8       v26, v27, v16, v17, v26, v27
+         urhadd          v26.8B, v26.8B, v28.8B
+         urhadd          v27.8B, v27.8B, v29.8B
+   .ifc \type,avg
+         ld1             {v2.8B},      [x0], x2
+         urhadd          v26.8B, v26.8B, v2.8B
+         ld1             {v3.8B},      [x0]
+         urhadd          v27.8B, v27.8B, v3.8B
+         sub             x0,  x0,  x2
+   .endif
+         st1             {v26.8B},     [x0], x2
+         st1             {v27.8B},     [x0], x2
+         b.ne            1b
+         ret
+ endfunc
+ .endm
+         h264_qpel_h_lowpass_l2 put
+         h264_qpel_h_lowpass_l2 avg
+ function put_h264_qpel16_v_lowpass_neon_packed
+         mov             x4,  x30
+         mov             x2,  #8
+         bl              put_h264_qpel8_v_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         bl              put_h264_qpel8_v_lowpass_neon
+         sub             x1,  x1,  x3, lsl #4
+         sub             x1,  x1,  x3, lsl #2
+         add             x1,  x1,  #8
+         bl              put_h264_qpel8_v_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         mov             x30, x4
+         b               put_h264_qpel8_v_lowpass_neon
+ endfunc
+ .macro  h264_qpel_v_lowpass type
+ function \type\()_h264_qpel16_v_lowpass_neon
+         mov             x4,  x30
+         bl              \type\()_h264_qpel8_v_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         bl              \type\()_h264_qpel8_v_lowpass_neon
+         sub             x0,  x0,  x2, lsl #4
+         add             x0,  x0,  #8
+         sub             x1,  x1,  x3, lsl #4
+         sub             x1,  x1,  x3, lsl #2
+         add             x1,  x1,  #8
+         bl              \type\()_h264_qpel8_v_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         mov             x30, x4
+ endfunc
+ function \type\()_h264_qpel8_v_lowpass_neon
+         ld1             {v16.8B}, [x1], x3
+         ld1             {v18.8B}, [x1], x3
+         ld1             {v20.8B}, [x1], x3
+         ld1             {v22.8B}, [x1], x3
+         ld1             {v24.8B}, [x1], x3
+         ld1             {v26.8B}, [x1], x3
+         ld1             {v28.8B}, [x1], x3
+         ld1             {v30.8B}, [x1], x3
+         ld1             {v17.8B}, [x1], x3
+         ld1             {v19.8B}, [x1], x3
+         ld1             {v21.8B}, [x1], x3
+         ld1             {v23.8B}, [x1], x3
+         ld1             {v25.8B}, [x1]
+         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+         lowpass_8       v16, v17, v18, v19, v16, v17
+         lowpass_8       v20, v21, v22, v23, v18, v19
+         lowpass_8       v24, v25, v26, v27, v20, v21
+         lowpass_8       v28, v29, v30, v31, v22, v23
+         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+   .ifc \type,avg
+         ld1             {v24.8B},  [x0], x2
+         urhadd          v16.8B, v16.8B, v24.8B
+         ld1             {v25.8B}, [x0], x2
+         urhadd          v17.8B, v17.8B, v25.8B
+         ld1             {v26.8B}, [x0], x2
+         urhadd          v18.8B, v18.8B, v26.8B
+         ld1             {v27.8B}, [x0], x2
+         urhadd          v19.8B, v19.8B, v27.8B
+         ld1             {v28.8B}, [x0], x2
+         urhadd          v20.8B, v20.8B, v28.8B
+         ld1             {v29.8B}, [x0], x2
+         urhadd          v21.8B, v21.8B, v29.8B
+         ld1             {v30.8B}, [x0], x2
+         urhadd          v22.8B, v22.8B, v30.8B
+         ld1             {v31.8B}, [x0], x2
+         urhadd          v23.8B, v23.8B, v31.8B
+         sub             x0,  x0,  x2,  lsl #3
+   .endif
+         st1             {v16.8B}, [x0], x2
+         st1             {v17.8B}, [x0], x2
+         st1             {v18.8B}, [x0], x2
+         st1             {v19.8B}, [x0], x2
+         st1             {v20.8B}, [x0], x2
+         st1             {v21.8B}, [x0], x2
+         st1             {v22.8B}, [x0], x2
+         st1             {v23.8B}, [x0], x2
+         ret
+ endfunc
+ .endm
+         h264_qpel_v_lowpass put
+         h264_qpel_v_lowpass avg
+ .macro  h264_qpel_v_lowpass_l2 type
+ function \type\()_h264_qpel16_v_lowpass_l2_neon
+         mov             x4,  x30
+         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+         sub             x1,  x1,  x3, lsl #2
+         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+         sub             x0,  x0,  x3, lsl #4
+         sub             x12, x12, x2, lsl #4
+         add             x0,  x0,  #8
+         add             x12, x12, #8
+         sub             x1,  x1,  x3, lsl #4
+         sub             x1,  x1,  x3, lsl #2
+         add             x1,  x1,  #8
+         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+         sub             x1,  x1,  x3, lsl #2
+         mov             x30, x4
+ endfunc
+ function \type\()_h264_qpel8_v_lowpass_l2_neon
+         ld1             {v16.8B}, [x1], x3
+         ld1             {v18.8B}, [x1], x3
+         ld1             {v20.8B}, [x1], x3
+         ld1             {v22.8B}, [x1], x3
+         ld1             {v24.8B}, [x1], x3
+         ld1             {v26.8B}, [x1], x3
+         ld1             {v28.8B}, [x1], x3
+         ld1             {v30.8B}, [x1], x3
+         ld1             {v17.8B}, [x1], x3
+         ld1             {v19.8B}, [x1], x3
+         ld1             {v21.8B}, [x1], x3
+         ld1             {v23.8B}, [x1], x3
+         ld1             {v25.8B}, [x1]
+         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
+         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
+         lowpass_8       v16, v17, v18, v19, v16, v17
+         lowpass_8       v20, v21, v22, v23, v18, v19
+         lowpass_8       v24, v25, v26, v27, v20, v21
+         lowpass_8       v28, v29, v30, v31, v22, v23
+         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+         ld1             {v24.8B},  [x12], x2
+         ld1             {v25.8B},  [x12], x2
+         ld1             {v26.8B},  [x12], x2
+         ld1             {v27.8B},  [x12], x2
+         ld1             {v28.8B},  [x12], x2
+         urhadd          v16.8B, v24.8B, v16.8B
+         urhadd          v17.8B, v25.8B, v17.8B
+         ld1             {v29.8B},  [x12], x2
+         urhadd          v18.8B, v26.8B, v18.8B
+         urhadd          v19.8B, v27.8B, v19.8B
+         ld1             {v30.8B}, [x12], x2
+         urhadd          v20.8B, v28.8B, v20.8B
+         urhadd          v21.8B, v29.8B, v21.8B
+         ld1             {v31.8B}, [x12], x2
+         urhadd          v22.8B, v30.8B, v22.8B
+         urhadd          v23.8B, v31.8B, v23.8B
+   .ifc \type,avg
+         ld1             {v24.8B}, [x0], x3
+         urhadd          v16.8B, v16.8B, v24.8B
+         ld1             {v25.8B}, [x0], x3
+         urhadd          v17.8B, v17.8B, v25.8B
+         ld1             {v26.8B}, [x0], x3
+         urhadd          v18.8B, v18.8B, v26.8B
+         ld1             {v27.8B}, [x0], x3
+         urhadd          v19.8B, v19.8B, v27.8B
+         ld1             {v28.8B}, [x0], x3
+         urhadd          v20.8B, v20.8B, v28.8B
+         ld1             {v29.8B}, [x0], x3
+         urhadd          v21.8B, v21.8B, v29.8B
+         ld1             {v30.8B}, [x0], x3
+         urhadd          v22.8B, v22.8B, v30.8B
+         ld1             {v31.8B}, [x0], x3
+         urhadd          v23.8B, v23.8B, v31.8B
+         sub             x0,  x0,  x3,  lsl #3
+   .endif
+         st1             {v16.8B}, [x0], x3
+         st1             {v17.8B}, [x0], x3
+         st1             {v18.8B}, [x0], x3
+         st1             {v19.8B}, [x0], x3
+         st1             {v20.8B}, [x0], x3
+         st1             {v21.8B}, [x0], x3
+         st1             {v22.8B}, [x0], x3
+         st1             {v23.8B}, [x0], x3
+         ret
+ endfunc
+ .endm
+         h264_qpel_v_lowpass_l2 put
+         h264_qpel_v_lowpass_l2 avg
+ function put_h264_qpel8_hv_lowpass_neon_top
+         lowpass_const   w12
+         ld1             {v16.8H}, [x1], x3
+         ld1             {v17.8H}, [x1], x3
+         ld1             {v18.8H}, [x1], x3
+         ld1             {v19.8H}, [x1], x3
+         ld1             {v20.8H}, [x1], x3
+         ld1             {v21.8H}, [x1], x3
+         ld1             {v22.8H}, [x1], x3
+         ld1             {v23.8H}, [x1], x3
+         ld1             {v24.8H}, [x1], x3
+         ld1             {v25.8H}, [x1], x3
+         ld1             {v26.8H}, [x1], x3
+         ld1             {v27.8H}, [x1], x3
+         ld1             {v28.8H}, [x1]
+         lowpass_8H      v16, v17
+         lowpass_8H      v18, v19
+         lowpass_8H      v20, v21
+         lowpass_8H      v22, v23
+         lowpass_8H      v24, v25
+         lowpass_8H      v26, v27
+         lowpass_8H      v28, v29
+         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
+         lowpass_8.16    v16, v24, v16
+         lowpass_8.16    v17, v25, v17
+         lowpass_8.16    v18, v26, v18
+         lowpass_8.16    v19, v27, v19
+         lowpass_8.16    v20, v28, v20
+         lowpass_8.16    v21, v29, v21
+         lowpass_8.16    v22, v30, v22
+         lowpass_8.16    v23, v31, v23
+         transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
+         ret
+ endfunc
+ .macro  h264_qpel8_hv_lowpass type
+ function \type\()_h264_qpel8_hv_lowpass_neon
+         mov             x10, x30
+         bl              put_h264_qpel8_hv_lowpass_neon_top
+   .ifc \type,avg
+         ld1             {v0.8B},      [x0], x2
+         urhadd          v16.8B, v16.8B, v0.8B
+         ld1             {v1.8B},      [x0], x2
+         urhadd          v17.8B, v17.8B, v1.8B
+         ld1             {v2.8B},      [x0], x2
+         urhadd          v18.8B, v18.8B, v2.8B
+         ld1             {v3.8B},      [x0], x2
+         urhadd          v19.8B, v19.8B, v3.8B
+         ld1             {v4.8B},      [x0], x2
+         urhadd          v20.8B, v20.8B, v4.8B
+         ld1             {v5.8B},      [x0], x2
+         urhadd          v21.8B, v21.8B, v5.8B
+         ld1             {v6.8B},      [x0], x2
+         urhadd          v22.8B, v22.8B, v6.8B
+         ld1             {v7.8B},      [x0], x2
+         urhadd          v23.8B, v23.8B, v7.8B
+         sub             x0,  x0,  x2,  lsl #3
+   .endif
+         st1             {v16.8B},     [x0], x2
+         st1             {v17.8B},     [x0], x2
+         st1             {v18.8B},     [x0], x2
+         st1             {v19.8B},     [x0], x2
+         st1             {v20.8B},     [x0], x2
+         st1             {v21.8B},     [x0], x2
+         st1             {v22.8B},     [x0], x2
+         st1             {v23.8B},     [x0], x2
+         ret             x10
+ endfunc
+ .endm
+         h264_qpel8_hv_lowpass put
+         h264_qpel8_hv_lowpass avg
+ .macro  h264_qpel8_hv_lowpass_l2 type
+ function \type\()_h264_qpel8_hv_lowpass_l2_neon
+         mov             x10, x30
+         bl              put_h264_qpel8_hv_lowpass_neon_top
+         ld1             {v0.8B, v1.8B},  [x2], #16
+         ld1             {v2.8B, v3.8B},  [x2], #16
+         urhadd          v0.8B,  v0.8B,  v16.8B
+         urhadd          v1.8B,  v1.8B,  v17.8B
+         ld1             {v4.8B, v5.8B},  [x2], #16
+         urhadd          v2.8B,  v2.8B,  v18.8B
+         urhadd          v3.8B,  v3.8B,  v19.8B
+         ld1             {v6.8B, v7.8B},  [x2], #16
+         urhadd          v4.8B,  v4.8B,  v20.8B
+         urhadd          v5.8B,  v5.8B,  v21.8B
+         urhadd          v6.8B,  v6.8B,  v22.8B
+         urhadd          v7.8B,  v7.8B,  v23.8B
+   .ifc \type,avg
+         ld1             {v16.8B},     [x0], x3
+         urhadd          v0.8B,  v0.8B,  v16.8B
+         ld1             {v17.8B},     [x0], x3
+         urhadd          v1.8B,  v1.8B,  v17.8B
+         ld1             {v18.8B},     [x0], x3
+         urhadd          v2.8B,  v2.8B,  v18.8B
+         ld1             {v19.8B},     [x0], x3
+         urhadd          v3.8B,  v3.8B,  v19.8B
+         ld1             {v20.8B},     [x0], x3
+         urhadd          v4.8B,  v4.8B,  v20.8B
+         ld1             {v21.8B},     [x0], x3
+         urhadd          v5.8B,  v5.8B,  v21.8B
+         ld1             {v22.8B},     [x0], x3
+         urhadd          v6.8B,  v6.8B,  v22.8B
+         ld1             {v23.8B},     [x0], x3
+         urhadd          v7.8B,  v7.8B,  v23.8B
+         sub             x0,  x0,  x3,  lsl #3
+   .endif
+         st1             {v0.8B},      [x0], x3
+         st1             {v1.8B},      [x0], x3
+         st1             {v2.8B},      [x0], x3
+         st1             {v3.8B},      [x0], x3
+         st1             {v4.8B},      [x0], x3
+         st1             {v5.8B},      [x0], x3
+         st1             {v6.8B},      [x0], x3
+         st1             {v7.8B},      [x0], x3
+         ret             x10
+ endfunc
+ .endm
+         h264_qpel8_hv_lowpass_l2 put
+         h264_qpel8_hv_lowpass_l2 avg
+ .macro  h264_qpel16_hv  type
+ function \type\()_h264_qpel16_hv_lowpass_neon
+         mov             x13, x30
+         bl              \type\()_h264_qpel8_hv_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         bl              \type\()_h264_qpel8_hv_lowpass_neon
+         sub             x1,  x1,  x3, lsl #4
+         sub             x1,  x1,  x3, lsl #2
+         add             x1,  x1,  #8
+         sub             x0,  x0,  x2, lsl #4
+         add             x0,  x0,  #8
+         bl              \type\()_h264_qpel8_hv_lowpass_neon
+         sub             x1,  x1,  x3, lsl #2
+         mov             x30, x13
+         b               \type\()_h264_qpel8_hv_lowpass_neon
+ endfunc
+ function \type\()_h264_qpel16_hv_lowpass_l2_neon
+         mov             x13, x30
+         sub             x2,  x4,  #256
+         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+         sub             x1,  x1,  x3, lsl #2
+         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+         sub             x1,  x1,  x3, lsl #4
+         sub             x1,  x1,  x3, lsl #2
+         add             x1,  x1,  #8
+         sub             x0,  x0,  x3, lsl #4
+         add             x0,  x0,  #8
+         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+         sub             x1,  x1,  x3, lsl #2
+         mov             x30, x13
+         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+ endfunc
+ .endm
+         h264_qpel16_hv put
+         h264_qpel16_hv avg
+ .macro  h264_qpel8      type
+ function ff_\type\()_h264_qpel8_mc10_neon, export=1
+         lowpass_const   w3
+         mov             x3,  x1
+         sub             x1,  x1,  #2
+         mov             x12, #8
+         b               \type\()_h264_qpel8_h_lowpass_l2_neon
+ endfunc
+ function ff_\type\()_h264_qpel8_mc20_neon, export=1
+         lowpass_const   w3
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         mov             x12, #8
+         b               \type\()_h264_qpel8_h_lowpass_neon
+ endfunc
+ function ff_\type\()_h264_qpel8_mc30_neon, export=1
+         lowpass_const   w3
+         add             x3,  x1,  #1
+         sub             x1,  x1,  #2
+         mov             x12, #8
+         b               \type\()_h264_qpel8_h_lowpass_l2_neon
+ endfunc
+ function ff_\type\()_h264_qpel8_mc01_neon, export=1
+         mov             x14, x30
+         mov             x12, x1
+ \type\()_h264_qpel8_mc01:
+         lowpass_const   w3
+         mov             x3,  x2
+         sub             x1,  x1,  x2, lsl #1
+         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc11_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel8_mc11:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #64
+         mov             x0,  sp
+         sub             x1,  x1,  #2
+         mov             x3,  #8
+         mov             x12, #8
+         bl              put_h264_qpel8_h_lowpass_neon
+         mov             x0,  x8
+         mov             x3,  x2
+         mov             x12, sp
+         sub             x1,  x9,  x2, lsl #1
+         mov             x2,  #8
+         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc21_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel8_mc21:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #(8*8+16*12)
+         sub             x1,  x1,  #2
+         mov             x3,  #8
+         mov             x0,  sp
+         mov             x12, #8
+         bl              put_h264_qpel8_h_lowpass_neon
+         mov             x4,  x0
+         mov             x0,  x8
+         sub             x1,  x9,  x2, lsl #1
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         sub             x2,  x4,  #64
+         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc31_neon, export=1
+         add             x1,  x1,  #1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         sub             x1,  x1,  #1
+         b               \type\()_h264_qpel8_mc11
+ endfunc
+ function ff_\type\()_h264_qpel8_mc02_neon, export=1
+         mov             x14, x30
+         lowpass_const   w3
+         sub             x1,  x1,  x2, lsl #1
+         mov             x3,  x2
+         bl              \type\()_h264_qpel8_v_lowpass_neon
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc12_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel8_mc12:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #(8*8+16*12)
+         sub             x1,  x1,  x2, lsl #1
+         mov             x3,  x2
+         mov             x2,  #8
+         mov             x0,  sp
+         bl              put_h264_qpel8_v_lowpass_neon
+         mov             x4,  x0
+         mov             x0,  x8
+         sub             x1,  x9,  x3, lsl #1
+         sub             x1,  x1,  #2
+         sub             x2,  x4,  #64
+         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc22_neon, export=1
+         mov             x14, x30
+         mov             x11, sp
+         sub             x1,  x1,  x2, lsl #1
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         bl              \type\()_h264_qpel8_hv_lowpass_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel8_mc32_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  #1
+         b               \type\()_h264_qpel8_mc12
+ endfunc
+ function ff_\type\()_h264_qpel8_mc03_neon, export=1
+         mov             x14, x30
+         add             x12, x1,  x2
+         b               \type\()_h264_qpel8_mc01
+ endfunc
+ function ff_\type\()_h264_qpel8_mc13_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         b               \type\()_h264_qpel8_mc11
+ endfunc
+ function ff_\type\()_h264_qpel8_mc23_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         b               \type\()_h264_qpel8_mc21
+ endfunc
+ function ff_\type\()_h264_qpel8_mc33_neon, export=1
+         add             x1,  x1,  #1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         sub             x1,  x1,  #1
+         b               \type\()_h264_qpel8_mc11
+ endfunc
+ .endm
+         h264_qpel8 put
+         h264_qpel8 avg
+ .macro  h264_qpel16     type
+ function ff_\type\()_h264_qpel16_mc10_neon, export=1
+         lowpass_const   w3
+         mov             x3,  x1
+         sub             x1,  x1,  #2
+         b               \type\()_h264_qpel16_h_lowpass_l2_neon
+ endfunc
+ function ff_\type\()_h264_qpel16_mc20_neon, export=1
+         lowpass_const   w3
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         b               \type\()_h264_qpel16_h_lowpass_neon
+ endfunc
+ function ff_\type\()_h264_qpel16_mc30_neon, export=1
+         lowpass_const   w3
+         add             x3,  x1,  #1
+         sub             x1,  x1,  #2
+         b               \type\()_h264_qpel16_h_lowpass_l2_neon
+ endfunc
+ function ff_\type\()_h264_qpel16_mc01_neon, export=1
+         mov             x14, x30
+         mov             x12, x1
+ \type\()_h264_qpel16_mc01:
+         lowpass_const   w3
+         mov             x3,  x2
+         sub             x1,  x1,  x2, lsl #1
+         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc11_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel16_mc11:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #256
+         mov             x0,  sp
+         sub             x1,  x1,  #2
+         mov             x3,  #16
+         bl              put_h264_qpel16_h_lowpass_neon
+         mov             x0,  x8
+         mov             x3,  x2
+         mov             x12, sp
+         sub             x1,  x9,  x2, lsl #1
+         mov             x2,  #16
+         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc21_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel16_mc21:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #(16*16+16*12)
+         sub             x1,  x1,  #2
+         mov             x0,  sp
+         bl              put_h264_qpel16_h_lowpass_neon_packed
+         mov             x4,  x0
+         mov             x0,  x8
+         sub             x1,  x9,  x2, lsl #1
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc31_neon, export=1
+         add             x1,  x1,  #1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         sub             x1,  x1,  #1
+         b               \type\()_h264_qpel16_mc11
+ endfunc
+ function ff_\type\()_h264_qpel16_mc02_neon, export=1
+         mov             x14, x30
+         lowpass_const   w3
+         sub             x1,  x1,  x2, lsl #1
+         mov             x3,  x2
+         bl              \type\()_h264_qpel16_v_lowpass_neon
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc12_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+ \type\()_h264_qpel16_mc12:
+         lowpass_const   w3
+         mov             x11, sp
+         sub             sp,  sp,  #(16*16+16*12)
+         sub             x1,  x1,  x2, lsl #1
+         mov             x0,  sp
+         mov             x3,  x2
+         bl              put_h264_qpel16_v_lowpass_neon_packed
+         mov             x4,  x0
+         mov             x0,  x8
+         sub             x1,  x9,  x3, lsl #1
+         sub             x1,  x1,  #2
+         mov             x2,  x3
+         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
+         mov             sp,  x11
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc22_neon, export=1
+         mov             x14, x30
+         lowpass_const   w3
+         mov             x11, sp
+         sub             x1,  x1,  x2, lsl #1
+         sub             x1,  x1,  #2
+         mov             x3,  x2
+         bl              \type\()_h264_qpel16_hv_lowpass_neon
+         mov             sp,  x11 // restore stack
+         ret             x14
+ endfunc
+ function ff_\type\()_h264_qpel16_mc32_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  #1
+         b               \type\()_h264_qpel16_mc12
+ endfunc
+ function ff_\type\()_h264_qpel16_mc03_neon, export=1
+         mov             x14, x30
+         add             x12, x1,  x2
+         b               \type\()_h264_qpel16_mc01
+ endfunc
+ function ff_\type\()_h264_qpel16_mc13_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         b               \type\()_h264_qpel16_mc11
+ endfunc
+ function ff_\type\()_h264_qpel16_mc23_neon, export=1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         b               \type\()_h264_qpel16_mc21
+ endfunc
+ function ff_\type\()_h264_qpel16_mc33_neon, export=1
+         add             x1,  x1,  #1
+         mov             x14, x30
+         mov             x8,  x0
+         mov             x9,  x1
+         add             x1,  x1,  x2
+         sub             x1,  x1,  #1
+         b               \type\()_h264_qpel16_mc11
+ endfunc
+ .endm
+         h264_qpel16 put
+         h264_qpel16 avg
Simple merge
@@@ -88,14 -76,10 +88,16 @@@ av_cold void ff_h264qpel_init(H264QpelC
      case 10:
          SET_QPEL(10);
          break;
 +    case 12:
 +        SET_QPEL(12);
 +        break;
 +    case 14:
 +        SET_QPEL(14);
 +        break;
      }
  
+     if (ARCH_AARCH64)
+         ff_h264qpel_init_aarch64(c, bit_depth);
      if (ARCH_ARM)
          ff_h264qpel_init_arm(c, bit_depth);
      if (ARCH_PPC)
Simple merge