Merge commit 'd3f5b94762fb803c0f3b29f9ad6c5eaa813998ba'
authorMichael Niedermayer <michaelni@gmx.at>
Thu, 15 May 2014 19:13:31 +0000 (21:13 +0200)
committerMichael Niedermayer <michaelni@gmx.at>
Thu, 15 May 2014 19:13:53 +0000 (21:13 +0200)
* commit 'd3f5b94762fb803c0f3b29f9ad6c5eaa813998ba':
  aarch64: opus NEON iMDCT and FFT

Merged-by: Michael Niedermayer <michaelni@gmx.at>
1  2 
libavcodec/aarch64/asm-offsets.h
libavcodec/aarch64/opus_imdct_init.c
libavcodec/aarch64/opus_imdct_neon.S
libavcodec/opus.h
libavcodec/opus_celt.c
libavcodec/opus_imdct.c
libavcodec/opus_imdct.h

index 0000000,45b5c40..8defd7c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,30 +1,30 @@@
 - * This file is part of Libav.
+ /*
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
+ #define AVCODEC_AARCH64_ASM_OFFSETS_H
+ /* CeltIMDCTContext */
+ #define CELT_EXPTAB                     0x20
+ #define CELT_FFT_N                      0x00
+ #define CELT_LEN2                       0x04
+ #define CELT_LEN4                       (CELT_LEN2 + 0x4)   // loaded as pair
+ #define CELT_TMP                        0x10
+ #define CELT_TWIDDLE                    (CELT_TMP + 0x8)    // loaded as pair
+ #endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */
index 0000000,1a776dc..3fa9a11
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,45 +1,45 @@@
 - * This file is part of Libav.
+ /*
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include <stddef.h>
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
+ #include "libavutil/internal.h"
+ #include "libavcodec/opus_imdct.h"
+ #include "asm-offsets.h"
+ AV_CHECK_OFFSET(CeltIMDCTContext, exptab,         CELT_EXPTAB);
+ AV_CHECK_OFFSET(CeltIMDCTContext, fft_n,          CELT_FFT_N);
+ AV_CHECK_OFFSET(CeltIMDCTContext, len2,           CELT_LEN2);
+ AV_CHECK_OFFSET(CeltIMDCTContext, len4,           CELT_LEN4);
+ AV_CHECK_OFFSET(CeltIMDCTContext, tmp,            CELT_TMP);
+ AV_CHECK_OFFSET(CeltIMDCTContext, twiddle_exptab, CELT_TWIDDLE);
+ void ff_celt_imdct_half_neon(CeltIMDCTContext *s, float *dst, const float *src,
+                              ptrdiff_t stride, float scale);
+ void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s)
+ {
+     int cpu_flags = av_get_cpu_flags();
+     if (have_neon(cpu_flags)) {
+         s->imdct_half = ff_celt_imdct_half_neon;
+     }
+ }
index 0000000,6b06396..ff1f705
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,647 +1,647 @@@
 - * This file is part of Libav.
+ /*
+  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
+  *
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include "libavutil/aarch64/asm.S"
+ #include "asm-offsets.h"
+ .macro shuffle a, b, c, d
+ const shuffle_\a\b\c\d align=4
+         .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3)
+         .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3)
+         .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3)
+         .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3)
+ endconst
+ .endm
+ shuffle 0, 2, 1, 3
+ shuffle 1, 0, 3, 2
+ shuffle 2, 3, 0, 1
+ shuffle 3, 1, 2, 0
+ function fft5_neon
+         lsl             x2,  x2,  #3
+         ld1             {v24.2s},         [x1],  x2
+         ld2             {v25.s,v26.s}[0], [x1],  x2
+         ld2             {v25.s,v26.s}[1], [x1],  x2
+         ld2             {v25.s,v26.s}[2], [x1],  x2
+         ld2             {v25.s,v26.s}[3], [x1]
+         dup             v6.4s,  v24.s[0]
+         dup             v7.4s,  v24.s[1]
+         faddp           v0.4s,  v25.4s, v26.4s
+         // z[][0], z[][3]
+         fmul            v16.4s, v25.4s, v15.s[0] // rr
+         fmul            v17.4s, v25.4s, v15.s[1] // ri
+         fmul            v18.4s, v26.4s, v15.s[0] // ir
+         fmul            v19.4s, v26.4s, v15.s[1] // ii
+         faddp           v0.4s,  v0.4s,  v0.4s
+         // z[][1], z[][2]
+         fmul            v20.4s, v25.4s, v15.s[2] // rr
+         fmul            v21.4s, v25.4s, v15.s[3] // ri
+         fmul            v22.4s, v26.4s, v15.s[2] // ir
+         fmul            v23.4s, v26.4s, v15.s[3] // ii
+         fadd            v0.2s,  v24.2s, v0.2s   // out[0]
+         // z[0123][0], z[0123][3]
+         fsub            v24.4s, v16.4s, v19.4s  //    (c).re =  rr - ii;
+         fadd            v27.4s, v16.4s, v19.4s  //    (d).re =  rr + ii;
+         ld1             {v16.16b},  [x11]
+         ld1             {v19.16b},  [x14]
+         fadd            v28.4s, v17.4s, v18.4s  //    (c).im =  ri + ir;
+         fsub            v31.4s, v18.4s, v17.4s  //    (d).im = -ri + ir;
+         ld1             {v17.16b},  [x12]
+         // z[0123][1], z[0123][2]
+         fsub            v25.4s, v20.4s, v23.4s  //    (c).re =  rr - ii;
+         fadd            v26.4s, v20.4s, v23.4s  //    (d).re =  rr + ii;
+         ld1             {v18.16b},  [x13]
+         fadd            v29.4s, v21.4s, v22.4s  //    (c).im =  ri + ir;
+         fsub            v30.4s, v22.4s, v21.4s  //    (d).im = -ri + ir;
+         //real
+         tbl             v20.16b, {v24.16b}, v16.16b
+         tbl             v21.16b, {v25.16b}, v17.16b
+         tbl             v22.16b, {v26.16b}, v18.16b
+         tbl             v23.16b, {v27.16b}, v19.16b
+         //imag
+         tbl             v16.16b, {v28.16b}, v16.16b
+         tbl             v17.16b, {v29.16b}, v17.16b
+         tbl             v18.16b, {v30.16b}, v18.16b
+         tbl             v19.16b, {v31.16b}, v19.16b
+         fadd            v6.4s,  v6.4s,  v20.4s
+         fadd            v22.4s, v22.4s, v23.4s
+         fadd            v7.4s,  v7.4s,  v16.4s
+         fadd            v18.4s, v18.4s, v19.4s
+         fadd            v21.4s, v21.4s, v22.4s
+         fadd            v17.4s, v17.4s, v18.4s
+         fadd            v6.4s,  v6.4s,  v21.4s
+         fadd            v7.4s,  v7.4s,  v17.4s
+         ret
+ endfunc
+ function fft15_neon
+         mov             x8,  x1
+         mov             x9,  x30
+         add             x2,  x3,  x3,  lsl #1   // 3 * stride
+         add             x1,  x8,  x3,  lsl #3   // in + 1 * stride
+         bl              fft5_neon
+         mov             v1.8b,   v0.8b
+         mov             v2.16b,  v6.16b
+         mov             v3.16b,  v7.16b
+         add             x1,  x8,  x3,  lsl #4   // in + 2 * stride
+         add             x2,  x3,  x3,  lsl #1   // 3 * stride
+         bl              fft5_neon
+         zip1            v1.4s,   v1.4s,  v0.4s
+         mov             v4.16b,  v6.16b
+         mov             v5.16b,  v7.16b
+         mov             x1,  x8                 // in + 0 * stride
+         add             x2,  x3,  x3,  lsl #1   // 3 * stride
+         bl              fft5_neon
+         faddp           v20.4s, v1.4s,  v1.4s
+         ext             v18.16b, v8.16b,  v8.16b,  #4
+         ext             v19.16b, v9.16b,  v9.16b,  #4
+         mov             v16.16b, v6.16b
+         mov             v17.16b, v7.16b
+         fadd            v20.2s, v20.2s, v0.2s
+         uzp1            v18.4s, v18.4s, v10.4s  // exp[2,4,6,8].re
+         uzp1            v19.4s, v19.4s, v11.4s  // exp[2,4,6,8].im
+         st1             {v20.2s},  [x0], #8     // out[0]
+         fmla            v16.4s, v2.4s,  v8.4s
+         fmls            v16.4s, v3.4s,  v9.4s
+         fmla            v17.4s, v2.4s,  v9.4s
+         fmla            v17.4s, v3.4s,  v8.4s
+         fmla            v16.4s, v4.4s,  v18.4s
+         fmls            v16.4s, v5.4s,  v19.4s
+         fmla            v17.4s, v4.4s,  v19.4s
+         fmla            v17.4s, v5.4s,  v18.4s
+         zip1            v18.4s, v16.4s, v17.4s
+         zip2            v19.4s, v16.4s, v17.4s
+         rev64           v31.4s, v14.4s
+         trn1            v28.2d, v1.2d,  v1.2d
+         trn2            v29.2d, v1.2d,  v1.2d
+         zip1            v30.2d, v14.2d, v31.2d
+         zip2            v31.2d, v14.2d, v31.2d
+         st1             {v18.4s,v19.4s},  [x0], #32 // out[1-4]
+         fmul            v16.4s, v28.4s, v30.4s
+         fmul            v17.4s, v29.4s, v30.4s
+         fmls            v16.4s, v29.4s, v31.4s
+         fmla            v17.4s, v28.4s, v31.4s
+         faddp           v16.4s, v16.4s, v16.4s
+         faddp           v17.4s, v17.4s, v17.4s
+         zip1            v18.2s, v16.2s, v17.2s
+         zip2            v19.2s, v16.2s, v17.2s
+         fadd            v18.2s, v18.2s, v0.2s
+         fadd            v0.2s,  v19.2s, v0.2s
+         ext             v30.16b, v12.16b, v12.16b, #4
+         ext             v31.16b, v13.16b, v13.16b, #4
+         mov             v16.16b, v6.16b
+         mov             v17.16b, v7.16b
+         uzp1            v30.4s, v30.4s, v8.4s
+         uzp1            v31.4s, v31.4s, v9.4s
+         st1             {v18.2s},  [x0], #8     // out[5]
+         fmla            v16.4s, v2.4s,  v10.4s
+         fmls            v16.4s, v3.4s,  v11.4s
+         fmla            v17.4s, v2.4s,  v11.4s
+         fmla            v17.4s, v3.4s,  v10.4s
+         fmla            v16.4s, v4.4s,  v30.4s
+         fmls            v16.4s, v5.4s,  v31.4s
+         fmla            v17.4s, v4.4s,  v31.4s
+         fmla            v17.4s, v5.4s,  v30.4s
+         zip1            v18.4s, v16.4s, v17.4s
+         zip2            v19.4s, v16.4s, v17.4s
+         ext             v30.16b, v10.16b, v10.16b, #4
+         ext             v31.16b, v11.16b, v11.16b, #4
+         fmla            v6.4s,  v2.4s,  v12.4s
+         fmls            v6.4s,  v3.4s,  v13.4s
+         st1             {v18.4s,v19.4s},  [x0], #32 // out[6-9]
+         uzp1            v30.4s, v30.4s, v12.4s
+         uzp1            v31.4s, v31.4s, v13.4s
+         fmla            v7.4s,  v2.4s,  v13.4s
+         fmla            v7.4s,  v3.4s,  v12.4s
+         st1             {v0.2s},  [x0], #8     // out[10]
+         fmla            v6.4s,  v4.4s,  v30.4s
+         fmls            v6.4s,  v5.4s,  v31.4s
+         fmla            v7.4s,  v4.4s,  v31.4s
+         fmla            v7.4s,  v5.4s,  v30.4s
+         zip1            v18.4s, v6.4s,  v7.4s
+         zip2            v19.4s, v6.4s,  v7.4s
+         st1             {v18.4s,v19.4s},  [x0], #32 // out[11-14]
+         ret             x9
+ endfunc
+ // x0: out, x1: out+len2, x2: exptab, x3: len2
+ function fft15_pass
+         ands            x6,  x3,  #3
+         mov             x4,  x0
+         mov             x5,  x1
+         b.eq            9f
+         ld1             {v0.2s},  [x0], #8
+         ld1             {v1.2s},  [x1], #8
+         sub             x3,  x3,  x6
+         subs            x6,  x6,  #1
+         fadd            v2.2s,  v0.2s,  v1.2s
+         fsub            v3.2s,  v0.2s,  v1.2s
+         add             x2,  x2,  #8
+         st1             {v2.2s},  [x4], #8
+         st1             {v3.2s},  [x5], #8
+         b.eq            9f
+ 1:
+         subs            x6,  x6,  #1
+         ldp             s4,  s5,  [x2], #8
+         ldp             s2,  s3,  [x1], #8
+         ldp             s0,  s1,  [x0], #8
+         fmul            s6,  s2,  s4
+         fmul            s7,  s2,  s5
+         fmls            s6,  s3,  v5.s[0]
+         fmla            s7,  s3,  v4.s[0]
+         fsub            s2,  s0,  s6
+         fsub            s3,  s1,  s7
+         fadd            s0,  s0,  s6
+         fadd            s1,  s1,  s7
+         stp             s2,  s3,  [x5], #8
+         stp             s0,  s1,  [x4], #8
+         b.gt            1b
+ 9:
+         ld1             {v4.4s,v5.4s}, [x2],  #32
+         ld2             {v2.4s,v3.4s}, [x1],  #32
+         uzp1            v6.4s,  v4.4s,  v5.4s
+         uzp2            v7.4s,  v4.4s,  v5.4s
+         ld2             {v0.4s,v1.4s}, [x0],  #32
+ 8:
+         subs            x3,  x3,  #8
+         fmul            v4.4s,  v2.4s,  v6.4s
+         fmul            v5.4s,  v2.4s,  v7.4s
+         b.lt            4f
+         ld1             {v18.4s,v19.4s}, [x2],  #32
+         fmls            v4.4s,  v3.4s,  v7.4s
+         fmla            v5.4s,  v3.4s,  v6.4s
+         ld2             {v22.4s,v23.4s}, [x1],  #32
+         fsub            v2.4s,  v0.4s,  v4.4s
+         fadd            v0.4s,  v0.4s,  v4.4s
+         fsub            v3.4s,  v1.4s,  v5.4s
+         fadd            v1.4s,  v1.4s,  v5.4s
+         uzp1            v16.4s, v18.4s, v19.4s
+         uzp2            v17.4s, v18.4s, v19.4s
+         st2             {v2.4s,v3.4s}, [x5],  #32
+         st2             {v0.4s,v1.4s}, [x4],  #32
+         ld2             {v20.4s,v21.4s}, [x0],  #32
+         fmul            v18.4s, v22.4s, v16.4s
+         fmul            v19.4s, v22.4s, v17.4s
+         b.eq            0f
+         ld1             {v4.4s,v5.4s}, [x2],  #32
+         fmls            v18.4s, v23.4s, v17.4s
+         fmla            v19.4s, v23.4s, v16.4s
+         ld2             {v2.4s,v3.4s}, [x1],  #32
+         fsub            v22.4s, v20.4s, v18.4s
+         fadd            v20.4s, v20.4s, v18.4s
+         fsub            v23.4s, v21.4s, v19.4s
+         fadd            v21.4s, v21.4s, v19.4s
+         uzp1            v6.4s,  v4.4s,  v5.4s
+         uzp2            v7.4s,  v4.4s,  v5.4s
+         st2             {v22.4s,v23.4s}, [x5],  #32
+         st2             {v20.4s,v21.4s}, [x4],  #32
+         ld2             {v0.4s,v1.4s}, [x0],  #32
+         b               8b
+ 4:
+         fmls            v4.4s,  v3.4s,  v7.4s
+         fmla            v5.4s,  v3.4s,  v6.4s
+         fsub            v2.4s,  v0.4s,  v4.4s
+         fadd            v0.4s,  v0.4s,  v4.4s
+         fsub            v3.4s,  v1.4s,  v5.4s
+         fadd            v1.4s,  v1.4s,  v5.4s
+         st2             {v2.4s,v3.4s}, [x5],  #32
+         st2             {v0.4s,v1.4s}, [x4],  #32
+         ret
+ 0:
+         fmls            v18.4s, v23.4s, v17.4s
+         fmla            v19.4s, v23.4s, v16.4s
+         fsub            v22.4s, v20.4s, v18.4s
+         fadd            v20.4s, v20.4s, v18.4s
+         fsub            v23.4s, v21.4s, v19.4s
+         fadd            v21.4s, v21.4s, v19.4s
+         st2             {v22.4s,v23.4s}, [x5],  #32
+         st2             {v20.4s,v21.4s}, [x4],  #32
+         ret
+ endfunc
+ function fft30_neon  align=6
+         sub             sp,  sp,  #0x20
+         stp             x20, x21, [sp]
+         stp             x22, x30, [sp, #0x10]
+         mov             x21, x1
+         mov             x22, x2
+         mov             x20, x4
+         mov             x0,  x21
+         mov             x1,  x22
+         lsl             x3,  x20, #1
+         bl              fft15_neon
+         add             x0,  x21, #15*8
+         add             x1,  x22, x20,  lsl #3
+         lsl             x3,  x20, #1
+         bl              fft15_neon
+         ldr             x2,  [x10, #(CELT_EXPTAB + 8)]  // s->exptab[1]
+         add             x0,  x21, #0
+         add             x1,  x21, #15*8
+         mov             x3,  #15
+         ldp             x20, x21, [sp]
+         ldp             x22, x30, [sp, #0x10]
+         add             sp,  sp,  #0x20
+         b               fft15_pass
+ endfunc
+ .macro  def_fft n, n2
+ function fft\n\()_neon  align=6
+         sub             sp,  sp,  #0x30
+         stp             x20, x21, [sp]
+         stp             x22, x30, [sp, #0x10]
+         stp             x23, x24, [sp, #0x20]
+         mov             x21, x1
+         mov             x22, x2
+         mov             x23, x3
+         mov             x20, x4
+         sub             x3,  x3,  #1
+         lsl             x4,  x4,  #1
+         bl              fft\n2\()_neon
+         add             x1,  x21, #(\n2 * 8)
+         add             x2,  x22, x20, lsl #3
+         sub             x3,  x23, #1
+         lsl             x4,  x20, #1
+         bl              fft\n2\()_neon
+         add             x5,  x10, #CELT_EXPTAB
+         mov             x0,  x21
+         ldr             x2,  [x5,  x23, lsl #3] // s->exptab[N]
+         add             x1,  x21, #(\n2 * 8)
+         mov             x3,  #\n2
+         ldp             x20, x21, [sp]
+         ldp             x22, x30, [sp, #0x10]
+         ldp             x23, x24, [sp, #0x20]
+         add             sp,  sp,  #0x30
+         b               fft15_pass
+ endfunc
+ .endm
+         def_fft    60,  30
+         def_fft   120,  60
+         def_fft   240, 120
+         def_fft   480, 240
+         def_fft   960, 480
+ function fft_b15_calc_neon
+         sub             sp,  sp,  #0x50
+         ldr             x8,  [x0,  #CELT_EXPTAB]    // s->exptab[0]
+         movrel          x6,  fact5
+         movrel          x11, shuffle_0213
+         movrel          x12, shuffle_1032
+         movrel          x13, shuffle_2301
+         movrel          x14, shuffle_3120
+         add             x8,  x8,  #8
+         movrel          x5,  fft_tab_neon
+         stp             x20, x30, [sp]
+         stp             d8,  d9,  [sp, #0x10]
+         stp             d10, d11, [sp, #0x20]
+         stp             d12, d13, [sp, #0x30]
+         stp             d14, d15, [sp, #0x40]
+         ld1             {v15.4s}, [x6]
+         ld1             {v0.4s,v1.4s},   [x8],  #32
+         ld1             {v6.2s},  [x8],  #8
+         ld1             {v2.4s,v3.4s},   [x8],  #32
+         ld1             {v7.2s},  [x8],  #8
+         ld1             {v4.4s,v5.4s},   [x8],  #32
+         uzp1            v8.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].re
+         uzp2            v9.4s,  v0.4s,  v1.4s   // exp[ 1 -  4].im
+         uzp1            v10.4s, v2.4s,  v3.4s   // exp[ 6 -  9].re
+         uzp2            v11.4s, v2.4s,  v3.4s   // exp[ 6 -  9].im
+         uzp1            v12.4s, v4.4s,  v5.4s   // exp[11 - 14].re
+         uzp2            v13.4s, v4.4s,  v5.4s   // exp[11 - 14].im
+         zip1            v14.4s, v6.4s,  v7.4s   // exp[5,10].re/exp[5,10].im
+         add             x5,  x5,  x3,  lsl #3
+         ldr             x5,  [x5]
+         mov             x10, x0
+         blr             x5
+         ldp             x20, x30, [sp]
+         ldp             d8,  d9,  [sp, #0x10]
+         ldp             d10, d11, [sp, #0x20]
+         ldp             d12, d13, [sp, #0x30]
+         ldp             d14, d15, [sp, #0x40]
+         add             sp,  sp,  #0x50
+         ret
+ endfunc
+ const   fft_tab_neon
+         .quad fft15_neon
+         .quad fft30_neon
+         .quad fft60_neon
+         .quad fft120_neon
+         .quad fft240_neon
+         .quad fft480_neon
+         .quad fft960_neon
+ endconst
+ function ff_celt_imdct_half_neon, export=1
+         sub             sp,  sp,  #0x20
+         stp             x21, x30, [sp]
+         str             s0, [sp, #0x10]
+         ldp             w5,  w6,  [x0,  #CELT_LEN2] // CELT_LEN4
+         mov             x10, x0
+         mov             x21, x1
+         sub             w5,  w5,  #1
+         lsl             x7,  x3,  #3            //  2 * stride * sizeof(float)
+         sub             x8,  xzr, x3,  lsl #3   // -2 * stride * sizeof(float)
+         mul             x5,  x5,  x3
+         ldp             x9,  x10, [x0,  #CELT_TMP]  // CELT_TWIDDLE
+         ldr             w3,  [x0, #CELT_FFT_N]
+         add             x5,  x2,  x5,  lsl #2
+         mov             x11, x9
+         sub             w6,  w6,  #4
+         ld1             {v0.s}[0],  [x5], x8
+         ld1             {v1.s}[0],  [x2], x7
+         ld1             {v4.4s,v5.4s}, [x10], #32
+         ld1             {v0.s}[1],  [x5], x8
+         ld1             {v1.s}[1],  [x2], x7
+         uzp1            v2.4s,  v4.4s,  v5.4s
+         ld1             {v0.s}[2],  [x5], x8
+         ld1             {v1.s}[2],  [x2], x7
+         uzp2            v3.4s,  v4.4s,  v5.4s
+         ld1             {v0.s}[3],  [x5], x8
+         ld1             {v1.s}[3],  [x2], x7
+ 1:
+         subs            w6,  w6,  #4
+         ld1             {v20.s}[0], [x5], x8
+         ld1             {v21.s}[0], [x2], x7
+         ld1             {v4.4s,v5.4s}, [x10], #32
+         fmul            v6.4s,  v0.4s,  v2.4s
+         fmul            v7.4s,  v0.4s,  v3.4s
+         ld1             {v20.s}[1], [x5], x8
+         ld1             {v21.s}[1], [x2], x7
+         fmls            v6.4s,  v1.4s,  v3.4s
+         fmla            v7.4s,  v1.4s,  v2.4s
+         ld1             {v20.s}[2], [x5], x8
+         ld1             {v21.s}[2], [x2], x7
+         uzp1            v2.4s,  v4.4s,  v5.4s
+         uzp2            v3.4s,  v4.4s,  v5.4s
+         ld1             {v20.s}[3], [x5], x8
+         ld1             {v21.s}[3], [x2], x7
+         zip1            v4.4s,  v6.4s,  v7.4s
+         zip2            v5.4s,  v6.4s,  v7.4s
+         fmul            v6.4s,  v20.4s, v2.4s
+         fmul            v7.4s,  v20.4s, v3.4s
+         st1             {v4.4s,v5.4s}, [x9], #32
+         fmls            v6.4s,  v21.4s, v3.4s
+         fmla            v7.4s,  v21.4s, v2.4s
+         b.eq            3f
+         subs            w6,  w6,  #4
+         ld1             {v4.4s,v5.4s}, [x10], #32
+         ld1             {v0.s}[0],  [x5], x8
+         ld1             {v1.s}[0],  [x2], x7
+         uzp1            v2.4s,  v4.4s,  v5.4s
+         ld1             {v0.s}[1],  [x5], x8
+         ld1             {v1.s}[1],  [x2], x7
+         uzp2            v3.4s,  v4.4s,  v5.4s
+         ld1             {v0.s}[2],  [x5], x8
+         ld1             {v1.s}[2],  [x2], x7
+         zip1            v4.4s,  v6.4s,  v7.4s
+         zip2            v5.4s,  v6.4s,  v7.4s
+         ld1             {v0.s}[3],  [x5], x8
+         ld1             {v1.s}[3],  [x2], x7
+         st1             {v4.4s,v5.4s}, [x9], #32
+         b.gt            1b
+         fmul            v6.4s,  v0.4s,  v2.4s
+         fmul            v7.4s,  v0.4s,  v3.4s
+         fmls            v6.4s,  v1.4s,  v3.4s
+         fmla            v7.4s,  v1.4s,  v2.4s
+ 3:
+         zip1            v4.4s,  v6.4s,  v7.4s
+         zip2            v5.4s,  v6.4s,  v7.4s
+         st1             {v4.4s,v5.4s}, [x9], #32
+         mov             x2,  x11
+         mov             x4,  #1
+         bl              fft_b15_calc_neon
+         ldr             w5,  [x10, #CELT_LEN4]
+         ldr             x6,  [x10, #CELT_TWIDDLE]
+         ldr             s31, [sp, #0x10]
+         add             x1,  x21, x5,  lsl #2
+         add             x3,  x6,  x5,  lsl #2
+         sub             x0,  x1,  #16
+         sub             x2,  x3,  #16
+         mov             x8,  #-16
+         mov             x7,  #16
+         mov             x10, x0
+         mov             x11, x1
+         sub             w5,  w5,  #4
+         ld1             {v0.4s},  [x0], x8
+         ld1             {v1.4s},  [x1], x7
+         ld1             {v2.4s},  [x2], x8
+         ld1             {v3.4s},  [x3], x7
+         uzp1            v4.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].re
+         uzp2            v6.4s,  v0.4s,  v1.4s   // z[-i-2, -i-1, +i, i+1].im
+         uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
+         uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
+         fmul            v1.4s,  v6.4s,  v5.4s
+         fmul            v0.4s,  v6.4s,  v7.4s
+ 2:
+         subs            w5,  w5,  #4
+         ld1             {v20.4s}, [x0], x8
+         fmla            v1.4s,  v4.4s,  v7.4s
+         fmls            v0.4s,  v4.4s,  v5.4s
+         ld1             {v21.4s}, [x1], x7
+         ext             v1.16b, v1.16b, v1.16b, #8
+         fmul            v0.4s,  v0.4s,  v31.s[0]
+         ld1             {v2.4s},  [x2], x8
+         rev64           v1.4s,  v1.4s
+         fmul            v1.4s,  v1.4s,  v31.s[0]
+         ld1             {v3.4s},  [x3], x7
+         zip1            v5.4s,  v0.4s,  v1.4s
+         zip2            v7.4s,  v0.4s,  v1.4s
+         uzp1            v4.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].re
+         uzp2            v6.4s,  v20.4s, v21.4s  // z[-i-2, -i-1, +i, i+1].im
+         st1             {v5.4s},  [x10], x8
+         st1             {v7.4s},  [x11], x7
+         uzp1            v5.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].re
+         uzp2            v7.4s,  v2.4s,  v3.4s   // twidlle_exptab[-i-2, -i-1, +i, i+1].im
+         fmul            v1.4s,  v6.4s,  v5.4s
+         fmul            v0.4s,  v6.4s,  v7.4s
+         b.gt            2b
+         fmla            v1.4s,  v4.4s,  v7.4s
+         fmls            v0.4s,  v4.4s,  v5.4s
+         ext             v1.16b, v1.16b, v1.16b, #8
+         fmul            v0.4s,  v0.4s,  v31.s[0]
+         rev64           v1.4s,  v1.4s
+         fmul            v1.4s,  v1.4s,  v31.s[0]
+         zip1            v5.4s,  v0.4s,  v1.4s
+         zip2            v7.4s,  v0.4s,  v1.4s
+         st1             {v5.4s},  [x10], x8
+         st1             {v7.4s},  [x11], x7
+         ldp             x21, x30, [sp]
+         add             sp,  sp,  #0x20
+         ret
+ endfunc
+ // [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5)
+ const   fact5           align=4
+         .float           0.30901699437494745, 0.95105651629515353
+         .float          -0.80901699437494734, 0.58778525229247325
+ endconst
Simple merge
Simple merge
Simple merge
index 0000000,d4bff9a..0ca4d97
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,57 +1,57 @@@
 - * This file is part of Libav.
+ /*
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #ifndef AVCODEC_OPUS_IMDCT_H
+ #define AVCODEC_OPUS_IMDCT_H
+ #include <stddef.h>
+ #include "avfft.h"
+ typedef struct CeltIMDCTContext {
+     int fft_n;
+     int len2;
+     int len4;
+     FFTComplex *tmp;
+     FFTComplex *twiddle_exptab;
+     FFTComplex *exptab[6];
+     /**
+      * Calculate the middle half of the iMDCT
+      */
+     void (*imdct_half)(struct CeltIMDCTContext *s, float *dst, const float *src,
+                        ptrdiff_t src_stride, float scale);
+ } CeltIMDCTContext;
+ /**
+  * Init an iMDCT of the length 2 * 15 * (2^N)
+  */
+ int ff_celt_imdct_init(CeltIMDCTContext **s, int N);
+ /**
+  * Free an iMDCT.
+  */
+ void ff_celt_imdct_uninit(CeltIMDCTContext **s);
+ void ff_celt_imdct_init_aarch64(CeltIMDCTContext *s);
+ #endif /* AVCODEC_OPUS_IMDCT_H */