Merge commit 'b63bb251ea6d6ba23295294e37a92625c0192206'
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 22 Jul 2013 09:56:04 +0000 (11:56 +0200)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 22 Jul 2013 09:57:05 +0000 (11:57 +0200)
* commit 'b63bb251ea6d6ba23295294e37a92625c0192206':
  arm: Add VFP-accelerated version of imdct_half

Merged-by: Michael Niedermayer <michaelni@gmx.at>
1  2 
libavcodec/arm/fft_init_arm.c
libavcodec/arm/mdct_vfp.S
libavcodec/arm/synth_filter_vfp.S

@@@ -48,11 -50,16 +50,18 @@@ av_cold void ff_fft_init_arm(FFTContex
  {
      int cpu_flags = av_get_cpu_flags();
  
+     if (have_vfp(cpu_flags)) {
+ #if CONFIG_MDCT
+         if (!have_vfpv3(cpu_flags))
+             s->imdct_half   = ff_imdct_half_vfp;
+ #endif
+     }
      if (have_neon(cpu_flags)) {
 +#if CONFIG_FFT
          s->fft_permute  = ff_fft_permute_neon;
          s->fft_calc     = ff_fft_calc_neon;
 +#endif
  #if CONFIG_MDCT
          s->imdct_calc   = ff_imdct_calc_neon;
          s->imdct_half   = ff_imdct_half_neon;
index 0000000,7413a41..2666b00
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,206 +1,206 @@@
 - * This file is part of Libav.
+ /*
+  * Copyright (c) 2013 RISC OS Open Ltd
+  * Author: Ben Avison <bavison@riscosopen.org>
+  *
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include "libavutil/arm/asm.S"
+ CONTEXT .req    a1
+ ORIGOUT .req    a2
+ IN      .req    a3
+ OUT     .req    v1
+ REVTAB  .req    v2
+ TCOS    .req    v3
+ TSIN    .req    v4
+ OLDFPSCR .req   v5
+ J0      .req    a2
+ J1      .req    a4
+ J2      .req    ip
+ J3      .req    lr
+ .macro prerotation_innerloop
+  .set trig_lo, k
+  .set trig_hi, n4 - k - 2
+  .set in_lo, trig_lo * 2
+  .set in_hi, trig_hi * 2
+         vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
+         vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
+         vldr    s0, [IN, #in_hi*4 + 12]
+         vldr    s1, [IN, #in_hi*4 + 4]
+         vldr    s2, [IN, #in_lo*4 + 12]
+         vldr    s3, [IN, #in_lo*4 + 4]
+         vmul.f  s8, s0, s16                     @ vector operation
+         vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
+         vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
+         vldr    s4, [IN, #in_lo*4]
+         vldr    s5, [IN, #in_lo*4 + 8]
+         vldr    s6, [IN, #in_hi*4]
+         vldr    s7, [IN, #in_hi*4 + 8]
+         ldr     J0, [REVTAB, #trig_lo*2]
+         vmul.f  s12, s0, s20                    @ vector operation
+         ldr     J2, [REVTAB, #trig_hi*2]
+         mov     J1, J0, lsr #16
+         and     J0, J0, #255                    @ halfword value will be < n4
+         vmls.f  s8, s4, s20                     @ vector operation
+         mov     J3, J2, lsr #16
+         and     J2, J2, #255                    @ halfword value will be < n4
+         add     J0, OUT, J0, lsl #3
+         vmla.f  s12, s4, s16                    @ vector operation
+         add     J1, OUT, J1, lsl #3
+         add     J2, OUT, J2, lsl #3
+         add     J3, OUT, J3, lsl #3
+         vstr    s8, [J0]
+         vstr    s9, [J1]
+         vstr    s10, [J2]
+         vstr    s11, [J3]
+         vstr    s12, [J0, #4]
+         vstr    s13, [J1, #4]
+         vstr    s14, [J2, #4]
+         vstr    s15, [J3, #4]
+  .set k, k + 2
+ .endm
+ .macro postrotation_innerloop tail, head
+  .set trig_lo_head, n8 - k - 2
+  .set trig_hi_head, n8 + k
+  .set out_lo_head, trig_lo_head * 2
+  .set out_hi_head, trig_hi_head * 2
+  .set trig_lo_tail, n8 - (k - 2) - 2
+  .set trig_hi_tail, n8 + (k - 2)
+  .set out_lo_tail, trig_lo_tail * 2
+  .set out_hi_tail, trig_hi_tail * 2
+  .if (k & 2) == 0
+   TCOS_D0_HEAD .req d10 @ s20,s21
+   TCOS_D1_HEAD .req d11 @ s22,s23
+   TCOS_S0_TAIL .req s24
+  .else
+   TCOS_D0_HEAD .req d12 @ s24,s25
+   TCOS_D1_HEAD .req d13 @ s26,s27
+   TCOS_S0_TAIL .req s20
+  .endif
+  .ifnc "\tail",""
+         vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
+  .endif
+  .ifnc "\head",""
+         vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
+         vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
+         vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
+  .endif
+  .ifnc "\tail",""
+         vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
+  .endif
+  .ifnc "\head",""
+         vldr    s0, [OUT, #out_lo_head*4]
+         vldr    s1, [OUT, #out_lo_head*4 + 8]
+         vldr    s2, [OUT, #out_hi_head*4]
+         vldr    s3, [OUT, #out_hi_head*4 + 8]
+         vldr    s4, [OUT, #out_lo_head*4 + 4]
+         vldr    s5, [OUT, #out_lo_head*4 + 12]
+         vldr    s6, [OUT, #out_hi_head*4 + 4]
+         vldr    s7, [OUT, #out_hi_head*4 + 12]
+  .endif
+  .ifnc "\tail",""
+         vstr    s8, [OUT, #out_lo_tail*4]
+         vstr    s9, [OUT, #out_lo_tail*4 + 8]
+         vstr    s10, [OUT, #out_hi_tail*4]
+         vstr    s11, [OUT, #out_hi_tail*4 + 8]
+  .endif
+  .ifnc "\head",""
+         vmul.f  s8, s4, s16                 @ vector operation
+  .endif
+  .ifnc "\tail",""
+         vstr    s12, [OUT, #out_hi_tail*4 + 12]
+         vstr    s13, [OUT, #out_hi_tail*4 + 4]
+         vstr    s14, [OUT, #out_lo_tail*4 + 12]
+         vstr    s15, [OUT, #out_lo_tail*4 + 4]
+  .endif
+  .ifnc "\head",""
+         vmul.f  s12, s0, s16                @ vector operation
+         vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
+  .endif
+  .unreq TCOS_D0_HEAD
+  .unreq TCOS_D1_HEAD
+  .unreq TCOS_S0_TAIL
+  .ifnc "\head",""
+   .set k, k + 2
+  .endif
+ .endm
+ /* void ff_imdct_half_vfp(FFTContext *s,
+  *                        FFTSample *output,
+  *                        const FFTSample *input)
+  */
+ function ff_imdct_half_vfp, export=1
+         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
+         teq     ip, #6
+         it      ne
+         bne     ff_imdct_half_c             @ only case currently accelerated is the one used by DCA
+  .set n, 1<<6
+  .set n2, n/2
+  .set n4, n/4
+  .set n8, n/8
+         push    {v1-v5,lr}
+         vpush   {s16-s27}
+         fmrx    OLDFPSCR, FPSCR
+         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
+         fmxr    FPSCR, lr
+         mov     OUT, ORIGOUT
+         ldr     REVTAB, [CONTEXT, #2*4]
+         ldr     TCOS, [CONTEXT, #6*4]
+         ldr     TSIN, [CONTEXT, #7*4]
+  .set k, 0
+  .rept n8/2
+         prerotation_innerloop
+  .endr
+         fmxr    FPSCR, OLDFPSCR
+         mov     ORIGOUT, OUT
+         ldr     ip, [CONTEXT, #9*4]
+         blx     ip                          @ s->fft_calc(s, output)
+         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
+         fmxr    FPSCR, lr
+  .set k, 0
+         postrotation_innerloop , head
+  .rept n8/2 - 1
+         postrotation_innerloop tail, head
+  .endr
+         postrotation_innerloop tail
+         fmxr    FPSCR, OLDFPSCR
+         vpop    {s16-s27}
+         pop     {v1-v5,pc}
+ endfunc
+         .unreq  CONTEXT
+         .unreq  ORIGOUT
+         .unreq  IN
+         .unreq  OUT
+         .unreq  REVTAB
+         .unreq  TCOS
+         .unreq  TSIN
+         .unreq  OLDFPSCR
+         .unreq  J0
+         .unreq  J1
+         .unreq  J2
+         .unreq  J3
Simple merge