e66abd682a8fe1efcd338693038d58c708997411
[ffmpeg.git] / libavcodec / arm / sbrdsp_neon.S
1 /*
2  * Copyright (c) 2012 Mans Rullgard
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 function ff_sbr_sum64x5_neon, export=1
24         push            {lr}
25         add             r1,  r0,  # 64*4
26         add             r2,  r0,  #128*4
27         add             r3,  r0,  #192*4
28         add             lr,  r0,  #256*4
29         mov             r12, #64
30 1:
31         vld1.32         {q0},     [r0,:128]
32         vld1.32         {q1},     [r1,:128]!
33         vadd.f32        q0,  q0,  q1
34         vld1.32         {q2},     [r2,:128]!
35         vadd.f32        q0,  q0,  q2
36         vld1.32         {q3},     [r3,:128]!
37         vadd.f32        q0,  q0,  q3
38         vld1.32         {q8},     [lr,:128]!
39         vadd.f32        q0,  q0,  q8
40         vst1.32         {q0},     [r0,:128]!
41         subs            r12, #4
42         bgt             1b
43         pop             {pc}
44 endfunc
45
46 function ff_sbr_sum_square_neon, export=1
47         vmov.f32        q0,  #0.0
48 1:
49         vld1.32         {q1},     [r0,:128]!
50         vmla.f32        q0,  q1,  q1
51         subs            r1,  r1,  #2
52         bgt             1b
53         vadd.f32        d0,  d0,  d1
54         vpadd.f32       d0,  d0,  d0
55 NOVFP   vmov.32         r0,  d0[0]
56         bx              lr
57 endfunc
58
59 function ff_sbr_neg_odd_64_neon, export=1
60         mov             r1,  r0
61         vmov.i32        q8,  #1<<31
62         vld2.32         {q0,q1},  [r0,:128]!
63         veor            q1,  q1,  q8
64         vld2.32         {q2,q3},  [r0,:128]!
65     .rept 3
66         vst2.32         {q0,q1},  [r1,:128]!
67         veor            q3,  q3,  q8
68         vld2.32         {q0,q1},  [r0,:128]!
69         vst2.32         {q2,q3},  [r1,:128]!
70         veor            q1,  q1,  q8
71         vld2.32         {q2,q3},  [r0,:128]!
72     .endr
73         veor            q3,  q3,  q8
74         vst2.32         {q0,q1},  [r1,:128]!
75         vst2.32         {q2,q3},  [r1,:128]!
76         bx              lr
77 endfunc
78
79 function ff_sbr_qmf_pre_shuffle_neon, export=1
80         add             r1,  r0,  #60*4
81         add             r2,  r0,  #64*4
82         vld1.32         {d0},     [r0,:64]!
83         vst1.32         {d0},     [r2,:64]!
84         mov             r3,  #-16
85         mov             r12, #24
86         vmov.i32        q8,  #1<<31
87         vld1.32         {q0},     [r1,:128], r3
88         vld1.32         {d2},     [r0,:64]!
89 1:
90         vld1.32         {d3,d4},  [r0,:128]!
91         vrev64.32       q0,  q0
92         vld1.32         {q9},     [r1,:128], r3
93         veor            q0,  q0,  q8
94         vld1.32         {d5,d6},  [r0,:128]!
95         vswp            d0,  d1
96         vrev64.32       q9,  q9
97         vst2.32         {q0,q1},  [r2,:64]!
98         vmov            q10, q2
99         veor            q9,  q9,  q8
100         vmov            d2,  d6
101         vswp            d18, d19
102         vld1.32         {q0},     [r1,:128], r3
103         vst2.32         {q9,q10}, [r2,:64]!
104         subs            r12, r12, #8
105         bgt             1b
106         vld1.32         {d3,d4},  [r0,:128]!
107         vrev64.32       q0,  q0
108         vld1.32         {q9},     [r1,:128], r3
109         veor            q0,  q0,  q8
110         vld1.32         {d5},     [r0,:64]!
111         vswp            d0,  d1
112         vrev64.32       q9,  q9
113         vst2.32         {q0,q1},  [r2,:64]!
114         vswp            d4,  d5
115         veor            q1,  q9,  q8
116         vst2.32         {d3,d5},  [r2,:64]!
117         vst2.32         {d2[0],d4[0]}, [r2,:64]!
118         bx              lr
119 endfunc
120
121 function ff_sbr_qmf_post_shuffle_neon, export=1
122         add             r2,  r1,  #60*4
123         mov             r3,  #-16
124         mov             r12, #32
125         vmov.i32        q8,  #1<<31
126         vld1.32         {q0},     [r2,:128], r3
127         vld1.32         {q1},     [r1,:128]!
128 1:
129         pld             [r2, #-32]
130         vrev64.32       q0,  q0
131         vswp            d2,  d3
132         veor            q0,  q0,  q8
133         vld1.32         {q2},     [r2,:128], r3
134         vld1.32         {q3},     [r1,:128]!
135         vst2.32         {d1,d3},  [r0,:128]!
136         vst2.32         {d0,d2},  [r0,:128]!
137         pld             [r2, #-32]
138         vrev64.32       q2,  q2
139         vswp            d6,  d7
140         veor            q2,  q2,  q8
141         vld1.32         {q0},     [r2,:128], r3
142         vld1.32         {q1},     [r1,:128]!
143         vst2.32         {d5,d7},  [r0,:128]!
144         vst2.32         {d4,d6},  [r0,:128]!
145         subs            r12, r12, #8
146         bgt             1b
147         bx              lr
148 endfunc
149
150 function ff_sbr_qmf_deint_neg_neon, export=1
151         add             r1,  r1,  #60*4
152         add             r2,  r0,  #62*4
153         mov             r3,  #-16
154         mov             r12, #32
155         vmov.i32        d2,  #1<<31
156 1:
157         vld2.32         {d0,d1},  [r1,:128], r3
158         veor            d0,  d0,  d2
159         vrev64.32       d1,  d1
160         vst1.32         {d0},     [r2,:64]
161         vst1.32         {d1},     [r0,:64]!
162         sub             r2,  r2,  #8
163         subs            r12, r12, #2
164         bgt             1b
165         bx              lr
166 endfunc
167
168 function ff_sbr_qmf_deint_bfly_neon, export=1
169         push            {lr}
170         add             r2,  r2,  #60*4
171         add             r3,  r0,  #124*4
172         mov             r12, #64
173         mov             lr,  #-16
174 1:
175         vld1.32         {q0},     [r1,:128]!
176         vld1.32         {q1},     [r2,:128], lr
177         vrev64.32       q2,  q0
178         vrev64.32       q3,  q1
179         vadd.f32        d3,  d4,  d3
180         vadd.f32        d2,  d5,  d2
181         vsub.f32        d0,  d0,  d7
182         vsub.f32        d1,  d1,  d6
183         vst1.32         {q1},     [r3,:128], lr
184         vst1.32         {q0},     [r0,:128]!
185         subs            r12, r12, #4
186         bgt             1b
187         pop             {pc}
188 endfunc
189
190 function ff_sbr_hf_g_filt_neon, export=1
191         ldr             r12, [sp]
192         add             r1,  r1,  r12, lsl #3
193         mov             r12, #40*2*4
194         sub             r3,  r3,  #1
195         vld2.32         {d2[],d3[]},[r2,:64]!
196         vld1.32         {d0},     [r1,:64], r12
197 1:
198         vld1.32         {d1},     [r1,:64], r12
199         vmul.f32        q3,  q0,  q1
200         vld2.32         {d2[],d3[]},[r2,:64]!
201         vld1.32         {d0},     [r1,:64], r12
202         vst1.32         {q3},     [r0,:64]!
203         subs            r3,  r3,  #2
204         bgt             1b
205         it              lt
206         bxlt            lr
207         vmul.f32        d0,  d0,  d2
208         vst1.32         {d0},     [r0,:64]!
209         bx              lr
210 endfunc
211
212 function ff_sbr_hf_gen_neon, export=1
213 NOVFP   vld1.32         {d1[]},   [sp,:32]
214 VFP     vdup.32         d1,  d0[0]
215         vmul.f32        d0,  d1,  d1
216         vld1.32         {d3},     [r2,:64]
217         vld1.32         {d2},     [r3,:64]
218         vmul.f32        q0,  q0,  q1
219         ldrd            r2,  r3,  [sp, #4*!HAVE_VFP_ARGS]
220         vtrn.32         d0,  d1
221         vneg.f32        d18, d1
222         vtrn.32         d18, d1
223         add             r0,  r0,  r2,  lsl #3
224         add             r1,  r1,  r2,  lsl #3
225         sub             r1,  r1,  #2*8
226         sub             r3,  r3,  r2
227         vld1.32         {q1},     [r1,:128]!
228 1:
229         vld1.32         {q3},     [r1,:128]!
230         vrev64.32       q2,  q1
231         vmov            q8,  q3
232         vrev64.32       d20, d3
233         vrev64.32       d21, d6
234         vmla.f32        q3,  q1,  d0[0]
235         vmla.f32        d6,  d4,  d18
236         vmla.f32        d7,  d20, d18
237         vmla.f32        d6,  d3,  d0[1]
238         vmla.f32        d7,  d16, d0[1]
239         vmla.f32        d6,  d5,  d1
240         vmla.f32        d7,  d21, d1
241         vmov            q1,  q8
242         vst1.32         {q3},     [r0,:128]!
243         subs            r3,  r3,  #2
244         bgt             1b
245         bx              lr
246 endfunc
247
248 function ff_sbr_autocorrelate_neon, export=1
249         vld1.32         {q0},     [r0,:128]!
250         vmov.f32        q1,  #0.0
251         vmov.f32        q3,  #0.0
252         vmov.f32        d20, #0.0
253         vmul.f32        d21, d1,  d1
254         vmov            q8,  q0
255         vmov            q11, q0
256         mov             r12, #36
257 1:
258         vld1.32         {q2},     [r0,:128]!
259         vrev64.32       q12, q2
260         vmla.f32        q10, q2,  q2
261         vmla.f32        d2,  d1,  d4
262         vmla.f32        d3,  d1,  d24
263         vmla.f32        d6,  d0,  d4
264         vmla.f32        d7,  d0,  d24
265         vmla.f32        d2,  d4,  d5
266         vmla.f32        d3,  d4,  d25
267         vmla.f32        d6,  d1,  d5
268         vmla.f32        d7,  d1,  d25
269         vmov            q0,  q2
270         subs            r12, r12, #2
271         bgt             1b
272         vld1.32         {q2},     [r0,:128]!
273         vrev64.32       q12, q2
274         vmla.f32        d2,  d1,  d4
275         vmla.f32        d3,  d1,  d24
276         vmla.f32        d6,  d0,  d4
277         vmla.f32        d7,  d0,  d24
278         vadd.f32        d20, d20, d21
279         vrev64.32       d18, d17
280         vmla.f32        d6,  d1,  d5
281         vmla.f32        d7,  d1,  d25
282         vmov            q0,  q1
283         vmla.f32        d0,  d16, d17
284         vmla.f32        d1,  d16, d18
285         vmla.f32        d2,  d4,  d5
286         vmla.f32        d3,  d4,  d25
287         vneg.f32        s15, s15
288         vmov            d21, d20
289         vpadd.f32       d0,  d0,  d2
290         vpadd.f32       d7,  d6,  d7
291         vtrn.32         d1,  d3
292         vsub.f32        d6,  d1,  d3
293         vmla.f32        d20, d22, d22
294         vmla.f32        d21, d4,  d4
295         vtrn.32         d0,  d6
296         vpadd.f32       d20, d20, d21
297         vst1.32         {q3},     [r1,:128]!
298         vst1.32         {d20[1]}, [r1,:32]
299         add             r1,  r1,  #2*4
300         vst1.32         {d0},     [r1,:64]
301         add             r1,  r1,  #4*4
302         vst1.32         {d20[0]}, [r1,:32]
303         bx              lr
304 endfunc
305
306 function ff_sbr_hf_apply_noise_0_neon, export=1
307         vmov.i32        d3,  #0
308 .Lhf_apply_noise_0:
309         push            {r4,lr}
310         movrelx         r4,  X(ff_sbr_noise_table)
311         ldr             r12, [sp, #12]
312         add             r3,  r3,  #1
313         bfc             r3,  #9,  #23
314         sub             r12, r12, #1
315 1:
316         add             lr,  r4,  r3,  lsl #3
317         vld2.32         {q0},     [r0,:64]
318         vld2.32         {q3},     [lr,:64]
319         vld1.32         {d2},     [r1,:64]!
320         vld1.32         {d18},    [r2,:64]!
321         vceq.f32        d16, d2,  #0
322         veor            d2,  d2,  d3
323         vmov            q2,  q0
324         vmla.f32        d0,  d6,  d18
325         vmla.f32        d1,  d7,  d18
326         vadd.f32        d4,  d4,  d2
327         add             r3,  r3,  #2
328         bfc             r3,  #9,  #23
329         vbif            d0,  d4,  d16
330         vbif            d1,  d5,  d16
331         vst2.32         {q0},     [r0,:64]!
332         subs            r12, r12, #2
333         bgt             1b
334         blt             2f
335         add             lr,  r4,  r3,  lsl #3
336         vld1.32         {d0},     [r0,:64]
337         vld1.32         {d6},     [lr,:64]
338         vld1.32         {d2[]},   [r1,:32]!
339         vld1.32         {d3[]},   [r2,:32]!
340         vceq.f32        d4,  d2,  #0
341         veor            d2,  d2,  d3
342         vmov            d1,  d0
343         vmla.f32        d0,  d6,  d3
344         vadd.f32        s2,  s2,  s4
345         vbif            d0,  d1,  d4
346         vst1.32         {d0},     [r0,:64]!
347 2:
348         pop             {r4,pc}
349 endfunc
350
351 function ff_sbr_hf_apply_noise_1_neon, export=1
352         ldr             r12, [sp]
353         push            {r4,lr}
354         lsl             r12, r12, #31
355         eor             lr,  r12, #1<<31
356         vmov            d3,  r12, lr
357 .Lhf_apply_noise_1:
358         movrelx         r4,  X(ff_sbr_noise_table)
359         ldr             r12, [sp, #12]
360         add             r3,  r3,  #1
361         bfc             r3,  #9,  #23
362         sub             r12, r12, #1
363 1:
364         add             lr,  r4,  r3,  lsl #3
365         vld2.32         {q0},     [r0,:64]
366         vld2.32         {q3},     [lr,:64]
367         vld1.32         {d2},     [r1,:64]!
368         vld1.32         {d18},    [r2,:64]!
369         vceq.f32        d16, d2,  #0
370         veor            d2,  d2,  d3
371         vmov            q2,  q0
372         vmla.f32        d0,  d6,  d18
373         vmla.f32        d1,  d7,  d18
374         vadd.f32        d5,  d5,  d2
375         add             r3,  r3,  #2
376         bfc             r3,  #9,  #23
377         vbif            d0,  d4,  d16
378         vbif            d1,  d5,  d16
379         vst2.32         {q0},     [r0,:64]!
380         subs            r12, r12, #2
381         bgt             1b
382         blt             2f
383         add             lr,  r4,  r3,  lsl #3
384         vld1.32         {d0},     [r0,:64]
385         vld1.32         {d6},     [lr,:64]
386         vld1.32         {d2[]},   [r1,:32]!
387         vld1.32         {d18[]},  [r2,:32]!
388         vceq.f32        d4,  d2,  #0
389         veor            d2,  d2,  d3
390         vmov            d1,  d0
391         vmla.f32        d0,  d6,  d18
392         vadd.f32        s3,  s3,  s5
393         vbif            d0,  d1,  d4
394         vst1.32         {d0},     [r0,:64]!
395 2:
396         pop             {r4,pc}
397 endfunc
398
399 function ff_sbr_hf_apply_noise_2_neon, export=1
400         vmov.i32        d3,  #1<<31
401         b               .Lhf_apply_noise_0
402 endfunc
403
404 function ff_sbr_hf_apply_noise_3_neon, export=1
405         ldr             r12, [sp]
406         push            {r4,lr}
407         lsl             r12, r12, #31
408         eor             lr,  r12, #1<<31
409         vmov            d3,  lr, r12
410         b               .Lhf_apply_noise_1
411 endfunc