libavresample: NEON optimized FIR audio resampling
[ffmpeg.git] / libavresample / arm / resample_neon.S
1 /*
2  * Copyright (c) 2014 Peter Meerwald <pmeerw@pmeerw.net>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 #include "asm-offsets.h"
24
25 .macro resample_one     fmt, es=2
26 function ff_resample_one_\fmt\()_neon, export=1
27         push            {r4, r5}
28         add             r1, r1, r2, lsl #\es
29
30         ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
31         ldr             ip, [sp, #8] /* index */
32         ldr             r5, [r0, #FILTER_LENGTH]
33         and             r2, ip, r2 /* (index & phase_mask) */
34         ldr             r4, [r0, #PHASE_SHIFT]
35         lsr             r4, ip, r4 /* compute sample_index */
36         mul             r2, r2, r5
37
38         ldr             ip, [r0, #FILTER_BANK]
39         add             r3, r3, r4, lsl #\es /* &src[sample_index] */
40
41         cmp             r5, #8
42         add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
43
44         blt             5f
45 8:
46         subs            r5, r5, #8
47         LOAD4
48         MUL4
49 7:
50         LOAD4
51         beq             6f
52         cmp             r5, #8
53         MLA4
54         blt             4f
55         subs            r5, r5, #8
56         LOAD4
57         MLA4
58         b               7b
59 6:
60         MLA4
61         STORE
62         pop             {r4, r5}
63         bx              lr
64 5:
65         INIT4
66 4:      /* remaining filter_length 1 to 7 */
67         cmp             r5, #4
68         blt             2f
69         subs            r5, r5, #4
70         LOAD4
71         MLA4
72         beq             0f
73 2:      /* remaining filter_length 1 to 3 */
74         cmp             r5, #2
75         blt             1f
76         subs            r5, r5, #2
77         LOAD2
78         MLA2
79         beq             0f
80 1:      /* remaining filter_length 1 */
81         LOAD1
82         MLA1
83 0:
84         STORE
85         pop             {r4, r5}
86         bx              lr
87 endfunc
88
89 .purgem LOAD1
90 .purgem LOAD2
91 .purgem LOAD4
92 .purgem MLA1
93 .purgem MLA2
94 .purgem MLA4
95 .purgem MUL4
96 .purgem INIT4
97 .purgem STORE
98 .endm
99
100
101 /* float32 */
102 .macro  LOAD1
103         veor.32         d0, d0
104         vld1.32         {d0[0]}, [r0]! /* load filter */
105         vld1.32         {d4[0]}, [r3]! /* load src */
106 .endm
107 .macro  LOAD2
108         vld1.32         {d0}, [r0]! /* load filter */
109         vld1.32         {d4}, [r3]! /* load src */
110 .endm
111 .macro  LOAD4
112         vld1.32         {d0,d1}, [r0]! /* load filter */
113         vld1.32         {d4,d5}, [r3]! /* load src */
114 .endm
115 .macro  MLA1
116         vmla.f32        d16, d0, d4[0]
117 .endm
118 .macro  MLA2
119         vmla.f32        d16, d0, d4
120 .endm
121 .macro  MLA4
122         vmla.f32        d16, d0, d4
123         vmla.f32        d17, d1, d5
124 .endm
125 .macro  MUL4
126         vmul.f32        d16, d0, d4
127         vmul.f32        d17, d1, d5
128 .endm
129 .macro  INIT4
130         veor.f32        q8, q8
131 .endm
132 .macro  STORE
133         vpadd.f32       d16, d16, d17
134         vpadd.f32       d16, d16, d16
135         vst1.32         d16[0], [r1]
136 .endm
137
138 resample_one flt, 2
139
140
141 /* s32 */
142 .macro  LOAD1
143         veor.32         d0, d0
144         vld1.32         {d0[0]}, [r0]! /* load filter */
145         vld1.32         {d4[0]}, [r3]! /* load src */
146 .endm
147 .macro  LOAD2
148         vld1.32         {d0}, [r0]! /* load filter */
149         vld1.32         {d4}, [r3]! /* load src */
150 .endm
151 .macro  LOAD4
152         vld1.32         {d0,d1}, [r0]! /* load filter */
153         vld1.32         {d4,d5}, [r3]! /* load src */
154 .endm
155 .macro  MLA1
156         vmlal.s32       q8, d0, d4[0]
157 .endm
158 .macro  MLA2
159         vmlal.s32       q8, d0, d4
160 .endm
161 .macro  MLA4
162         vmlal.s32       q8, d0, d4
163         vmlal.s32       q9, d1, d5
164 .endm
165 .macro  MUL4
166         vmull.s32       q8, d0, d4
167         vmull.s32       q9, d1, d5
168 .endm
169 .macro  INIT4
170         veor.s64        q8, q8
171         veor.s64        q9, q9
172 .endm
173 .macro  STORE
174         vadd.s64        q8, q8, q9
175         vadd.s64        d16, d16, d17
176         vqrshrn.s64     d16, q8, #30
177         vst1.32         d16[0], [r1]
178 .endm
179
180 resample_one s32, 2
181
182
183 /* s16 */
184 .macro  LOAD1
185         veor.16         d0, d0
186         vld1.16         {d0[0]}, [r0]! /* load filter */
187         vld1.16         {d4[0]}, [r3]! /* load src */
188 .endm
189 .macro  LOAD2
190         veor.16         d0, d0
191         vld1.32         {d0[0]}, [r0]! /* load filter */
192         veor.16         d4, d4
193         vld1.32         {d4[0]}, [r3]! /* load src */
194 .endm
195 .macro  LOAD4
196         vld1.16         {d0}, [r0]! /* load filter */
197         vld1.16         {d4}, [r3]! /* load src */
198 .endm
199 .macro  MLA1
200         vmlal.s16       q8, d0, d4[0]
201 .endm
202 .macro  MLA2
203         vmlal.s16       q8, d0, d4
204 .endm
205 .macro  MLA4
206         vmlal.s16       q8, d0, d4
207 .endm
208 .macro  MUL4
209         vmull.s16       q8, d0, d4
210 .endm
211 .macro  INIT4
212         veor.s32        q8, q8
213 .endm
214 .macro  STORE
215         vpadd.s32       d16, d16, d17
216         vpadd.s32       d16, d16, d16
217         vqrshrn.s32     d16, q8, #15
218         vst1.16         d16[0], [r1]
219 .endm
220
221 resample_one s16, 1
222
223
224 .macro resample_linear  fmt, es=2
225 function ff_resample_linear_\fmt\()_neon, export=1
226         push            {r4, r5}
227         add             r1, r1, r2, lsl #\es
228
229         ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
230         ldr             ip, [sp, #8] /* index */
231         ldr             r5, [r0, #FILTER_LENGTH]
232         and             r2, ip, r2 /* (index & phase_mask) */
233         ldr             r4, [r0, #PHASE_SHIFT]
234         lsr             r4, ip, r4 /* compute sample_index */
235         mul             r2, r2, r5
236
237         ldr             ip, [r0, #FILTER_BANK]
238         add             r3, r3, r4, lsl #\es /* &src[sample_index] */
239
240         cmp             r5, #8
241         ldr             r4, [r0, #SRC_INCR]
242         add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
243         add             r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */
244
245         blt             5f
246 8:
247         subs            r5, r5, #8
248         LOAD4
249         MUL4
250 7:
251         LOAD4
252         beq             6f
253         cmp             r5, #8
254         MLA4
255         blt             4f
256         subs            r5, r5, #8
257         LOAD4
258         MLA4
259         b               7b
260 6:
261         MLA4
262         STORE
263         pop             {r4, r5}
264         bx              lr
265 5:
266         INIT4
267 4:      /* remaining filter_length 1 to 7 */
268         cmp             r5, #4
269         blt             2f
270         subs            r5, r5, #4
271         LOAD4
272         MLA4
273         beq             0f
274 2:      /* remaining filter_length 1 to 3 */
275         cmp             r5, #2
276         blt             1f
277         subs            r5, r5, #2
278         LOAD2
279         MLA2
280         beq             0f
281 1:      /* remaining filter_length 1 */
282         LOAD1
283         MLA1
284 0:
285         STORE
286         pop             {r4, r5}
287         bx              lr
288 endfunc
289
290 .purgem LOAD1
291 .purgem LOAD2
292 .purgem LOAD4
293 .purgem MLA1
294 .purgem MLA2
295 .purgem MLA4
296 .purgem MUL4
297 .purgem INIT4
298 .purgem STORE
299 .endm
300
301
302 /* float32 linear */
303 .macro  LOAD1
304         veor.32         d0, d0
305         veor.32         d2, d2
306         vld1.32         {d0[0]}, [r0]! /* load filter */
307         vld1.32         {d2[0]}, [r2]! /* load filter */
308         vld1.32         {d4[0]}, [r3]! /* load src */
309 .endm
310 .macro  LOAD2
311         vld1.32         {d0}, [r0]! /* load filter */
312         vld1.32         {d2}, [r2]! /* load filter */
313         vld1.32         {d4}, [r3]! /* load src */
314 .endm
315 .macro  LOAD4
316         vld1.32         {d0,d1}, [r0]! /* load filter */
317         vld1.32         {d2,d3}, [r2]! /* load filter */
318         vld1.32         {d4,d5}, [r3]! /* load src */
319 .endm
320 .macro  MLA1
321         vmla.f32        d18, d0, d4[0]
322         vmla.f32        d16, d2, d4[0]
323 .endm
324 .macro  MLA2
325         vmla.f32        d18, d0, d4
326         vmla.f32        d16, d2, d4
327 .endm
328 .macro  MLA4
329         vmla.f32        q9, q0, q2
330         vmla.f32        q8, q1, q2
331 .endm
332 .macro  MUL4
333         vmul.f32        q9, q0, q2
334         vmul.f32        q8, q1, q2
335 .endm
336 .macro  INIT4
337         veor.f32        q9, q9
338         veor.f32        q8, q8
339 .endm
340 .macro  STORE
341         vldr            s0, [sp, #12] /* frac */
342         vmov            s1, r4
343         vcvt.f32.s32    d0, d0
344
345         vsub.f32        q8, q8, q9 /* v2 - val */
346         vpadd.f32       d18, d18, d19
347         vpadd.f32       d16, d16, d17
348         vpadd.f32       d2, d18, d18
349         vpadd.f32       d1, d16, d16
350
351         vmul.f32        s2, s2, s0 /* (v2 - val) * frac */
352         vdiv.f32        s2, s2, s1 /* / c->src_incr */
353         vadd.f32        s4, s4, s2
354
355         vstr            s4, [r1]
356 .endm
357
358 resample_linear flt, 2