Merge commit 'ab05d3934de8e932dbd77979a687e6598e67535c'
[ffmpeg.git] / libswresample / aarch64 / audio_convert_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "libavutil/aarch64/asm.S"
24
25 function swri_oldapi_conv_flt_to_s16_neon, export=1
26         subs            x2,  x2,  #8
27         ld1             {v0.4s}, [x1],  #16
28         fcvtzs          v4.4s,  v0.4s,  #31
29         ld1             {v1.4s}, [x1],  #16
30         fcvtzs          v5.4s,  v1.4s,  #31
31         b.eq            3f
32         ands            x12, x2,  #~15
33         b.eq            2f
34 1:      subs            x12, x12, #16
35         sqrshrn         v4.4h,  v4.4s,  #16
36         ld1             {v2.4s}, [x1],  #16
37         fcvtzs          v6.4s,  v2.4s,  #31
38         sqrshrn2        v4.8h,  v5.4s,  #16
39         ld1             {v3.4s}, [x1],  #16
40         fcvtzs          v7.4s,  v3.4s,  #31
41         sqrshrn         v6.4h,  v6.4s,  #16
42         st1             {v4.8h}, [x0],  #16
43         sqrshrn2        v6.8h,  v7.4s,  #16
44         ld1             {v0.4s}, [x1],  #16
45         fcvtzs          v4.4s,  v0.4s,  #31
46         ld1             {v1.4s}, [x1],  #16
47         fcvtzs          v5.4s,  v1.4s,  #31
48         st1             {v6.8h}, [x0],  #16
49         b.ne            1b
50         ands            x2,  x2,  #15
51         b.eq            3f
52 2:      ld1             {v2.4s}, [x1],  #16
53         sqrshrn         v4.4h,  v4.4s,  #16
54         fcvtzs          v6.4s,  v2.4s,  #31
55         ld1             {v3.4s}, [x1],  #16
56         sqrshrn2        v4.8h,  v5.4s,  #16
57         fcvtzs          v7.4s,  v3.4s,  #31
58         sqrshrn         v6.4h,  v6.4s,  #16
59         st1             {v4.8h}, [x0],  #16
60         sqrshrn2        v6.8h,  v7.4s,  #16
61         st1             {v6.8h}, [x0]
62         ret
63 3:      sqrshrn         v4.4h,  v4.4s,  #16
64         sqrshrn2        v4.8h,  v5.4s,  #16
65         st1             {v4.8h}, [x0]
66         ret
67 endfunc
68
69 function swri_oldapi_conv_fltp_to_s16_2ch_neon, export=1
70         ldp             x4,  x5,  [x1]
71         subs            x2,  x2,  #8
72         ld1             {v0.4s},  [x4], #16
73         fcvtzs          v4.4s,  v0.4s,  #31
74         ld1             {v1.4s},  [x4], #16
75         fcvtzs          v5.4s,  v1.4s,  #31
76         ld1             {v2.4s},  [x5], #16
77         fcvtzs          v6.4s,  v2.4s,  #31
78         ld1             {v3.4s},  [x5], #16
79         fcvtzs          v7.4s,  v3.4s,  #31
80         b.eq            3f
81         ands            x12, x2,  #~15
82         b.eq            2f
83 1:      subs            x12, x12, #16
84         ld1             {v16.4s}, [x4], #16
85         fcvtzs          v20.4s, v16.4s, #31
86         sri             v6.4s,  v4.4s,  #16
87         ld1             {v17.4s}, [x4], #16
88         fcvtzs          v21.4s, v17.4s, #31
89         ld1             {v18.4s}, [x5], #16
90         fcvtzs          v22.4s, v18.4s, #31
91         ld1             {v19.4s}, [x5], #16
92         sri             v7.4s,  v5.4s,  #16
93         st1             {v6.4s},  [x0], #16
94         fcvtzs          v23.4s, v19.4s, #31
95         st1             {v7.4s},  [x0], #16
96         sri             v22.4s, v20.4s, #16
97         ld1             {v0.4s},  [x4], #16
98         sri             v23.4s, v21.4s, #16
99         st1             {v22.4s}, [x0], #16
100         fcvtzs          v4.4s,  v0.4s,  #31
101         ld1             {v1.4s},  [x4], #16
102         fcvtzs          v5.4s,  v1.4s,  #31
103         ld1             {v2.4s},  [x5], #16
104         fcvtzs          v6.4s,  v2.4s,  #31
105         ld1             {v3.4s},  [x5], #16
106         fcvtzs          v7.4s,  v3.4s,  #31
107         st1             {v23.4s}, [x0], #16
108         b.ne            1b
109         ands            x2,  x2,  #15
110         b.eq            3f
111 2:      sri             v6.4s,  v4.4s,  #16
112         ld1             {v0.4s},  [x4], #16
113         fcvtzs          v0.4s,  v0.4s,  #31
114         ld1             {v1.4s},  [x4], #16
115         fcvtzs          v1.4s,  v1.4s,  #31
116         ld1             {v2.4s},  [x5], #16
117         fcvtzs          v2.4s,  v2.4s,  #31
118         sri             v7.4s,  v5.4s,  #16
119         ld1             {v3.4s},  [x5], #16
120         fcvtzs          v3.4s,  v3.4s,  #31
121         sri             v2.4s,  v0.4s,  #16
122         st1             {v6.4s,v7.4s},  [x0], #32
123         sri             v3.4s,  v1.4s,  #16
124         st1             {v2.4s,v3.4s},  [x0], #32
125         ret
126 3:      sri             v6.4s,  v4.4s,  #16
127         sri             v7.4s,  v5.4s,  #16
128         st1             {v6.4s,v7.4s},  [x0]
129         ret
130 endfunc
131
132 function swri_oldapi_conv_fltp_to_s16_nch_neon, export=1
133         cmp             w3,  #2
134         b.eq            X(swri_oldapi_conv_fltp_to_s16_2ch_neon)
135         b.gt            1f
136         ldr             x1,  [x1]
137         b               X(swri_oldapi_conv_flt_to_s16_neon)
138 1:
139         cmp             w3,  #4
140         lsl             x12, x3,  #1
141         b.lt            4f
142
143 5:      // 4 channels
144         ldp             x4, x5, [x1], #16
145         ldp             x6, x7, [x1], #16
146         mov             w9,  w2
147         mov             x8,  x0
148         ld1             {v4.4s},        [x4], #16
149         fcvtzs          v4.4s,  v4.4s,  #31
150         ld1             {v5.4s},        [x5], #16
151         fcvtzs          v5.4s,  v5.4s,  #31
152         ld1             {v6.4s},        [x6], #16
153         fcvtzs          v6.4s, v6.4s, #31
154         ld1             {v7.4s},        [x7], #16
155         fcvtzs          v7.4s, v7.4s, #31
156 6:
157         subs            w9,  w9,  #8
158         ld1             {v0.4s},        [x4], #16
159         fcvtzs          v0.4s,  v0.4s,  #31
160         sri             v5.4s,  v4.4s,  #16
161         ld1             {v1.4s},        [x5], #16
162         fcvtzs          v1.4s,  v1.4s,  #31
163         sri             v7.4s,  v6.4s,  #16
164         ld1             {v2.4s},        [x6], #16
165         fcvtzs          v2.4s,  v2.4s,  #31
166         zip1            v16.4s, v5.4s,  v7.4s
167         ld1             {v3.4s},        [x7], #16
168         fcvtzs          v3.4s,  v3.4s,  #31
169         zip2            v17.4s, v5.4s,  v7.4s
170         st1             {v16.d}[0],     [x8], x12
171         sri             v1.4s,  v0.4s,  #16
172         st1             {v16.d}[1],     [x8], x12
173         sri             v3.4s,  v2.4s,  #16
174         st1             {v17.d}[0],     [x8], x12
175         zip1            v18.4s, v1.4s,  v3.4s
176         st1             {v17.d}[1],     [x8], x12
177         zip2            v19.4s, v1.4s,  v3.4s
178         b.eq            7f
179         ld1             {v4.4s},        [x4], #16
180         fcvtzs          v4.4s,  v4.4s,  #31
181         st1             {v18.d}[0],     [x8], x12
182         ld1             {v5.4s},        [x5], #16
183         fcvtzs          v5.4s,  v5.4s,  #31
184         st1             {v18.d}[1],     [x8], x12
185         ld1             {v6.4s},    [x6], #16
186         fcvtzs          v6.4s, v6.4s, #31
187         st1             {v19.d}[0],     [x8], x12
188         ld1             {v7.4s},    [x7], #16
189         fcvtzs          v7.4s, v7.4s, #31
190         st1             {v19.d}[1],     [x8], x12
191         b               6b
192 7:
193         st1             {v18.d}[0],     [x8], x12
194         st1             {v18.d}[1],     [x8], x12
195         st1             {v19.d}[0],     [x8], x12
196         st1             {v19.d}[1],     [x8], x12
197         subs            w3,  w3,  #4
198         b.eq            end
199         cmp             w3,  #4
200         add             x0,  x0,  #8
201         b.ge            5b
202
203 4:      // 2 channels
204         cmp             w3,  #2
205         b.lt            4f
206         ldp             x4,  x5,  [x1], #16
207         mov             w9,  w2
208         mov             x8,  x0
209         tst             w9,  #8
210         ld1             {v4.4s},        [x4], #16
211         fcvtzs          v4.4s,  v4.4s,  #31
212         ld1             {v5.4s},        [x5], #16
213         fcvtzs          v5.4s,  v5.4s,  #31
214         ld1             {v6.4s},        [x4], #16
215         fcvtzs          v6.4s,  v6.4s,  #31
216         ld1             {v7.4s},        [x5], #16
217         fcvtzs          v7.4s,  v7.4s,  #31
218         b.eq            6f
219         subs            w9,  w9,  #8
220         b.eq            7f
221         sri             v5.4s,  v4.4s,  #16
222         ld1             {v4.4s},        [x4], #16
223         fcvtzs          v4.4s,  v4.4s,  #31
224         st1             {v5.s}[0],      [x8], x12
225         sri             v7.4s,  v6.4s,  #16
226         st1             {v5.s}[1],      [x8], x12
227         ld1             {v6.4s},        [x4], #16
228         fcvtzs          v6.4s,  v6.4s, #31
229         st1             {v5.s}[2],      [x8], x12
230         st1             {v5.s}[3],      [x8], x12
231         st1             {v7.s}[0],      [x8], x12
232         st1             {v7.s}[1],      [x8], x12
233         ld1             {v5.4s},        [x5], #16
234         fcvtzs          v5.4s,  v5.4s,  #31
235         st1             {v7.s}[2],      [x8], x12
236         st1             {v7.s}[3],      [x8], x12
237         ld1             {v7.4s},        [x5], #16
238         fcvtzs          v7.4s,  v7.4s,  #31
239 6:
240         subs            w9,  w9,  #16
241         ld1             {v0.4s},        [x4], #16
242         sri             v5.4s,  v4.4s,  #16
243         fcvtzs          v0.4s,  v0.4s,  #31
244         ld1             {v1.4s},        [x5], #16
245         sri             v7.4s,  v6.4s,  #16
246         st1             {v5.s}[0],      [x8], x12
247         st1             {v5.s}[1],      [x8], x12
248         fcvtzs          v1.4s,  v1.4s,  #31
249         st1             {v5.s}[2],      [x8], x12
250         st1             {v5.s}[3],      [x8], x12
251         ld1             {v2.4s},        [x4], #16
252         st1             {v7.s}[0],      [x8], x12
253         fcvtzs          v2.4s,  v2.4s,  #31
254         st1             {v7.s}[1],      [x8], x12
255         ld1             {v3.4s},        [x5], #16
256         st1             {v7.s}[2],      [x8], x12
257         fcvtzs          v3.4s,  v3.4s,  #31
258         st1             {v7.s}[3],      [x8], x12
259         sri             v1.4s,  v0.4s,  #16
260         sri             v3.4s,  v2.4s,  #16
261         b.eq            6f
262         ld1             {v4.4s},        [x4], #16
263         st1             {v1.s}[0],      [x8], x12
264         fcvtzs          v4.4s,  v4.4s,  #31
265         st1             {v1.s}[1],      [x8], x12
266         ld1             {v5.4s},        [x5], #16
267         st1             {v1.s}[2],      [x8], x12
268         fcvtzs          v5.4s,  v5.4s,  #31
269         st1             {v1.s}[3],      [x8], x12
270         ld1             {v6.4s},        [x4], #16
271         st1             {v3.s}[0],      [x8], x12
272         fcvtzs          v6.4s,  v6.4s,  #31
273         st1             {v3.s}[1],      [x8], x12
274         ld1             {v7.4s},        [x5], #16
275         st1             {v3.s}[2],      [x8], x12
276         fcvtzs          v7.4s,  v7.4s,  #31
277         st1             {v3.s}[3],      [x8], x12
278         b.gt            6b
279 6:
280         st1             {v1.s}[0],      [x8], x12
281         st1             {v1.s}[1],      [x8], x12
282         st1             {v1.s}[2],      [x8], x12
283         st1             {v1.s}[3],      [x8], x12
284         st1             {v3.s}[0],      [x8], x12
285         st1             {v3.s}[1],      [x8], x12
286         st1             {v3.s}[2],      [x8], x12
287         st1             {v3.s}[3],      [x8], x12
288         b               8f
289 7:
290         sri             v5.4s,  v4.4s,  #16
291         sri             v7.4s,  v6.4s,  #16
292         st1             {v5.s}[0],      [x8], x12
293         st1             {v5.s}[1],      [x8], x12
294         st1             {v5.s}[2],      [x8], x12
295         st1             {v5.s}[3],      [x8], x12
296         st1             {v7.s}[0],      [x8], x12
297         st1             {v7.s}[1],      [x8], x12
298         st1             {v7.s}[2],      [x8], x12
299         st1             {v7.s}[3],      [x8], x12
300 8:
301         subs            w3,  w3,  #2
302         add             x0,  x0,  #4
303         b.eq            end
304
305 4:      // 1 channel
306         ldr             x4,  [x1]
307         tst             w2,  #8
308         mov             w9,  w2
309         mov             x5,  x0
310         ld1             {v0.4s},        [x4], #16
311         fcvtzs          v0.4s,  v0.4s,  #31
312         ld1             {v1.4s},        [x4], #16
313         fcvtzs          v1.4s,  v1.4s,  #31
314         b.ne            8f
315 6:
316         subs            w9,  w9,  #16
317         ld1             {v2.4s},        [x4], #16
318         fcvtzs          v2.4s,  v2.4s,  #31
319         ld1             {v3.4s},        [x4], #16
320         fcvtzs          v3.4s,  v3.4s,  #31
321         st1             {v0.h}[1],      [x5], x12
322         st1             {v0.h}[3],      [x5], x12
323         st1             {v0.h}[5],      [x5], x12
324         st1             {v0.h}[7],      [x5], x12
325         st1             {v1.h}[1],      [x5], x12
326         st1             {v1.h}[3],      [x5], x12
327         st1             {v1.h}[5],      [x5], x12
328         st1             {v1.h}[7],      [x5], x12
329         b.eq            7f
330         ld1             {v0.4s},        [x4], #16
331         fcvtzs          v0.4s,  v0.4s,  #31
332         ld1             {v1.4s},        [x4], #16
333         fcvtzs          v1.4s,  v1.4s,  #31
334 7:
335         st1             {v2.h}[1],      [x5], x12
336         st1             {v2.h}[3],      [x5], x12
337         st1             {v2.h}[5],      [x5], x12
338         st1             {v2.h}[7],      [x5], x12
339         st1             {v3.h}[1],      [x5], x12
340         st1             {v3.h}[3],      [x5], x12
341         st1             {v3.h}[5],      [x5], x12
342         st1             {v3.h}[7],      [x5], x12
343         b.gt            6b
344         ret
345 8:
346         subs            w9,  w9,  #8
347         st1             {v0.h}[1],      [x5], x12
348         st1             {v0.h}[3],      [x5], x12
349         st1             {v0.h}[5],      [x5], x12
350         st1             {v0.h}[7],      [x5], x12
351         st1             {v1.h}[1],      [x5], x12
352         st1             {v1.h}[3],      [x5], x12
353         st1             {v1.h}[5],      [x5], x12
354         st1             {v1.h}[7],      [x5], x12
355         b.eq            end
356         ld1             {v0.4s},        [x4], #16
357         fcvtzs          v0.4s,  v0.4s,  #31
358         ld1             {v1.4s},        [x4], #16
359         fcvtzs          v1.4s,  v1.4s,  #31
360         b               6b
361 end:
362         ret
363 endfunc