Merge commit '1d25a86902946dbc80bb3a38e61755181ca3af7b'
[ffmpeg.git] / libavcodec / aarch64 / simple_idct_neon.S
1 /*
2  * ARM NEON IDCT
3  *
4  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5  * Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
6  *
7  * Based on Simple IDCT
8  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26
27 #include "libavutil/aarch64/asm.S"
28
29 #define Z1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
30 #define Z2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
31 #define Z3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
32 #define Z4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
33 #define Z5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
34 #define Z6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
35 #define Z7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
36 #define Z4c ((1<<(COL_SHIFT-1))/Z4)
37 #define ROW_SHIFT 11
38 #define COL_SHIFT 20
39
40 #define z1 v0.H[0]
41 #define z2 v0.H[1]
42 #define z3 v0.H[2]
43 #define z4 v0.H[3]
44 #define z5 v0.H[4]
45 #define z6 v0.H[5]
46 #define z7 v0.H[6]
47 #define z4c v0.H[7]
48
49 const   idct_coeff_neon, align=4
50         .short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
51 endconst
52
53 .macro idct_start data
54         prfm            pldl1keep, [\data]
55         mov             x10, x30
56         movrel          x3, idct_coeff_neon
57         ld1             {v0.2D}, [x3]
58 .endm
59
60 .macro idct_end
61         br              x10
62 .endm
63
64 .macro smull1 a b c
65         smull           \a, \b, \c
66 .endm
67
68 .macro smlal1 a b c
69         smlal           \a, \b, \c
70 .endm
71
72 .macro smlsl1 a b c
73         smlsl           \a, \b, \c
74 .endm
75
76 .macro idct_col4_top y1 y2 y3 y4 i l
77         smull\i         v7.4S,  \y3\().\l, z2
78         smull\i         v16.4S, \y3\().\l, z6
79         smull\i         v17.4S, \y2\().\l, z1
80         add             v19.4S, v23.4S, v7.4S
81         smull\i         v18.4S, \y2\().\l, z3
82         add             v20.4S, v23.4S, v16.4S
83         smull\i         v5.4S,  \y2\().\l, z5
84         sub             v21.4S, v23.4S, v16.4S
85         smull\i         v6.4S,  \y2\().\l, z7
86         sub             v22.4S, v23.4S, v7.4S
87
88         smlal\i         v17.4S, \y4\().\l, z3
89         smlsl\i         v18.4S, \y4\().\l, z7
90         smlsl\i         v5.4S,  \y4\().\l, z1
91         smlsl\i         v6.4S,  \y4\().\l, z5
92 .endm
93
94 .macro idct_row4_neon y1 y2 y3 y4 pass
95         ld1             {\y1\().2D-\y2\().2D}, [x2], #32
96         movi            v23.4S, #1<<2, lsl #8
97         orr             v5.16B, \y1\().16B, \y2\().16B
98         ld1             {\y3\().2D, \y4\().2D}, [x2], #32
99         orr             v6.16B, \y3\().16B, \y4\().16B
100         orr             v5.16B, v5.16B, v6.16B
101         mov             x3, v5.D[1]
102         smlal           v23.4S, \y1\().4H, z4
103
104         idct_col4_top   \y1 \y2 \y3 \y4 1 4H
105
106         cmp             x3, #0
107         beq             \pass\()f
108
109         smull2          v7.4S, \y1\().8H, z4
110         smlal2          v17.4S, \y2\().8H, z5
111         smlsl2          v18.4S, \y2\().8H, z1
112         smull2          v16.4S, \y3\().8H, z2
113         smlal2          v5.4S, \y2\().8H, z7
114         add             v19.4S, v19.4S, v7.4S
115         sub             v20.4S, v20.4S, v7.4S
116         sub             v21.4S, v21.4S, v7.4S
117         add             v22.4S, v22.4S, v7.4S
118         smlal2          v6.4S, \y2\().8H, z3
119         smull2          v7.4S, \y3\().8H, z6
120         smlal2          v17.4S, \y4\().8H, z7
121         smlsl2          v18.4S, \y4\().8H, z5
122         smlal2          v5.4S, \y4\().8H, z3
123         smlsl2          v6.4S, \y4\().8H, z1
124         add             v19.4S, v19.4S, v7.4S
125         sub             v20.4S, v20.4S, v16.4S
126         add             v21.4S, v21.4S, v16.4S
127         sub             v22.4S, v22.4S, v7.4S
128
129 \pass:  add             \y3\().4S, v19.4S, v17.4S
130         add             \y4\().4S, v20.4S, v18.4S
131         shrn            \y1\().4H, \y3\().4S, #ROW_SHIFT
132         shrn            \y2\().4H, \y4\().4S, #ROW_SHIFT
133         add             v7.4S, v21.4S, v5.4S
134         add             v16.4S, v22.4S, v6.4S
135         shrn            \y3\().4H, v7.4S, #ROW_SHIFT
136         shrn            \y4\().4H, v16.4S, #ROW_SHIFT
137         sub             v22.4S, v22.4S, v6.4S
138         sub             v19.4S, v19.4S, v17.4S
139         sub             v21.4S, v21.4S, v5.4S
140         shrn2           \y1\().8H, v22.4S, #ROW_SHIFT
141         sub             v20.4S, v20.4S, v18.4S
142         shrn2           \y2\().8H, v21.4S, #ROW_SHIFT
143         shrn2           \y3\().8H, v20.4S, #ROW_SHIFT
144         shrn2           \y4\().8H, v19.4S, #ROW_SHIFT
145
146         trn1            v16.8H, \y1\().8H, \y2\().8H
147         trn2            v17.8H, \y1\().8H, \y2\().8H
148         trn1            v18.8H, \y3\().8H, \y4\().8H
149         trn2            v19.8H, \y3\().8H, \y4\().8H
150         trn1            \y1\().4S, v16.4S, v18.4S
151         trn1            \y2\().4S, v17.4S, v19.4S
152         trn2            \y3\().4S, v16.4S, v18.4S
153         trn2            \y4\().4S, v17.4S, v19.4S
154 .endm
155
156 .macro declare_idct_col4_neon i l
157 function idct_col4_neon\i
158         dup             v23.4H, z4c
159 .if \i == 1
160         add             v23.4H, v23.4H, v24.4H
161 .else
162         mov             v5.D[0], v24.D[1]
163         add             v23.4H, v23.4H, v5.4H
164 .endif
165         smull           v23.4S, v23.4H, z4
166
167         idct_col4_top   v24 v25 v26 v27 \i \l
168
169         mov             x4, v28.D[\i - 1]
170         mov             x5, v29.D[\i - 1]
171         cmp             x4, #0
172         beq             1f
173
174         smull\i         v7.4S,  v28.\l, z4
175         add             v19.4S, v19.4S, v7.4S
176         sub             v20.4S, v20.4S, v7.4S
177         sub             v21.4S, v21.4S, v7.4S
178         add             v22.4S, v22.4S, v7.4S
179
180 1:      mov             x4, v30.D[\i - 1]
181         cmp             x5, #0
182         beq             2f
183
184         smlal\i         v17.4S, v29.\l, z5
185         smlsl\i         v18.4S, v29.\l, z1
186         smlal\i         v5.4S,  v29.\l, z7
187         smlal\i         v6.4S,  v29.\l, z3
188
189 2:      mov             x5, v31.D[\i - 1]
190         cmp             x4, #0
191         beq             3f
192
193         smull\i         v7.4S,  v30.\l, z6
194         smull\i         v16.4S, v30.\l, z2
195         add             v19.4S, v19.4S, v7.4S
196         sub             v22.4S, v22.4S, v7.4S
197         sub             v20.4S, v20.4S, v16.4S
198         add             v21.4S, v21.4S, v16.4S
199
200 3:      cmp             x5, #0
201         beq             4f
202
203         smlal\i         v17.4S, v31.\l, z7
204         smlsl\i         v18.4S, v31.\l, z5
205         smlal\i         v5.4S,  v31.\l, z3
206         smlsl\i         v6.4S,  v31.\l, z1
207
208 4:      addhn           v7.4H, v19.4S, v17.4S
209         addhn2          v7.8H, v20.4S, v18.4S
210         subhn           v18.4H, v20.4S, v18.4S
211         subhn2          v18.8H, v19.4S, v17.4S
212
213         addhn           v16.4H, v21.4S, v5.4S
214         addhn2          v16.8H, v22.4S, v6.4S
215         subhn           v17.4H, v22.4S, v6.4S
216         subhn2          v17.8H, v21.4S, v5.4S
217
218         ret
219 endfunc
220 .endm
221
222 declare_idct_col4_neon 1 4H
223 declare_idct_col4_neon 2 8H
224
225 function ff_simple_idct_put_neon, export=1
226         idct_start      x2
227
228         idct_row4_neon  v24 v25 v26 v27 1
229         idct_row4_neon  v28 v29 v30 v31 2
230         bl              idct_col4_neon1
231
232         sqshrun         v1.8B,  v7.8H, #COL_SHIFT-16
233         sqshrun2        v1.16B, v16.8H, #COL_SHIFT-16
234         sqshrun         v3.8B,  v17.8H, #COL_SHIFT-16
235         sqshrun2        v3.16B, v18.8H, #COL_SHIFT-16
236
237         bl              idct_col4_neon2
238
239         sqshrun         v2.8B,  v7.8H, #COL_SHIFT-16
240         sqshrun2        v2.16B, v16.8H, #COL_SHIFT-16
241         sqshrun         v4.8B,  v17.8H, #COL_SHIFT-16
242         sqshrun2        v4.16B, v18.8H, #COL_SHIFT-16
243
244         zip1            v16.4S, v1.4S, v2.4S
245         zip2            v17.4S, v1.4S, v2.4S
246
247         st1             {v16.D}[0], [x0], x1
248         st1             {v16.D}[1], [x0], x1
249
250         zip1            v18.4S, v3.4S, v4.4S
251         zip2            v19.4S, v3.4S, v4.4S
252
253         st1             {v17.D}[0], [x0], x1
254         st1             {v17.D}[1], [x0], x1
255         st1             {v18.D}[0], [x0], x1
256         st1             {v18.D}[1], [x0], x1
257         st1             {v19.D}[0], [x0], x1
258         st1             {v19.D}[1], [x0], x1
259
260         idct_end
261 endfunc
262
263 function ff_simple_idct_add_neon, export=1
264         idct_start      x2
265
266         idct_row4_neon  v24 v25 v26 v27 1
267         idct_row4_neon  v28 v29 v30 v31 2
268         bl              idct_col4_neon1
269
270         sshr            v1.8H, V7.8H, #COL_SHIFT-16
271         sshr            v2.8H, v16.8H, #COL_SHIFT-16
272         sshr            v3.8H, v17.8H, #COL_SHIFT-16
273         sshr            v4.8H, v18.8H, #COL_SHIFT-16
274
275         bl              idct_col4_neon2
276
277         sshr            v7.8H, V7.8H, #COL_SHIFT-16
278         sshr            v16.8H, v16.8H, #COL_SHIFT-16
279         sshr            v17.8H, v17.8H, #COL_SHIFT-16
280         sshr            v18.8H, v18.8H, #COL_SHIFT-16
281
282         mov             x9,  x0
283         ld1             {v19.D}[0], [x0], x1
284         zip1            v23.2D, v1.2D, v7.2D
285         zip2            v24.2D, v1.2D, v7.2D
286         ld1             {v19.D}[1], [x0], x1
287         zip1            v25.2D, v2.2D, v16.2D
288         zip2            v26.2D, v2.2D, v16.2D
289         ld1             {v20.D}[0], [x0], x1
290         zip1            v27.2D, v3.2D, v17.2D
291         zip2            v28.2D, v3.2D, v17.2D
292         ld1             {v20.D}[1], [x0], x1
293         zip1            v29.2D, v4.2D, v18.2D
294         zip2            v30.2D, v4.2D, v18.2D
295         ld1             {v21.D}[0], [x0], x1
296         uaddw           v23.8H, v23.8H, v19.8B
297         uaddw2          v24.8H, v24.8H, v19.16B
298         ld1             {v21.D}[1], [x0], x1
299         sqxtun          v23.8B, v23.8H
300         sqxtun2         v23.16B, v24.8H
301         ld1             {v22.D}[0], [x0], x1
302         uaddw           v24.8H, v25.8H, v20.8B
303         uaddw2          v25.8H, v26.8H, v20.16B
304         ld1             {v22.D}[1], [x0], x1
305         sqxtun          v24.8B, v24.8H
306         sqxtun2         v24.16B, v25.8H
307         st1             {v23.D}[0], [x9], x1
308         uaddw           v25.8H, v27.8H, v21.8B
309         uaddw2          v26.8H, v28.8H, v21.16B
310         st1             {v23.D}[1], [x9], x1
311         sqxtun          v25.8B, v25.8H
312         sqxtun2         v25.16B, v26.8H
313         st1             {v24.D}[0], [x9], x1
314         uaddw           v26.8H, v29.8H, v22.8B
315         uaddw2          v27.8H, v30.8H, v22.16B
316         st1             {v24.D}[1], [x9], x1
317         sqxtun          v26.8B, v26.8H
318         sqxtun2         v26.16B, v27.8H
319         st1             {v25.D}[0], [x9], x1
320         st1             {v25.D}[1], [x9], x1
321         st1             {v26.D}[0], [x9], x1
322         st1             {v26.D}[1], [x9], x1
323
324         idct_end
325 endfunc
326
327 function ff_simple_idct_neon, export=1
328         idct_start      x0
329
330         mov             x2,  x0
331         idct_row4_neon  v24 v25 v26 v27 1
332         idct_row4_neon  v28 v29 v30 v31 2
333         add             x2, x2, #-128
334         bl              idct_col4_neon1
335
336         sshr            v1.8H, v7.8H, #COL_SHIFT-16
337         sshr            v2.8H, v16.8H, #COL_SHIFT-16
338         sshr            v3.8H, v17.8H, #COL_SHIFT-16
339         sshr            v4.8H, v18.8H, #COL_SHIFT-16
340
341         bl              idct_col4_neon2
342
343         sshr            v7.8H, v7.8H, #COL_SHIFT-16
344         sshr            v16.8H, v16.8H, #COL_SHIFT-16
345         sshr            v17.8H, v17.8H, #COL_SHIFT-16
346         sshr            v18.8H, v18.8H, #COL_SHIFT-16
347
348         zip1            v23.2D, v1.2D, v7.2D
349         zip2            v24.2D, v1.2D, v7.2D
350         st1             {v23.2D,V24.2D}, [x2], #32
351         zip1            v25.2D, v2.2D, v16.2D
352         zip2            v26.2D, v2.2D, v16.2D
353         st1             {v25.2D,V26.2D}, [x2], #32
354         zip1            v27.2D, v3.2D, v17.2D
355         zip2            v28.2D, v3.2D, v17.2D
356         st1             {v27.2D,V28.2D}, [x2], #32
357         zip1            v29.2D, v4.2D, v18.2D
358         zip2            v30.2D, v4.2D, v18.2D
359         st1             {v29.2D,V30.2D}, [x2], #32
360
361         idct_end
362 endfunc