Merge commit 'bb515e3a735f526ccb1068031e289eb5aeb69e22'
[ffmpeg.git] / libavcodec / aarch64 / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25 .macro  h264_loop_filter_start
26         cmp             w2,  #0
27         ldr             w6,  [x4]
28         ccmp            w3,  #0, #0, ne
29         mov             v24.S[0], w6
30         and             w6,  w6,  w6,  lsl #16
31         b.eq            1f
32         ands            w6,  w6,  w6,  lsl #8
33         b.ge            2f
34 1:
35         ret
36 2:
37 .endm
38
39 .macro  h264_loop_filter_luma
40         dup             v22.16B, w2                     // alpha
41         uxtl            v24.8H,  v24.8B
42         uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
43         uxtl            v24.4S,  v24.4H
44         uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
45         sli             v24.8H,  v24.8H,  #8
46         uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
47         sli             v24.4S,  v24.4S,  #16
48         cmhi            v21.16B, v22.16B, v21.16B       // < alpha
49         dup             v22.16B, w3                     // beta
50         cmlt            v23.16B, v24.16B, #0
51         cmhi            v28.16B, v22.16B, v28.16B       // < beta
52         cmhi            v30.16B, v22.16B, v30.16B       // < beta
53         bic             v21.16B, v21.16B, v23.16B
54         uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
55         and             v21.16B, v21.16B, v28.16B
56         uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
57         cmhi            v17.16B, v22.16B, v17.16B       // < beta
58         and             v21.16B, v21.16B, v30.16B
59         cmhi            v19.16B, v22.16B, v19.16B       // < beta
60         and             v17.16B, v17.16B, v21.16B
61         and             v19.16B, v19.16B, v21.16B
62         and             v24.16B, v24.16B, v21.16B
63         urhadd          v28.16B, v16.16B,  v0.16B
64         sub             v21.16B, v24.16B, v17.16B
65         uqadd           v23.16B, v18.16B, v24.16B
66         uhadd           v20.16B, v20.16B, v28.16B
67         sub             v21.16B, v21.16B, v19.16B
68         uhadd           v28.16B,  v4.16B, v28.16B
69         umin            v23.16B, v23.16B, v20.16B
70         uqsub           v22.16B, v18.16B, v24.16B
71         uqadd           v4.16B,   v2.16B, v24.16B
72         umax            v23.16B, v23.16B, v22.16B
73         uqsub           v22.16B,  v2.16B, v24.16B
74         umin            v28.16B,  v4.16B, v28.16B
75         uxtl            v4.8H,    v0.8B
76         umax            v28.16B, v28.16B, v22.16B
77         uxtl2           v20.8H,   v0.16B
78         usubw           v4.8H,    v4.8H,  v16.8B
79         usubw2          v20.8H,  v20.8H,  v16.16B
80         shl             v4.8H,    v4.8H,  #2
81         shl             v20.8H,  v20.8H,  #2
82         uaddw           v4.8H,    v4.8H,  v18.8B
83         uaddw2          v20.8H,  v20.8H,  v18.16B
84         usubw           v4.8H,    v4.8H,   v2.8B
85         usubw2          v20.8H,  v20.8H,   v2.16B
86         rshrn           v4.8B,    v4.8H,  #3
87         rshrn2          v4.16B,  v20.8H,  #3
88         bsl             v17.16B, v23.16B, v18.16B
89         bsl             v19.16B, v28.16B,  v2.16B
90         neg             v23.16B, v21.16B
91         uxtl            v28.8H,  v16.8B
92         smin            v4.16B,   v4.16B, v21.16B
93         uxtl2           v21.8H,  v16.16B
94         smax            v4.16B,   v4.16B, v23.16B
95         uxtl            v22.8H,   v0.8B
96         uxtl2           v24.8H,   v0.16B
97         saddw           v28.8H,  v28.8H,  v4.8B
98         saddw2          v21.8H,  v21.8H,  v4.16B
99         ssubw           v22.8H,  v22.8H,  v4.8B
100         ssubw2          v24.8H,  v24.8H,  v4.16B
101         sqxtun          v16.8B,  v28.8H
102         sqxtun2         v16.16B, v21.8H
103         sqxtun          v0.8B,   v22.8H
104         sqxtun2         v0.16B,  v24.8H
105 .endm
106
107 function ff_h264_v_loop_filter_luma_neon, export=1
108         h264_loop_filter_start
109         sxtw            x1,  w1
110
111         ld1             {v0.16B},  [x0], x1
112         ld1             {v2.16B},  [x0], x1
113         ld1             {v4.16B},  [x0], x1
114         sub             x0,  x0,  x1, lsl #2
115         sub             x0,  x0,  x1, lsl #1
116         ld1             {v20.16B},  [x0], x1
117         ld1             {v18.16B},  [x0], x1
118         ld1             {v16.16B},  [x0], x1
119
120         h264_loop_filter_luma
121
122         sub             x0,  x0,  x1, lsl #1
123         st1             {v17.16B},  [x0], x1
124         st1             {v16.16B}, [x0], x1
125         st1             {v0.16B},  [x0], x1
126         st1             {v19.16B}, [x0]
127
128         ret
129 endfunc
130
131 function ff_h264_h_loop_filter_luma_neon, export=1
132         h264_loop_filter_start
133         sxtw            x1,  w1
134
135         sub             x0,  x0,  #4
136         ld1             {v6.8B},  [x0], x1
137         ld1             {v20.8B}, [x0], x1
138         ld1             {v18.8B}, [x0], x1
139         ld1             {v16.8B}, [x0], x1
140         ld1             {v0.8B},  [x0], x1
141         ld1             {v2.8B},  [x0], x1
142         ld1             {v4.8B},  [x0], x1
143         ld1             {v26.8B}, [x0], x1
144         ld1             {v6.D}[1],  [x0], x1
145         ld1             {v20.D}[1], [x0], x1
146         ld1             {v18.D}[1], [x0], x1
147         ld1             {v16.D}[1], [x0], x1
148         ld1             {v0.D}[1],  [x0], x1
149         ld1             {v2.D}[1],  [x0], x1
150         ld1             {v4.D}[1],  [x0], x1
151         ld1             {v26.D}[1], [x0], x1
152
153         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
154
155         h264_loop_filter_luma
156
157         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
158
159         sub             x0,  x0,  x1, lsl #4
160         add             x0,  x0,  #2
161         st1             {v17.S}[0],  [x0], x1
162         st1             {v16.S}[0], [x0], x1
163         st1             {v0.S}[0],  [x0], x1
164         st1             {v19.S}[0], [x0], x1
165         st1             {v17.S}[1],  [x0], x1
166         st1             {v16.S}[1], [x0], x1
167         st1             {v0.S}[1],  [x0], x1
168         st1             {v19.S}[1], [x0], x1
169         st1             {v17.S}[2],  [x0], x1
170         st1             {v16.S}[2], [x0], x1
171         st1             {v0.S}[2],  [x0], x1
172         st1             {v19.S}[2], [x0], x1
173         st1             {v17.S}[3],  [x0], x1
174         st1             {v16.S}[3], [x0], x1
175         st1             {v0.S}[3],  [x0], x1
176         st1             {v19.S}[3], [x0], x1
177
178         ret
179 endfunc
180
181 .macro  h264_loop_filter_chroma
182         dup             v22.8B, w2              // alpha
183         uxtl            v24.8H, v24.8B
184         uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
185         uxtl            v4.8H,  v0.8B
186         uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
187         usubw           v4.8H,  v4.8H,  v16.8B
188         sli             v24.8H, v24.8H, #8
189         shl             v4.8H,  v4.8H,  #2
190         uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
191         uaddw           v4.8H,  v4.8H,  v18.8B
192         cmhi            v26.8B, v22.8B, v26.8B  // < alpha
193         usubw           v4.8H,  v4.8H,  v2.8B
194         dup             v22.8B, w3              // beta
195         rshrn           v4.8B,  v4.8H,  #3
196         cmhi            v28.8B, v22.8B, v28.8B  // < beta
197         cmhi            v30.8B, v22.8B, v30.8B  // < beta
198         smin            v4.8B,  v4.8B,  v24.8B
199         neg             v25.8B, v24.8B
200         and             v26.8B, v26.8B, v28.8B
201         smax            v4.8B,  v4.8B,  v25.8B
202         and             v26.8B, v26.8B, v30.8B
203         uxtl            v22.8H, v0.8B
204         and             v4.8B,  v4.8B,  v26.8B
205         uxtl            v28.8H, v16.8B
206         saddw           v28.8H, v28.8H, v4.8B
207         ssubw           v22.8H, v22.8H, v4.8B
208         sqxtun          v16.8B, v28.8H
209         sqxtun          v0.8B,  v22.8H
210 .endm
211
212 function ff_h264_v_loop_filter_chroma_neon, export=1
213         h264_loop_filter_start
214         sxtw            x1,  w1
215
216         sub             x0,  x0,  x1, lsl #1
217         ld1             {v18.8B}, [x0], x1
218         ld1             {v16.8B}, [x0], x1
219         ld1             {v0.8B},  [x0], x1
220         ld1             {v2.8B},  [x0]
221
222         h264_loop_filter_chroma
223
224         sub             x0,  x0,  x1, lsl #1
225         st1             {v16.8B}, [x0], x1
226         st1             {v0.8B},  [x0], x1
227
228         ret
229 endfunc
230
231 function ff_h264_h_loop_filter_chroma_neon, export=1
232         h264_loop_filter_start
233         sxtw            x1,  w1
234
235         sub             x0,  x0,  #2
236         ld1             {v18.S}[0], [x0], x1
237         ld1             {v16.S}[0], [x0], x1
238         ld1             {v0.S}[0],  [x0], x1
239         ld1             {v2.S}[0],  [x0], x1
240         ld1             {v18.S}[1], [x0], x1
241         ld1             {v16.S}[1], [x0], x1
242         ld1             {v0.S}[1],  [x0], x1
243         ld1             {v2.S}[1],  [x0], x1
244
245         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
246
247         h264_loop_filter_chroma
248
249         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
250
251         sub             x0,  x0,  x1, lsl #3
252         st1             {v18.S}[0], [x0], x1
253         st1             {v16.S}[0], [x0], x1
254         st1             {v0.S}[0],  [x0], x1
255         st1             {v2.S}[0],  [x0], x1
256         st1             {v18.S}[1], [x0], x1
257         st1             {v16.S}[1], [x0], x1
258         st1             {v0.S}[1],  [x0], x1
259         st1             {v2.S}[1],  [x0], x1
260
261         ret
262 endfunc
263
264 .macro  biweight_16     macs, macd
265         dup             v0.16B,  w5
266         dup             v1.16B,  w6
267         mov             v4.16B,  v16.16B
268         mov             v6.16B,  v16.16B
269 1:      subs            w3,  w3,  #2
270         ld1             {v20.16B}, [x0], x2
271         \macd           v4.8H,   v0.8B,  v20.8B
272         \macd\()2       v6.8H,   v0.16B, v20.16B
273         ld1             {v22.16B}, [x1], x2
274         \macs           v4.8H,   v1.8B,  v22.8B
275         \macs\()2       v6.8H,   v1.16B, v22.16B
276         mov             v24.16B, v16.16B
277         ld1             {v28.16B}, [x0], x2
278         mov             v26.16B, v16.16B
279         \macd           v24.8H,  v0.8B,  v28.8B
280         \macd\()2       v26.8H,  v0.16B, v28.16B
281         ld1             {v30.16B}, [x1], x2
282         \macs           v24.8H,  v1.8B,  v30.8B
283         \macs\()2       v26.8H,  v1.16B, v30.16B
284         sshl            v4.8H,   v4.8H,  v18.8H
285         sshl            v6.8H,   v6.8H,  v18.8H
286         sqxtun          v4.8B,   v4.8H
287         sqxtun2         v4.16B,  v6.8H
288         sshl            v24.8H,  v24.8H, v18.8H
289         sshl            v26.8H,  v26.8H, v18.8H
290         sqxtun          v24.8B,  v24.8H
291         sqxtun2         v24.16B, v26.8H
292         mov             v6.16B,  v16.16B
293         st1             {v4.16B},  [x7], x2
294         mov             v4.16B,  v16.16B
295         st1             {v24.16B}, [x7], x2
296         b.ne            1b
297         ret
298 .endm
299
300 .macro  biweight_8      macs, macd
301         dup             v0.8B,  w5
302         dup             v1.8B,  w6
303         mov             v2.16B,  v16.16B
304         mov             v20.16B, v16.16B
305 1:      subs            w3,  w3,  #2
306         ld1             {v4.8B}, [x0], x2
307         \macd           v2.8H,  v0.8B,  v4.8B
308         ld1             {v5.8B}, [x1], x2
309         \macs           v2.8H,  v1.8B,  v5.8B
310         ld1             {v6.8B}, [x0], x2
311         \macd           v20.8H, v0.8B,  v6.8B
312         ld1             {v7.8B}, [x1], x2
313         \macs           v20.8H, v1.8B,  v7.8B
314         sshl            v2.8H,  v2.8H,  v18.8H
315         sqxtun          v2.8B,  v2.8H
316         sshl            v20.8H, v20.8H, v18.8H
317         sqxtun          v4.8B,  v20.8H
318         mov             v20.16B, v16.16B
319         st1             {v2.8B}, [x7], x2
320         mov             v2.16B,  v16.16B
321         st1             {v4.8B}, [x7], x2
322         b.ne            1b
323         ret
324 .endm
325
326 .macro  biweight_4      macs, macd
327         dup             v0.8B,  w5
328         dup             v1.8B,  w6
329         mov             v2.16B, v16.16B
330         mov             v20.16B,v16.16B
331 1:      subs            w3,  w3,  #4
332         ld1             {v4.S}[0], [x0], x2
333         ld1             {v4.S}[1], [x0], x2
334         \macd           v2.8H,  v0.8B,  v4.8B
335         ld1             {v5.S}[0], [x1], x2
336         ld1             {v5.S}[1], [x1], x2
337         \macs           v2.8H,  v1.8B,  v5.8B
338         b.lt            2f
339         ld1             {v6.S}[0], [x0], x2
340         ld1             {v6.S}[1], [x0], x2
341         \macd           v20.8H, v0.8B,  v6.8B
342         ld1             {v7.S}[0], [x1], x2
343         ld1             {v7.S}[1], [x1], x2
344         \macs           v20.8H, v1.8B,  v7.8B
345         sshl            v2.8H,  v2.8H,  v18.8H
346         sqxtun          v2.8B,  v2.8H
347         sshl            v20.8H, v20.8H, v18.8H
348         sqxtun          v4.8B,  v20.8H
349         mov             v20.16B, v16.16B
350         st1             {v2.S}[0], [x7], x2
351         st1             {v2.S}[1], [x7], x2
352         mov             v2.16B,  v16.16B
353         st1             {v4.S}[0], [x7], x2
354         st1             {v4.S}[1], [x7], x2
355         b.ne            1b
356         ret
357 2:      sshl            v2.8H,  v2.8H,  v18.8H
358         sqxtun          v2.8B,  v2.8H
359         st1             {v2.S}[0], [x7], x2
360         st1             {v2.S}[1], [x7], x2
361         ret
362 .endm
363
364 .macro  biweight_func   w
365 function ff_biweight_h264_pixels_\w\()_neon, export=1
366         sxtw            x2,  w2
367         lsr             w8,  w5,  #31
368         add             w7,  w7,  #1
369         eor             w8,  w8,  w6,  lsr #30
370         orr             w7,  w7,  #1
371         dup             v18.8H,   w4
372         lsl             w7,  w7,  w4
373         not             v18.16B,  v18.16B
374         dup             v16.8H,   w7
375         mov             x7,  x0
376         cbz             w8,  10f
377         subs            w8,  w8,  #1
378         b.eq            20f
379         subs            w8,  w8,  #1
380         b.eq            30f
381         b               40f
382 10:     biweight_\w     umlal, umlal
383 20:     neg             w5, w5
384         biweight_\w     umlal, umlsl
385 30:     neg             w5, w5
386         neg             w6, w6
387         biweight_\w     umlsl, umlsl
388 40:     neg             w6, w6
389         biweight_\w     umlsl, umlal
390 endfunc
391 .endm
392
393         biweight_func   16
394         biweight_func   8
395         biweight_func   4
396
397 .macro  weight_16       add
398         dup             v0.16B,  w4
399 1:      subs            w2,  w2,  #2
400         ld1             {v20.16B}, [x0], x1
401         umull           v4.8H,   v0.8B,  v20.8B
402         umull2          v6.8H,   v0.16B, v20.16B
403         ld1             {v28.16B}, [x0], x1
404         umull           v24.8H,  v0.8B,  v28.8B
405         umull2          v26.8H,  v0.16B, v28.16B
406         \add            v4.8H,   v16.8H, v4.8H
407         srshl           v4.8H,   v4.8H,  v18.8H
408         \add            v6.8H,   v16.8H, v6.8H
409         srshl           v6.8H,   v6.8H,  v18.8H
410         sqxtun          v4.8B,   v4.8H
411         sqxtun2         v4.16B,  v6.8H
412         \add            v24.8H,  v16.8H, v24.8H
413         srshl           v24.8H,  v24.8H, v18.8H
414         \add            v26.8H,  v16.8H, v26.8H
415         srshl           v26.8H,  v26.8H, v18.8H
416         sqxtun          v24.8B,  v24.8H
417         sqxtun2         v24.16B, v26.8H
418         st1             {v4.16B},  [x5], x1
419         st1             {v24.16B}, [x5], x1
420         b.ne            1b
421         ret
422 .endm
423
424 .macro  weight_8        add
425         dup             v0.8B,  w4
426 1:      subs            w2,  w2,  #2
427         ld1             {v4.8B}, [x0], x1
428         umull           v2.8H,  v0.8B,  v4.8B
429         ld1             {v6.8B}, [x0], x1
430         umull           v20.8H, v0.8B,  v6.8B
431         \add            v2.8H,  v16.8H,  v2.8H
432         srshl           v2.8H,  v2.8H,  v18.8H
433         sqxtun          v2.8B,  v2.8H
434         \add            v20.8H, v16.8H,  v20.8H
435         srshl           v20.8H, v20.8H, v18.8H
436         sqxtun          v4.8B,  v20.8H
437         st1             {v2.8B}, [x5], x1
438         st1             {v4.8B}, [x5], x1
439         b.ne            1b
440         ret
441 .endm
442
443 .macro  weight_4        add
444         dup             v0.8B,  w4
445 1:      subs            w2,  w2,  #4
446         ld1             {v4.S}[0], [x0], x1
447         ld1             {v4.S}[1], [x0], x1
448         umull           v2.8H,  v0.8B,  v4.8B
449         b.lt            2f
450         ld1             {v6.S}[0], [x0], x1
451         ld1             {v6.S}[1], [x0], x1
452         umull           v20.8H, v0.8B,  v6.8B
453         \add            v2.8H,  v16.8H,  v2.8H
454         srshl           v2.8H,  v2.8H,  v18.8H
455         sqxtun          v2.8B,  v2.8H
456         \add            v20.8H, v16.8H,  v20.8H
457         srshl           v20.8H, v20.8h, v18.8H
458         sqxtun          v4.8B,  v20.8H
459         st1             {v2.S}[0], [x5], x1
460         st1             {v2.S}[1], [x5], x1
461         st1             {v4.S}[0], [x5], x1
462         st1             {v4.S}[1], [x5], x1
463         b.ne            1b
464         ret
465 2:      \add            v2.8H,  v16.8H,  v2.8H
466         srshl           v2.8H,  v2.8H,  v18.8H
467         sqxtun          v2.8B,  v2.8H
468         st1             {v2.S}[0], [x5], x1
469         st1             {v2.S}[1], [x5], x1
470         ret
471 .endm
472
473 .macro  weight_func     w
474 function ff_weight_h264_pixels_\w\()_neon, export=1
475         sxtw            x1,  w1
476         cmp             w3,  #1
477         mov             w6,  #1
478         lsl             w5,  w5,  w3
479         dup             v16.8H,  w5
480         mov             x5,  x0
481         b.le            20f
482         sub             w6,  w6,  w3
483         dup             v18.8H,  w6
484         cmp             w4, #0
485         b.lt            10f
486         weight_\w       shadd
487 10:     neg             w4,  w4
488         weight_\w       shsub
489 20:     neg             w6,  w3
490         dup             v18.8H,  w6
491         cmp             w4,  #0
492         b.lt            10f
493         weight_\w       add
494 10:     neg             w4,  w4
495         weight_\w       sub
496 endfunc
497 .endm
498
499         weight_func     16
500         weight_func     8
501         weight_func     4