Merge commit 'b8d5070db6313f985562865edcfd08a01c2d7503'
[ffmpeg.git] / libavcodec / aarch64 / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
25 .macro  h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27   .ifc \type,avg
28         mov             x8,  x0
29   .endif
30         prfm            pldl1strm, [x1]
31         prfm            pldl1strm, [x1, x2]
32   .ifc \codec,rv40
33         movrel          x6,  rv40bias
34         lsr             w9,  w5,  #1
35         lsr             w10, w4,  #1
36         lsl             w9,  w9,  #3
37         lsl             w10, w10, #1
38         add             w9,  w9,  w10
39         add             x6,  x6,  w9, UXTW
40         ld1r            {v22.8H}, [x6]
41   .endif
42   .ifc \codec,vc1
43         movi            v22.8H,   #28
44   .endif
45         mul             w7,  w4,  w5
46         lsl             w14, w5,  #3
47         lsl             w13, w4,  #3
48         cmp             w7,  #0
49         sub             w6,  w14, w7
50         sub             w12, w13, w7
51         sub             w4,  w7,  w13
52         sub             w4,  w4,  w14
53         add             w4,  w4,  #64
54         b.eq            2f
55
56         dup             v0.8B,  w4
57         dup             v1.8B,  w12
58         ld1             {v4.8B, v5.8B}, [x1], x2
59         dup             v2.8B,  w6
60         dup             v3.8B,  w7
61         ext             v5.8B,  v4.8B,  v5.8B,  #1
62 1:      ld1             {v6.8B, v7.8B}, [x1], x2
63         umull           v16.8H, v4.8B,  v0.8B
64         umlal           v16.8H, v5.8B,  v1.8B
65         ext             v7.8B,  v6.8B,  v7.8B,  #1
66         ld1             {v4.8B, v5.8B}, [x1], x2
67         umlal           v16.8H, v6.8B,  v2.8B
68         prfm            pldl1strm, [x1]
69         ext             v5.8B,  v4.8B,  v5.8B,  #1
70         umlal           v16.8H, v7.8B,  v3.8B
71         umull           v17.8H, v6.8B,  v0.8B
72         subs            w3,  w3,  #2
73         umlal           v17.8H, v7.8B, v1.8B
74         umlal           v17.8H, v4.8B, v2.8B
75         umlal           v17.8H, v5.8B, v3.8B
76         prfm            pldl1strm, [x1, x2]
77   .ifc \codec,h264
78         rshrn           v16.8B, v16.8H, #6
79         rshrn           v17.8B, v17.8H, #6
80   .else
81         add             v16.8H, v16.8H, v22.8H
82         add             v17.8H, v17.8H, v22.8H
83         shrn            v16.8B, v16.8H, #6
84         shrn            v17.8B, v17.8H, #6
85   .endif
86   .ifc \type,avg
87         ld1             {v20.8B}, [x8], x2
88         ld1             {v21.8B}, [x8], x2
89         urhadd          v16.8B, v16.8B, v20.8B
90         urhadd          v17.8B, v17.8B, v21.8B
91   .endif
92         st1             {v16.8B}, [x0], x2
93         st1             {v17.8B}, [x0], x2
94         b.gt            1b
95         ret
96
97 2:      adds            w12, w12, w6
98         dup             v0.8B, w4
99         b.eq            5f
100         tst             w6,  w6
101         dup             v1.8B, w12
102         b.eq            4f
103
104         ld1             {v4.8B}, [x1], x2
105 3:      ld1             {v6.8B}, [x1], x2
106         umull           v16.8H, v4.8B,  v0.8B
107         umlal           v16.8H, v6.8B,  v1.8B
108         ld1             {v4.8B}, [x1], x2
109         umull           v17.8H, v6.8B,  v0.8B
110         umlal           v17.8H, v4.8B,  v1.8B
111         prfm            pldl1strm, [x1]
112   .ifc \codec,h264
113         rshrn           v16.8B, v16.8H, #6
114         rshrn           v17.8B, v17.8H, #6
115   .else
116         add             v16.8H, v16.8H, v22.8H
117         add             v17.8H, v17.8H, v22.8H
118         shrn            v16.8B, v16.8H, #6
119         shrn            v17.8B, v17.8H, #6
120   .endif
121         prfm            pldl1strm, [x1, x2]
122   .ifc \type,avg
123         ld1             {v20.8B}, [x8], x2
124         ld1             {v21.8B}, [x8], x2
125         urhadd          v16.8B, v16.8B, v20.8B
126         urhadd          v17.8B, v17.8B, v21.8B
127   .endif
128         subs            w3,  w3,  #2
129         st1             {v16.8B}, [x0], x2
130         st1             {v17.8B}, [x0], x2
131         b.gt            3b
132         ret
133
134 4:      ld1             {v4.8B, v5.8B}, [x1], x2
135         ld1             {v6.8B, v7.8B}, [x1], x2
136         ext             v5.8B,  v4.8B,  v5.8B,  #1
137         ext             v7.8B,  v6.8B,  v7.8B,  #1
138         prfm            pldl1strm, [x1]
139         subs            w3,  w3,  #2
140         umull           v16.8H, v4.8B, v0.8B
141         umlal           v16.8H, v5.8B, v1.8B
142         umull           v17.8H, v6.8B, v0.8B
143         umlal           v17.8H, v7.8B, v1.8B
144         prfm            pldl1strm, [x1, x2]
145   .ifc \codec,h264
146         rshrn           v16.8B, v16.8H, #6
147         rshrn           v17.8B, v17.8H, #6
148   .else
149         add             v16.8H, v16.8H, v22.8H
150         add             v17.8H, v17.8H, v22.8H
151         shrn            v16.8B, v16.8H, #6
152         shrn            v17.8B, v17.8H, #6
153   .endif
154   .ifc \type,avg
155         ld1             {v20.8B}, [x8], x2
156         ld1             {v21.8B}, [x8], x2
157         urhadd          v16.8B, v16.8B, v20.8B
158         urhadd          v17.8B, v17.8B, v21.8B
159   .endif
160         st1             {v16.8B}, [x0], x2
161         st1             {v17.8B}, [x0], x2
162         b.gt            4b
163         ret
164
165 5:      ld1             {v4.8B}, [x1], x2
166         ld1             {v5.8B}, [x1], x2
167         prfm            pldl1strm, [x1]
168         subs            w3,  w3,  #2
169         umull           v16.8H, v4.8B, v0.8B
170         umull           v17.8H, v5.8B, v0.8B
171         prfm            pldl1strm, [x1, x2]
172   .ifc \codec,h264
173         rshrn           v16.8B, v16.8H, #6
174         rshrn           v17.8B, v17.8H, #6
175   .else
176         add             v16.8H, v16.8H, v22.8H
177         add             v17.8H, v17.8H, v22.8H
178         shrn            v16.8B, v16.8H, #6
179         shrn            v17.8B, v17.8H, #6
180   .endif
181   .ifc \type,avg
182         ld1             {v20.8B}, [x8], x2
183         ld1             {v21.8B}, [x8], x2
184         urhadd          v16.8B, v16.8B, v20.8B
185         urhadd          v17.8B, v17.8B, v21.8B
186   .endif
187         st1             {v16.8B}, [x0], x2
188         st1             {v17.8B}, [x0], x2
189         b.gt            5b
190         ret
191 endfunc
192 .endm
193
194 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
195 .macro  h264_chroma_mc4 type, codec=h264
196 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
197   .ifc \type,avg
198         mov             x8,  x0
199   .endif
200         prfm            pldl1strm, [x1]
201         prfm            pldl1strm, [x1, x2]
202   .ifc \codec,rv40
203         movrel          x6,  rv40bias
204         lsr             w9,  w5,  #1
205         lsr             w10, w4,  #1
206         lsl             w9,  w9,  #3
207         lsl             w10, w10, #1
208         add             w9,  w9,  w10
209         add             x6,  x6,  w9, UXTW
210         ld1r            {v22.8H}, [x6]
211   .endif
212   .ifc \codec,vc1
213         movi            v22.8H,   #28
214   .endif
215         mul             w7,  w4,  w5
216         lsl             w14, w5,  #3
217         lsl             w13, w4,  #3
218         cmp             w7,  #0
219         sub             w6,  w14, w7
220         sub             w12, w13, w7
221         sub             w4,  w7,  w13
222         sub             w4,  w4,  w14
223         add             w4,  w4,  #64
224         b.eq            2f
225
226         dup             v24.8B,  w4
227         dup             v25.8B,  w12
228         ld1             {v4.8B}, [x1], x2
229         dup             v26.8B,  w6
230         dup             v27.8B,  w7
231         ext             v5.8B,  v4.8B,  v5.8B, #1
232         trn1            v0.2S,  v24.2S, v25.2S
233         trn1            v2.2S,  v26.2S, v27.2S
234         trn1            v4.2S,  v4.2S,  v5.2S
235 1:      ld1             {v6.8B}, [x1], x2
236         ext             v7.8B,  v6.8B,  v7.8B, #1
237         trn1            v6.2S,  v6.2S,  v7.2S
238         umull           v18.8H, v4.8B,  v0.8B
239         umlal           v18.8H, v6.8B,  v2.8B
240         ld1             {v4.8B}, [x1], x2
241         ext             v5.8B,  v4.8B,  v5.8B, #1
242         trn1            v4.2S,  v4.2S,  v5.2S
243         prfm            pldl1strm, [x1]
244         umull           v19.8H, v6.8B,  v0.8B
245         umlal           v19.8H, v4.8B,  v2.8B
246         trn1            v30.2D, v18.2D, v19.2D
247         trn2            v31.2D, v18.2D, v19.2D
248         add             v18.8H, v30.8H, v31.8H
249   .ifc \codec,h264
250         rshrn           v16.8B, v18.8H, #6
251   .else
252         add             v18.8H, v18.8H, v22.8H
253         shrn            v16.8B, v18.8H, #6
254   .endif
255         subs            w3,  w3,  #2
256         prfm            pldl1strm, [x1, x2]
257   .ifc \type,avg
258         ld1             {v20.S}[0], [x8], x2
259         ld1             {v20.S}[1], [x8], x2
260         urhadd          v16.8B, v16.8B, v20.8B
261   .endif
262         st1             {v16.S}[0], [x0], x2
263         st1             {v16.S}[1], [x0], x2
264         b.gt            1b
265         ret
266
267 2:      adds            w12, w12, w6
268         dup             v30.8B, w4
269         b.eq            5f
270         tst             w6,  w6
271         dup             v31.8B, w12
272         trn1            v0.2S,  v30.2S, v31.2S
273         trn2            v1.2S,  v30.2S, v31.2S
274         b.eq            4f
275
276         ext             v1.8B,  v0.8B,  v1.8B, #4
277         ld1             {v4.S}[0], [x1], x2
278 3:      ld1             {v4.S}[1], [x1], x2
279         umull           v18.8H, v4.8B,  v0.8B
280         ld1             {v4.S}[0], [x1], x2
281         umull           v19.8H, v4.8B,  v1.8B
282         trn1            v30.2D, v18.2D, v19.2D
283         trn2            v31.2D, v18.2D, v19.2D
284         add             v18.8H, v30.8H, v31.8H
285         prfm            pldl1strm, [x1]
286   .ifc \codec,h264
287         rshrn           v16.8B, v18.8H, #6
288   .else
289         add             v18.8H, v18.8H, v22.8H
290         shrn            v16.8B, v18.8H, #6
291   .endif
292   .ifc \type,avg
293         ld1             {v20.S}[0], [x8], x2
294         ld1             {v20.S}[1], [x8], x2
295         urhadd          v16.8B, v16.8B, v20.8B
296   .endif
297         subs            w3,  w3,  #2
298         prfm            pldl1strm, [x1, x2]
299         st1             {v16.S}[0], [x0], x2
300         st1             {v16.S}[1], [x0], x2
301         b.gt            3b
302         ret
303
304 4:      ld1             {v4.8B}, [x1], x2
305         ld1             {v6.8B}, [x1], x2
306         ext             v5.8B,  v4.8B,  v5.8B, #1
307         ext             v7.8B,  v6.8B,  v7.8B, #1
308         trn1            v4.2S,  v4.2S,  v5.2S
309         trn1            v6.2S,  v6.2S,  v7.2S
310         umull           v18.8H, v4.8B,  v0.8B
311         umull           v19.8H, v6.8B,  v0.8B
312         subs            w3,  w3,  #2
313         trn1            v30.2D, v18.2D, v19.2D
314         trn2            v31.2D, v18.2D, v19.2D
315         add             v18.8H, v30.8H, v31.8H
316         prfm            pldl1strm, [x1]
317   .ifc \codec,h264
318         rshrn           v16.8B, v18.8H, #6
319   .else
320         add             v18.8H, v18.8H, v22.8H
321         shrn            v16.8B, v18.8H, #6
322   .endif
323   .ifc \type,avg
324         ld1             {v20.S}[0], [x8], x2
325         ld1             {v20.S}[1], [x8], x2
326         urhadd          v16.8B, v16.8B, v20.8B
327   .endif
328         prfm            pldl1strm, [x1]
329         st1             {v16.S}[0], [x0], x2
330         st1             {v16.S}[1], [x0], x2
331         b.gt            4b
332         ret
333
334 5:      ld1             {v4.S}[0], [x1], x2
335         ld1             {v4.S}[1], [x1], x2
336         umull           v18.8H, v4.8B,  v30.8B
337         subs            w3,  w3,  #2
338         prfm            pldl1strm, [x1]
339   .ifc \codec,h264
340         rshrn           v16.8B, v18.8H, #6
341   .else
342         add             v18.8H, v18.8H, v22.8H
343         shrn            v16.8B, v18.8H, #6
344   .endif
345   .ifc \type,avg
346         ld1             {v20.S}[0], [x8], x2
347         ld1             {v20.S}[1], [x8], x2
348         urhadd          v16.8B, v16.8B, v20.8B
349   .endif
350         prfm            pldl1strm, [x1]
351         st1             {v16.S}[0], [x0], x2
352         st1             {v16.S}[1], [x0], x2
353         b.gt            5b
354         ret
355 endfunc
356 .endm
357
358 .macro  h264_chroma_mc2 type
359 function ff_\type\()_h264_chroma_mc2_neon, export=1
360         prfm            pldl1strm, [x1]
361         prfm            pldl1strm, [x1, x2]
362         orr             w7,  w4,  w5
363         cbz             w7,  2f
364
365         mul             w7,  w4,  w5
366         lsl             w14, w5,  #3
367         lsl             w13, w4,  #3
368         sub             w6,  w14, w7
369         sub             w12, w13, w7
370         sub             w4,  w7,  w13
371         sub             w4,  w4,  w14
372         add             w4,  w4,  #64
373         dup             v0.8B,  w4
374         dup             v2.8B,  w12
375         dup             v1.8B,  w6
376         dup             v3.8B,  w7
377         trn1            v0.4H,  v0.4H,  v2.4H
378         trn1            v1.4H,  v1.4H,  v3.4H
379 1:
380         ld1             {v4.S}[0],  [x1], x2
381         ld1             {v4.S}[1],  [x1], x2
382         rev64           v5.2S,  v4.2S
383         ld1             {v5.S}[1],  [x1]
384         ext             v6.8B,  v4.8B,  v5.8B,  #1
385         ext             v7.8B,  v5.8B,  v4.8B,  #1
386         trn1            v4.4H,  v4.4H,  v6.4H
387         trn1            v5.4H,  v5.4H,  v7.4H
388         umull           v16.8H, v4.8B,  v0.8B
389         umlal           v16.8H, v5.8B,  v1.8B
390   .ifc \type,avg
391         ld1             {v18.H}[0], [x0], x2
392         ld1             {v18.H}[2], [x0]
393         sub             x0,  x0,  x2
394   .endif
395         rev64           v17.4S, v16.4S
396         add             v16.8H, v16.8H, v17.8H
397         rshrn           v16.8B, v16.8H, #6
398   .ifc \type,avg
399         urhadd          v16.8B, v16.8B, v18.8B
400   .endif
401         st1             {v16.H}[0], [x0], x2
402         st1             {v16.H}[2], [x0], x2
403         subs            w3,  w3,  #2
404         b.gt            1b
405         ret
406
407 2:
408         ld1             {v16.H}[0], [x1], x2
409         ld1             {v16.H}[1], [x1], x2
410   .ifc \type,avg
411         ld1             {v18.H}[0], [x0], x2
412         ld1             {v18.H}[1], [x0]
413         sub             x0,  x0,  x2
414         urhadd          v16.8B, v16.8B, v18.8B
415   .endif
416         st1             {v16.H}[0], [x0], x2
417         st1             {v16.H}[1], [x0], x2
418         subs            w3,  w3,  #2
419         b.gt            2b
420         ret
421 endfunc
422 .endm
423
424         h264_chroma_mc8 put
425         h264_chroma_mc8 avg
426         h264_chroma_mc4 put
427         h264_chroma_mc4 avg
428         h264_chroma_mc2 put
429         h264_chroma_mc2 avg
430
431 #if CONFIG_RV40_DECODER
432 const   rv40bias
433         .short           0, 16, 32, 16
434         .short          32, 28, 32, 28
435         .short           0, 32, 16, 32
436         .short          32, 28, 32, 28
437 endconst
438
439         h264_chroma_mc8 put, rv40
440         h264_chroma_mc8 avg, rv40
441         h264_chroma_mc4 put, rv40
442         h264_chroma_mc4 avg, rv40
443 #endif
444
445 #if CONFIG_VC1DSP
446         h264_chroma_mc8 put, vc1
447         h264_chroma_mc8 avg, vc1
448         h264_chroma_mc4 put, vc1
449         h264_chroma_mc4 avg, vc1
450 #endif