ARMv6 optimised pix_abs8
[ffmpeg.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         .text
24
25 .macro  call_2x_pixels  type, subp
26 function ff_\type\()_pixels16\subp\()_armv6, export=1
27         push            {r0-r3, lr}
28         bl              ff_\type\()_pixels8\subp\()_armv6
29         pop             {r0-r3, lr}
30         add             r0,  r0,  #8
31         add             r1,  r1,  #8
32         b               ff_\type\()_pixels8\subp\()_armv6
33 .endfunc
34 .endm
35
36 call_2x_pixels          avg
37 call_2x_pixels          put, _x2
38 call_2x_pixels          put, _y2
39 call_2x_pixels          put, _x2_no_rnd
40 call_2x_pixels          put, _y2_no_rnd
41
42 function ff_put_pixels16_armv6, export=1
43         push            {r4-r11}
44 1:
45         ldr             r5,  [r1, #4]
46         ldr             r6,  [r1, #8]
47         ldr             r7,  [r1, #12]
48         ldr             r4,  [r1], r2
49         strd            r6,  r7,  [r0, #8]
50         ldr             r9,  [r1, #4]
51         strd            r4,  r5,  [r0],  r2
52         ldr             r10, [r1, #8]
53         ldr             r11, [r1, #12]
54         ldr             r8,  [r1], r2
55         strd            r10, r11, [r0, #8]
56         subs            r3,  r3,  #2
57         strd            r8,  r9,  [r0],  r2
58         bne             1b
59
60         pop             {r4-r11}
61         bx              lr
62 .endfunc
63
64 function ff_put_pixels8_armv6, export=1
65         push            {r4-r7}
66 1:
67         ldr             r5,  [r1, #4]
68         ldr             r4,  [r1], r2
69         ldr             r7,  [r1, #4]
70         strd            r4,  r5,  [r0],  r2
71         ldr             r6,  [r1], r2
72         subs            r3,  r3,  #2
73         strd            r6,  r7,  [r0],  r2
74         bne             1b
75
76         pop             {r4-r7}
77         bx              lr
78 .endfunc
79
80 function ff_put_pixels8_x2_armv6, export=1
81         push            {r4-r11, lr}
82         mov             r12, #1
83         orr             r12, r12, r12, lsl #8
84         orr             r12, r12, r12, lsl #16
85 1:
86         ldr             r4,  [r1]
87         subs            r3,  r3,  #2
88         ldr             r5,  [r1, #4]
89         ldr             r7,  [r1, #5]
90         lsr             r6,  r4,  #8
91         ldr             r8,  [r1, r2]!
92         orr             r6,  r6,  r5,  lsl #24
93         ldr             r9,  [r1, #4]
94         ldr             r11, [r1, #5]
95         lsr             r10, r8,  #8
96         add             r1,  r1,  r2
97         orr             r10, r10, r9,  lsl #24
98         eor             r14, r4,  r6
99         uhadd8          r4,  r4,  r6
100         eor             r6,  r5,  r7
101         uhadd8          r5,  r5,  r7
102         and             r14, r14, r12
103         and             r6,  r6,  r12
104         uadd8           r4,  r4,  r14
105         eor             r14, r8,  r10
106         uadd8           r5,  r5,  r6
107         eor             r6,  r9,  r11
108         uhadd8          r8,  r8,  r10
109         and             r14, r14, r12
110         uhadd8          r9,  r9,  r11
111         and             r6,  r6,  r12
112         uadd8           r8,  r8,  r14
113         strd            r4,  r5,  [r0],  r2
114         uadd8           r9,  r9,  r6
115         strd            r8,  r9,  [r0],  r2
116         bne             1b
117
118         pop             {r4-r11, pc}
119 .endfunc
120
121 function ff_put_pixels8_y2_armv6, export=1
122         push            {r4-r11}
123         mov             r12, #1
124         orr             r12, r12, r12, lsl #8
125         orr             r12, r12, r12, lsl #16
126         ldr             r4,  [r1]
127         ldr             r5,  [r1, #4]
128         ldr             r6,  [r1, r2]!
129         ldr             r7,  [r1, #4]
130 1:
131         subs            r3,  r3,  #2
132         uhadd8          r8,  r4,  r6
133         eor             r10, r4,  r6
134         uhadd8          r9,  r5,  r7
135         eor             r11, r5,  r7
136         and             r10, r10, r12
137         ldr             r4,  [r1, r2]!
138         uadd8           r8,  r8,  r10
139         and             r11, r11, r12
140         uadd8           r9,  r9,  r11
141         ldr             r5,  [r1, #4]
142         uhadd8          r10, r4,  r6
143         eor             r6,  r4,  r6
144         uhadd8          r11, r5,  r7
145         and             r6,  r6,  r12
146         eor             r7,  r5,  r7
147         uadd8           r10, r10, r6
148         and             r7,  r7,  r12
149         ldr             r6,  [r1, r2]!
150         uadd8           r11, r11, r7
151         strd            r8,  r9,  [r0],  r2
152         ldr             r7,  [r1, #4]
153         strd            r10, r11, [r0],  r2
154         bne             1b
155
156         pop             {r4-r11}
157         bx              lr
158 .endfunc
159
160 function ff_put_pixels8_x2_no_rnd_armv6, export=1
161         push            {r4-r9, lr}
162 1:
163         subs            r3,  r3,  #2
164         ldr             r4,  [r1]
165         ldr             r5,  [r1, #4]
166         ldr             r7,  [r1, #5]
167         ldr             r8,  [r1, r2]!
168         ldr             r9,  [r1, #4]
169         ldr             r14, [r1, #5]
170         add             r1,  r1,  r2
171         lsr             r6,  r4,  #8
172         orr             r6,  r6,  r5,  lsl #24
173         lsr             r12, r8,  #8
174         orr             r12, r12, r9,  lsl #24
175         uhadd8          r4,  r4,  r6
176         uhadd8          r5,  r5,  r7
177         uhadd8          r8,  r8,  r12
178         uhadd8          r9,  r9,  r14
179         stm             r0,  {r4,r5}
180         add             r0,  r0,  r2
181         stm             r0,  {r8,r9}
182         add             r0,  r0,  r2
183         bne             1b
184
185         pop             {r4-r9, pc}
186 .endfunc
187
188 function ff_put_pixels8_y2_no_rnd_armv6, export=1
189         push            {r4-r9, lr}
190         ldr             r4,  [r1]
191         ldr             r5,  [r1, #4]
192         ldr             r6,  [r1, r2]!
193         ldr             r7,  [r1, #4]
194 1:
195         subs            r3,  r3,  #2
196         uhadd8          r8,  r4,  r6
197         ldr             r4,  [r1, r2]!
198         uhadd8          r9,  r5,  r7
199         ldr             r5,  [r1, #4]
200         uhadd8          r12, r4,  r6
201         ldr             r6,  [r1, r2]!
202         uhadd8          r14, r5,  r7
203         ldr             r7,  [r1, #4]
204         stm             r0,  {r8,r9}
205         add             r0,  r0,  r2
206         stm             r0,  {r12,r14}
207         add             r0,  r0,  r2
208         bne             1b
209
210         pop             {r4-r9, pc}
211 .endfunc
212
213 function ff_avg_pixels8_armv6, export=1
214         pld             [r1, r2]
215         push            {r4-r10, lr}
216         mov             lr,  #1
217         orr             lr,  lr,  lr,  lsl #8
218         orr             lr,  lr,  lr,  lsl #16
219         ldrd            r4,  r5,  [r0]
220         ldr             r10, [r1, #4]
221         ldr             r9,  [r1], r2
222         subs            r3,  r3,  #2
223 1:
224         pld             [r1, r2]
225         eor             r8,  r4,  r9
226         uhadd8          r4,  r4,  r9
227         eor             r12, r5,  r10
228         ldrd            r6,  r7,  [r0, r2]
229         uhadd8          r5,  r5,  r10
230         and             r8,  r8,  lr
231         ldr             r10, [r1, #4]
232         and             r12, r12, lr
233         uadd8           r4,  r4,  r8
234         ldr             r9,  [r1], r2
235         eor             r8,  r6,  r9
236         uadd8           r5,  r5,  r12
237         pld             [r1, r2,  lsl #1]
238         eor             r12, r7,  r10
239         uhadd8          r6,  r6,  r9
240         strd            r4,  r5,  [r0], r2
241         uhadd8          r7,  r7,  r10
242         beq             2f
243         and             r8,  r8,  lr
244         ldrd            r4,  r5,  [r0, r2]
245         uadd8           r6,  r6,  r8
246         ldr             r10, [r1, #4]
247         and             r12, r12, lr
248         subs            r3,  r3,  #2
249         uadd8           r7,  r7,  r12
250         ldr             r9,  [r1], r2
251         strd            r6,  r7,  [r0], r2
252         b               1b
253 2:
254         and             r8,  r8,  lr
255         and             r12, r12, lr
256         uadd8           r6,  r6,  r8
257         uadd8           r7,  r7,  r12
258         strd            r6,  r7,  [r0], r2
259
260         pop             {r4-r10, pc}
261 .endfunc
262
263 function ff_add_pixels_clamped_armv6, export=1
264         push            {r4-r8,lr}
265         mov             r3,  #8
266 1:
267         ldm             r0!, {r4,r5,r12,lr}
268         ldrd            r6,  r7,  [r1]
269         pkhbt           r8,  r4,  r5,  lsl #16
270         pkhtb           r5,  r5,  r4,  asr #16
271         pkhbt           r4,  r12, lr,  lsl #16
272         pkhtb           lr,  lr,  r12, asr #16
273         pld             [r1, r2]
274         uxtab16         r8,  r8,  r6
275         uxtab16         r5,  r5,  r6,  ror #8
276         uxtab16         r4,  r4,  r7
277         uxtab16         lr,  lr,  r7,  ror #8
278         usat16          r8,  #8,  r8
279         usat16          r5,  #8,  r5
280         usat16          r4,  #8,  r4
281         usat16          lr,  #8,  lr
282         orr             r6,  r8,  r5,  lsl #8
283         orr             r7,  r4,  lr,  lsl #8
284         subs            r3,  r3,  #1
285         strd            r6,  r7,  [r1],  r2
286         bgt             1b
287         pop             {r4-r8,pc}
288 .endfunc
289
290 function ff_pix_abs16_armv6, export=1
291         ldr             r0,  [sp]
292         push            {r4-r9, lr}
293         mov             r12, #0
294         mov             lr,  #0
295         ldm             r1,  {r4-r7}
296         ldr             r8,  [r2]
297 1:
298         ldr             r9,  [r2, #4]
299         pld             [r1, r3]
300         usada8          r12, r4,  r8,  r12
301         ldr             r8,  [r2, #8]
302         pld             [r2, r3]
303         usada8          lr,  r5,  r9,  lr
304         ldr             r9,  [r2, #12]
305         usada8          r12, r6,  r8,  r12
306         subs            r0,  r0,  #1
307         usada8          lr,  r7,  r9,  lr
308         beq             2f
309         add             r1,  r1,  r3
310         ldm             r1,  {r4-r7}
311         add             r2,  r2,  r3
312         ldr             r8,  [r2]
313         b               1b
314 2:
315         add             r0,  r12, lr
316         pop             {r4-r9, pc}
317 .endfunc
318
319 function ff_pix_abs16_x2_armv6, export=1
320         ldr             r12, [sp]
321         push            {r4-r11, lr}
322         mov             r0,  #0
323         mov             lr,  #1
324         orr             lr,  lr,  lr,  lsl #8
325         orr             lr,  lr,  lr,  lsl #16
326 1:
327         ldr             r8,  [r2]
328         ldr             r9,  [r2, #4]
329         lsr             r10, r8,  #8
330         ldr             r4,  [r1]
331         lsr             r6,  r9,  #8
332         orr             r10, r10, r9,  lsl #24
333         ldr             r5,  [r2, #8]
334         eor             r11, r8,  r10
335         uhadd8          r7,  r8,  r10
336         orr             r6,  r6,  r5,  lsl #24
337         and             r11, r11, lr
338         uadd8           r7,  r7,  r11
339         ldr             r8,  [r1, #4]
340         usada8          r0,  r4,  r7,  r0
341         eor             r7,  r9,  r6
342         lsr             r10, r5,  #8
343         and             r7,  r7,  lr
344         uhadd8          r4,  r9,  r6
345         ldr             r6,  [r2, #12]
346         uadd8           r4,  r4,  r7
347         pld             [r1, r3]
348         orr             r10, r10, r6,  lsl #24
349         usada8          r0,  r8,  r4,  r0
350         ldr             r4,  [r1, #8]
351         eor             r11, r5,  r10
352         ldrb            r7,  [r2, #16]
353         and             r11, r11, lr
354         uhadd8          r8,  r5,  r10
355         ldr             r5,  [r1, #12]
356         uadd8           r8,  r8,  r11
357         pld             [r2, r3]
358         lsr             r10, r6,  #8
359         usada8          r0,  r4,  r8,  r0
360         orr             r10, r10, r7,  lsl #24
361         subs            r12,  r12,  #1
362         eor             r11, r6,  r10
363         add             r1,  r1,  r3
364         uhadd8          r9,  r6,  r10
365         and             r11, r11, lr
366         uadd8           r9,  r9,  r11
367         add             r2,  r2,  r3
368         usada8          r0,  r5,  r9,  r0
369         bgt             1b
370
371         pop             {r4-r11, pc}
372 .endfunc
373
374 .macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
375         ldr             \n0, [r2]
376         eor             \n1, \p0, \n0
377         uhadd8          \p0, \p0, \n0
378         and             \n1, \n1, lr
379         ldr             \n2, [r1]
380         uadd8           \p0, \p0, \n1
381         ldr             \n1, [r2, #4]
382         usada8          r0,  \p0, \n2, r0
383         pld             [r1,  r3]
384         eor             \n3, \p1, \n1
385         uhadd8          \p1, \p1, \n1
386         and             \n3, \n3, lr
387         ldr             \p0, [r1, #4]
388         uadd8           \p1, \p1, \n3
389         ldr             \n2, [r2, #8]
390         usada8          r0,  \p1, \p0, r0
391         pld             [r2,  r3]
392         eor             \p0, \p2, \n2
393         uhadd8          \p2, \p2, \n2
394         and             \p0, \p0, lr
395         ldr             \p1, [r1, #8]
396         uadd8           \p2, \p2, \p0
397         ldr             \n3, [r2, #12]
398         usada8          r0,  \p2, \p1, r0
399         eor             \p1, \p3, \n3
400         uhadd8          \p3, \p3, \n3
401         and             \p1, \p1, lr
402         ldr             \p0,  [r1, #12]
403         uadd8           \p3, \p3, \p1
404         add             r1,  r1,  r3
405         usada8          r0,  \p3, \p0,  r0
406         add             r2,  r2,  r3
407 .endm
408
409 function ff_pix_abs16_y2_armv6, export=1
410         pld             [r1]
411         pld             [r2]
412         ldr             r12, [sp]
413         push            {r4-r11, lr}
414         mov             r0,  #0
415         mov             lr,  #1
416         orr             lr,  lr,  lr,  lsl #8
417         orr             lr,  lr,  lr,  lsl #16
418         ldr             r4,  [r2]
419         ldr             r5,  [r2, #4]
420         ldr             r6,  [r2, #8]
421         ldr             r7,  [r2, #12]
422         add             r2,  r2,  r3
423 1:
424         usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
425         subs            r12, r12, #2
426         usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
427         bgt             1b
428
429         pop             {r4-r11, pc}
430 .endfunc
431
432 function ff_pix_abs8_armv6, export=1
433         pld             [r2, r3]
434         ldr             r12, [sp]
435         push            {r4-r9, lr}
436         mov             r0,  #0
437         mov             lr,  #0
438         ldrd            r4,  r5,  [r1], r3
439 1:
440         subs            r12, r12, #2
441         ldr             r7,  [r2, #4]
442         ldr             r6,  [r2], r3
443         ldrd            r8,  r9,  [r1], r3
444         usada8          r0,  r4,  r6,  r0
445         pld             [r2, r3]
446         usada8          lr,  r5,  r7,  lr
447         ldr             r7,  [r2, #4]
448         ldr             r6,  [r2], r3
449         beq             2f
450         ldrd            r4,  r5,  [r1], r3
451         usada8          r0,  r8,  r6,  r0
452         pld             [r2, r3]
453         usada8          lr,  r9,  r7,  lr
454         b               1b
455 2:
456         usada8          r0,  r8,  r6,  r0
457         usada8          lr,  r9,  r7,  lr
458         add             r0,  r0,  lr
459         pop             {r4-r9, pc}
460 .endfunc