Replace FFmpeg with Libav in licence headers
[ffmpeg.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "asm.S"
22
23         preserve8
24
25         .text
26
27 .macro  call_2x_pixels  type, subp
28 function ff_\type\()_pixels16\subp\()_armv6, export=1
29         push            {r0-r3, lr}
30         bl              ff_\type\()_pixels8\subp\()_armv6
31         pop             {r0-r3, lr}
32         add             r0,  r0,  #8
33         add             r1,  r1,  #8
34         b               ff_\type\()_pixels8\subp\()_armv6
35 endfunc
36 .endm
37
38 call_2x_pixels          avg
39 call_2x_pixels          put, _x2
40 call_2x_pixels          put, _y2
41 call_2x_pixels          put, _x2_no_rnd
42 call_2x_pixels          put, _y2_no_rnd
43
44 function ff_put_pixels16_armv6, export=1
45         push            {r4-r11}
46 1:
47         ldr             r5,  [r1, #4]
48         ldr             r6,  [r1, #8]
49         ldr             r7,  [r1, #12]
50         ldr             r4,  [r1], r2
51         strd            r6,  r7,  [r0, #8]
52         ldr             r9,  [r1, #4]
53         strd            r4,  r5,  [r0],  r2
54         ldr             r10, [r1, #8]
55         ldr             r11, [r1, #12]
56         ldr             r8,  [r1], r2
57         strd            r10, r11, [r0, #8]
58         subs            r3,  r3,  #2
59         strd            r8,  r9,  [r0],  r2
60         bne             1b
61
62         pop             {r4-r11}
63         bx              lr
64 endfunc
65
66 function ff_put_pixels8_armv6, export=1
67         push            {r4-r7}
68 1:
69         ldr             r5,  [r1, #4]
70         ldr             r4,  [r1], r2
71         ldr             r7,  [r1, #4]
72         strd            r4,  r5,  [r0],  r2
73         ldr             r6,  [r1], r2
74         subs            r3,  r3,  #2
75         strd            r6,  r7,  [r0],  r2
76         bne             1b
77
78         pop             {r4-r7}
79         bx              lr
80 endfunc
81
82 function ff_put_pixels8_x2_armv6, export=1
83         push            {r4-r11, lr}
84         mov             r12, #1
85         orr             r12, r12, r12, lsl #8
86         orr             r12, r12, r12, lsl #16
87 1:
88         ldr             r4,  [r1]
89         subs            r3,  r3,  #2
90         ldr             r5,  [r1, #4]
91         ldr             r7,  [r1, #5]
92         lsr             r6,  r4,  #8
93         ldr             r8,  [r1, r2]!
94         orr             r6,  r6,  r5,  lsl #24
95         ldr             r9,  [r1, #4]
96         ldr             r11, [r1, #5]
97         lsr             r10, r8,  #8
98         add             r1,  r1,  r2
99         orr             r10, r10, r9,  lsl #24
100         eor             r14, r4,  r6
101         uhadd8          r4,  r4,  r6
102         eor             r6,  r5,  r7
103         uhadd8          r5,  r5,  r7
104         and             r14, r14, r12
105         and             r6,  r6,  r12
106         uadd8           r4,  r4,  r14
107         eor             r14, r8,  r10
108         uadd8           r5,  r5,  r6
109         eor             r6,  r9,  r11
110         uhadd8          r8,  r8,  r10
111         and             r14, r14, r12
112         uhadd8          r9,  r9,  r11
113         and             r6,  r6,  r12
114         uadd8           r8,  r8,  r14
115         strd            r4,  r5,  [r0],  r2
116         uadd8           r9,  r9,  r6
117         strd            r8,  r9,  [r0],  r2
118         bne             1b
119
120         pop             {r4-r11, pc}
121 endfunc
122
123 function ff_put_pixels8_y2_armv6, export=1
124         push            {r4-r11}
125         mov             r12, #1
126         orr             r12, r12, r12, lsl #8
127         orr             r12, r12, r12, lsl #16
128         ldr             r4,  [r1]
129         ldr             r5,  [r1, #4]
130         ldr             r6,  [r1, r2]!
131         ldr             r7,  [r1, #4]
132 1:
133         subs            r3,  r3,  #2
134         uhadd8          r8,  r4,  r6
135         eor             r10, r4,  r6
136         uhadd8          r9,  r5,  r7
137         eor             r11, r5,  r7
138         and             r10, r10, r12
139         ldr             r4,  [r1, r2]!
140         uadd8           r8,  r8,  r10
141         and             r11, r11, r12
142         uadd8           r9,  r9,  r11
143         ldr             r5,  [r1, #4]
144         uhadd8          r10, r4,  r6
145         eor             r6,  r4,  r6
146         uhadd8          r11, r5,  r7
147         and             r6,  r6,  r12
148         eor             r7,  r5,  r7
149         uadd8           r10, r10, r6
150         and             r7,  r7,  r12
151         ldr             r6,  [r1, r2]!
152         uadd8           r11, r11, r7
153         strd            r8,  r9,  [r0],  r2
154         ldr             r7,  [r1, #4]
155         strd            r10, r11, [r0],  r2
156         bne             1b
157
158         pop             {r4-r11}
159         bx              lr
160 endfunc
161
162 function ff_put_pixels8_x2_no_rnd_armv6, export=1
163         push            {r4-r9, lr}
164 1:
165         subs            r3,  r3,  #2
166         ldr             r4,  [r1]
167         ldr             r5,  [r1, #4]
168         ldr             r7,  [r1, #5]
169         ldr             r8,  [r1, r2]!
170         ldr             r9,  [r1, #4]
171         ldr             r14, [r1, #5]
172         add             r1,  r1,  r2
173         lsr             r6,  r4,  #8
174         orr             r6,  r6,  r5,  lsl #24
175         lsr             r12, r8,  #8
176         orr             r12, r12, r9,  lsl #24
177         uhadd8          r4,  r4,  r6
178         uhadd8          r5,  r5,  r7
179         uhadd8          r8,  r8,  r12
180         uhadd8          r9,  r9,  r14
181         stm             r0,  {r4,r5}
182         add             r0,  r0,  r2
183         stm             r0,  {r8,r9}
184         add             r0,  r0,  r2
185         bne             1b
186
187         pop             {r4-r9, pc}
188 endfunc
189
190 function ff_put_pixels8_y2_no_rnd_armv6, export=1
191         push            {r4-r9, lr}
192         ldr             r4,  [r1]
193         ldr             r5,  [r1, #4]
194         ldr             r6,  [r1, r2]!
195         ldr             r7,  [r1, #4]
196 1:
197         subs            r3,  r3,  #2
198         uhadd8          r8,  r4,  r6
199         ldr             r4,  [r1, r2]!
200         uhadd8          r9,  r5,  r7
201         ldr             r5,  [r1, #4]
202         uhadd8          r12, r4,  r6
203         ldr             r6,  [r1, r2]!
204         uhadd8          r14, r5,  r7
205         ldr             r7,  [r1, #4]
206         stm             r0,  {r8,r9}
207         add             r0,  r0,  r2
208         stm             r0,  {r12,r14}
209         add             r0,  r0,  r2
210         bne             1b
211
212         pop             {r4-r9, pc}
213 endfunc
214
215 function ff_avg_pixels8_armv6, export=1
216         pld             [r1, r2]
217         push            {r4-r10, lr}
218         mov             lr,  #1
219         orr             lr,  lr,  lr,  lsl #8
220         orr             lr,  lr,  lr,  lsl #16
221         ldrd            r4,  r5,  [r0]
222         ldr             r10, [r1, #4]
223         ldr             r9,  [r1], r2
224         subs            r3,  r3,  #2
225 1:
226         pld             [r1, r2]
227         eor             r8,  r4,  r9
228         uhadd8          r4,  r4,  r9
229         eor             r12, r5,  r10
230         ldrd            r6,  r7,  [r0, r2]
231         uhadd8          r5,  r5,  r10
232         and             r8,  r8,  lr
233         ldr             r10, [r1, #4]
234         and             r12, r12, lr
235         uadd8           r4,  r4,  r8
236         ldr             r9,  [r1], r2
237         eor             r8,  r6,  r9
238         uadd8           r5,  r5,  r12
239         pld             [r1, r2,  lsl #1]
240         eor             r12, r7,  r10
241         uhadd8          r6,  r6,  r9
242         strd            r4,  r5,  [r0], r2
243         uhadd8          r7,  r7,  r10
244         beq             2f
245         and             r8,  r8,  lr
246         ldrd            r4,  r5,  [r0, r2]
247         uadd8           r6,  r6,  r8
248         ldr             r10, [r1, #4]
249         and             r12, r12, lr
250         subs            r3,  r3,  #2
251         uadd8           r7,  r7,  r12
252         ldr             r9,  [r1], r2
253         strd            r6,  r7,  [r0], r2
254         b               1b
255 2:
256         and             r8,  r8,  lr
257         and             r12, r12, lr
258         uadd8           r6,  r6,  r8
259         uadd8           r7,  r7,  r12
260         strd            r6,  r7,  [r0], r2
261
262         pop             {r4-r10, pc}
263 endfunc
264
265 function ff_add_pixels_clamped_armv6, export=1
266         push            {r4-r8,lr}
267         mov             r3,  #8
268 1:
269         ldm             r0!, {r4,r5,r12,lr}
270         ldrd            r6,  r7,  [r1]
271         pkhbt           r8,  r4,  r5,  lsl #16
272         pkhtb           r5,  r5,  r4,  asr #16
273         pkhbt           r4,  r12, lr,  lsl #16
274         pkhtb           lr,  lr,  r12, asr #16
275         pld             [r1, r2]
276         uxtab16         r8,  r8,  r6
277         uxtab16         r5,  r5,  r6,  ror #8
278         uxtab16         r4,  r4,  r7
279         uxtab16         lr,  lr,  r7,  ror #8
280         usat16          r8,  #8,  r8
281         usat16          r5,  #8,  r5
282         usat16          r4,  #8,  r4
283         usat16          lr,  #8,  lr
284         orr             r6,  r8,  r5,  lsl #8
285         orr             r7,  r4,  lr,  lsl #8
286         subs            r3,  r3,  #1
287         strd            r6,  r7,  [r1],  r2
288         bgt             1b
289         pop             {r4-r8,pc}
290 endfunc
291
292 function ff_get_pixels_armv6, export=1
293         pld             [r1, r2]
294         push            {r4-r8, lr}
295         mov             lr,  #8
296 1:
297         ldrd            r4,  r5,  [r1],  r2
298         subs            lr,  lr,  #1
299         uxtb16          r6,  r4
300         uxtb16          r4,  r4,  ror #8
301         uxtb16          r12, r5
302         uxtb16          r8,  r5,  ror #8
303         pld             [r1, r2]
304         pkhbt           r5,  r6,  r4,  lsl #16
305         pkhtb           r6,  r4,  r6,  asr #16
306         pkhbt           r7,  r12, r8,  lsl #16
307         pkhtb           r12, r8,  r12, asr #16
308         stm             r0!, {r5,r6,r7,r12}
309         bgt             1b
310
311         pop             {r4-r8, pc}
312 endfunc
313
314 function ff_diff_pixels_armv6, export=1
315         pld             [r1, r3]
316         pld             [r2, r3]
317         push            {r4-r9, lr}
318         mov             lr,  #8
319 1:
320         ldrd            r4,  r5,  [r1],  r3
321         ldrd            r6,  r7,  [r2],  r3
322         uxtb16          r8,  r4
323         uxtb16          r4,  r4,  ror #8
324         uxtb16          r9,  r6
325         uxtb16          r6,  r6,  ror #8
326         pld             [r1, r3]
327         ssub16          r9,  r8,  r9
328         ssub16          r6,  r4,  r6
329         uxtb16          r8,  r5
330         uxtb16          r5,  r5,  ror #8
331         pld             [r2, r3]
332         pkhbt           r4,  r9,  r6,  lsl #16
333         pkhtb           r6,  r6,  r9,  asr #16
334         uxtb16          r9,  r7
335         uxtb16          r7,  r7,  ror #8
336         ssub16          r9,  r8,  r9
337         ssub16          r5,  r5,  r7
338         subs            lr,  lr,  #1
339         pkhbt           r8,  r9,  r5,  lsl #16
340         pkhtb           r9,  r5,  r9,  asr #16
341         stm             r0!, {r4,r6,r8,r9}
342         bgt             1b
343
344         pop             {r4-r9, pc}
345 endfunc
346
347 function ff_pix_abs16_armv6, export=1
348         ldr             r0,  [sp]
349         push            {r4-r9, lr}
350         mov             r12, #0
351         mov             lr,  #0
352         ldm             r1,  {r4-r7}
353         ldr             r8,  [r2]
354 1:
355         ldr             r9,  [r2, #4]
356         pld             [r1, r3]
357         usada8          r12, r4,  r8,  r12
358         ldr             r8,  [r2, #8]
359         pld             [r2, r3]
360         usada8          lr,  r5,  r9,  lr
361         ldr             r9,  [r2, #12]
362         usada8          r12, r6,  r8,  r12
363         subs            r0,  r0,  #1
364         usada8          lr,  r7,  r9,  lr
365         beq             2f
366         add             r1,  r1,  r3
367         ldm             r1,  {r4-r7}
368         add             r2,  r2,  r3
369         ldr             r8,  [r2]
370         b               1b
371 2:
372         add             r0,  r12, lr
373         pop             {r4-r9, pc}
374 endfunc
375
376 function ff_pix_abs16_x2_armv6, export=1
377         ldr             r12, [sp]
378         push            {r4-r11, lr}
379         mov             r0,  #0
380         mov             lr,  #1
381         orr             lr,  lr,  lr,  lsl #8
382         orr             lr,  lr,  lr,  lsl #16
383 1:
384         ldr             r8,  [r2]
385         ldr             r9,  [r2, #4]
386         lsr             r10, r8,  #8
387         ldr             r4,  [r1]
388         lsr             r6,  r9,  #8
389         orr             r10, r10, r9,  lsl #24
390         ldr             r5,  [r2, #8]
391         eor             r11, r8,  r10
392         uhadd8          r7,  r8,  r10
393         orr             r6,  r6,  r5,  lsl #24
394         and             r11, r11, lr
395         uadd8           r7,  r7,  r11
396         ldr             r8,  [r1, #4]
397         usada8          r0,  r4,  r7,  r0
398         eor             r7,  r9,  r6
399         lsr             r10, r5,  #8
400         and             r7,  r7,  lr
401         uhadd8          r4,  r9,  r6
402         ldr             r6,  [r2, #12]
403         uadd8           r4,  r4,  r7
404         pld             [r1, r3]
405         orr             r10, r10, r6,  lsl #24
406         usada8          r0,  r8,  r4,  r0
407         ldr             r4,  [r1, #8]
408         eor             r11, r5,  r10
409         ldrb            r7,  [r2, #16]
410         and             r11, r11, lr
411         uhadd8          r8,  r5,  r10
412         ldr             r5,  [r1, #12]
413         uadd8           r8,  r8,  r11
414         pld             [r2, r3]
415         lsr             r10, r6,  #8
416         usada8          r0,  r4,  r8,  r0
417         orr             r10, r10, r7,  lsl #24
418         subs            r12,  r12,  #1
419         eor             r11, r6,  r10
420         add             r1,  r1,  r3
421         uhadd8          r9,  r6,  r10
422         and             r11, r11, lr
423         uadd8           r9,  r9,  r11
424         add             r2,  r2,  r3
425         usada8          r0,  r5,  r9,  r0
426         bgt             1b
427
428         pop             {r4-r11, pc}
429 endfunc
430
431 .macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
432         ldr             \n0, [r2]
433         eor             \n1, \p0, \n0
434         uhadd8          \p0, \p0, \n0
435         and             \n1, \n1, lr
436         ldr             \n2, [r1]
437         uadd8           \p0, \p0, \n1
438         ldr             \n1, [r2, #4]
439         usada8          r0,  \p0, \n2, r0
440         pld             [r1,  r3]
441         eor             \n3, \p1, \n1
442         uhadd8          \p1, \p1, \n1
443         and             \n3, \n3, lr
444         ldr             \p0, [r1, #4]
445         uadd8           \p1, \p1, \n3
446         ldr             \n2, [r2, #8]
447         usada8          r0,  \p1, \p0, r0
448         pld             [r2,  r3]
449         eor             \p0, \p2, \n2
450         uhadd8          \p2, \p2, \n2
451         and             \p0, \p0, lr
452         ldr             \p1, [r1, #8]
453         uadd8           \p2, \p2, \p0
454         ldr             \n3, [r2, #12]
455         usada8          r0,  \p2, \p1, r0
456         eor             \p1, \p3, \n3
457         uhadd8          \p3, \p3, \n3
458         and             \p1, \p1, lr
459         ldr             \p0,  [r1, #12]
460         uadd8           \p3, \p3, \p1
461         add             r1,  r1,  r3
462         usada8          r0,  \p3, \p0,  r0
463         add             r2,  r2,  r3
464 .endm
465
466 function ff_pix_abs16_y2_armv6, export=1
467         pld             [r1]
468         pld             [r2]
469         ldr             r12, [sp]
470         push            {r4-r11, lr}
471         mov             r0,  #0
472         mov             lr,  #1
473         orr             lr,  lr,  lr,  lsl #8
474         orr             lr,  lr,  lr,  lsl #16
475         ldr             r4,  [r2]
476         ldr             r5,  [r2, #4]
477         ldr             r6,  [r2, #8]
478         ldr             r7,  [r2, #12]
479         add             r2,  r2,  r3
480 1:
481         usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
482         subs            r12, r12, #2
483         usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
484         bgt             1b
485
486         pop             {r4-r11, pc}
487 endfunc
488
489 function ff_pix_abs8_armv6, export=1
490         pld             [r2, r3]
491         ldr             r12, [sp]
492         push            {r4-r9, lr}
493         mov             r0,  #0
494         mov             lr,  #0
495         ldrd            r4,  r5,  [r1], r3
496 1:
497         subs            r12, r12, #2
498         ldr             r7,  [r2, #4]
499         ldr             r6,  [r2], r3
500         ldrd            r8,  r9,  [r1], r3
501         usada8          r0,  r4,  r6,  r0
502         pld             [r2, r3]
503         usada8          lr,  r5,  r7,  lr
504         ldr             r7,  [r2, #4]
505         ldr             r6,  [r2], r3
506         beq             2f
507         ldrd            r4,  r5,  [r1], r3
508         usada8          r0,  r8,  r6,  r0
509         pld             [r2, r3]
510         usada8          lr,  r9,  r7,  lr
511         b               1b
512 2:
513         usada8          r0,  r8,  r6,  r0
514         usada8          lr,  r9,  r7,  lr
515         add             r0,  r0,  lr
516         pop             {r4-r9, pc}
517 endfunc
518
519 function ff_sse16_armv6, export=1
520         ldr             r12, [sp]
521         push            {r4-r9, lr}
522         mov             r0,  #0
523 1:
524         ldrd            r4,  r5,  [r1]
525         ldr             r8,  [r2]
526         uxtb16          lr,  r4
527         uxtb16          r4,  r4,  ror #8
528         uxtb16          r9,  r8
529         uxtb16          r8,  r8,  ror #8
530         ldr             r7,  [r2, #4]
531         usub16          lr,  lr,  r9
532         usub16          r4,  r4,  r8
533         smlad           r0,  lr,  lr,  r0
534         uxtb16          r6,  r5
535         uxtb16          lr,  r5,  ror #8
536         uxtb16          r8,  r7
537         uxtb16          r9,  r7,  ror #8
538         smlad           r0,  r4,  r4,  r0
539         ldrd            r4,  r5,  [r1, #8]
540         usub16          r6,  r6,  r8
541         usub16          r8,  lr,  r9
542         ldr             r7,  [r2, #8]
543         smlad           r0,  r6,  r6,  r0
544         uxtb16          lr,  r4
545         uxtb16          r4,  r4,  ror #8
546         uxtb16          r9,  r7
547         uxtb16          r7,  r7, ror #8
548         smlad           r0,  r8,  r8,  r0
549         ldr             r8,  [r2, #12]
550         usub16          lr,  lr,  r9
551         usub16          r4,  r4,  r7
552         smlad           r0,  lr,  lr,  r0
553         uxtb16          r6,  r5
554         uxtb16          r5,  r5,  ror #8
555         uxtb16          r9,  r8
556         uxtb16          r8,  r8,  ror #8
557         smlad           r0,  r4,  r4,  r0
558         usub16          r6,  r6,  r9
559         usub16          r5,  r5,  r8
560         smlad           r0,  r6,  r6,  r0
561         add             r1,  r1,  r3
562         add             r2,  r2,  r3
563         subs            r12, r12, #1
564         smlad           r0,  r5,  r5,  r0
565         bgt             1b
566
567         pop             {r4-r9, pc}
568 endfunc
569
570 function ff_pix_norm1_armv6, export=1
571         push            {r4-r6, lr}
572         mov             r12, #16
573         mov             lr,  #0
574 1:
575         ldm             r0,  {r2-r5}
576         uxtb16          r6,  r2
577         uxtb16          r2,  r2,  ror #8
578         smlad           lr,  r6,  r6,  lr
579         uxtb16          r6,  r3
580         smlad           lr,  r2,  r2,  lr
581         uxtb16          r3,  r3,  ror #8
582         smlad           lr,  r6,  r6,  lr
583         uxtb16          r6,  r4
584         smlad           lr,  r3,  r3,  lr
585         uxtb16          r4,  r4,  ror #8
586         smlad           lr,  r6,  r6,  lr
587         uxtb16          r6,  r5
588         smlad           lr,  r4,  r4,  lr
589         uxtb16          r5,  r5,  ror #8
590         smlad           lr,  r6,  r6,  lr
591         subs            r12, r12, #1
592         add             r0,  r0,  r1
593         smlad           lr,  r5,  r5,  lr
594         bgt             1b
595
596         mov             r0,  lr
597         pop             {r4-r6, pc}
598 endfunc
599
600 function ff_pix_sum_armv6, export=1
601         push            {r4-r7, lr}
602         mov             r12, #16
603         mov             r2,  #0
604         mov             r3,  #0
605         mov             lr,  #0
606         ldr             r4,  [r0]
607 1:
608         subs            r12, r12, #1
609         ldr             r5,  [r0, #4]
610         usada8          r2,  r4,  lr,  r2
611         ldr             r6,  [r0, #8]
612         usada8          r3,  r5,  lr,  r3
613         ldr             r7,  [r0, #12]
614         usada8          r2,  r6,  lr,  r2
615         beq             2f
616         ldr             r4,  [r0, r1]!
617         usada8          r3,  r7,  lr,  r3
618         bgt             1b
619 2:
620         usada8          r3,  r7,  lr,  r3
621         add             r0,  r2,  r3
622         pop             {r4-r7, pc}
623 endfunc