arm: hpeldsp: Move half-pel assembly from dsputil to hpeldsp
[ffmpeg.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of Libav.
5  *
6  * Libav is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * Libav is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with Libav; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 function ff_add_pixels_clamped_armv6, export=1
24         push            {r4-r8,lr}
25         mov             r3,  #8
26 1:
27         ldm             r0!, {r4,r5,r12,lr}
28         ldrd            r6,  r7,  [r1]
29         pkhbt           r8,  r4,  r5,  lsl #16
30         pkhtb           r5,  r5,  r4,  asr #16
31         pkhbt           r4,  r12, lr,  lsl #16
32         pkhtb           lr,  lr,  r12, asr #16
33         pld             [r1, r2]
34         uxtab16         r8,  r8,  r6
35         uxtab16         r5,  r5,  r6,  ror #8
36         uxtab16         r4,  r4,  r7
37         uxtab16         lr,  lr,  r7,  ror #8
38         usat16          r8,  #8,  r8
39         usat16          r5,  #8,  r5
40         usat16          r4,  #8,  r4
41         usat16          lr,  #8,  lr
42         orr             r6,  r8,  r5,  lsl #8
43         orr             r7,  r4,  lr,  lsl #8
44         subs            r3,  r3,  #1
45         strd_post       r6,  r7,  r1,  r2
46         bgt             1b
47         pop             {r4-r8,pc}
48 endfunc
49
50 function ff_get_pixels_armv6, export=1
51         pld             [r1, r2]
52         push            {r4-r8, lr}
53         mov             lr,  #8
54 1:
55         ldrd_post       r4,  r5,  r1,  r2
56         subs            lr,  lr,  #1
57         uxtb16          r6,  r4
58         uxtb16          r4,  r4,  ror #8
59         uxtb16          r12, r5
60         uxtb16          r8,  r5,  ror #8
61         pld             [r1, r2]
62         pkhbt           r5,  r6,  r4,  lsl #16
63         pkhtb           r6,  r4,  r6,  asr #16
64         pkhbt           r7,  r12, r8,  lsl #16
65         pkhtb           r12, r8,  r12, asr #16
66         stm             r0!, {r5,r6,r7,r12}
67         bgt             1b
68
69         pop             {r4-r8, pc}
70 endfunc
71
72 function ff_diff_pixels_armv6, export=1
73         pld             [r1, r3]
74         pld             [r2, r3]
75         push            {r4-r9, lr}
76         mov             lr,  #8
77 1:
78         ldrd_post       r4,  r5,  r1,  r3
79         ldrd_post       r6,  r7,  r2,  r3
80         uxtb16          r8,  r4
81         uxtb16          r4,  r4,  ror #8
82         uxtb16          r9,  r6
83         uxtb16          r6,  r6,  ror #8
84         pld             [r1, r3]
85         ssub16          r9,  r8,  r9
86         ssub16          r6,  r4,  r6
87         uxtb16          r8,  r5
88         uxtb16          r5,  r5,  ror #8
89         pld             [r2, r3]
90         pkhbt           r4,  r9,  r6,  lsl #16
91         pkhtb           r6,  r6,  r9,  asr #16
92         uxtb16          r9,  r7
93         uxtb16          r7,  r7,  ror #8
94         ssub16          r9,  r8,  r9
95         ssub16          r5,  r5,  r7
96         subs            lr,  lr,  #1
97         pkhbt           r8,  r9,  r5,  lsl #16
98         pkhtb           r9,  r5,  r9,  asr #16
99         stm             r0!, {r4,r6,r8,r9}
100         bgt             1b
101
102         pop             {r4-r9, pc}
103 endfunc
104
105 function ff_pix_abs16_armv6, export=1
106         ldr             r0,  [sp]
107         push            {r4-r9, lr}
108         mov             r12, #0
109         mov             lr,  #0
110         ldm             r1,  {r4-r7}
111         ldr             r8,  [r2]
112 1:
113         ldr             r9,  [r2, #4]
114         pld             [r1, r3]
115         usada8          r12, r4,  r8,  r12
116         ldr             r8,  [r2, #8]
117         pld             [r2, r3]
118         usada8          lr,  r5,  r9,  lr
119         ldr             r9,  [r2, #12]
120         usada8          r12, r6,  r8,  r12
121         subs            r0,  r0,  #1
122         usada8          lr,  r7,  r9,  lr
123         beq             2f
124         add             r1,  r1,  r3
125         ldm             r1,  {r4-r7}
126         add             r2,  r2,  r3
127         ldr             r8,  [r2]
128         b               1b
129 2:
130         add             r0,  r12, lr
131         pop             {r4-r9, pc}
132 endfunc
133
134 function ff_pix_abs16_x2_armv6, export=1
135         ldr             r12, [sp]
136         push            {r4-r11, lr}
137         mov             r0,  #0
138         mov             lr,  #1
139         orr             lr,  lr,  lr,  lsl #8
140         orr             lr,  lr,  lr,  lsl #16
141 1:
142         ldr             r8,  [r2]
143         ldr             r9,  [r2, #4]
144         lsr             r10, r8,  #8
145         ldr             r4,  [r1]
146         lsr             r6,  r9,  #8
147         orr             r10, r10, r9,  lsl #24
148         ldr             r5,  [r2, #8]
149         eor             r11, r8,  r10
150         uhadd8          r7,  r8,  r10
151         orr             r6,  r6,  r5,  lsl #24
152         and             r11, r11, lr
153         uadd8           r7,  r7,  r11
154         ldr             r8,  [r1, #4]
155         usada8          r0,  r4,  r7,  r0
156         eor             r7,  r9,  r6
157         lsr             r10, r5,  #8
158         and             r7,  r7,  lr
159         uhadd8          r4,  r9,  r6
160         ldr             r6,  [r2, #12]
161         uadd8           r4,  r4,  r7
162         pld             [r1, r3]
163         orr             r10, r10, r6,  lsl #24
164         usada8          r0,  r8,  r4,  r0
165         ldr             r4,  [r1, #8]
166         eor             r11, r5,  r10
167         ldrb            r7,  [r2, #16]
168         and             r11, r11, lr
169         uhadd8          r8,  r5,  r10
170         ldr             r5,  [r1, #12]
171         uadd8           r8,  r8,  r11
172         pld             [r2, r3]
173         lsr             r10, r6,  #8
174         usada8          r0,  r4,  r8,  r0
175         orr             r10, r10, r7,  lsl #24
176         subs            r12,  r12,  #1
177         eor             r11, r6,  r10
178         add             r1,  r1,  r3
179         uhadd8          r9,  r6,  r10
180         and             r11, r11, lr
181         uadd8           r9,  r9,  r11
182         add             r2,  r2,  r3
183         usada8          r0,  r5,  r9,  r0
184         bgt             1b
185
186         pop             {r4-r11, pc}
187 endfunc
188
189 .macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
190         ldr             \n0, [r2]
191         eor             \n1, \p0, \n0
192         uhadd8          \p0, \p0, \n0
193         and             \n1, \n1, lr
194         ldr             \n2, [r1]
195         uadd8           \p0, \p0, \n1
196         ldr             \n1, [r2, #4]
197         usada8          r0,  \p0, \n2, r0
198         pld             [r1,  r3]
199         eor             \n3, \p1, \n1
200         uhadd8          \p1, \p1, \n1
201         and             \n3, \n3, lr
202         ldr             \p0, [r1, #4]
203         uadd8           \p1, \p1, \n3
204         ldr             \n2, [r2, #8]
205         usada8          r0,  \p1, \p0, r0
206         pld             [r2,  r3]
207         eor             \p0, \p2, \n2
208         uhadd8          \p2, \p2, \n2
209         and             \p0, \p0, lr
210         ldr             \p1, [r1, #8]
211         uadd8           \p2, \p2, \p0
212         ldr             \n3, [r2, #12]
213         usada8          r0,  \p2, \p1, r0
214         eor             \p1, \p3, \n3
215         uhadd8          \p3, \p3, \n3
216         and             \p1, \p1, lr
217         ldr             \p0,  [r1, #12]
218         uadd8           \p3, \p3, \p1
219         add             r1,  r1,  r3
220         usada8          r0,  \p3, \p0,  r0
221         add             r2,  r2,  r3
222 .endm
223
224 function ff_pix_abs16_y2_armv6, export=1
225         pld             [r1]
226         pld             [r2]
227         ldr             r12, [sp]
228         push            {r4-r11, lr}
229         mov             r0,  #0
230         mov             lr,  #1
231         orr             lr,  lr,  lr,  lsl #8
232         orr             lr,  lr,  lr,  lsl #16
233         ldr             r4,  [r2]
234         ldr             r5,  [r2, #4]
235         ldr             r6,  [r2, #8]
236         ldr             r7,  [r2, #12]
237         add             r2,  r2,  r3
238 1:
239         usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
240         subs            r12, r12, #2
241         usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
242         bgt             1b
243
244         pop             {r4-r11, pc}
245 endfunc
246
247 function ff_pix_abs8_armv6, export=1
248         pld             [r2, r3]
249         ldr             r12, [sp]
250         push            {r4-r9, lr}
251         mov             r0,  #0
252         mov             lr,  #0
253         ldrd_post       r4,  r5,  r1,  r3
254 1:
255         subs            r12, r12, #2
256         ldr             r7,  [r2, #4]
257         ldr_post        r6,  r2,  r3
258         ldrd_post       r8,  r9,  r1,  r3
259         usada8          r0,  r4,  r6,  r0
260         pld             [r2, r3]
261         usada8          lr,  r5,  r7,  lr
262         ldr             r7,  [r2, #4]
263         ldr_post        r6,  r2,  r3
264         beq             2f
265         ldrd_post       r4,  r5,  r1,  r3
266         usada8          r0,  r8,  r6,  r0
267         pld             [r2, r3]
268         usada8          lr,  r9,  r7,  lr
269         b               1b
270 2:
271         usada8          r0,  r8,  r6,  r0
272         usada8          lr,  r9,  r7,  lr
273         add             r0,  r0,  lr
274         pop             {r4-r9, pc}
275 endfunc
276
277 function ff_sse16_armv6, export=1
278         ldr             r12, [sp]
279         push            {r4-r9, lr}
280         mov             r0,  #0
281 1:
282         ldrd            r4,  r5,  [r1]
283         ldr             r8,  [r2]
284         uxtb16          lr,  r4
285         uxtb16          r4,  r4,  ror #8
286         uxtb16          r9,  r8
287         uxtb16          r8,  r8,  ror #8
288         ldr             r7,  [r2, #4]
289         usub16          lr,  lr,  r9
290         usub16          r4,  r4,  r8
291         smlad           r0,  lr,  lr,  r0
292         uxtb16          r6,  r5
293         uxtb16          lr,  r5,  ror #8
294         uxtb16          r8,  r7
295         uxtb16          r9,  r7,  ror #8
296         smlad           r0,  r4,  r4,  r0
297         ldrd            r4,  r5,  [r1, #8]
298         usub16          r6,  r6,  r8
299         usub16          r8,  lr,  r9
300         ldr             r7,  [r2, #8]
301         smlad           r0,  r6,  r6,  r0
302         uxtb16          lr,  r4
303         uxtb16          r4,  r4,  ror #8
304         uxtb16          r9,  r7
305         uxtb16          r7,  r7, ror #8
306         smlad           r0,  r8,  r8,  r0
307         ldr             r8,  [r2, #12]
308         usub16          lr,  lr,  r9
309         usub16          r4,  r4,  r7
310         smlad           r0,  lr,  lr,  r0
311         uxtb16          r6,  r5
312         uxtb16          r5,  r5,  ror #8
313         uxtb16          r9,  r8
314         uxtb16          r8,  r8,  ror #8
315         smlad           r0,  r4,  r4,  r0
316         usub16          r6,  r6,  r9
317         usub16          r5,  r5,  r8
318         smlad           r0,  r6,  r6,  r0
319         add             r1,  r1,  r3
320         add             r2,  r2,  r3
321         subs            r12, r12, #1
322         smlad           r0,  r5,  r5,  r0
323         bgt             1b
324
325         pop             {r4-r9, pc}
326 endfunc
327
328 function ff_pix_norm1_armv6, export=1
329         push            {r4-r6, lr}
330         mov             r12, #16
331         mov             lr,  #0
332 1:
333         ldm             r0,  {r2-r5}
334         uxtb16          r6,  r2
335         uxtb16          r2,  r2,  ror #8
336         smlad           lr,  r6,  r6,  lr
337         uxtb16          r6,  r3
338         smlad           lr,  r2,  r2,  lr
339         uxtb16          r3,  r3,  ror #8
340         smlad           lr,  r6,  r6,  lr
341         uxtb16          r6,  r4
342         smlad           lr,  r3,  r3,  lr
343         uxtb16          r4,  r4,  ror #8
344         smlad           lr,  r6,  r6,  lr
345         uxtb16          r6,  r5
346         smlad           lr,  r4,  r4,  lr
347         uxtb16          r5,  r5,  ror #8
348         smlad           lr,  r6,  r6,  lr
349         subs            r12, r12, #1
350         add             r0,  r0,  r1
351         smlad           lr,  r5,  r5,  lr
352         bgt             1b
353
354         mov             r0,  lr
355         pop             {r4-r6, pc}
356 endfunc
357
358 function ff_pix_sum_armv6, export=1
359         push            {r4-r7, lr}
360         mov             r12, #16
361         mov             r2,  #0
362         mov             r3,  #0
363         mov             lr,  #0
364         ldr             r4,  [r0]
365 1:
366         subs            r12, r12, #1
367         ldr             r5,  [r0, #4]
368         usada8          r2,  r4,  lr,  r2
369         ldr             r6,  [r0, #8]
370         usada8          r3,  r5,  lr,  r3
371         ldr             r7,  [r0, #12]
372         usada8          r2,  r6,  lr,  r2
373         beq             2f
374         ldr_pre         r4,  r0,  r1
375         usada8          r3,  r7,  lr,  r3
376         bgt             1b
377 2:
378         usada8          r3,  r7,  lr,  r3
379         add             r0,  r2,  r3
380         pop             {r4-r7, pc}
381 endfunc