Merge commit 'f46bb608d9d76c543e4929dc8cffe36b84bd789e'
[ffmpeg.git] / libavcodec / arm / dsputil_armv6.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/arm/asm.S"
22
23 function ff_pix_abs16_armv6, export=1
24         ldr             r0,  [sp]
25         push            {r4-r9, lr}
26         mov             r12, #0
27         mov             lr,  #0
28         ldm             r1,  {r4-r7}
29         ldr             r8,  [r2]
30 1:
31         ldr             r9,  [r2, #4]
32         pld             [r1, r3]
33         usada8          r12, r4,  r8,  r12
34         ldr             r8,  [r2, #8]
35         pld             [r2, r3]
36         usada8          lr,  r5,  r9,  lr
37         ldr             r9,  [r2, #12]
38         usada8          r12, r6,  r8,  r12
39         subs            r0,  r0,  #1
40         usada8          lr,  r7,  r9,  lr
41         beq             2f
42         add             r1,  r1,  r3
43         ldm             r1,  {r4-r7}
44         add             r2,  r2,  r3
45         ldr             r8,  [r2]
46         b               1b
47 2:
48         add             r0,  r12, lr
49         pop             {r4-r9, pc}
50 endfunc
51
52 function ff_pix_abs16_x2_armv6, export=1
53         ldr             r12, [sp]
54         push            {r4-r11, lr}
55         mov             r0,  #0
56         mov             lr,  #1
57         orr             lr,  lr,  lr,  lsl #8
58         orr             lr,  lr,  lr,  lsl #16
59 1:
60         ldr             r8,  [r2]
61         ldr             r9,  [r2, #4]
62         lsr             r10, r8,  #8
63         ldr             r4,  [r1]
64         lsr             r6,  r9,  #8
65         orr             r10, r10, r9,  lsl #24
66         ldr             r5,  [r2, #8]
67         eor             r11, r8,  r10
68         uhadd8          r7,  r8,  r10
69         orr             r6,  r6,  r5,  lsl #24
70         and             r11, r11, lr
71         uadd8           r7,  r7,  r11
72         ldr             r8,  [r1, #4]
73         usada8          r0,  r4,  r7,  r0
74         eor             r7,  r9,  r6
75         lsr             r10, r5,  #8
76         and             r7,  r7,  lr
77         uhadd8          r4,  r9,  r6
78         ldr             r6,  [r2, #12]
79         uadd8           r4,  r4,  r7
80         pld             [r1, r3]
81         orr             r10, r10, r6,  lsl #24
82         usada8          r0,  r8,  r4,  r0
83         ldr             r4,  [r1, #8]
84         eor             r11, r5,  r10
85         ldrb            r7,  [r2, #16]
86         and             r11, r11, lr
87         uhadd8          r8,  r5,  r10
88         ldr             r5,  [r1, #12]
89         uadd8           r8,  r8,  r11
90         pld             [r2, r3]
91         lsr             r10, r6,  #8
92         usada8          r0,  r4,  r8,  r0
93         orr             r10, r10, r7,  lsl #24
94         subs            r12,  r12,  #1
95         eor             r11, r6,  r10
96         add             r1,  r1,  r3
97         uhadd8          r9,  r6,  r10
98         and             r11, r11, lr
99         uadd8           r9,  r9,  r11
100         add             r2,  r2,  r3
101         usada8          r0,  r5,  r9,  r0
102         bgt             1b
103
104         pop             {r4-r11, pc}
105 endfunc
106
107 .macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
108         ldr             \n0, [r2]
109         eor             \n1, \p0, \n0
110         uhadd8          \p0, \p0, \n0
111         and             \n1, \n1, lr
112         ldr             \n2, [r1]
113         uadd8           \p0, \p0, \n1
114         ldr             \n1, [r2, #4]
115         usada8          r0,  \p0, \n2, r0
116         pld             [r1,  r3]
117         eor             \n3, \p1, \n1
118         uhadd8          \p1, \p1, \n1
119         and             \n3, \n3, lr
120         ldr             \p0, [r1, #4]
121         uadd8           \p1, \p1, \n3
122         ldr             \n2, [r2, #8]
123         usada8          r0,  \p1, \p0, r0
124         pld             [r2,  r3]
125         eor             \p0, \p2, \n2
126         uhadd8          \p2, \p2, \n2
127         and             \p0, \p0, lr
128         ldr             \p1, [r1, #8]
129         uadd8           \p2, \p2, \p0
130         ldr             \n3, [r2, #12]
131         usada8          r0,  \p2, \p1, r0
132         eor             \p1, \p3, \n3
133         uhadd8          \p3, \p3, \n3
134         and             \p1, \p1, lr
135         ldr             \p0,  [r1, #12]
136         uadd8           \p3, \p3, \p1
137         add             r1,  r1,  r3
138         usada8          r0,  \p3, \p0,  r0
139         add             r2,  r2,  r3
140 .endm
141
142 function ff_pix_abs16_y2_armv6, export=1
143         pld             [r1]
144         pld             [r2]
145         ldr             r12, [sp]
146         push            {r4-r11, lr}
147         mov             r0,  #0
148         mov             lr,  #1
149         orr             lr,  lr,  lr,  lsl #8
150         orr             lr,  lr,  lr,  lsl #16
151         ldr             r4,  [r2]
152         ldr             r5,  [r2, #4]
153         ldr             r6,  [r2, #8]
154         ldr             r7,  [r2, #12]
155         add             r2,  r2,  r3
156 1:
157         usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
158         subs            r12, r12, #2
159         usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
160         bgt             1b
161
162         pop             {r4-r11, pc}
163 endfunc
164
165 function ff_pix_abs8_armv6, export=1
166         pld             [r2, r3]
167         ldr             r12, [sp]
168         push            {r4-r9, lr}
169         mov             r0,  #0
170         mov             lr,  #0
171         ldrd_post       r4,  r5,  r1,  r3
172 1:
173         subs            r12, r12, #2
174         ldr             r7,  [r2, #4]
175         ldr_post        r6,  r2,  r3
176         ldrd_post       r8,  r9,  r1,  r3
177         usada8          r0,  r4,  r6,  r0
178         pld             [r2, r3]
179         usada8          lr,  r5,  r7,  lr
180         ldr             r7,  [r2, #4]
181         ldr_post        r6,  r2,  r3
182         beq             2f
183         ldrd_post       r4,  r5,  r1,  r3
184         usada8          r0,  r8,  r6,  r0
185         pld             [r2, r3]
186         usada8          lr,  r9,  r7,  lr
187         b               1b
188 2:
189         usada8          r0,  r8,  r6,  r0
190         usada8          lr,  r9,  r7,  lr
191         add             r0,  r0,  lr
192         pop             {r4-r9, pc}
193 endfunc
194
195 function ff_sse16_armv6, export=1
196         ldr             r12, [sp]
197         push            {r4-r9, lr}
198         mov             r0,  #0
199 1:
200         ldrd            r4,  r5,  [r1]
201         ldr             r8,  [r2]
202         uxtb16          lr,  r4
203         uxtb16          r4,  r4,  ror #8
204         uxtb16          r9,  r8
205         uxtb16          r8,  r8,  ror #8
206         ldr             r7,  [r2, #4]
207         usub16          lr,  lr,  r9
208         usub16          r4,  r4,  r8
209         smlad           r0,  lr,  lr,  r0
210         uxtb16          r6,  r5
211         uxtb16          lr,  r5,  ror #8
212         uxtb16          r8,  r7
213         uxtb16          r9,  r7,  ror #8
214         smlad           r0,  r4,  r4,  r0
215         ldrd            r4,  r5,  [r1, #8]
216         usub16          r6,  r6,  r8
217         usub16          r8,  lr,  r9
218         ldr             r7,  [r2, #8]
219         smlad           r0,  r6,  r6,  r0
220         uxtb16          lr,  r4
221         uxtb16          r4,  r4,  ror #8
222         uxtb16          r9,  r7
223         uxtb16          r7,  r7, ror #8
224         smlad           r0,  r8,  r8,  r0
225         ldr             r8,  [r2, #12]
226         usub16          lr,  lr,  r9
227         usub16          r4,  r4,  r7
228         smlad           r0,  lr,  lr,  r0
229         uxtb16          r6,  r5
230         uxtb16          r5,  r5,  ror #8
231         uxtb16          r9,  r8
232         uxtb16          r8,  r8,  ror #8
233         smlad           r0,  r4,  r4,  r0
234         usub16          r6,  r6,  r9
235         usub16          r5,  r5,  r8
236         smlad           r0,  r6,  r6,  r0
237         add             r1,  r1,  r3
238         add             r2,  r2,  r3
239         subs            r12, r12, #1
240         smlad           r0,  r5,  r5,  r0
241         bgt             1b
242
243         pop             {r4-r9, pc}
244 endfunc