arm: dsputil: fix overreads in put/avg_pixels functions
[ffmpeg.git] / libavcodec / arm / dsputil_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "asm.S"
24
25         preserve8
26
27 function ff_clear_block_neon, export=1
28         vmov.i16        q0,  #0
29         .rept           8
30         vst1.16         {q0}, [r0,:128]!
31         .endr
32         bx              lr
33 endfunc
34
35 function ff_clear_blocks_neon, export=1
36         vmov.i16        q0,  #0
37         .rept           8*6
38         vst1.16         {q0}, [r0,:128]!
39         .endr
40         bx              lr
41 endfunc
42
43 .macro  pixels16        rnd=1, avg=0
44   .if \avg
45         mov             r12, r0
46   .endif
47 1:      vld1.64         {q0},     [r1], r2
48         vld1.64         {q1},     [r1], r2
49         vld1.64         {q2},     [r1], r2
50         pld             [r1, r2, lsl #2]
51         vld1.64         {q3},     [r1], r2
52         pld             [r1]
53         pld             [r1, r2]
54         pld             [r1, r2, lsl #1]
55   .if \avg
56         vld1.64         {q8},     [r12,:128], r2
57         vrhadd.u8       q0,  q0,  q8
58         vld1.64         {q9},     [r12,:128], r2
59         vrhadd.u8       q1,  q1,  q9
60         vld1.64         {q10},    [r12,:128], r2
61         vrhadd.u8       q2,  q2,  q10
62         vld1.64         {q11},    [r12,:128], r2
63         vrhadd.u8       q3,  q3,  q11
64   .endif
65         subs            r3,  r3,  #4
66         vst1.64         {q0},     [r0,:128], r2
67         vst1.64         {q1},     [r0,:128], r2
68         vst1.64         {q2},     [r0,:128], r2
69         vst1.64         {q3},     [r0,:128], r2
70         bne             1b
71         bx              lr
72 .endm
73
74 .macro  pixels16_x2     rnd=1, avg=0
75 1:      vld1.64         {d0-d2},  [r1], r2
76         vld1.64         {d4-d6},  [r1], r2
77         pld             [r1]
78         pld             [r1, r2]
79         subs            r3,  r3,  #2
80         vext.8          q1,  q0,  q1,  #1
81         avg             q0,  q0,  q1
82         vext.8          q3,  q2,  q3,  #1
83         avg             q2,  q2,  q3
84   .if \avg
85         vld1.8          {q1},     [r0,:128], r2
86         vld1.8          {q3},     [r0,:128]
87         vrhadd.u8       q0,  q0,  q1
88         vrhadd.u8       q2,  q2,  q3
89         sub             r0,  r0,  r2
90   .endif
91         vst1.64         {q0},     [r0,:128], r2
92         vst1.64         {q2},     [r0,:128], r2
93         bne             1b
94         bx              lr
95 .endm
96
97 .macro  pixels16_y2     rnd=1, avg=0
98         sub             r3,  r3,  #2
99         vld1.64         {q0},     [r1], r2
100         vld1.64         {q1},     [r1], r2
101 1:      subs            r3,  r3,  #2
102         avg             q2,  q0,  q1
103         vld1.64         {q0},     [r1], r2
104         avg             q3,  q0,  q1
105         vld1.64         {q1},     [r1], r2
106         pld             [r1]
107         pld             [r1, r2]
108   .if \avg
109         vld1.8          {q8},     [r0,:128], r2
110         vld1.8          {q9},     [r0,:128]
111         vrhadd.u8       q2,  q2,  q8
112         vrhadd.u8       q3,  q3,  q9
113         sub             r0,  r0,  r2
114   .endif
115         vst1.64         {q2},     [r0,:128], r2
116         vst1.64         {q3},     [r0,:128], r2
117         bne             1b
118
119         avg             q2,  q0,  q1
120         vld1.64         {q0},     [r1], r2
121         avg             q3,  q0,  q1
122   .if \avg
123         vld1.8          {q8},     [r0,:128], r2
124         vld1.8          {q9},     [r0,:128]
125         vrhadd.u8       q2,  q2,  q8
126         vrhadd.u8       q3,  q3,  q9
127         sub             r0,  r0,  r2
128   .endif
129         vst1.64         {q2},     [r0,:128], r2
130         vst1.64         {q3},     [r0,:128], r2
131
132         bx              lr
133 .endm
134
135 .macro  pixels16_xy2    rnd=1, avg=0
136         sub             r3,  r3,  #2
137         vld1.64         {d0-d2},  [r1], r2
138         vld1.64         {d4-d6},  [r1], r2
139   .ifeq \rnd
140         vmov.i16        q13, #1
141   .endif
142         pld             [r1]
143         pld             [r1, r2]
144         vext.8          q1,  q0,  q1,  #1
145         vext.8          q3,  q2,  q3,  #1
146         vaddl.u8        q8,  d0,  d2
147         vaddl.u8        q10, d1,  d3
148         vaddl.u8        q9,  d4,  d6
149         vaddl.u8        q11, d5,  d7
150 1:      subs            r3,  r3,  #2
151         vld1.64         {d0-d2},  [r1], r2
152         vadd.u16        q12, q8,  q9
153         pld             [r1]
154   .ifeq \rnd
155         vadd.u16        q12, q12, q13
156   .endif
157         vext.8          q15, q0,  q1,  #1
158         vadd.u16        q1 , q10, q11
159         shrn            d28, q12, #2
160   .ifeq \rnd
161         vadd.u16        q1,  q1,  q13
162   .endif
163         shrn            d29, q1,  #2
164   .if \avg
165         vld1.8          {q8},     [r0,:128]
166         vrhadd.u8       q14, q14, q8
167   .endif
168         vaddl.u8        q8,  d0,  d30
169         vld1.64         {d2-d4},  [r1], r2
170         vaddl.u8        q10, d1,  d31
171         vst1.64         {q14},    [r0,:128], r2
172         vadd.u16        q12, q8,  q9
173         pld             [r1, r2]
174   .ifeq \rnd
175         vadd.u16        q12, q12, q13
176   .endif
177         vext.8          q2,  q1,  q2,  #1
178         vadd.u16        q0,  q10, q11
179         shrn            d30, q12, #2
180   .ifeq \rnd
181         vadd.u16        q0,  q0,  q13
182   .endif
183         shrn            d31, q0,  #2
184   .if \avg
185         vld1.8          {q9},     [r0,:128]
186         vrhadd.u8       q15, q15, q9
187   .endif
188         vaddl.u8        q9,  d2,  d4
189         vaddl.u8        q11, d3,  d5
190         vst1.64         {q15},    [r0,:128], r2
191         bgt             1b
192
193         vld1.64         {d0-d2},  [r1], r2
194         vadd.u16        q12, q8,  q9
195   .ifeq \rnd
196         vadd.u16        q12, q12, q13
197   .endif
198         vext.8          q15, q0,  q1,  #1
199         vadd.u16        q1 , q10, q11
200         shrn            d28, q12, #2
201   .ifeq \rnd
202         vadd.u16        q1,  q1,  q13
203   .endif
204         shrn            d29, q1,  #2
205   .if \avg
206         vld1.8          {q8},     [r0,:128]
207         vrhadd.u8       q14, q14, q8
208   .endif
209         vaddl.u8        q8,  d0,  d30
210         vaddl.u8        q10, d1,  d31
211         vst1.64         {q14},    [r0,:128], r2
212         vadd.u16        q12, q8,  q9
213   .ifeq \rnd
214         vadd.u16        q12, q12, q13
215   .endif
216         vadd.u16        q0,  q10, q11
217         shrn            d30, q12, #2
218   .ifeq \rnd
219         vadd.u16        q0,  q0,  q13
220   .endif
221         shrn            d31, q0,  #2
222   .if \avg
223         vld1.8          {q9},     [r0,:128]
224         vrhadd.u8       q15, q15, q9
225   .endif
226         vst1.64         {q15},    [r0,:128], r2
227
228         bx              lr
229 .endm
230
231 .macro  pixels8         rnd=1, avg=0
232 1:      vld1.64         {d0},     [r1], r2
233         vld1.64         {d1},     [r1], r2
234         vld1.64         {d2},     [r1], r2
235         pld             [r1, r2, lsl #2]
236         vld1.64         {d3},     [r1], r2
237         pld             [r1]
238         pld             [r1, r2]
239         pld             [r1, r2, lsl #1]
240   .if \avg
241         vld1.64         {d4},     [r0,:64], r2
242         vrhadd.u8       d0,  d0,  d4
243         vld1.64         {d5},     [r0,:64], r2
244         vrhadd.u8       d1,  d1,  d5
245         vld1.64         {d6},     [r0,:64], r2
246         vrhadd.u8       d2,  d2,  d6
247         vld1.64         {d7},     [r0,:64], r2
248         vrhadd.u8       d3,  d3,  d7
249         sub             r0,  r0,  r2,  lsl #2
250   .endif
251         subs            r3,  r3,  #4
252         vst1.64         {d0},     [r0,:64], r2
253         vst1.64         {d1},     [r0,:64], r2
254         vst1.64         {d2},     [r0,:64], r2
255         vst1.64         {d3},     [r0,:64], r2
256         bne             1b
257         bx              lr
258 .endm
259
260 .macro  pixels8_x2      rnd=1, avg=0
261 1:      vld1.64         {q0},     [r1], r2
262         vext.8          d1,  d0,  d1,  #1
263         vld1.64         {q1},     [r1], r2
264         vext.8          d3,  d2,  d3,  #1
265         pld             [r1]
266         pld             [r1, r2]
267         subs            r3,  r3,  #2
268         vswp            d1,  d2
269         avg             q0,  q0,  q1
270   .if \avg
271         vld1.8          {d4},     [r0,:64], r2
272         vld1.8          {d5},     [r0,:64]
273         vrhadd.u8       q0,  q0,  q2
274         sub             r0,  r0,  r2
275   .endif
276         vst1.64         {d0},     [r0,:64], r2
277         vst1.64         {d1},     [r0,:64], r2
278         bne             1b
279         bx              lr
280 .endm
281
282 .macro  pixels8_y2      rnd=1, avg=0
283         sub             r3,  r3,  #2
284         vld1.64         {d0},     [r1], r2
285         vld1.64         {d1},     [r1], r2
286 1:      subs            r3,  r3,  #2
287         avg             d4,  d0,  d1
288         vld1.64         {d0},     [r1], r2
289         avg             d5,  d0,  d1
290         vld1.64         {d1},     [r1], r2
291         pld             [r1]
292         pld             [r1, r2]
293   .if \avg
294         vld1.8          {d2},     [r0,:64], r2
295         vld1.8          {d3},     [r0,:64]
296         vrhadd.u8       q2,  q2,  q1
297         sub             r0,  r0,  r2
298   .endif
299         vst1.64         {d4},     [r0,:64], r2
300         vst1.64         {d5},     [r0,:64], r2
301         bne             1b
302
303         avg             d4,  d0,  d1
304         vld1.64         {d0},     [r1], r2
305         avg             d5,  d0,  d1
306   .if \avg
307         vld1.8          {d2},     [r0,:64], r2
308         vld1.8          {d3},     [r0,:64]
309         vrhadd.u8       q2,  q2,  q1
310         sub             r0,  r0,  r2
311   .endif
312         vst1.64         {d4},     [r0,:64], r2
313         vst1.64         {d5},     [r0,:64], r2
314
315         bx              lr
316 .endm
317
318 .macro  pixels8_xy2     rnd=1, avg=0
319         sub             r3,  r3,  #2
320         vld1.64         {q0},     [r1], r2
321         vld1.64         {q1},     [r1], r2
322   .ifeq \rnd
323         vmov.i16        q11, #1
324   .endif
325         pld             [r1]
326         pld             [r1, r2]
327         vext.8          d4,  d0,  d1,  #1
328         vext.8          d6,  d2,  d3,  #1
329         vaddl.u8        q8,  d0,  d4
330         vaddl.u8        q9,  d2,  d6
331 1:      subs            r3,  r3,  #2
332         vld1.64         {q0},     [r1], r2
333         pld             [r1]
334         vadd.u16        q10, q8,  q9
335         vext.8          d4,  d0,  d1,  #1
336   .ifeq \rnd
337         vadd.u16        q10, q10, q11
338   .endif
339         vaddl.u8        q8,  d0,  d4
340         shrn            d5,  q10, #2
341         vld1.64         {q1},     [r1], r2
342         vadd.u16        q10, q8,  q9
343         pld             [r1, r2]
344   .if \avg
345         vld1.8          {d7},     [r0,:64]
346         vrhadd.u8       d5,  d5,  d7
347   .endif
348   .ifeq \rnd
349         vadd.u16        q10, q10, q11
350   .endif
351         vst1.64         {d5},     [r0,:64], r2
352         shrn            d7,  q10, #2
353   .if \avg
354         vld1.8          {d5},     [r0,:64]
355         vrhadd.u8       d7,  d7,  d5
356   .endif
357         vext.8          d6,  d2,  d3,  #1
358         vaddl.u8        q9,  d2,  d6
359         vst1.64         {d7},     [r0,:64], r2
360         bgt             1b
361
362         vld1.64         {q0},     [r1], r2
363         vadd.u16        q10, q8,  q9
364         vext.8          d4,  d0,  d1,  #1
365   .ifeq \rnd
366         vadd.u16        q10, q10, q11
367   .endif
368         vaddl.u8        q8,  d0,  d4
369         shrn            d5,  q10, #2
370         vadd.u16        q10, q8,  q9
371   .if \avg
372         vld1.8          {d7},     [r0,:64]
373         vrhadd.u8       d5,  d5,  d7
374   .endif
375   .ifeq \rnd
376         vadd.u16        q10, q10, q11
377   .endif
378         vst1.64         {d5},     [r0,:64], r2
379         shrn            d7,  q10, #2
380   .if \avg
381         vld1.8          {d5},     [r0,:64]
382         vrhadd.u8       d7,  d7,  d5
383   .endif
384         vst1.64         {d7},     [r0,:64], r2
385
386         bx              lr
387 .endm
388
389 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
390   .if \rnd
391     .macro avg  rd, rn, rm
392         vrhadd.u8       \rd, \rn, \rm
393     .endm
394     .macro shrn rd, rn, rm
395         vrshrn.u16      \rd, \rn, \rm
396     .endm
397   .else
398     .macro avg  rd, rn, rm
399         vhadd.u8        \rd, \rn, \rm
400     .endm
401     .macro shrn rd, rn, rm
402         vshrn.u16       \rd, \rn, \rm
403     .endm
404   .endif
405 function ff_\pfx\name\suf\()_neon, export=1
406         \name           \rnd, \avg
407 endfunc
408         .purgem         avg
409         .purgem         shrn
410 .endm
411
412 .macro  pixfunc2        pfx, name, avg=0
413         pixfunc         \pfx, \name,          rnd=1, avg=\avg
414         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
415 .endm
416
417 function ff_put_h264_qpel16_mc00_neon, export=1
418         mov             r3,  #16
419 endfunc
420
421         pixfunc         put_, pixels16,     avg=0
422         pixfunc2        put_, pixels16_x2,  avg=0
423         pixfunc2        put_, pixels16_y2,  avg=0
424         pixfunc2        put_, pixels16_xy2, avg=0
425
426 function ff_avg_h264_qpel16_mc00_neon, export=1
427         mov             r3,  #16
428 endfunc
429
430         pixfunc         avg_, pixels16,     avg=1
431         pixfunc2        avg_, pixels16_x2,  avg=1
432         pixfunc2        avg_, pixels16_y2,  avg=1
433         pixfunc2        avg_, pixels16_xy2, avg=1
434
435 function ff_put_h264_qpel8_mc00_neon, export=1
436         mov             r3,  #8
437 endfunc
438
439         pixfunc         put_, pixels8,     avg=0
440         pixfunc2        put_, pixels8_x2,  avg=0
441         pixfunc2        put_, pixels8_y2,  avg=0
442         pixfunc2        put_, pixels8_xy2, avg=0
443
444 function ff_avg_h264_qpel8_mc00_neon, export=1
445         mov             r3,  #8
446 endfunc
447
448         pixfunc         avg_, pixels8,     avg=1
449         pixfunc2        avg_, pixels8_x2,  avg=1
450         pixfunc2        avg_, pixels8_y2,  avg=1
451         pixfunc2        avg_, pixels8_xy2, avg=1
452
453 function ff_put_pixels_clamped_neon, export=1
454         vld1.64         {d16-d19}, [r0,:128]!
455         vqmovun.s16     d0, q8
456         vld1.64         {d20-d23}, [r0,:128]!
457         vqmovun.s16     d1, q9
458         vld1.64         {d24-d27}, [r0,:128]!
459         vqmovun.s16     d2, q10
460         vld1.64         {d28-d31}, [r0,:128]!
461         vqmovun.s16     d3, q11
462         vst1.64         {d0},      [r1,:64], r2
463         vqmovun.s16     d4, q12
464         vst1.64         {d1},      [r1,:64], r2
465         vqmovun.s16     d5, q13
466         vst1.64         {d2},      [r1,:64], r2
467         vqmovun.s16     d6, q14
468         vst1.64         {d3},      [r1,:64], r2
469         vqmovun.s16     d7, q15
470         vst1.64         {d4},      [r1,:64], r2
471         vst1.64         {d5},      [r1,:64], r2
472         vst1.64         {d6},      [r1,:64], r2
473         vst1.64         {d7},      [r1,:64], r2
474         bx              lr
475 endfunc
476
477 function ff_put_signed_pixels_clamped_neon, export=1
478         vmov.u8         d31, #128
479         vld1.64         {d16-d17}, [r0,:128]!
480         vqmovn.s16      d0, q8
481         vld1.64         {d18-d19}, [r0,:128]!
482         vqmovn.s16      d1, q9
483         vld1.64         {d16-d17}, [r0,:128]!
484         vqmovn.s16      d2, q8
485         vld1.64         {d18-d19}, [r0,:128]!
486         vadd.u8         d0, d0, d31
487         vld1.64         {d20-d21}, [r0,:128]!
488         vadd.u8         d1, d1, d31
489         vld1.64         {d22-d23}, [r0,:128]!
490         vadd.u8         d2, d2, d31
491         vst1.64         {d0},      [r1,:64], r2
492         vqmovn.s16      d3, q9
493         vst1.64         {d1},      [r1,:64], r2
494         vqmovn.s16      d4, q10
495         vst1.64         {d2},      [r1,:64], r2
496         vqmovn.s16      d5, q11
497         vld1.64         {d24-d25}, [r0,:128]!
498         vadd.u8         d3, d3, d31
499         vld1.64         {d26-d27}, [r0,:128]!
500         vadd.u8         d4, d4, d31
501         vadd.u8         d5, d5, d31
502         vst1.64         {d3},      [r1,:64], r2
503         vqmovn.s16      d6, q12
504         vst1.64         {d4},      [r1,:64], r2
505         vqmovn.s16      d7, q13
506         vst1.64         {d5},      [r1,:64], r2
507         vadd.u8         d6, d6, d31
508         vadd.u8         d7, d7, d31
509         vst1.64         {d6},      [r1,:64], r2
510         vst1.64         {d7},      [r1,:64], r2
511         bx              lr
512 endfunc
513
514 function ff_add_pixels_clamped_neon, export=1
515         mov             r3, r1
516         vld1.64         {d16},   [r1,:64], r2
517         vld1.64         {d0-d1}, [r0,:128]!
518         vaddw.u8        q0, q0, d16
519         vld1.64         {d17},   [r1,:64], r2
520         vld1.64         {d2-d3}, [r0,:128]!
521         vqmovun.s16     d0, q0
522         vld1.64         {d18},   [r1,:64], r2
523         vaddw.u8        q1, q1, d17
524         vld1.64         {d4-d5}, [r0,:128]!
525         vaddw.u8        q2, q2, d18
526         vst1.64         {d0},    [r3,:64], r2
527         vqmovun.s16     d2, q1
528         vld1.64         {d19},   [r1,:64], r2
529         vld1.64         {d6-d7}, [r0,:128]!
530         vaddw.u8        q3, q3, d19
531         vqmovun.s16     d4, q2
532         vst1.64         {d2},    [r3,:64], r2
533         vld1.64         {d16},   [r1,:64], r2
534         vqmovun.s16     d6, q3
535         vld1.64         {d0-d1}, [r0,:128]!
536         vaddw.u8        q0, q0, d16
537         vst1.64         {d4},    [r3,:64], r2
538         vld1.64         {d17},   [r1,:64], r2
539         vld1.64         {d2-d3}, [r0,:128]!
540         vaddw.u8        q1, q1, d17
541         vst1.64         {d6},    [r3,:64], r2
542         vqmovun.s16     d0, q0
543         vld1.64         {d18},   [r1,:64], r2
544         vld1.64         {d4-d5}, [r0,:128]!
545         vaddw.u8        q2, q2, d18
546         vst1.64         {d0},    [r3,:64], r2
547         vqmovun.s16     d2, q1
548         vld1.64         {d19},   [r1,:64], r2
549         vqmovun.s16     d4, q2
550         vld1.64         {d6-d7}, [r0,:128]!
551         vaddw.u8        q3, q3, d19
552         vst1.64         {d2},    [r3,:64], r2
553         vqmovun.s16     d6, q3
554         vst1.64         {d4},    [r3,:64], r2
555         vst1.64         {d6},    [r3,:64], r2
556         bx              lr
557 endfunc
558
559 function ff_vector_fmul_neon, export=1
560         subs            r3,  r3,  #8
561         vld1.64         {d0-d3},  [r1,:128]!
562         vld1.64         {d4-d7},  [r2,:128]!
563         vmul.f32        q8,  q0,  q2
564         vmul.f32        q9,  q1,  q3
565         beq             3f
566         bics            ip,  r3,  #15
567         beq             2f
568 1:      subs            ip,  ip,  #16
569         vld1.64         {d0-d1},  [r1,:128]!
570         vld1.64         {d4-d5},  [r2,:128]!
571         vmul.f32        q10, q0,  q2
572         vld1.64         {d2-d3},  [r1,:128]!
573         vld1.64         {d6-d7},  [r2,:128]!
574         vmul.f32        q11, q1,  q3
575         vst1.64         {d16-d19},[r0,:128]!
576         vld1.64         {d0-d1},  [r1,:128]!
577         vld1.64         {d4-d5},  [r2,:128]!
578         vmul.f32        q8,  q0,  q2
579         vld1.64         {d2-d3},  [r1,:128]!
580         vld1.64         {d6-d7},  [r2,:128]!
581         vmul.f32        q9,  q1,  q3
582         vst1.64         {d20-d23},[r0,:128]!
583         bne             1b
584         ands            r3,  r3,  #15
585         beq             3f
586 2:      vld1.64         {d0-d1},  [r1,:128]!
587         vld1.64         {d4-d5},  [r2,:128]!
588         vst1.64         {d16-d17},[r0,:128]!
589         vmul.f32        q8,  q0,  q2
590         vld1.64         {d2-d3},  [r1,:128]!
591         vld1.64         {d6-d7},  [r2,:128]!
592         vst1.64         {d18-d19},[r0,:128]!
593         vmul.f32        q9,  q1,  q3
594 3:      vst1.64         {d16-d19},[r0,:128]!
595         bx              lr
596 endfunc
597
598 function ff_vector_fmul_window_neon, export=1
599         push            {r4,r5,lr}
600         ldr             lr,  [sp, #12]
601         sub             r2,  r2,  #8
602         sub             r5,  lr,  #2
603         add             r2,  r2,  r5, lsl #2
604         add             r4,  r3,  r5, lsl #3
605         add             ip,  r0,  r5, lsl #3
606         mov             r5,  #-16
607         vld1.64         {d0,d1},  [r1,:128]!
608         vld1.64         {d2,d3},  [r2,:128], r5
609         vld1.64         {d4,d5},  [r3,:128]!
610         vld1.64         {d6,d7},  [r4,:128], r5
611 1:      subs            lr,  lr,  #4
612         vmul.f32        d22, d0,  d4
613         vrev64.32       q3,  q3
614         vmul.f32        d23, d1,  d5
615         vrev64.32       q1,  q1
616         vmul.f32        d20, d0,  d7
617         vmul.f32        d21, d1,  d6
618         beq             2f
619         vmla.f32        d22, d3,  d7
620         vld1.64         {d0,d1},  [r1,:128]!
621         vmla.f32        d23, d2,  d6
622         vld1.64         {d18,d19},[r2,:128], r5
623         vmls.f32        d20, d3,  d4
624         vld1.64         {d24,d25},[r3,:128]!
625         vmls.f32        d21, d2,  d5
626         vld1.64         {d6,d7},  [r4,:128], r5
627         vmov            q1,  q9
628         vrev64.32       q11, q11
629         vmov            q2,  q12
630         vswp            d22, d23
631         vst1.64         {d20,d21},[r0,:128]!
632         vst1.64         {d22,d23},[ip,:128], r5
633         b               1b
634 2:      vmla.f32        d22, d3,  d7
635         vmla.f32        d23, d2,  d6
636         vmls.f32        d20, d3,  d4
637         vmls.f32        d21, d2,  d5
638         vrev64.32       q11, q11
639         vswp            d22, d23
640         vst1.64         {d20,d21},[r0,:128]!
641         vst1.64         {d22,d23},[ip,:128], r5
642         pop             {r4,r5,pc}
643 endfunc
644
645 #if CONFIG_VORBIS_DECODER
646 function ff_vorbis_inverse_coupling_neon, export=1
647         vmov.i32        q10, #1<<31
648         subs            r2,  r2,  #4
649         mov             r3,  r0
650         mov             r12, r1
651         beq             3f
652
653         vld1.32         {d24-d25},[r1,:128]!
654         vld1.32         {d22-d23},[r0,:128]!
655         vcle.s32        q8,  q12, #0
656         vand            q9,  q11, q10
657         veor            q12, q12, q9
658         vand            q2,  q12, q8
659         vbic            q3,  q12, q8
660         vadd.f32        q12, q11, q2
661         vsub.f32        q11, q11, q3
662 1:      vld1.32         {d2-d3},  [r1,:128]!
663         vld1.32         {d0-d1},  [r0,:128]!
664         vcle.s32        q8,  q1,  #0
665         vand            q9,  q0,  q10
666         veor            q1,  q1,  q9
667         vst1.32         {d24-d25},[r3, :128]!
668         vst1.32         {d22-d23},[r12,:128]!
669         vand            q2,  q1,  q8
670         vbic            q3,  q1,  q8
671         vadd.f32        q1,  q0,  q2
672         vsub.f32        q0,  q0,  q3
673         subs            r2,  r2,  #8
674         ble             2f
675         vld1.32         {d24-d25},[r1,:128]!
676         vld1.32         {d22-d23},[r0,:128]!
677         vcle.s32        q8,  q12, #0
678         vand            q9,  q11, q10
679         veor            q12, q12, q9
680         vst1.32         {d2-d3},  [r3, :128]!
681         vst1.32         {d0-d1},  [r12,:128]!
682         vand            q2,  q12, q8
683         vbic            q3,  q12, q8
684         vadd.f32        q12, q11, q2
685         vsub.f32        q11, q11, q3
686         b               1b
687
688 2:      vst1.32         {d2-d3},  [r3, :128]!
689         vst1.32         {d0-d1},  [r12,:128]!
690         it              lt
691         bxlt            lr
692
693 3:      vld1.32         {d2-d3},  [r1,:128]
694         vld1.32         {d0-d1},  [r0,:128]
695         vcle.s32        q8,  q1,  #0
696         vand            q9,  q0,  q10
697         veor            q1,  q1,  q9
698         vand            q2,  q1,  q8
699         vbic            q3,  q1,  q8
700         vadd.f32        q1,  q0,  q2
701         vsub.f32        q0,  q0,  q3
702         vst1.32         {d2-d3},  [r0,:128]!
703         vst1.32         {d0-d1},  [r1,:128]!
704         bx              lr
705 endfunc
706 #endif
707
708 function ff_vector_fmul_scalar_neon, export=1
709 VFP     len .req r2
710 NOVFP   len .req r3
711 VFP     vdup.32         q8,  d0[0]
712 NOVFP   vdup.32         q8,  r2
713         bics            r12, len, #15
714         beq             3f
715         vld1.32         {q0},[r1,:128]!
716         vld1.32         {q1},[r1,:128]!
717 1:      vmul.f32        q0,  q0,  q8
718         vld1.32         {q2},[r1,:128]!
719         vmul.f32        q1,  q1,  q8
720         vld1.32         {q3},[r1,:128]!
721         vmul.f32        q2,  q2,  q8
722         vst1.32         {q0},[r0,:128]!
723         vmul.f32        q3,  q3,  q8
724         vst1.32         {q1},[r0,:128]!
725         subs            r12, r12, #16
726         beq             2f
727         vld1.32         {q0},[r1,:128]!
728         vst1.32         {q2},[r0,:128]!
729         vld1.32         {q1},[r1,:128]!
730         vst1.32         {q3},[r0,:128]!
731         b               1b
732 2:      vst1.32         {q2},[r0,:128]!
733         vst1.32         {q3},[r0,:128]!
734         ands            len, len, #15
735         it              eq
736         bxeq            lr
737 3:      vld1.32         {q0},[r1,:128]!
738         vmul.f32        q0,  q0,  q8
739         vst1.32         {q0},[r0,:128]!
740         subs            len, len, #4
741         bgt             3b
742         bx              lr
743         .unreq          len
744 endfunc
745
746 function ff_vector_fmac_scalar_neon, export=1
747 VFP     len .req r2
748 VFP     acc .req r3
749 NOVFP   len .req r3
750 NOVFP   acc .req r2
751 VFP     vdup.32         q15, d0[0]
752 NOVFP   vdup.32         q15, r2
753         bics            r12, len, #15
754         mov             acc, r0
755         beq             3f
756         vld1.32         {q0},     [r1,:128]!
757         vld1.32         {q8},     [acc,:128]!
758         vld1.32         {q1},     [r1,:128]!
759         vld1.32         {q9},     [acc,:128]!
760 1:      vmla.f32        q8,  q0,  q15
761         vld1.32         {q2},     [r1,:128]!
762         vld1.32         {q10},    [acc,:128]!
763         vmla.f32        q9,  q1,  q15
764         vld1.32         {q3},     [r1,:128]!
765         vld1.32         {q11},    [acc,:128]!
766         vmla.f32        q10, q2,  q15
767         vst1.32         {q8},     [r0,:128]!
768         vmla.f32        q11, q3,  q15
769         vst1.32         {q9},     [r0,:128]!
770         subs            r12, r12, #16
771         beq             2f
772         vld1.32         {q0},     [r1,:128]!
773         vld1.32         {q8},     [acc,:128]!
774         vst1.32         {q10},    [r0,:128]!
775         vld1.32         {q1},     [r1,:128]!
776         vld1.32         {q9},     [acc,:128]!
777         vst1.32         {q11},    [r0,:128]!
778         b               1b
779 2:      vst1.32         {q10},    [r0,:128]!
780         vst1.32         {q11},    [r0,:128]!
781         ands            len, len, #15
782         it              eq
783         bxeq            lr
784 3:      vld1.32         {q0},     [r1,:128]!
785         vld1.32         {q8},     [acc,:128]!
786         vmla.f32        q8,  q0,  q15
787         vst1.32         {q8},     [r0,:128]!
788         subs            len, len, #4
789         bgt             3b
790         bx              lr
791         .unreq          len
792 endfunc
793
794 function ff_butterflies_float_neon, export=1
795 1:      vld1.32         {q0},[r0,:128]
796         vld1.32         {q1},[r1,:128]
797         vsub.f32        q2,  q0,  q1
798         vadd.f32        q1,  q0,  q1
799         vst1.32         {q2},[r1,:128]!
800         vst1.32         {q1},[r0,:128]!
801         subs            r2,  r2,  #4
802         bgt             1b
803         bx              lr
804 endfunc
805
806 function ff_scalarproduct_float_neon, export=1
807         vmov.f32        q2,  #0.0
808 1:      vld1.32         {q0},[r0,:128]!
809         vld1.32         {q1},[r1,:128]!
810         vmla.f32        q2,  q0,  q1
811         subs            r2,  r2,  #4
812         bgt             1b
813         vadd.f32        d0,  d4,  d5
814         vpadd.f32       d0,  d0,  d0
815 NOVFP   vmov.32         r0,  d0[0]
816         bx              lr
817 endfunc
818
819 function ff_vector_fmul_reverse_neon, export=1
820         add             r2,  r2,  r3,  lsl #2
821         sub             r2,  r2,  #32
822         mov             r12, #-32
823         vld1.32         {q0-q1},  [r1,:128]!
824         vld1.32         {q2-q3},  [r2,:128], r12
825 1:      pld             [r1, #32]
826         vrev64.32       q3,  q3
827         vmul.f32        d16, d0,  d7
828         vmul.f32        d17, d1,  d6
829         pld             [r2, #-32]
830         vrev64.32       q2,  q2
831         vmul.f32        d18, d2,  d5
832         vmul.f32        d19, d3,  d4
833         subs            r3,  r3,  #8
834         beq             2f
835         vld1.32         {q0-q1},  [r1,:128]!
836         vld1.32         {q2-q3},  [r2,:128], r12
837         vst1.32         {q8-q9},  [r0,:128]!
838         b               1b
839 2:      vst1.32         {q8-q9},  [r0,:128]!
840         bx              lr
841 endfunc
842
843 function ff_vector_fmul_add_neon, export=1
844         ldr             r12, [sp]
845         vld1.32         {q0-q1},  [r1,:128]!
846         vld1.32         {q8-q9},  [r2,:128]!
847         vld1.32         {q2-q3},  [r3,:128]!
848         vmul.f32        q10, q0,  q8
849         vmul.f32        q11, q1,  q9
850 1:      vadd.f32        q12, q2,  q10
851         vadd.f32        q13, q3,  q11
852         pld             [r1, #16]
853         pld             [r2, #16]
854         pld             [r3, #16]
855         subs            r12, r12, #8
856         beq             2f
857         vld1.32         {q0},     [r1,:128]!
858         vld1.32         {q8},     [r2,:128]!
859         vmul.f32        q10, q0,  q8
860         vld1.32         {q1},     [r1,:128]!
861         vld1.32         {q9},     [r2,:128]!
862         vmul.f32        q11, q1,  q9
863         vld1.32         {q2-q3},  [r3,:128]!
864         vst1.32         {q12-q13},[r0,:128]!
865         b               1b
866 2:      vst1.32         {q12-q13},[r0,:128]!
867         bx              lr
868 endfunc
869
870 function ff_vector_clipf_neon, export=1
871 VFP     vdup.32         q1,  d0[1]
872 VFP     vdup.32         q0,  d0[0]
873 NOVFP   vdup.32         q0,  r2
874 NOVFP   vdup.32         q1,  r3
875 NOVFP   ldr             r2,  [sp]
876         vld1.f32        {q2},[r1,:128]!
877         vmin.f32        q10, q2,  q1
878         vld1.f32        {q3},[r1,:128]!
879         vmin.f32        q11, q3,  q1
880 1:      vmax.f32        q8,  q10, q0
881         vmax.f32        q9,  q11, q0
882         subs            r2,  r2,  #8
883         beq             2f
884         vld1.f32        {q2},[r1,:128]!
885         vmin.f32        q10, q2,  q1
886         vld1.f32        {q3},[r1,:128]!
887         vmin.f32        q11, q3,  q1
888         vst1.f32        {q8},[r0,:128]!
889         vst1.f32        {q9},[r0,:128]!
890         b               1b
891 2:      vst1.f32        {q8},[r0,:128]!
892         vst1.f32        {q9},[r0,:128]!
893         bx              lr
894 endfunc
895
896 function ff_apply_window_int16_neon, export=1
897         push            {r4,lr}
898         add             r4,  r1,  r3,  lsl #1
899         add             lr,  r0,  r3,  lsl #1
900         sub             r4,  r4,  #16
901         sub             lr,  lr,  #16
902         mov             r12, #-16
903 1:
904         vld1.16         {q0},     [r1,:128]!
905         vld1.16         {q2},     [r2,:128]!
906         vld1.16         {q1},     [r4,:128], r12
907         vrev64.16       q3,  q2
908         vqrdmulh.s16    q0,  q0,  q2
909         vqrdmulh.s16    d2,  d2,  d7
910         vqrdmulh.s16    d3,  d3,  d6
911         vst1.16         {q0},     [r0,:128]!
912         vst1.16         {q1},     [lr,:128], r12
913         subs            r3,  r3,  #16
914         bgt             1b
915
916         pop             {r4,pc}
917 endfunc
918
919 function ff_vector_clip_int32_neon, export=1
920         vdup.32         q0,  r2
921         vdup.32         q1,  r3
922         ldr             r2,  [sp]
923 1:
924         vld1.32         {q2-q3},  [r1,:128]!
925         vmin.s32        q2,  q2,  q1
926         vmin.s32        q3,  q3,  q1
927         vmax.s32        q2,  q2,  q0
928         vmax.s32        q3,  q3,  q0
929         vst1.32         {q2-q3},  [r0,:128]!
930         subs            r2,  r2,  #8
931         bgt             1b
932         bx              lr
933 endfunc