1/*
2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "config.h"
22#include "asm.S"
23
24/*
25 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
26 * throughput for almost all the instructions (except for double precision
27 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
28 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
29 * important for performance. One more interesting feature is that VFP has
30 * independent load/store and arithmetics pipelines, so it is possible to make
31 * them work simultaneously and get more than 1 operation per cycle. Load/store
32 * pipeline can process 2 single precision floating point values per cycle and
33 * supports bulk loads and stores for large sets of registers. Arithmetic operations
34 * can be done on vectors, which allows to keep the arithmetics pipeline busy,
35 * while the processor may issue and execute other instructions. Detailed
36 * optimization manuals can be found at http://www.arm.com
37 */
38
39/**
40 * ARM VFP optimized implementation of 'vector_fmul_c' function.
41 * Assume that len is a positive number and is multiple of 8
42 */
43@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
44function ff_vector_fmul_vfp, export=1
45        vpush           {d8-d15}
46        fmrx            r12, fpscr
47        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
48        fmxr            fpscr, r12
49
50        vldmia          r1!, {s0-s3}
51        vldmia          r2!, {s8-s11}
52        vldmia          r1!, {s4-s7}
53        vldmia          r2!, {s12-s15}
54        vmul.f32        s8,  s0,  s8
551:
56        subs            r3,  r3,  #16
57        vmul.f32        s12, s4,  s12
58        itttt           ge
59        vldmiage        r1!, {s16-s19}
60        vldmiage        r2!, {s24-s27}
61        vldmiage        r1!, {s20-s23}
62        vldmiage        r2!, {s28-s31}
63        it              ge
64        vmulge.f32      s24, s16, s24
65        vstmia          r0!, {s8-s11}
66        vstmia          r0!, {s12-s15}
67        it              ge
68        vmulge.f32      s28, s20, s28
69        itttt           gt
70        vldmiagt        r1!, {s0-s3}
71        vldmiagt        r2!, {s8-s11}
72        vldmiagt        r1!, {s4-s7}
73        vldmiagt        r2!, {s12-s15}
74        ittt            ge
75        vmulge.f32      s8,  s0,  s8
76        vstmiage        r0!, {s24-s27}
77        vstmiage        r0!, {s28-s31}
78        bgt             1b
79
80        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
81        fmxr            fpscr, r12
82        vpop            {d8-d15}
83        bx              lr
84endfunc
85
86/**
87 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
88 * Assume that len is a positive number and is multiple of 8
89 */
90@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
91@                                 const float *src1, int len)
92function ff_vector_fmul_reverse_vfp, export=1
93        vpush           {d8-d15}
94        add             r2,  r2,  r3, lsl #2
95        vldmdb          r2!, {s0-s3}
96        vldmia          r1!, {s8-s11}
97        vldmdb          r2!, {s4-s7}
98        vldmia          r1!, {s12-s15}
99        vmul.f32        s8,  s3,  s8
100        vmul.f32        s9,  s2,  s9
101        vmul.f32        s10, s1,  s10
102        vmul.f32        s11, s0,  s11
1031:
104        subs            r3,  r3,  #16
105        it              ge
106        vldmdbge        r2!, {s16-s19}
107        vmul.f32        s12, s7,  s12
108        it              ge
109        vldmiage        r1!, {s24-s27}
110        vmul.f32        s13, s6,  s13
111        it              ge
112        vldmdbge        r2!, {s20-s23}
113        vmul.f32        s14, s5,  s14
114        it              ge
115        vldmiage        r1!, {s28-s31}
116        vmul.f32        s15, s4,  s15
117        it              ge
118        vmulge.f32      s24, s19, s24
119        it              gt
120        vldmdbgt        r2!, {s0-s3}
121        it              ge
122        vmulge.f32      s25, s18, s25
123        vstmia          r0!, {s8-s13}
124        it              ge
125        vmulge.f32      s26, s17, s26
126        it              gt
127        vldmiagt        r1!, {s8-s11}
128        itt             ge
129        vmulge.f32      s27, s16, s27
130        vmulge.f32      s28, s23, s28
131        it              gt
132        vldmdbgt        r2!, {s4-s7}
133        it              ge
134        vmulge.f32      s29, s22, s29
135        vstmia          r0!, {s14-s15}
136        ittt            ge
137        vmulge.f32      s30, s21, s30
138        vmulge.f32      s31, s20, s31
139        vmulge.f32      s8,  s3,  s8
140        it              gt
141        vldmiagt        r1!, {s12-s15}
142        itttt           ge
143        vmulge.f32      s9,  s2,  s9
144        vmulge.f32      s10, s1,  s10
145        vstmiage        r0!, {s24-s27}
146        vmulge.f32      s11, s0,  s11
147        it              ge
148        vstmiage        r0!, {s28-s31}
149        bgt             1b
150
151        vpop            {d8-d15}
152        bx              lr
153endfunc
154