1/*
2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of Libav.
5 *
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23        preserve8
24
25.macro  prerot          dst, rt
26        lsr             r3,  r6,  #2            @ n4
27        add             \rt, r4,  r6,  lsr #1   @ revtab + n4
28        add             r9,  r3,  r3,  lsl #1   @ n3
29        add             r8,  r7,  r6            @ tcos + n4
30        add             r3,  r2,  r6,  lsr #1   @ in + n4
31        add             r9,  r2,  r9,  lsl #1   @ in + n3
32        sub             r8,  r8,  #16
33        sub             r10, r3,  #16
34        sub             r11, r9,  #16
35        mov             r12, #-16
361:
37        vld2.16         {d0,d1},  [r9, :128]!
38        vld2.16         {d2,d3},  [r11,:128], r12
39        vld2.16         {d4,d5},  [r3, :128]!
40        vld2.16         {d6,d7},  [r10,:128], r12
41        vld2.16         {d16,d17},[r7, :128]!   @ cos, sin
42        vld2.16         {d18,d19},[r8, :128], r12
43        vrev64.16       q1,  q1
44        vrev64.16       q3,  q3
45        vrev64.16       q9,  q9
46        vneg.s16        d0,  d0
47        vneg.s16        d2,  d2
48        vneg.s16        d16, d16
49        vneg.s16        d18, d18
50        vhsub.s16       d0,  d0,  d3            @ re
51        vhsub.s16       d4,  d7,  d4            @ im
52        vhsub.s16       d6,  d6,  d5
53        vhsub.s16       d2,  d2,  d1
54        vmull.s16       q10, d0,  d16
55        vmlsl.s16       q10, d4,  d17
56        vmull.s16       q11, d0,  d17
57        vmlal.s16       q11, d4,  d16
58        vmull.s16       q12, d6,  d18
59        vmlsl.s16       q12, d2,  d19
60        vmull.s16       q13, d6,  d19
61        vmlal.s16       q13, d2,  d18
62        vshrn.s32       d0,  q10, #15
63        vshrn.s32       d1,  q11, #15
64        vshrn.s32       d2,  q12, #15
65        vshrn.s32       d3,  q13, #15
66        vzip.16         d0,  d1
67        vzip.16         d2,  d3
68        ldrh            lr,  [r4], #2
69        ldrh            r2,  [\rt, #-2]!
70        add             lr,  \dst, lr,  lsl #2
71        add             r2,  \dst, r2,  lsl #2
72        vst1.32         {d0[0]},  [lr,:32]
73        vst1.32         {d2[0]},  [r2,:32]
74        ldrh            lr,  [r4], #2
75        ldrh            r2,  [\rt, #-2]!
76        add             lr,  \dst, lr,  lsl #2
77        add             r2,  \dst, r2,  lsl #2
78        vst1.32         {d0[1]},  [lr,:32]
79        vst1.32         {d2[1]},  [r2,:32]
80        ldrh            lr,  [r4], #2
81        ldrh            r2,  [\rt, #-2]!
82        add             lr,  \dst, lr,  lsl #2
83        add             r2,  \dst, r2,  lsl #2
84        vst1.32         {d1[0]},  [lr,:32]
85        vst1.32         {d3[0]},  [r2,:32]
86        ldrh            lr,  [r4], #2
87        ldrh            r2,  [\rt, #-2]!
88        add             lr,  \dst, lr,  lsl #2
89        add             r2,  \dst, r2,  lsl #2
90        vst1.32         {d1[1]},  [lr,:32]
91        vst1.32         {d3[1]},  [r2,:32]
92        subs            r6,  r6,  #32
93        bgt             1b
94.endm
95
96function ff_mdct_fixed_calc_neon, export=1
97        push            {r1,r4-r11,lr}
98
99        ldr             r4,  [r0, #8]           @ revtab
100        ldr             r6,  [r0, #16]          @ mdct_size; n
101        ldr             r7,  [r0, #24]          @ tcos
102
103        prerot          r1,  r5
104
105        mov             r4,  r0
106        bl              X(ff_fft_fixed_calc_neon)
107
108        pop             {r5}
109        mov             r12, #-16
110        ldr             r6,  [r4, #16]          @ mdct_size; n
111        ldr             r7,  [r4, #24]          @ tcos
112        add             r5,  r5,  r6,  lsr #1
113        add             r7,  r7,  r6,  lsr #1
114        sub             r1,  r5,  #16
115        sub             r2,  r7,  #16
1161:
117        vld2.16         {d4,d5},  [r7,:128]!
118        vld2.16         {d6,d7},  [r2,:128], r12
119        vld2.16         {d0,d1},  [r5,:128]
120        vld2.16         {d2,d3},  [r1,:128]
121        vrev64.16       q3,  q3
122        vrev64.16       q1,  q1
123        vneg.s16        q3,  q3
124        vneg.s16        q2,  q2
125        vmull.s16       q11, d2,  d6
126        vmlal.s16       q11, d3,  d7
127        vmull.s16       q8,  d0,  d5
128        vmlsl.s16       q8,  d1,  d4
129        vmull.s16       q9,  d0,  d4
130        vmlal.s16       q9,  d1,  d5
131        vmull.s16       q10, d2,  d7
132        vmlsl.s16       q10, d3,  d6
133        vshrn.s32       d0,  q11, #15
134        vshrn.s32       d1,  q8,  #15
135        vshrn.s32       d2,  q9,  #15
136        vshrn.s32       d3,  q10, #15
137        vrev64.16       q0,  q0
138        vst2.16         {d2,d3},  [r5,:128]!
139        vst2.16         {d0,d1},  [r1,:128], r12
140        subs            r6,  r6,  #32
141        bgt             1b
142
143        pop             {r4-r11,pc}
144endfunc
145
146function ff_mdct_fixed_calcw_neon, export=1
147        push            {r1,r4-r11,lr}
148
149        ldrd            r4,  r5,  [r0, #8]      @ revtab, tmp_buf
150        ldr             r6,  [r0, #16]          @ mdct_size; n
151        ldr             r7,  [r0, #24]          @ tcos
152
153        prerot          r5,  r1
154
155        mov             r4,  r0
156        mov             r1,  r5
157        bl              X(ff_fft_fixed_calc_neon)
158
159        pop             {r7}
160        mov             r12, #-16
161        ldr             r6,  [r4, #16]          @ mdct_size; n
162        ldr             r9,  [r4, #24]          @ tcos
163        add             r5,  r5,  r6,  lsr #1
164        add             r7,  r7,  r6
165        add             r9,  r9,  r6,  lsr #1
166        sub             r3,  r5,  #16
167        sub             r1,  r7,  #16
168        sub             r2,  r9,  #16
1691:
170        vld2.16         {d4,d5},  [r9,:128]!
171        vld2.16         {d6,d7},  [r2,:128], r12
172        vld2.16         {d0,d1},  [r5,:128]!
173        vld2.16         {d2,d3},  [r3,:128], r12
174        vrev64.16       q3,  q3
175        vrev64.16       q1,  q1
176        vneg.s16        q3,  q3
177        vneg.s16        q2,  q2
178        vmull.s16       q8,  d2,  d6
179        vmlal.s16       q8,  d3,  d7
180        vmull.s16       q9,  d0,  d5
181        vmlsl.s16       q9,  d1,  d4
182        vmull.s16       q10, d0,  d4
183        vmlal.s16       q10, d1,  d5
184        vmull.s16       q11, d2,  d7
185        vmlsl.s16       q11, d3,  d6
186        vrev64.32       q8,  q8
187        vrev64.32       q9,  q9
188        vst2.32         {q10,q11},[r7,:128]!
189        vst2.32         {d16,d18},[r1,:128], r12
190        vst2.32         {d17,d19},[r1,:128], r12
191        subs            r6,  r6,  #32
192        bgt             1b
193
194        pop             {r4-r11,pc}
195endfunc
196