1/*
2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23.macro  prerot          dst, rt
24        lsr             r3,  r6,  #2            @ n4
25        add             \rt, r4,  r6,  lsr #1   @ revtab + n4
26        add             r9,  r3,  r3,  lsl #1   @ n3
27        add             r8,  r7,  r6            @ tcos + n4
28        add             r3,  r2,  r6,  lsr #1   @ in + n4
29        add             r9,  r2,  r9,  lsl #1   @ in + n3
30        sub             r8,  r8,  #16
31        sub             r10, r3,  #16
32        sub             r11, r9,  #16
33        mov             r12, #-16
341:
35        vld2.16         {d0,d1},  [r9, :128]!
36        vld2.16         {d2,d3},  [r11,:128], r12
37        vld2.16         {d4,d5},  [r3, :128]!
38        vld2.16         {d6,d7},  [r10,:128], r12
39        vld2.16         {d16,d17},[r7, :128]!   @ cos, sin
40        vld2.16         {d18,d19},[r8, :128], r12
41        vrev64.16       q1,  q1
42        vrev64.16       q3,  q3
43        vrev64.16       q9,  q9
44        vneg.s16        d0,  d0
45        vneg.s16        d2,  d2
46        vneg.s16        d16, d16
47        vneg.s16        d18, d18
48        vhsub.s16       d0,  d0,  d3            @ re
49        vhsub.s16       d4,  d7,  d4            @ im
50        vhsub.s16       d6,  d6,  d5
51        vhsub.s16       d2,  d2,  d1
52        vmull.s16       q10, d0,  d16
53        vmlsl.s16       q10, d4,  d17
54        vmull.s16       q11, d0,  d17
55        vmlal.s16       q11, d4,  d16
56        vmull.s16       q12, d6,  d18
57        vmlsl.s16       q12, d2,  d19
58        vmull.s16       q13, d6,  d19
59        vmlal.s16       q13, d2,  d18
60        vshrn.s32       d0,  q10, #15
61        vshrn.s32       d1,  q11, #15
62        vshrn.s32       d2,  q12, #15
63        vshrn.s32       d3,  q13, #15
64        vzip.16         d0,  d1
65        vzip.16         d2,  d3
66        ldrh            lr,  [r4], #2
67        ldrh            r2,  [\rt, #-2]!
68        add             lr,  \dst, lr,  lsl #2
69        add             r2,  \dst, r2,  lsl #2
70        vst1.32         {d0[0]},  [lr,:32]
71        vst1.32         {d2[0]},  [r2,:32]
72        ldrh            lr,  [r4], #2
73        ldrh            r2,  [\rt, #-2]!
74        add             lr,  \dst, lr,  lsl #2
75        add             r2,  \dst, r2,  lsl #2
76        vst1.32         {d0[1]},  [lr,:32]
77        vst1.32         {d2[1]},  [r2,:32]
78        ldrh            lr,  [r4], #2
79        ldrh            r2,  [\rt, #-2]!
80        add             lr,  \dst, lr,  lsl #2
81        add             r2,  \dst, r2,  lsl #2
82        vst1.32         {d1[0]},  [lr,:32]
83        vst1.32         {d3[0]},  [r2,:32]
84        ldrh            lr,  [r4], #2
85        ldrh            r2,  [\rt, #-2]!
86        add             lr,  \dst, lr,  lsl #2
87        add             r2,  \dst, r2,  lsl #2
88        vst1.32         {d1[1]},  [lr,:32]
89        vst1.32         {d3[1]},  [r2,:32]
90        subs            r6,  r6,  #32
91        bgt             1b
92.endm
93
94function ff_mdct_fixed_calc_neon, export=1
95        push            {r1,r4-r11,lr}
96
97        ldr             r4,  [r0, #8]           @ revtab
98        ldr             r6,  [r0, #16]          @ mdct_size; n
99        ldr             r7,  [r0, #24]          @ tcos
100
101        prerot          r1,  r5
102
103        mov             r4,  r0
104        bl              X(ff_fft_fixed_calc_neon)
105
106        pop             {r5}
107        mov             r12, #-16
108        ldr             r6,  [r4, #16]          @ mdct_size; n
109        ldr             r7,  [r4, #24]          @ tcos
110        add             r5,  r5,  r6,  lsr #1
111        add             r7,  r7,  r6,  lsr #1
112        sub             r1,  r5,  #16
113        sub             r2,  r7,  #16
1141:
115        vld2.16         {d4,d5},  [r7,:128]!
116        vld2.16         {d6,d7},  [r2,:128], r12
117        vld2.16         {d0,d1},  [r5,:128]
118        vld2.16         {d2,d3},  [r1,:128]
119        vrev64.16       q3,  q3
120        vrev64.16       q1,  q1
121        vneg.s16        q3,  q3
122        vneg.s16        q2,  q2
123        vmull.s16       q11, d2,  d6
124        vmlal.s16       q11, d3,  d7
125        vmull.s16       q8,  d0,  d5
126        vmlsl.s16       q8,  d1,  d4
127        vmull.s16       q9,  d0,  d4
128        vmlal.s16       q9,  d1,  d5
129        vmull.s16       q10, d2,  d7
130        vmlsl.s16       q10, d3,  d6
131        vshrn.s32       d0,  q11, #15
132        vshrn.s32       d1,  q8,  #15
133        vshrn.s32       d2,  q9,  #15
134        vshrn.s32       d3,  q10, #15
135        vrev64.16       q0,  q0
136        vst2.16         {d2,d3},  [r5,:128]!
137        vst2.16         {d0,d1},  [r1,:128], r12
138        subs            r6,  r6,  #32
139        bgt             1b
140
141        pop             {r4-r11,pc}
142endfunc
143
144function ff_mdct_fixed_calcw_neon, export=1
145        push            {r1,r4-r11,lr}
146
147        ldrd            r4,  r5,  [r0, #8]      @ revtab, tmp_buf
148        ldr             r6,  [r0, #16]          @ mdct_size; n
149        ldr             r7,  [r0, #24]          @ tcos
150
151        prerot          r5,  r1
152
153        mov             r4,  r0
154        mov             r1,  r5
155        bl              X(ff_fft_fixed_calc_neon)
156
157        pop             {r7}
158        mov             r12, #-16
159        ldr             r6,  [r4, #16]          @ mdct_size; n
160        ldr             r9,  [r4, #24]          @ tcos
161        add             r5,  r5,  r6,  lsr #1
162        add             r7,  r7,  r6
163        add             r9,  r9,  r6,  lsr #1
164        sub             r3,  r5,  #16
165        sub             r1,  r7,  #16
166        sub             r2,  r9,  #16
1671:
168        vld2.16         {d4,d5},  [r9,:128]!
169        vld2.16         {d6,d7},  [r2,:128], r12
170        vld2.16         {d0,d1},  [r5,:128]!
171        vld2.16         {d2,d3},  [r3,:128], r12
172        vrev64.16       q3,  q3
173        vrev64.16       q1,  q1
174        vneg.s16        q3,  q3
175        vneg.s16        q2,  q2
176        vmull.s16       q8,  d2,  d6
177        vmlal.s16       q8,  d3,  d7
178        vmull.s16       q9,  d0,  d5
179        vmlsl.s16       q9,  d1,  d4
180        vmull.s16       q10, d0,  d4
181        vmlal.s16       q10, d1,  d5
182        vmull.s16       q11, d2,  d7
183        vmlsl.s16       q11, d3,  d6
184        vrev64.32       q8,  q8
185        vrev64.32       q9,  q9
186        vst2.32         {q10,q11},[r7,:128]!
187        vst2.32         {d16,d18},[r1,:128], r12
188        vst2.32         {d17,d19},[r1,:128], r12
189        subs            r6,  r6,  #32
190        bgt             1b
191
192        pop             {r4-r11,pc}
193endfunc
194