1/* 2 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23.macro bflies d0, d1, r0, r1 24 vrev64.32 \r0, \d1 @ t5, t6, t1, t2 25 vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2 26 vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2 27 vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5 28 vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1 29 @ t5, t6, t4, t3 30 vhsub.s16 \d1, \d0, \r0 31 vhadd.s16 \d0, \d0, \r0 32.endm 33 34.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1 35 vrev32.16 \r0, \d3 36 vmull.s16 \w0, \d3, \c0 37 vmlal.s16 \w0, \r0, \c1 38 vshrn.s32 \d3, \w0, #15 39 bflies \q0, \q1, \w0, \w1 40.endm 41 42.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \ 43 r0, r1, w0, w1 44 vrev32.16 \r0, \d1 45 vrev32.16 \r1, \d3 46 vmull.s16 \w0, \d1, \c0 47 vmlal.s16 \w0, \r0, \c1 48 vmull.s16 \w1, \d3, \c2 49 vmlal.s16 \w1, \r1, \c3 50 vshrn.s32 \d1, \w0, #15 51 vshrn.s32 \d3, \w1, #15 52 bflies \q0, \q1, \w0, \w1 53.endm 54 55.macro fft4 d0, d1, r0, r1 56 vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7 57 vhsub.s16 \r1, \d1, \d0 58 vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5 59 vmov.i64 \d1, #0xffff00000000 60 vbit \r0, \r1, \d1 61 vrev64.16 \r1, \r0 @ t7, t8, t4, t3 62 vtrn.32 \r0, \r1 @ t3, t4, t7, t8 63 vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7 64 vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1 65 vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3 66.endm 67 68.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1 69 fft4 \d0, \d1, \r0, \r1 70 vtrn.32 \d0, \d1 @ z0, z2, z1, z3 71 vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4 72 vhsub.s16 \d3, \d2, \d3 @ z5, z7 73 vmov \d2, \r0 74 transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1 75.endm 76 77function fft4_neon 78 vld1.16 {d0-d1}, [r0] 79 fft4 d0, d1, d2, d3 80 vst1.16 {d0-d1}, [r0] 81 bx lr 82endfunc 83 84function fft8_neon 85 vld1.16 {d0-d3}, [r0,:128] 86 movrel r1, coefs 87 vld1.16 {d30}, [r1,:64] 88 vdup.16 d31, d30[0] 89 fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9 90 vtrn.32 d0, d1 91 vtrn.32 d2, d3 92 vst1.16 {d0-d3}, [r0,:128] 93 bx lr 94endfunc 95 96function fft16_neon 97 vld1.16 {d0-d3}, [r0,:128]! 98 vld1.16 {d4-d7}, [r0,:128] 99 movrel r1, coefs 100 sub r0, r0, #32 101 vld1.16 {d28-d31},[r1,:128] 102 vdup.16 d31, d28[0] 103 fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9 104 vswp d5, d6 105 fft4 q2, q3, q8, q9 106 vswp d5, d6 107 vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7 108 vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15 109 vswp d1, d2 110 vdup.16 d31, d28[0] 111 transform01 q0, q2, d5, d31, d28, d20, q8, q9 112 vdup.16 d26, d29[0] 113 vdup.16 d27, d30[0] 114 transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \ 115 d20, d21, q8, q9 116 vtrn.32 q0, q1 117 vtrn.32 q2, q3 118 vst1.16 {d0-d3}, [r0,:128]! 119 vst1.16 {d4-d7}, [r0,:128] 120 bx lr 121endfunc 122 123function fft_pass_neon 124 push {r4,lr} 125 movrel lr, coefs + 24 126 vld1.16 {d30}, [lr,:64] 127 lsl r12, r2, #3 128 vmov d31, d30 129 add r3, r1, r2, lsl #2 130 mov lr, #-8 131 sub r3, r3, #2 132 mov r4, r0 133 vld1.16 {d27[]}, [r3,:16] 134 sub r3, r3, #6 135 vld1.16 {q0}, [r4,:128], r12 136 vld1.16 {q1}, [r4,:128], r12 137 vld1.16 {q2}, [r4,:128], r12 138 vld1.16 {q3}, [r4,:128], r12 139 vld1.16 {d28}, [r1,:64]! 140 vld1.16 {d29}, [r3,:64], lr 141 vswp d1, d2 142 vswp d5, d6 143 vtrn.32 d0, d1 144 vtrn.32 d4, d5 145 vdup.16 d25, d28[1] 146 vmul.s16 d27, d27, d31 147 transform01 q0, q2, d5, d25, d27, d20, q8, q9 148 b 2f 1491: 150 mov r4, r0 151 vdup.16 d26, d29[0] 152 vld1.16 {q0}, [r4,:128], r12 153 vld1.16 {q1}, [r4,:128], r12 154 vld1.16 {q2}, [r4,:128], r12 155 vld1.16 {q3}, [r4,:128], r12 156 vld1.16 {d28}, [r1,:64]! 157 vld1.16 {d29}, [r3,:64], lr 158 vswp d1, d2 159 vswp d5, d6 160 vtrn.32 d0, d1 161 vtrn.32 d4, d5 162 vdup.16 d24, d28[0] 163 vdup.16 d25, d28[1] 164 vdup.16 d27, d29[3] 165 vmul.s16 q13, q13, q15 166 transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \ 167 d16, d17, q9, q10 1682: 169 vtrn.32 d2, d3 170 vtrn.32 d6, d7 171 vdup.16 d24, d28[2] 172 vdup.16 d26, d29[2] 173 vdup.16 d25, d28[3] 174 vdup.16 d27, d29[1] 175 vmul.s16 q13, q13, q15 176 transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \ 177 d16, d17, q9, q10 178 vtrn.32 d0, d1 179 vtrn.32 d2, d3 180 vtrn.32 d4, d5 181 vtrn.32 d6, d7 182 vswp d1, d2 183 vswp d5, d6 184 mov r4, r0 185 vst1.16 {q0}, [r4,:128], r12 186 vst1.16 {q1}, [r4,:128], r12 187 vst1.16 {q2}, [r4,:128], r12 188 vst1.16 {q3}, [r4,:128], r12 189 add r0, r0, #16 190 subs r2, r2, #2 191 bgt 1b 192 pop {r4,pc} 193endfunc 194 195#define F_SQRT1_2 23170 196#define F_COS_16_1 30274 197#define F_COS_16_3 12540 198 199const coefs, align=4 200 .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2 201 .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1 202 .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3 203 .short 1, -1, -1, 1 204endconst 205 206.macro def_fft n, n2, n4 207function fft\n\()_neon 208 push {r4, lr} 209 mov r4, r0 210 bl fft\n2\()_neon 211 add r0, r4, #\n4*2*4 212 bl fft\n4\()_neon 213 add r0, r4, #\n4*3*4 214 bl fft\n4\()_neon 215 mov r0, r4 216 pop {r4, lr} 217 movrelx r1, X(ff_cos_\n\()_fixed) 218 mov r2, #\n4/2 219 b fft_pass_neon 220endfunc 221.endm 222 223 def_fft 32, 16, 8 224 def_fft 64, 32, 16 225 def_fft 128, 64, 32 226 def_fft 256, 128, 64 227 def_fft 512, 256, 128 228 def_fft 1024, 512, 256 229 def_fft 2048, 1024, 512 230 def_fft 4096, 2048, 1024 231 def_fft 8192, 4096, 2048 232 def_fft 16384, 8192, 4096 233 def_fft 32768, 16384, 8192 234 def_fft 65536, 32768, 16384 235 236function ff_fft_fixed_calc_neon, export=1 237 ldr r2, [r0] 238 sub r2, r2, #2 239 movrel r3, fft_fixed_tab_neon 240 ldr r3, [r3, r2, lsl #2] 241 mov r0, r1 242 bx r3 243endfunc 244 245const fft_fixed_tab_neon 246 .word fft4_neon 247 .word fft8_neon 248 .word fft16_neon 249 .word fft32_neon 250 .word fft64_neon 251 .word fft128_neon 252 .word fft256_neon 253 .word fft512_neon 254 .word fft1024_neon 255 .word fft2048_neon 256 .word fft4096_neon 257 .word fft8192_neon 258 .word fft16384_neon 259 .word fft32768_neon 260 .word fft65536_neon 261endconst 262