1/* 2 * Copyright (c) 2013 RISC OS Open Ltd 3 * Author: Ben Avison <bavison@riscosopen.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24@ TODO: * FFTs wider than 16 25@ * dispatch code 26 27function fft4_vfp 28 vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] 29 vldr d4, [a1, #1*2*4] @ s8,s9 = z[1] 30 vldr d1, [a1, #2*2*4] @ s2,s3 = z[2] 31 vldr d5, [a1, #3*2*4] @ s10,s11 = z[3] 32 @ stall 33 vadd.f s12, s0, s8 @ i0 34 vadd.f s13, s1, s9 @ i1 35 vadd.f s14, s2, s10 @ i2 36 vadd.f s15, s3, s11 @ i3 37 vsub.f s8, s0, s8 @ i4 38 vsub.f s9, s1, s9 @ i5 39 vsub.f s10, s2, s10 @ i6 40 vsub.f s11, s3, s11 @ i7 41 @ stall 42 @ stall 43 vadd.f s0, s12, s14 @ z[0].re 44 vsub.f s4, s12, s14 @ z[2].re 45 vadd.f s1, s13, s15 @ z[0].im 46 vsub.f s5, s13, s15 @ z[2].im 47 vadd.f s7, s9, s10 @ z[3].im 48 vsub.f s3, s9, s10 @ z[1].im 49 vadd.f s2, s8, s11 @ z[1].re 50 vsub.f s6, s8, s11 @ z[3].re 51 @ stall 52 @ stall 53 vstr d0, [a1, #0*2*4] 54 vstr d2, [a1, #2*2*4] 55 @ stall 56 @ stall 57 vstr d1, [a1, #1*2*4] 58 vstr d3, [a1, #3*2*4] 59 60 bx lr 61endfunc 62 63.macro macro_fft8_head 64 @ FFT4 65 vldr d4, [a1, #0 * 2*4] 66 vldr d6, [a1, #1 * 2*4] 67 vldr d5, [a1, #2 * 2*4] 68 vldr d7, [a1, #3 * 2*4] 69 @ BF 70 vldr d12, [a1, #4 * 2*4] 71 vadd.f s16, s8, s12 @ vector op 72 vldr d14, [a1, #5 * 2*4] 73 vldr d13, [a1, #6 * 2*4] 74 vldr d15, [a1, #7 * 2*4] 75 vsub.f s20, s8, s12 @ vector op 76 vadd.f s0, s16, s18 77 vsub.f s2, s16, s18 78 vadd.f s1, s17, s19 79 vsub.f s3, s17, s19 80 vadd.f s7, s21, s22 81 vsub.f s5, s21, s22 82 vadd.f s4, s20, s23 83 vsub.f s6, s20, s23 84 vsub.f s20, s24, s28 @ vector op 85 vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory 86 vstr d1, [a1, #1 * 2*4] 87 vldr s0, cos1pi4 88 vadd.f s16, s24, s28 @ vector op 89 vstr d2, [a1, #2 * 2*4] 90 vstr d3, [a1, #3 * 2*4] 91 vldr d12, [a1, #0 * 2*4] 92 @ TRANSFORM 93 vmul.f s20, s20, s0 @ vector x scalar op 94 vldr d13, [a1, #1 * 2*4] 95 vldr d14, [a1, #2 * 2*4] 96 vldr d15, [a1, #3 * 2*4] 97 @ BUTTERFLIES 98 vadd.f s0, s18, s16 99 vadd.f s1, s17, s19 100 vsub.f s2, s17, s19 101 vsub.f s3, s18, s16 102 vadd.f s4, s21, s20 103 vsub.f s5, s21, s20 104 vadd.f s6, s22, s23 105 vsub.f s7, s22, s23 106 vadd.f s8, s0, s24 @ vector op 107 vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory 108 vstr d1, [a1, #1 * 2*4] 109 vldr d6, [a1, #0 * 2*4] 110 vldr d7, [a1, #1 * 2*4] 111 vadd.f s1, s5, s6 112 vadd.f s0, s7, s4 113 vsub.f s2, s5, s6 114 vsub.f s3, s7, s4 115 vsub.f s12, s24, s12 @ vector op 116 vsub.f s5, s29, s1 117 vsub.f s4, s28, s0 118 vsub.f s6, s30, s2 119 vsub.f s7, s31, s3 120 vadd.f s16, s0, s28 @ vector op 121 vstr d6, [a1, #4 * 2*4] 122 vstr d7, [a1, #6 * 2*4] 123 vstr d4, [a1, #0 * 2*4] 124 vstr d5, [a1, #2 * 2*4] 125 vstr d2, [a1, #5 * 2*4] 126 vstr d3, [a1, #7 * 2*4] 127.endm 128 129.macro macro_fft8_tail 130 vstr d8, [a1, #1 * 2*4] 131 vstr d9, [a1, #3 * 2*4] 132.endm 133 134function fft8_vfp 135 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 136 fmrx a2, FPSCR 137 fmxr FPSCR, a3 138 vpush {s16-s31} 139 140 macro_fft8_head 141 macro_fft8_tail 142 143 vpop {s16-s31} 144 fmxr FPSCR, a2 145 bx lr 146endfunc 147 148.align 3 149cos1pi4: @ cos(1*pi/4) = sqrt(2) 150 .float 0.707106769084930419921875 151cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 152 .float 0.92387950420379638671875 153cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 154 .float 0.3826834261417388916015625 155 156function ff_fft16_vfp, export=1 157 ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 158 fmrx a2, FPSCR 159 fmxr FPSCR, a3 160 vpush {s16-s31} 161 162 macro_fft8_head 163 @ FFT4(z+8) 164 vldr d10, [a1, #8 * 2*4] 165 vldr d12, [a1, #9 * 2*4] 166 vldr d11, [a1, #10 * 2*4] 167 vldr d13, [a1, #11 * 2*4] 168 macro_fft8_tail 169 vadd.f s16, s20, s24 @ vector op 170 @ FFT4(z+12) 171 vldr d4, [a1, #12 * 2*4] 172 vldr d6, [a1, #13 * 2*4] 173 vldr d5, [a1, #14 * 2*4] 174 vsub.f s20, s20, s24 @ vector op 175 vldr d7, [a1, #15 * 2*4] 176 vadd.f s0, s16, s18 177 vsub.f s4, s16, s18 178 vadd.f s1, s17, s19 179 vsub.f s5, s17, s19 180 vadd.f s7, s21, s22 181 vsub.f s3, s21, s22 182 vadd.f s2, s20, s23 183 vsub.f s6, s20, s23 184 vadd.f s16, s8, s12 @ vector op 185 vstr d0, [a1, #8 * 2*4] 186 vstr d2, [a1, #10 * 2*4] 187 vstr d1, [a1, #9 * 2*4] 188 vsub.f s20, s8, s12 189 vstr d3, [a1, #11 * 2*4] 190 @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4) 191 vldr d12, [a1, #10 * 2*4] 192 vadd.f s0, s16, s18 193 vadd.f s1, s17, s19 194 vsub.f s6, s16, s18 195 vsub.f s7, s17, s19 196 vsub.f s3, s21, s22 197 vadd.f s2, s20, s23 198 vadd.f s5, s21, s22 199 vsub.f s4, s20, s23 200 vstr d0, [a1, #12 * 2*4] 201 vmov s0, s6 202 @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8) 203 vldr d6, [a1, #9 * 2*4] 204 vstr d1, [a1, #13 * 2*4] 205 vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8 206 vstr d2, [a1, #15 * 2*4] 207 vldr d7, [a1, #13 * 2*4] 208 vadd.f s4, s25, s24 209 vsub.f s5, s25, s24 210 vsub.f s6, s0, s7 211 vadd.f s7, s0, s7 212 vmul.f s20, s12, s3 @ vector op 213 @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8) 214 vldr d4, [a1, #11 * 2*4] 215 vldr d5, [a1, #15 * 2*4] 216 vldr s1, cos3pi8 217 vmul.f s24, s4, s2 @ vector * scalar op 218 vmul.f s28, s12, s1 @ vector * scalar op 219 vmul.f s12, s8, s1 @ vector * scalar op 220 vadd.f s4, s20, s29 221 vsub.f s5, s21, s28 222 vsub.f s6, s22, s31 223 vadd.f s7, s23, s30 224 vmul.f s8, s8, s3 @ vector * scalar op 225 vldr d8, [a1, #1 * 2*4] 226 vldr d9, [a1, #5 * 2*4] 227 vldr d10, [a1, #3 * 2*4] 228 vldr d11, [a1, #7 * 2*4] 229 vldr d14, [a1, #2 * 2*4] 230 vadd.f s0, s6, s4 231 vadd.f s1, s5, s7 232 vsub.f s2, s5, s7 233 vsub.f s3, s6, s4 234 vadd.f s4, s12, s9 235 vsub.f s5, s13, s8 236 vsub.f s6, s14, s11 237 vadd.f s7, s15, s10 238 vadd.f s12, s0, s16 @ vector op 239 vstr d0, [a1, #1 * 2*4] 240 vstr d1, [a1, #5 * 2*4] 241 vldr d4, [a1, #1 * 2*4] 242 vldr d5, [a1, #5 * 2*4] 243 vadd.f s0, s6, s4 244 vadd.f s1, s5, s7 245 vsub.f s2, s5, s7 246 vsub.f s3, s6, s4 247 vsub.f s8, s16, s8 @ vector op 248 vstr d6, [a1, #1 * 2*4] 249 vstr d7, [a1, #5 * 2*4] 250 vldr d15, [a1, #6 * 2*4] 251 vsub.f s4, s20, s0 252 vsub.f s5, s21, s1 253 vsub.f s6, s22, s2 254 vsub.f s7, s23, s3 255 vadd.f s20, s0, s20 @ vector op 256 vstr d4, [a1, #9 * 2*4] 257 @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12]) 258 vldr d6, [a1, #8 * 2*4] 259 vstr d5, [a1, #13 * 2*4] 260 vldr d7, [a1, #12 * 2*4] 261 vstr d2, [a1, #11 * 2*4] 262 vldr d8, [a1, #0 * 2*4] 263 vstr d3, [a1, #15 * 2*4] 264 vldr d9, [a1, #4 * 2*4] 265 vadd.f s0, s26, s24 266 vadd.f s1, s25, s27 267 vsub.f s2, s25, s27 268 vsub.f s3, s26, s24 269 vadd.f s4, s14, s12 270 vadd.f s5, s13, s15 271 vsub.f s6, s13, s15 272 vsub.f s7, s14, s12 273 vadd.f s8, s0, s28 @ vector op 274 vstr d0, [a1, #3 * 2*4] 275 vstr d1, [a1, #7 * 2*4] 276 vldr d6, [a1, #3 * 2*4] 277 vldr d7, [a1, #7 * 2*4] 278 vsub.f s0, s16, s4 279 vsub.f s1, s17, s5 280 vsub.f s2, s18, s6 281 vsub.f s3, s19, s7 282 vsub.f s12, s28, s12 @ vector op 283 vadd.f s16, s4, s16 @ vector op 284 vstr d10, [a1, #3 * 2*4] 285 vstr d11, [a1, #7 * 2*4] 286 vstr d4, [a1, #2 * 2*4] 287 vstr d5, [a1, #6 * 2*4] 288 vstr d0, [a1, #8 * 2*4] 289 vstr d1, [a1, #12 * 2*4] 290 vstr d6, [a1, #10 * 2*4] 291 vstr d7, [a1, #14 * 2*4] 292 vstr d8, [a1, #0 * 2*4] 293 vstr d9, [a1, #4 * 2*4] 294 295 vpop {s16-s31} 296 fmxr FPSCR, a2 297 bx lr 298endfunc 299