1/* 2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23#include "asm-offsets.h" 24 25.macro shuffle a, b, c, d 26const shuffle_\a\b\c\d align=4 27 .byte (\a * 4), (\a * 4 + 1), (\a * 4 + 2), (\a * 4 + 3) 28 .byte (\b * 4), (\b * 4 + 1), (\b * 4 + 2), (\b * 4 + 3) 29 .byte (\c * 4), (\c * 4 + 1), (\c * 4 + 2), (\c * 4 + 3) 30 .byte (\d * 4), (\d * 4 + 1), (\d * 4 + 2), (\d * 4 + 3) 31endconst 32.endm 33 34shuffle 0, 2, 1, 3 35shuffle 1, 0, 3, 2 36shuffle 2, 3, 0, 1 37shuffle 3, 1, 2, 0 38 39 40function fft5_neon 41 lsl x2, x2, #3 42 ld1 {v24.2s}, [x1], x2 43 ld2 {v25.s,v26.s}[0], [x1], x2 44 ld2 {v25.s,v26.s}[1], [x1], x2 45 ld2 {v25.s,v26.s}[2], [x1], x2 46 ld2 {v25.s,v26.s}[3], [x1] 47 dup v6.4s, v24.s[0] 48 dup v7.4s, v24.s[1] 49 50 faddp v0.4s, v25.4s, v26.4s 51 // z[][0], z[][3] 52 fmul v16.4s, v25.4s, v15.s[0] // rr 53 fmul v17.4s, v25.4s, v15.s[1] // ri 54 fmul v18.4s, v26.4s, v15.s[0] // ir 55 fmul v19.4s, v26.4s, v15.s[1] // ii 56 faddp v0.4s, v0.4s, v0.4s 57 // z[][1], z[][2] 58 fmul v20.4s, v25.4s, v15.s[2] // rr 59 fmul v21.4s, v25.4s, v15.s[3] // ri 60 fmul v22.4s, v26.4s, v15.s[2] // ir 61 fmul v23.4s, v26.4s, v15.s[3] // ii 62 fadd v0.2s, v24.2s, v0.2s // out[0] 63 64 // z[0123][0], z[0123][3] 65 fsub v24.4s, v16.4s, v19.4s // (c).re = rr - ii; 66 fadd v27.4s, v16.4s, v19.4s // (d).re = rr + ii; 67 ld1 {v16.16b}, [x11] 68 ld1 {v19.16b}, [x14] 69 fadd v28.4s, v17.4s, v18.4s // (c).im = ri + ir; 70 fsub v31.4s, v18.4s, v17.4s // (d).im = -ri + ir; 71 ld1 {v17.16b}, [x12] 72 // z[0123][1], z[0123][2] 73 fsub v25.4s, v20.4s, v23.4s // (c).re = rr - ii; 74 fadd v26.4s, v20.4s, v23.4s // (d).re = rr + ii; 75 ld1 {v18.16b}, [x13] 76 fadd v29.4s, v21.4s, v22.4s // (c).im = ri + ir; 77 fsub v30.4s, v22.4s, v21.4s // (d).im = -ri + ir; 78 79 //real 80 tbl v20.16b, {v24.16b}, v16.16b 81 tbl v21.16b, {v25.16b}, v17.16b 82 tbl v22.16b, {v26.16b}, v18.16b 83 tbl v23.16b, {v27.16b}, v19.16b 84 //imag 85 tbl v16.16b, {v28.16b}, v16.16b 86 tbl v17.16b, {v29.16b}, v17.16b 87 tbl v18.16b, {v30.16b}, v18.16b 88 tbl v19.16b, {v31.16b}, v19.16b 89 90 fadd v6.4s, v6.4s, v20.4s 91 fadd v22.4s, v22.4s, v23.4s 92 fadd v7.4s, v7.4s, v16.4s 93 fadd v18.4s, v18.4s, v19.4s 94 95 fadd v21.4s, v21.4s, v22.4s 96 fadd v17.4s, v17.4s, v18.4s 97 fadd v6.4s, v6.4s, v21.4s 98 fadd v7.4s, v7.4s, v17.4s 99 100 ret 101endfunc 102 103function fft15_neon 104 mov x8, x1 105 mov x9, x30 106 add x2, x3, x3, lsl #1 // 3 * stride 107 108 add x1, x8, x3, lsl #3 // in + 1 * stride 109 bl fft5_neon 110 mov v1.8b, v0.8b 111 mov v2.16b, v6.16b 112 mov v3.16b, v7.16b 113 114 add x1, x8, x3, lsl #4 // in + 2 * stride 115 add x2, x3, x3, lsl #1 // 3 * stride 116 bl fft5_neon 117 zip1 v1.4s, v1.4s, v0.4s 118 mov v4.16b, v6.16b 119 mov v5.16b, v7.16b 120 121 mov x1, x8 // in + 0 * stride 122 add x2, x3, x3, lsl #1 // 3 * stride 123 bl fft5_neon 124 125 faddp v20.4s, v1.4s, v1.4s 126 127 ext v18.16b, v8.16b, v8.16b, #4 128 ext v19.16b, v9.16b, v9.16b, #4 129 mov v16.16b, v6.16b 130 mov v17.16b, v7.16b 131 fadd v20.2s, v20.2s, v0.2s 132 133 uzp1 v18.4s, v18.4s, v10.4s // exp[2,4,6,8].re 134 uzp1 v19.4s, v19.4s, v11.4s // exp[2,4,6,8].im 135 136 st1 {v20.2s}, [x0], #8 // out[0] 137 138 fmla v16.4s, v2.4s, v8.4s 139 fmls v16.4s, v3.4s, v9.4s 140 141 fmla v17.4s, v2.4s, v9.4s 142 fmla v17.4s, v3.4s, v8.4s 143 144 fmla v16.4s, v4.4s, v18.4s 145 fmls v16.4s, v5.4s, v19.4s 146 147 fmla v17.4s, v4.4s, v19.4s 148 fmla v17.4s, v5.4s, v18.4s 149 150 zip1 v18.4s, v16.4s, v17.4s 151 zip2 v19.4s, v16.4s, v17.4s 152 153 rev64 v31.4s, v14.4s 154 trn1 v28.2d, v1.2d, v1.2d 155 trn2 v29.2d, v1.2d, v1.2d 156 zip1 v30.2d, v14.2d, v31.2d 157 zip2 v31.2d, v14.2d, v31.2d 158 159 st1 {v18.4s,v19.4s}, [x0], #32 // out[1-4] 160 161 fmul v16.4s, v28.4s, v30.4s 162 fmul v17.4s, v29.4s, v30.4s 163 fmls v16.4s, v29.4s, v31.4s 164 fmla v17.4s, v28.4s, v31.4s 165 faddp v16.4s, v16.4s, v16.4s 166 faddp v17.4s, v17.4s, v17.4s 167 zip1 v18.2s, v16.2s, v17.2s 168 zip2 v19.2s, v16.2s, v17.2s 169 170 fadd v18.2s, v18.2s, v0.2s 171 fadd v0.2s, v19.2s, v0.2s 172 173 ext v30.16b, v12.16b, v12.16b, #4 174 ext v31.16b, v13.16b, v13.16b, #4 175 mov v16.16b, v6.16b 176 mov v17.16b, v7.16b 177 178 uzp1 v30.4s, v30.4s, v8.4s 179 uzp1 v31.4s, v31.4s, v9.4s 180 181 st1 {v18.2s}, [x0], #8 // out[5] 182 183 fmla v16.4s, v2.4s, v10.4s 184 fmls v16.4s, v3.4s, v11.4s 185 186 fmla v17.4s, v2.4s, v11.4s 187 fmla v17.4s, v3.4s, v10.4s 188 189 fmla v16.4s, v4.4s, v30.4s 190 fmls v16.4s, v5.4s, v31.4s 191 192 fmla v17.4s, v4.4s, v31.4s 193 fmla v17.4s, v5.4s, v30.4s 194 195 zip1 v18.4s, v16.4s, v17.4s 196 zip2 v19.4s, v16.4s, v17.4s 197 198 ext v30.16b, v10.16b, v10.16b, #4 199 ext v31.16b, v11.16b, v11.16b, #4 200 201 fmla v6.4s, v2.4s, v12.4s 202 fmls v6.4s, v3.4s, v13.4s 203 204 st1 {v18.4s,v19.4s}, [x0], #32 // out[6-9] 205 206 uzp1 v30.4s, v30.4s, v12.4s 207 uzp1 v31.4s, v31.4s, v13.4s 208 209 fmla v7.4s, v2.4s, v13.4s 210 fmla v7.4s, v3.4s, v12.4s 211 212 st1 {v0.2s}, [x0], #8 // out[10] 213 214 fmla v6.4s, v4.4s, v30.4s 215 fmls v6.4s, v5.4s, v31.4s 216 217 fmla v7.4s, v4.4s, v31.4s 218 fmla v7.4s, v5.4s, v30.4s 219 220 zip1 v18.4s, v6.4s, v7.4s 221 zip2 v19.4s, v6.4s, v7.4s 222 223 st1 {v18.4s,v19.4s}, [x0], #32 // out[11-14] 224 225 ret x9 226endfunc 227 228// x0: out, x1: out+len2, x2: exptab, x3: len2 229function fft15_pass 230 ands x6, x3, #3 231 mov x4, x0 232 mov x5, x1 233 b.eq 9f 234 ld1 {v0.2s}, [x0], #8 235 ld1 {v1.2s}, [x1], #8 236 sub x3, x3, x6 237 subs x6, x6, #1 238 fadd v2.2s, v0.2s, v1.2s 239 fsub v3.2s, v0.2s, v1.2s 240 add x2, x2, #8 241 st1 {v2.2s}, [x4], #8 242 st1 {v3.2s}, [x5], #8 243 b.eq 9f 2441: 245 subs x6, x6, #1 246 ldp s4, s5, [x2], #8 247 ldp s2, s3, [x1], #8 248 ldp s0, s1, [x0], #8 249 250 fmul s6, s2, s4 251 fmul s7, s2, s5 252 fmls s6, s3, v5.s[0] 253 fmla s7, s3, v4.s[0] 254 255 fsub s2, s0, s6 256 fsub s3, s1, s7 257 fadd s0, s0, s6 258 fadd s1, s1, s7 259 260 stp s2, s3, [x5], #8 261 stp s0, s1, [x4], #8 262 b.gt 1b 2639: 264 ld1 {v4.4s,v5.4s}, [x2], #32 265 ld2 {v2.4s,v3.4s}, [x1], #32 266 uzp1 v6.4s, v4.4s, v5.4s 267 uzp2 v7.4s, v4.4s, v5.4s 268 ld2 {v0.4s,v1.4s}, [x0], #32 2698: 270 subs x3, x3, #8 271 272 fmul v4.4s, v2.4s, v6.4s 273 fmul v5.4s, v2.4s, v7.4s 274 b.lt 4f 275 276 ld1 {v18.4s,v19.4s}, [x2], #32 277 278 fmls v4.4s, v3.4s, v7.4s 279 fmla v5.4s, v3.4s, v6.4s 280 281 ld2 {v22.4s,v23.4s}, [x1], #32 282 283 fsub v2.4s, v0.4s, v4.4s 284 fadd v0.4s, v0.4s, v4.4s 285 fsub v3.4s, v1.4s, v5.4s 286 fadd v1.4s, v1.4s, v5.4s 287 288 uzp1 v16.4s, v18.4s, v19.4s 289 uzp2 v17.4s, v18.4s, v19.4s 290 291 st2 {v2.4s,v3.4s}, [x5], #32 292 st2 {v0.4s,v1.4s}, [x4], #32 293 ld2 {v20.4s,v21.4s}, [x0], #32 294 295 fmul v18.4s, v22.4s, v16.4s 296 fmul v19.4s, v22.4s, v17.4s 297 b.eq 0f 298 299 ld1 {v4.4s,v5.4s}, [x2], #32 300 301 fmls v18.4s, v23.4s, v17.4s 302 fmla v19.4s, v23.4s, v16.4s 303 304 ld2 {v2.4s,v3.4s}, [x1], #32 305 306 fsub v22.4s, v20.4s, v18.4s 307 fadd v20.4s, v20.4s, v18.4s 308 fsub v23.4s, v21.4s, v19.4s 309 fadd v21.4s, v21.4s, v19.4s 310 311 uzp1 v6.4s, v4.4s, v5.4s 312 uzp2 v7.4s, v4.4s, v5.4s 313 314 st2 {v22.4s,v23.4s}, [x5], #32 315 st2 {v20.4s,v21.4s}, [x4], #32 316 ld2 {v0.4s,v1.4s}, [x0], #32 317 318 b 8b 3194: 320 fmls v4.4s, v3.4s, v7.4s 321 fmla v5.4s, v3.4s, v6.4s 322 323 fsub v2.4s, v0.4s, v4.4s 324 fadd v0.4s, v0.4s, v4.4s 325 fsub v3.4s, v1.4s, v5.4s 326 fadd v1.4s, v1.4s, v5.4s 327 328 st2 {v2.4s,v3.4s}, [x5], #32 329 st2 {v0.4s,v1.4s}, [x4], #32 330 331 ret 3320: 333 fmls v18.4s, v23.4s, v17.4s 334 fmla v19.4s, v23.4s, v16.4s 335 336 fsub v22.4s, v20.4s, v18.4s 337 fadd v20.4s, v20.4s, v18.4s 338 fsub v23.4s, v21.4s, v19.4s 339 fadd v21.4s, v21.4s, v19.4s 340 341 st2 {v22.4s,v23.4s}, [x5], #32 342 st2 {v20.4s,v21.4s}, [x4], #32 343 344 ret 345endfunc 346 347function fft30_neon align=6 348 sub sp, sp, #0x20 349 stp x20, x21, [sp] 350 stp x22, x30, [sp, #0x10] 351 mov x21, x1 352 mov x22, x2 353 mov x20, x4 354 mov x0, x21 355 mov x1, x22 356 lsl x3, x20, #1 357 bl fft15_neon 358 359 add x0, x21, #15*8 360 add x1, x22, x20, lsl #3 361 lsl x3, x20, #1 362 bl fft15_neon 363 364 ldr x2, [x10, #(CELT_EXPTAB + 8)] // s->exptab[1] 365 add x0, x21, #0 366 add x1, x21, #15*8 367 mov x3, #15 368 ldp x20, x21, [sp] 369 ldp x22, x30, [sp, #0x10] 370 add sp, sp, #0x20 371 b fft15_pass 372endfunc 373 374.macro def_fft n, n2 375function fft\n\()_neon align=6 376 sub sp, sp, #0x30 377 stp x20, x21, [sp] 378 stp x22, x30, [sp, #0x10] 379 stp x23, x24, [sp, #0x20] 380 mov x21, x1 381 mov x22, x2 382 mov x23, x3 383 mov x20, x4 384 sub x3, x3, #1 385 lsl x4, x4, #1 386 bl fft\n2\()_neon 387 388 add x1, x21, #(\n2 * 8) 389 add x2, x22, x20, lsl #3 390 sub x3, x23, #1 391 lsl x4, x20, #1 392 bl fft\n2\()_neon 393 394 add x5, x10, #CELT_EXPTAB 395 mov x0, x21 396 ldr x2, [x5, x23, lsl #3] // s->exptab[N] 397 add x1, x21, #(\n2 * 8) 398 mov x3, #\n2 399 ldp x20, x21, [sp] 400 ldp x22, x30, [sp, #0x10] 401 ldp x23, x24, [sp, #0x20] 402 add sp, sp, #0x30 403 b fft15_pass 404endfunc 405.endm 406 407 def_fft 60, 30 408 def_fft 120, 60 409 def_fft 240, 120 410 def_fft 480, 240 411 def_fft 960, 480 412 413function fft_b15_calc_neon 414 sub sp, sp, #0x50 415 ldr x8, [x0, #CELT_EXPTAB] // s->exptab[0] 416 movrel x6, fact5 417 movrel x11, shuffle_0213 418 movrel x12, shuffle_1032 419 movrel x13, shuffle_2301 420 movrel x14, shuffle_3120 421 add x8, x8, #8 422 movrel x5, fft_tab_neon 423 stp x20, x30, [sp] 424 stp d8, d9, [sp, #0x10] 425 stp d10, d11, [sp, #0x20] 426 stp d12, d13, [sp, #0x30] 427 stp d14, d15, [sp, #0x40] 428 ld1 {v15.4s}, [x6] 429 ld1 {v0.4s,v1.4s}, [x8], #32 430 ld1 {v6.2s}, [x8], #8 431 ld1 {v2.4s,v3.4s}, [x8], #32 432 ld1 {v7.2s}, [x8], #8 433 ld1 {v4.4s,v5.4s}, [x8], #32 434 uzp1 v8.4s, v0.4s, v1.4s // exp[ 1 - 4].re 435 uzp2 v9.4s, v0.4s, v1.4s // exp[ 1 - 4].im 436 uzp1 v10.4s, v2.4s, v3.4s // exp[ 6 - 9].re 437 uzp2 v11.4s, v2.4s, v3.4s // exp[ 6 - 9].im 438 uzp1 v12.4s, v4.4s, v5.4s // exp[11 - 14].re 439 uzp2 v13.4s, v4.4s, v5.4s // exp[11 - 14].im 440 zip1 v14.4s, v6.4s, v7.4s // exp[5,10].re/exp[5,10].im 441 add x5, x5, x3, lsl #3 442 ldr x5, [x5] 443 mov x10, x0 444 blr x5 445 ldp x20, x30, [sp] 446 ldp d8, d9, [sp, #0x10] 447 ldp d10, d11, [sp, #0x20] 448 ldp d12, d13, [sp, #0x30] 449 ldp d14, d15, [sp, #0x40] 450 add sp, sp, #0x50 451 ret 452endfunc 453 454const fft_tab_neon 455 .quad fft15_neon 456 .quad fft30_neon 457 .quad fft60_neon 458 .quad fft120_neon 459 .quad fft240_neon 460 .quad fft480_neon 461 .quad fft960_neon 462endconst 463 464function ff_celt_imdct_half_neon, export=1 465 sub sp, sp, #0x20 466 stp x21, x30, [sp] 467 str s0, [sp, #0x10] 468 469 ldp w5, w6, [x0, #CELT_LEN2] // CELT_LEN4 470 mov x10, x0 471 mov x21, x1 472 sub w5, w5, #1 473 lsl x7, x3, #3 // 2 * stride * sizeof(float) 474 sub x8, xzr, x3, lsl #3 // -2 * stride * sizeof(float) 475 mul x5, x5, x3 476 ldp x9, x10, [x0, #CELT_TMP] // CELT_TWIDDLE 477 ldr w3, [x0, #CELT_FFT_N] 478 add x5, x2, x5, lsl #2 479 mov x11, x9 480 481 sub w6, w6, #4 482 ld1 {v0.s}[0], [x5], x8 483 ld1 {v1.s}[0], [x2], x7 484 ld1 {v4.4s,v5.4s}, [x10], #32 485 ld1 {v0.s}[1], [x5], x8 486 ld1 {v1.s}[1], [x2], x7 487 uzp1 v2.4s, v4.4s, v5.4s 488 ld1 {v0.s}[2], [x5], x8 489 ld1 {v1.s}[2], [x2], x7 490 uzp2 v3.4s, v4.4s, v5.4s 491 ld1 {v0.s}[3], [x5], x8 492 ld1 {v1.s}[3], [x2], x7 4931: 494 subs w6, w6, #4 495 496 ld1 {v20.s}[0], [x5], x8 497 ld1 {v21.s}[0], [x2], x7 498 ld1 {v4.4s,v5.4s}, [x10], #32 499 500 fmul v6.4s, v0.4s, v2.4s 501 fmul v7.4s, v0.4s, v3.4s 502 503 ld1 {v20.s}[1], [x5], x8 504 ld1 {v21.s}[1], [x2], x7 505 506 fmls v6.4s, v1.4s, v3.4s 507 fmla v7.4s, v1.4s, v2.4s 508 509 ld1 {v20.s}[2], [x5], x8 510 ld1 {v21.s}[2], [x2], x7 511 512 uzp1 v2.4s, v4.4s, v5.4s 513 uzp2 v3.4s, v4.4s, v5.4s 514 ld1 {v20.s}[3], [x5], x8 515 ld1 {v21.s}[3], [x2], x7 516 517 zip1 v4.4s, v6.4s, v7.4s 518 zip2 v5.4s, v6.4s, v7.4s 519 520 fmul v6.4s, v20.4s, v2.4s 521 fmul v7.4s, v20.4s, v3.4s 522 523 st1 {v4.4s,v5.4s}, [x9], #32 524 525 fmls v6.4s, v21.4s, v3.4s 526 fmla v7.4s, v21.4s, v2.4s 527 528 b.eq 3f 529 530 subs w6, w6, #4 531 ld1 {v4.4s,v5.4s}, [x10], #32 532 ld1 {v0.s}[0], [x5], x8 533 ld1 {v1.s}[0], [x2], x7 534 uzp1 v2.4s, v4.4s, v5.4s 535 ld1 {v0.s}[1], [x5], x8 536 ld1 {v1.s}[1], [x2], x7 537 uzp2 v3.4s, v4.4s, v5.4s 538 ld1 {v0.s}[2], [x5], x8 539 ld1 {v1.s}[2], [x2], x7 540 zip1 v4.4s, v6.4s, v7.4s 541 zip2 v5.4s, v6.4s, v7.4s 542 ld1 {v0.s}[3], [x5], x8 543 ld1 {v1.s}[3], [x2], x7 544 545 st1 {v4.4s,v5.4s}, [x9], #32 546 547 b.gt 1b 548 549 fmul v6.4s, v0.4s, v2.4s 550 fmul v7.4s, v0.4s, v3.4s 551 fmls v6.4s, v1.4s, v3.4s 552 fmla v7.4s, v1.4s, v2.4s 5533: 554 zip1 v4.4s, v6.4s, v7.4s 555 zip2 v5.4s, v6.4s, v7.4s 556 st1 {v4.4s,v5.4s}, [x9], #32 557 558 mov x2, x11 559 mov x4, #1 560 561 bl fft_b15_calc_neon 562 563 ldr w5, [x10, #CELT_LEN4] 564 ldr x6, [x10, #CELT_TWIDDLE] 565 ldr s31, [sp, #0x10] 566 567 add x1, x21, x5, lsl #2 568 add x3, x6, x5, lsl #2 569 sub x0, x1, #16 570 sub x2, x3, #16 571 mov x8, #-16 572 mov x7, #16 573 mov x10, x0 574 mov x11, x1 575 576 sub w5, w5, #4 577 578 ld1 {v0.4s}, [x0], x8 579 ld1 {v1.4s}, [x1], x7 580 ld1 {v2.4s}, [x2], x8 581 ld1 {v3.4s}, [x3], x7 582 583 uzp1 v4.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].re 584 uzp2 v6.4s, v0.4s, v1.4s // z[-i-2, -i-1, +i, i+1].im 585 586 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re 587 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im 588 589 fmul v1.4s, v6.4s, v5.4s 590 fmul v0.4s, v6.4s, v7.4s 5912: 592 subs w5, w5, #4 593 594 ld1 {v20.4s}, [x0], x8 595 596 fmla v1.4s, v4.4s, v7.4s 597 fmls v0.4s, v4.4s, v5.4s 598 599 ld1 {v21.4s}, [x1], x7 600 601 ext v1.16b, v1.16b, v1.16b, #8 602 fmul v0.4s, v0.4s, v31.s[0] 603 604 ld1 {v2.4s}, [x2], x8 605 606 rev64 v1.4s, v1.4s 607 fmul v1.4s, v1.4s, v31.s[0] 608 609 ld1 {v3.4s}, [x3], x7 610 611 zip1 v5.4s, v0.4s, v1.4s 612 zip2 v7.4s, v0.4s, v1.4s 613 614 uzp1 v4.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].re 615 uzp2 v6.4s, v20.4s, v21.4s // z[-i-2, -i-1, +i, i+1].im 616 617 st1 {v5.4s}, [x10], x8 618 st1 {v7.4s}, [x11], x7 619 620 uzp1 v5.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].re 621 uzp2 v7.4s, v2.4s, v3.4s // twidlle_exptab[-i-2, -i-1, +i, i+1].im 622 623 fmul v1.4s, v6.4s, v5.4s 624 fmul v0.4s, v6.4s, v7.4s 625 b.gt 2b 626 627 fmla v1.4s, v4.4s, v7.4s 628 fmls v0.4s, v4.4s, v5.4s 629 ext v1.16b, v1.16b, v1.16b, #8 630 fmul v0.4s, v0.4s, v31.s[0] 631 rev64 v1.4s, v1.4s 632 fmul v1.4s, v1.4s, v31.s[0] 633 zip1 v5.4s, v0.4s, v1.4s 634 zip2 v7.4s, v0.4s, v1.4s 635 st1 {v5.4s}, [x10], x8 636 st1 {v7.4s}, [x11], x7 637 638 ldp x21, x30, [sp] 639 add sp, sp, #0x20 640 ret 641endfunc 642 643// [0] = exp(2 * i * pi / 5), [1] = exp(2 * i * pi * 2 / 5) 644const fact5 align=4 645 .float 0.30901699437494745, 0.95105651629515353 646 .float -0.80901699437494734, 0.58778525229247325 647endconst 648