1/* 2 * Copyright (c) 2013 RISC OS Open Ltd 3 * Author: Ben Avison <bavison@riscosopen.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24POUT .req a1 25PIN .req a2 26PCOEF .req a3 27OLDFPSCR .req a4 28COUNTER .req ip 29 30IN0 .req s4 31IN1 .req s5 32IN2 .req s6 33IN3 .req s7 34IN4 .req s0 35IN5 .req s1 36IN6 .req s2 37IN7 .req s3 38COEF0 .req s8 @ coefficient elements 39COEF1 .req s9 40COEF2 .req s10 41COEF3 .req s11 42COEF4 .req s12 43COEF5 .req s13 44COEF6 .req s14 45COEF7 .req s15 46ACCUM0 .req s16 @ double-buffered multiply-accumulate results 47ACCUM4 .req s20 48POST0 .req s24 @ do long-latency post-multiply in this vector in parallel 49POST1 .req s25 50POST2 .req s26 51POST3 .req s27 52 53 54.macro inner_loop decifactor, dir, tail, head 55 .ifc "\dir","up" 56 .set X, 0 57 .set Y, 4 58 .else 59 .set X, 4*JMAX*4 - 4 60 .set Y, -4 61 .endif 62 .ifnc "\head","" 63 vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y] 64 vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y] 65 vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y] 66 vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y] 67 .endif 68 .ifnc "\tail","" 69 vadd.f POST0, ACCUM0, ACCUM4 @ vector operation 70 .endif 71 .ifnc "\head","" 72 vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar 73 vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y] 74 vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y] 75 vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y] 76 .endif 77 .ifnc "\head","" 78 vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y] 79 .ifc "\tail","" 80 vmul.f ACCUM4, COEF4, IN1 @ vector operation 81 .endif 82 vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y] 83 vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y] 84 .ifnc "\tail","" 85 vmul.f ACCUM4, COEF4, IN1 @ vector operation 86 .endif 87 vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y] 88 vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y] 89 .endif 90 .ifnc "\tail","" 91 vstmia POUT!, {POST0-POST3} 92 .endif 93 .ifnc "\head","" 94 vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar 95 vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y] 96 vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y] 97 vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y] 98 vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y] 99 vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar 100 .if \decifactor == 32 101 vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y] 102 vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y] 103 vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y] 104 vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y] 105 vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar 106 vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y] 107 vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y] 108 vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y] 109 vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y] 110 vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar 111 vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y] 112 vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y] 113 vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y] 114 vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y] 115 vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar 116 vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y] 117 vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y] 118 vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y] 119 vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y] 120 vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar 121 .endif 122 .endif 123.endm 124 125.macro dca_lfe_fir decifactor 126function ff_dca_lfe_fir\decifactor\()_vfp, export=1 127 fmrx OLDFPSCR, FPSCR 128 ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1 129 fmxr FPSCR, ip 130 vldr IN0, [PIN, #-0*4] 131 vldr IN1, [PIN, #-1*4] 132 vldr IN2, [PIN, #-2*4] 133 vldr IN3, [PIN, #-3*4] 134 .if \decifactor == 32 135 .set JMAX, 8 136 vpush {s16-s31} 137 vldr IN4, [PIN, #-4*4] 138 vldr IN5, [PIN, #-5*4] 139 vldr IN6, [PIN, #-6*4] 140 vldr IN7, [PIN, #-7*4] 141 .else 142 .set JMAX, 4 143 vpush {s16-s27} 144 .endif 145 146 mov COUNTER, #\decifactor/4 - 1 147 inner_loop \decifactor, up,, head 1481: add PCOEF, PCOEF, #4*JMAX*4 149 subs COUNTER, COUNTER, #1 150 inner_loop \decifactor, up, tail, head 151 bne 1b 152 inner_loop \decifactor, up, tail 153 154 mov COUNTER, #\decifactor/4 - 1 155 inner_loop \decifactor, down,, head 1561: sub PCOEF, PCOEF, #4*JMAX*4 157 subs COUNTER, COUNTER, #1 158 inner_loop \decifactor, down, tail, head 159 bne 1b 160 inner_loop \decifactor, down, tail 161 162 .if \decifactor == 32 163 vpop {s16-s31} 164 .else 165 vpop {s16-s27} 166 .endif 167 fmxr FPSCR, OLDFPSCR 168 bx lr 169endfunc 170.endm 171 172 dca_lfe_fir 64 173 .ltorg 174 dca_lfe_fir 32 175 176 .unreq POUT 177 .unreq PIN 178 .unreq PCOEF 179 .unreq OLDFPSCR 180 .unreq COUNTER 181 182 .unreq IN0 183 .unreq IN1 184 .unreq IN2 185 .unreq IN3 186 .unreq IN4 187 .unreq IN5 188 .unreq IN6 189 .unreq IN7 190 .unreq COEF0 191 .unreq COEF1 192 .unreq COEF2 193 .unreq COEF3 194 .unreq COEF4 195 .unreq COEF5 196 .unreq COEF6 197 .unreq COEF7 198 .unreq ACCUM0 199 .unreq ACCUM4 200 .unreq POST0 201 .unreq POST1 202 .unreq POST2 203 .unreq POST3 204 205 206IN .req a1 207SBACT .req a2 208OLDFPSCR .req a3 209IMDCT .req a4 210WINDOW .req v1 211OUT .req v2 212BUF .req v3 213SCALEINT .req v4 @ only used in softfp case 214COUNT .req v5 215 216SCALE .req s0 217 218/* Stack layout differs in softfp and hardfp cases: 219 * 220 * hardfp 221 * fp -> 6 arg words saved by caller 222 * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) 223 * s16-s23 on entry 224 * align 16 225 * buf -> 8*32*4 bytes buffer 226 * s0 on entry 227 * sp -> 3 arg words for callee 228 * 229 * softfp 230 * fp -> 7 arg words saved by caller 231 * a4,v1-v5,fp,lr on entry 232 * s16-s23 on entry 233 * align 16 234 * buf -> 8*32*4 bytes buffer 235 * sp -> 4 arg words for callee 236 */ 237 238/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, 239 * SynthFilterContext *synth, FFTContext *imdct, 240 * float (*synth_buf_ptr)[512], 241 * int *synth_buf_offset, float (*synth_buf2)[32], 242 * const float (*window)[512], float *samples_out, 243 * float (*raXin)[32], float scale); 244 */ 245function ff_dca_qmf_32_subbands_vfp, export=1 246VFP push {a3-a4,v1-v3,v5,fp,lr} 247NOVFP push {a4,v1-v5,fp,lr} 248 add fp, sp, #8*4 249 vpush {s16-s23} 250 @ The buffer pointed at by raXin isn't big enough for us to do a 251 @ complete matrix transposition as we want to, so allocate an 252 @ alternative buffer from the stack. Align to 4 words for speed. 253 sub BUF, sp, #8*32*4 254 bic BUF, BUF, #15 255 mov sp, BUF 256 ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2 257 fmrx OLDFPSCR, FPSCR 258 fmxr FPSCR, lr 259 @ COUNT is used to count down 2 things at once: 260 @ bits 0-4 are the number of word pairs remaining in the output row 261 @ bits 5-31 are the number of words to copy (with possible negation) 262 @ from the source matrix before we start zeroing the remainder 263 mov COUNT, #(-4 << 5) + 16 264 adds COUNT, COUNT, SBACT, lsl #5 265 bmi 2f 2661: 267 vldr s8, [IN, #(0*8+0)*4] 268 vldr s10, [IN, #(0*8+1)*4] 269 vldr s12, [IN, #(0*8+2)*4] 270 vldr s14, [IN, #(0*8+3)*4] 271 vldr s16, [IN, #(0*8+4)*4] 272 vldr s18, [IN, #(0*8+5)*4] 273 vldr s20, [IN, #(0*8+6)*4] 274 vldr s22, [IN, #(0*8+7)*4] 275 vneg.f s8, s8 276 vldr s9, [IN, #(1*8+0)*4] 277 vldr s11, [IN, #(1*8+1)*4] 278 vldr s13, [IN, #(1*8+2)*4] 279 vldr s15, [IN, #(1*8+3)*4] 280 vneg.f s16, s16 281 vldr s17, [IN, #(1*8+4)*4] 282 vldr s19, [IN, #(1*8+5)*4] 283 vldr s21, [IN, #(1*8+6)*4] 284 vldr s23, [IN, #(1*8+7)*4] 285 vstr d4, [BUF, #(0*32+0)*4] 286 vstr d5, [BUF, #(1*32+0)*4] 287 vstr d6, [BUF, #(2*32+0)*4] 288 vstr d7, [BUF, #(3*32+0)*4] 289 vstr d8, [BUF, #(4*32+0)*4] 290 vstr d9, [BUF, #(5*32+0)*4] 291 vstr d10, [BUF, #(6*32+0)*4] 292 vstr d11, [BUF, #(7*32+0)*4] 293 vldr s9, [IN, #(3*8+0)*4] 294 vldr s11, [IN, #(3*8+1)*4] 295 vldr s13, [IN, #(3*8+2)*4] 296 vldr s15, [IN, #(3*8+3)*4] 297 vldr s17, [IN, #(3*8+4)*4] 298 vldr s19, [IN, #(3*8+5)*4] 299 vldr s21, [IN, #(3*8+6)*4] 300 vldr s23, [IN, #(3*8+7)*4] 301 vneg.f s9, s9 302 vldr s8, [IN, #(2*8+0)*4] 303 vldr s10, [IN, #(2*8+1)*4] 304 vldr s12, [IN, #(2*8+2)*4] 305 vldr s14, [IN, #(2*8+3)*4] 306 vneg.f s17, s17 307 vldr s16, [IN, #(2*8+4)*4] 308 vldr s18, [IN, #(2*8+5)*4] 309 vldr s20, [IN, #(2*8+6)*4] 310 vldr s22, [IN, #(2*8+7)*4] 311 vstr d4, [BUF, #(0*32+2)*4] 312 vstr d5, [BUF, #(1*32+2)*4] 313 vstr d6, [BUF, #(2*32+2)*4] 314 vstr d7, [BUF, #(3*32+2)*4] 315 vstr d8, [BUF, #(4*32+2)*4] 316 vstr d9, [BUF, #(5*32+2)*4] 317 vstr d10, [BUF, #(6*32+2)*4] 318 vstr d11, [BUF, #(7*32+2)*4] 319 add IN, IN, #4*8*4 320 add BUF, BUF, #4*4 321 subs COUNT, COUNT, #(4 << 5) + 2 322 bpl 1b 3232: @ Now deal with trailing < 4 samples 324 adds COUNT, COUNT, #3 << 5 325 bmi 4f @ sb_act was a multiple of 4 326 bics lr, COUNT, #0x1F 327 bne 3f 328 @ sb_act was n*4+1 329 vldr s8, [IN, #(0*8+0)*4] 330 vldr s10, [IN, #(0*8+1)*4] 331 vldr s12, [IN, #(0*8+2)*4] 332 vldr s14, [IN, #(0*8+3)*4] 333 vldr s16, [IN, #(0*8+4)*4] 334 vldr s18, [IN, #(0*8+5)*4] 335 vldr s20, [IN, #(0*8+6)*4] 336 vldr s22, [IN, #(0*8+7)*4] 337 vneg.f s8, s8 338 vldr s9, zero 339 vldr s11, zero 340 vldr s13, zero 341 vldr s15, zero 342 vneg.f s16, s16 343 vldr s17, zero 344 vldr s19, zero 345 vldr s21, zero 346 vldr s23, zero 347 vstr d4, [BUF, #(0*32+0)*4] 348 vstr d5, [BUF, #(1*32+0)*4] 349 vstr d6, [BUF, #(2*32+0)*4] 350 vstr d7, [BUF, #(3*32+0)*4] 351 vstr d8, [BUF, #(4*32+0)*4] 352 vstr d9, [BUF, #(5*32+0)*4] 353 vstr d10, [BUF, #(6*32+0)*4] 354 vstr d11, [BUF, #(7*32+0)*4] 355 add BUF, BUF, #2*4 356 sub COUNT, COUNT, #1 357 b 4f 3583: @ sb_act was n*4+2 or n*4+3, so do the first 2 359 vldr s8, [IN, #(0*8+0)*4] 360 vldr s10, [IN, #(0*8+1)*4] 361 vldr s12, [IN, #(0*8+2)*4] 362 vldr s14, [IN, #(0*8+3)*4] 363 vldr s16, [IN, #(0*8+4)*4] 364 vldr s18, [IN, #(0*8+5)*4] 365 vldr s20, [IN, #(0*8+6)*4] 366 vldr s22, [IN, #(0*8+7)*4] 367 vneg.f s8, s8 368 vldr s9, [IN, #(1*8+0)*4] 369 vldr s11, [IN, #(1*8+1)*4] 370 vldr s13, [IN, #(1*8+2)*4] 371 vldr s15, [IN, #(1*8+3)*4] 372 vneg.f s16, s16 373 vldr s17, [IN, #(1*8+4)*4] 374 vldr s19, [IN, #(1*8+5)*4] 375 vldr s21, [IN, #(1*8+6)*4] 376 vldr s23, [IN, #(1*8+7)*4] 377 vstr d4, [BUF, #(0*32+0)*4] 378 vstr d5, [BUF, #(1*32+0)*4] 379 vstr d6, [BUF, #(2*32+0)*4] 380 vstr d7, [BUF, #(3*32+0)*4] 381 vstr d8, [BUF, #(4*32+0)*4] 382 vstr d9, [BUF, #(5*32+0)*4] 383 vstr d10, [BUF, #(6*32+0)*4] 384 vstr d11, [BUF, #(7*32+0)*4] 385 add BUF, BUF, #2*4 386 sub COUNT, COUNT, #(2 << 5) + 1 387 bics lr, COUNT, #0x1F 388 bne 4f 389 @ sb_act was n*4+3 390 vldr s8, [IN, #(2*8+0)*4] 391 vldr s10, [IN, #(2*8+1)*4] 392 vldr s12, [IN, #(2*8+2)*4] 393 vldr s14, [IN, #(2*8+3)*4] 394 vldr s16, [IN, #(2*8+4)*4] 395 vldr s18, [IN, #(2*8+5)*4] 396 vldr s20, [IN, #(2*8+6)*4] 397 vldr s22, [IN, #(2*8+7)*4] 398 vldr s9, zero 399 vldr s11, zero 400 vldr s13, zero 401 vldr s15, zero 402 vldr s17, zero 403 vldr s19, zero 404 vldr s21, zero 405 vldr s23, zero 406 vstr d4, [BUF, #(0*32+0)*4] 407 vstr d5, [BUF, #(1*32+0)*4] 408 vstr d6, [BUF, #(2*32+0)*4] 409 vstr d7, [BUF, #(3*32+0)*4] 410 vstr d8, [BUF, #(4*32+0)*4] 411 vstr d9, [BUF, #(5*32+0)*4] 412 vstr d10, [BUF, #(6*32+0)*4] 413 vstr d11, [BUF, #(7*32+0)*4] 414 add BUF, BUF, #2*4 415 sub COUNT, COUNT, #1 4164: @ Now fill the remainder with 0 417 vldr s8, zero 418 vldr s9, zero 419 ands COUNT, COUNT, #0x1F 420 beq 6f 4215: vstr d4, [BUF, #(0*32+0)*4] 422 vstr d4, [BUF, #(1*32+0)*4] 423 vstr d4, [BUF, #(2*32+0)*4] 424 vstr d4, [BUF, #(3*32+0)*4] 425 vstr d4, [BUF, #(4*32+0)*4] 426 vstr d4, [BUF, #(5*32+0)*4] 427 vstr d4, [BUF, #(6*32+0)*4] 428 vstr d4, [BUF, #(7*32+0)*4] 429 add BUF, BUF, #2*4 430 subs COUNT, COUNT, #1 431 bne 5b 4326: 433 fmxr FPSCR, OLDFPSCR 434 ldr WINDOW, [fp, #3*4] 435 ldr OUT, [fp, #4*4] 436 sub BUF, BUF, #32*4 437NOVFP ldr SCALEINT, [fp, #6*4] 438 mov COUNT, #8 439VFP vpush {SCALE} 440VFP sub sp, sp, #3*4 441NOVFP sub sp, sp, #4*4 4427: 443VFP ldr a1, [fp, #-7*4] @ imdct 444NOVFP ldr a1, [fp, #-8*4] 445 ldmia fp, {a2-a4} 446VFP stmia sp, {WINDOW, OUT, BUF} 447NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT} 448VFP vldr SCALE, [sp, #3*4] 449 bl X(ff_synth_filter_float_vfp) 450 add OUT, OUT, #32*4 451 add BUF, BUF, #32*4 452 subs COUNT, COUNT, #1 453 bne 7b 454 455A sub sp, fp, #(8+8)*4 456T sub fp, fp, #(8+8)*4 457T mov sp, fp 458 vpop {s16-s23} 459VFP pop {a3-a4,v1-v3,v5,fp,pc} 460NOVFP pop {a4,v1-v5,fp,pc} 461endfunc 462 463 .unreq IN 464 .unreq SBACT 465 .unreq OLDFPSCR 466 .unreq IMDCT 467 .unreq WINDOW 468 .unreq OUT 469 .unreq BUF 470 .unreq SCALEINT 471 .unreq COUNT 472 473 .unreq SCALE 474 475 .align 2 476zero: .word 0 477