1/* 2 * ARM NEON optimised DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "asm.S" 24 25 preserve8 26 27function ff_clear_block_neon, export=1 28 vmov.i16 q0, #0 29 .rept 8 30 vst1.16 {q0}, [r0,:128]! 31 .endr 32 bx lr 33endfunc 34 35function ff_clear_blocks_neon, export=1 36 vmov.i16 q0, #0 37 .rept 8*6 38 vst1.16 {q0}, [r0,:128]! 39 .endr 40 bx lr 41endfunc 42 43.macro pixels16 rnd=1, avg=0 44 .if \avg 45 mov r12, r0 46 .endif 471: vld1.64 {q0}, [r1], r2 48 vld1.64 {q1}, [r1], r2 49 vld1.64 {q2}, [r1], r2 50 pld [r1, r2, lsl #2] 51 vld1.64 {q3}, [r1], r2 52 pld [r1] 53 pld [r1, r2] 54 pld [r1, r2, lsl #1] 55 .if \avg 56 vld1.64 {q8}, [r12,:128], r2 57 vrhadd.u8 q0, q0, q8 58 vld1.64 {q9}, [r12,:128], r2 59 vrhadd.u8 q1, q1, q9 60 vld1.64 {q10}, [r12,:128], r2 61 vrhadd.u8 q2, q2, q10 62 vld1.64 {q11}, [r12,:128], r2 63 vrhadd.u8 q3, q3, q11 64 .endif 65 subs r3, r3, #4 66 vst1.64 {q0}, [r0,:128], r2 67 vst1.64 {q1}, [r0,:128], r2 68 vst1.64 {q2}, [r0,:128], r2 69 vst1.64 {q3}, [r0,:128], r2 70 bne 1b 71 bx lr 72.endm 73 74.macro pixels16_x2 rnd=1, avg=0 751: vld1.64 {d0-d2}, [r1], r2 76 vld1.64 {d4-d6}, [r1], r2 77 pld [r1] 78 pld [r1, r2] 79 subs r3, r3, #2 80 vext.8 q1, q0, q1, #1 81 avg q0, q0, q1 82 vext.8 q3, q2, q3, #1 83 avg q2, q2, q3 84 .if \avg 85 vld1.8 {q1}, [r0,:128], r2 86 vld1.8 {q3}, [r0,:128] 87 vrhadd.u8 q0, q0, q1 88 vrhadd.u8 q2, q2, q3 89 sub r0, r0, r2 90 .endif 91 vst1.64 {q0}, [r0,:128], r2 92 vst1.64 {q2}, [r0,:128], r2 93 bne 1b 94 bx lr 95.endm 96 97.macro pixels16_y2 rnd=1, avg=0 98 vld1.64 {q0}, [r1], r2 99 vld1.64 {q1}, [r1], r2 1001: subs r3, r3, #2 101 avg q2, q0, q1 102 vld1.64 {q0}, [r1], r2 103 avg q3, q0, q1 104 vld1.64 {q1}, [r1], r2 105 pld [r1] 106 pld [r1, r2] 107 .if \avg 108 vld1.8 {q8}, [r0,:128], r2 109 vld1.8 {q9}, [r0,:128] 110 vrhadd.u8 q2, q2, q8 111 vrhadd.u8 q3, q3, q9 112 sub r0, r0, r2 113 .endif 114 vst1.64 {q2}, [r0,:128], r2 115 vst1.64 {q3}, [r0,:128], r2 116 bne 1b 117 bx lr 118.endm 119 120.macro pixels16_xy2 rnd=1, avg=0 121 vld1.64 {d0-d2}, [r1], r2 122 vld1.64 {d4-d6}, [r1], r2 123 .ifeq \rnd 124 vmov.i16 q13, #1 125 .endif 126 pld [r1] 127 pld [r1, r2] 128 vext.8 q1, q0, q1, #1 129 vext.8 q3, q2, q3, #1 130 vaddl.u8 q8, d0, d2 131 vaddl.u8 q10, d1, d3 132 vaddl.u8 q9, d4, d6 133 vaddl.u8 q11, d5, d7 1341: subs r3, r3, #2 135 vld1.64 {d0-d2}, [r1], r2 136 vadd.u16 q12, q8, q9 137 pld [r1] 138 .ifeq \rnd 139 vadd.u16 q12, q12, q13 140 .endif 141 vext.8 q15, q0, q1, #1 142 vadd.u16 q1 , q10, q11 143 shrn d28, q12, #2 144 .ifeq \rnd 145 vadd.u16 q1, q1, q13 146 .endif 147 shrn d29, q1, #2 148 .if \avg 149 vld1.8 {q8}, [r0,:128] 150 vrhadd.u8 q14, q14, q8 151 .endif 152 vaddl.u8 q8, d0, d30 153 vld1.64 {d2-d4}, [r1], r2 154 vaddl.u8 q10, d1, d31 155 vst1.64 {q14}, [r0,:128], r2 156 vadd.u16 q12, q8, q9 157 pld [r1, r2] 158 .ifeq \rnd 159 vadd.u16 q12, q12, q13 160 .endif 161 vext.8 q2, q1, q2, #1 162 vadd.u16 q0, q10, q11 163 shrn d30, q12, #2 164 .ifeq \rnd 165 vadd.u16 q0, q0, q13 166 .endif 167 shrn d31, q0, #2 168 .if \avg 169 vld1.8 {q9}, [r0,:128] 170 vrhadd.u8 q15, q15, q9 171 .endif 172 vaddl.u8 q9, d2, d4 173 vaddl.u8 q11, d3, d5 174 vst1.64 {q15}, [r0,:128], r2 175 bgt 1b 176 bx lr 177.endm 178 179.macro pixels8 rnd=1, avg=0 1801: vld1.64 {d0}, [r1], r2 181 vld1.64 {d1}, [r1], r2 182 vld1.64 {d2}, [r1], r2 183 pld [r1, r2, lsl #2] 184 vld1.64 {d3}, [r1], r2 185 pld [r1] 186 pld [r1, r2] 187 pld [r1, r2, lsl #1] 188 .if \avg 189 vld1.64 {d4}, [r0,:64], r2 190 vrhadd.u8 d0, d0, d4 191 vld1.64 {d5}, [r0,:64], r2 192 vrhadd.u8 d1, d1, d5 193 vld1.64 {d6}, [r0,:64], r2 194 vrhadd.u8 d2, d2, d6 195 vld1.64 {d7}, [r0,:64], r2 196 vrhadd.u8 d3, d3, d7 197 sub r0, r0, r2, lsl #2 198 .endif 199 subs r3, r3, #4 200 vst1.64 {d0}, [r0,:64], r2 201 vst1.64 {d1}, [r0,:64], r2 202 vst1.64 {d2}, [r0,:64], r2 203 vst1.64 {d3}, [r0,:64], r2 204 bne 1b 205 bx lr 206.endm 207 208.macro pixels8_x2 rnd=1, avg=0 2091: vld1.64 {q0}, [r1], r2 210 vext.8 d1, d0, d1, #1 211 vld1.64 {q1}, [r1], r2 212 vext.8 d3, d2, d3, #1 213 pld [r1] 214 pld [r1, r2] 215 subs r3, r3, #2 216 vswp d1, d2 217 avg q0, q0, q1 218 .if \avg 219 vld1.8 {d4}, [r0,:64], r2 220 vld1.8 {d5}, [r0,:64] 221 vrhadd.u8 q0, q0, q2 222 sub r0, r0, r2 223 .endif 224 vst1.64 {d0}, [r0,:64], r2 225 vst1.64 {d1}, [r0,:64], r2 226 bne 1b 227 bx lr 228.endm 229 230.macro pixels8_y2 rnd=1, avg=0 231 vld1.64 {d0}, [r1], r2 232 vld1.64 {d1}, [r1], r2 2331: subs r3, r3, #2 234 avg d4, d0, d1 235 vld1.64 {d0}, [r1], r2 236 avg d5, d0, d1 237 vld1.64 {d1}, [r1], r2 238 pld [r1] 239 pld [r1, r2] 240 .if \avg 241 vld1.8 {d2}, [r0,:64], r2 242 vld1.8 {d3}, [r0,:64] 243 vrhadd.u8 q2, q2, q1 244 sub r0, r0, r2 245 .endif 246 vst1.64 {d4}, [r0,:64], r2 247 vst1.64 {d5}, [r0,:64], r2 248 bne 1b 249 bx lr 250.endm 251 252.macro pixels8_xy2 rnd=1, avg=0 253 vld1.64 {q0}, [r1], r2 254 vld1.64 {q1}, [r1], r2 255 .ifeq \rnd 256 vmov.i16 q11, #1 257 .endif 258 pld [r1] 259 pld [r1, r2] 260 vext.8 d4, d0, d1, #1 261 vext.8 d6, d2, d3, #1 262 vaddl.u8 q8, d0, d4 263 vaddl.u8 q9, d2, d6 2641: subs r3, r3, #2 265 vld1.64 {q0}, [r1], r2 266 pld [r1] 267 vadd.u16 q10, q8, q9 268 vext.8 d4, d0, d1, #1 269 .ifeq \rnd 270 vadd.u16 q10, q10, q11 271 .endif 272 vaddl.u8 q8, d0, d4 273 shrn d5, q10, #2 274 vld1.64 {q1}, [r1], r2 275 vadd.u16 q10, q8, q9 276 pld [r1, r2] 277 .if \avg 278 vld1.8 {d7}, [r0,:64] 279 vrhadd.u8 d5, d5, d7 280 .endif 281 .ifeq \rnd 282 vadd.u16 q10, q10, q11 283 .endif 284 vst1.64 {d5}, [r0,:64], r2 285 shrn d7, q10, #2 286 .if \avg 287 vld1.8 {d5}, [r0,:64] 288 vrhadd.u8 d7, d7, d5 289 .endif 290 vext.8 d6, d2, d3, #1 291 vaddl.u8 q9, d2, d6 292 vst1.64 {d7}, [r0,:64], r2 293 bgt 1b 294 bx lr 295.endm 296 297.macro pixfunc pfx, name, suf, rnd=1, avg=0 298 .if \rnd 299 .macro avg rd, rn, rm 300 vrhadd.u8 \rd, \rn, \rm 301 .endm 302 .macro shrn rd, rn, rm 303 vrshrn.u16 \rd, \rn, \rm 304 .endm 305 .else 306 .macro avg rd, rn, rm 307 vhadd.u8 \rd, \rn, \rm 308 .endm 309 .macro shrn rd, rn, rm 310 vshrn.u16 \rd, \rn, \rm 311 .endm 312 .endif 313function ff_\pfx\name\suf\()_neon, export=1 314 \name \rnd, \avg 315endfunc 316 .purgem avg 317 .purgem shrn 318.endm 319 320.macro pixfunc2 pfx, name, avg=0 321 pixfunc \pfx, \name, rnd=1, avg=\avg 322 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg 323.endm 324 325function ff_put_h264_qpel16_mc00_neon, export=1 326 mov r3, #16 327endfunc 328 329 pixfunc put_, pixels16, avg=0 330 pixfunc2 put_, pixels16_x2, avg=0 331 pixfunc2 put_, pixels16_y2, avg=0 332 pixfunc2 put_, pixels16_xy2, avg=0 333 334function ff_avg_h264_qpel16_mc00_neon, export=1 335 mov r3, #16 336endfunc 337 338 pixfunc avg_, pixels16, avg=1 339 pixfunc2 avg_, pixels16_x2, avg=1 340 pixfunc2 avg_, pixels16_y2, avg=1 341 pixfunc2 avg_, pixels16_xy2, avg=1 342 343function ff_put_h264_qpel8_mc00_neon, export=1 344 mov r3, #8 345endfunc 346 347 pixfunc put_, pixels8, avg=0 348 pixfunc2 put_, pixels8_x2, avg=0 349 pixfunc2 put_, pixels8_y2, avg=0 350 pixfunc2 put_, pixels8_xy2, avg=0 351 352function ff_avg_h264_qpel8_mc00_neon, export=1 353 mov r3, #8 354endfunc 355 356 pixfunc avg_, pixels8, avg=1 357 pixfunc2 avg_, pixels8_x2, avg=1 358 pixfunc2 avg_, pixels8_y2, avg=1 359 pixfunc2 avg_, pixels8_xy2, avg=1 360 361function ff_put_pixels_clamped_neon, export=1 362 vld1.64 {d16-d19}, [r0,:128]! 363 vqmovun.s16 d0, q8 364 vld1.64 {d20-d23}, [r0,:128]! 365 vqmovun.s16 d1, q9 366 vld1.64 {d24-d27}, [r0,:128]! 367 vqmovun.s16 d2, q10 368 vld1.64 {d28-d31}, [r0,:128]! 369 vqmovun.s16 d3, q11 370 vst1.64 {d0}, [r1,:64], r2 371 vqmovun.s16 d4, q12 372 vst1.64 {d1}, [r1,:64], r2 373 vqmovun.s16 d5, q13 374 vst1.64 {d2}, [r1,:64], r2 375 vqmovun.s16 d6, q14 376 vst1.64 {d3}, [r1,:64], r2 377 vqmovun.s16 d7, q15 378 vst1.64 {d4}, [r1,:64], r2 379 vst1.64 {d5}, [r1,:64], r2 380 vst1.64 {d6}, [r1,:64], r2 381 vst1.64 {d7}, [r1,:64], r2 382 bx lr 383endfunc 384 385function ff_put_signed_pixels_clamped_neon, export=1 386 vmov.u8 d31, #128 387 vld1.64 {d16-d17}, [r0,:128]! 388 vqmovn.s16 d0, q8 389 vld1.64 {d18-d19}, [r0,:128]! 390 vqmovn.s16 d1, q9 391 vld1.64 {d16-d17}, [r0,:128]! 392 vqmovn.s16 d2, q8 393 vld1.64 {d18-d19}, [r0,:128]! 394 vadd.u8 d0, d0, d31 395 vld1.64 {d20-d21}, [r0,:128]! 396 vadd.u8 d1, d1, d31 397 vld1.64 {d22-d23}, [r0,:128]! 398 vadd.u8 d2, d2, d31 399 vst1.64 {d0}, [r1,:64], r2 400 vqmovn.s16 d3, q9 401 vst1.64 {d1}, [r1,:64], r2 402 vqmovn.s16 d4, q10 403 vst1.64 {d2}, [r1,:64], r2 404 vqmovn.s16 d5, q11 405 vld1.64 {d24-d25}, [r0,:128]! 406 vadd.u8 d3, d3, d31 407 vld1.64 {d26-d27}, [r0,:128]! 408 vadd.u8 d4, d4, d31 409 vadd.u8 d5, d5, d31 410 vst1.64 {d3}, [r1,:64], r2 411 vqmovn.s16 d6, q12 412 vst1.64 {d4}, [r1,:64], r2 413 vqmovn.s16 d7, q13 414 vst1.64 {d5}, [r1,:64], r2 415 vadd.u8 d6, d6, d31 416 vadd.u8 d7, d7, d31 417 vst1.64 {d6}, [r1,:64], r2 418 vst1.64 {d7}, [r1,:64], r2 419 bx lr 420endfunc 421 422function ff_add_pixels_clamped_neon, export=1 423 mov r3, r1 424 vld1.64 {d16}, [r1,:64], r2 425 vld1.64 {d0-d1}, [r0,:128]! 426 vaddw.u8 q0, q0, d16 427 vld1.64 {d17}, [r1,:64], r2 428 vld1.64 {d2-d3}, [r0,:128]! 429 vqmovun.s16 d0, q0 430 vld1.64 {d18}, [r1,:64], r2 431 vaddw.u8 q1, q1, d17 432 vld1.64 {d4-d5}, [r0,:128]! 433 vaddw.u8 q2, q2, d18 434 vst1.64 {d0}, [r3,:64], r2 435 vqmovun.s16 d2, q1 436 vld1.64 {d19}, [r1,:64], r2 437 vld1.64 {d6-d7}, [r0,:128]! 438 vaddw.u8 q3, q3, d19 439 vqmovun.s16 d4, q2 440 vst1.64 {d2}, [r3,:64], r2 441 vld1.64 {d16}, [r1,:64], r2 442 vqmovun.s16 d6, q3 443 vld1.64 {d0-d1}, [r0,:128]! 444 vaddw.u8 q0, q0, d16 445 vst1.64 {d4}, [r3,:64], r2 446 vld1.64 {d17}, [r1,:64], r2 447 vld1.64 {d2-d3}, [r0,:128]! 448 vaddw.u8 q1, q1, d17 449 vst1.64 {d6}, [r3,:64], r2 450 vqmovun.s16 d0, q0 451 vld1.64 {d18}, [r1,:64], r2 452 vld1.64 {d4-d5}, [r0,:128]! 453 vaddw.u8 q2, q2, d18 454 vst1.64 {d0}, [r3,:64], r2 455 vqmovun.s16 d2, q1 456 vld1.64 {d19}, [r1,:64], r2 457 vqmovun.s16 d4, q2 458 vld1.64 {d6-d7}, [r0,:128]! 459 vaddw.u8 q3, q3, d19 460 vst1.64 {d2}, [r3,:64], r2 461 vqmovun.s16 d6, q3 462 vst1.64 {d4}, [r3,:64], r2 463 vst1.64 {d6}, [r3,:64], r2 464 bx lr 465endfunc 466 467function ff_vector_fmul_neon, export=1 468 subs r3, r3, #8 469 vld1.64 {d0-d3}, [r1,:128]! 470 vld1.64 {d4-d7}, [r2,:128]! 471 vmul.f32 q8, q0, q2 472 vmul.f32 q9, q1, q3 473 beq 3f 474 bics ip, r3, #15 475 beq 2f 4761: subs ip, ip, #16 477 vld1.64 {d0-d1}, [r1,:128]! 478 vld1.64 {d4-d5}, [r2,:128]! 479 vmul.f32 q10, q0, q2 480 vld1.64 {d2-d3}, [r1,:128]! 481 vld1.64 {d6-d7}, [r2,:128]! 482 vmul.f32 q11, q1, q3 483 vst1.64 {d16-d19},[r0,:128]! 484 vld1.64 {d0-d1}, [r1,:128]! 485 vld1.64 {d4-d5}, [r2,:128]! 486 vmul.f32 q8, q0, q2 487 vld1.64 {d2-d3}, [r1,:128]! 488 vld1.64 {d6-d7}, [r2,:128]! 489 vmul.f32 q9, q1, q3 490 vst1.64 {d20-d23},[r0,:128]! 491 bne 1b 492 ands r3, r3, #15 493 beq 3f 4942: vld1.64 {d0-d1}, [r1,:128]! 495 vld1.64 {d4-d5}, [r2,:128]! 496 vst1.64 {d16-d17},[r0,:128]! 497 vmul.f32 q8, q0, q2 498 vld1.64 {d2-d3}, [r1,:128]! 499 vld1.64 {d6-d7}, [r2,:128]! 500 vst1.64 {d18-d19},[r0,:128]! 501 vmul.f32 q9, q1, q3 5023: vst1.64 {d16-d19},[r0,:128]! 503 bx lr 504endfunc 505 506function ff_vector_fmul_window_neon, export=1 507 push {r4,r5,lr} 508 ldr lr, [sp, #12] 509 sub r2, r2, #8 510 sub r5, lr, #2 511 add r2, r2, r5, lsl #2 512 add r4, r3, r5, lsl #3 513 add ip, r0, r5, lsl #3 514 mov r5, #-16 515 vld1.64 {d0,d1}, [r1,:128]! 516 vld1.64 {d2,d3}, [r2,:128], r5 517 vld1.64 {d4,d5}, [r3,:128]! 518 vld1.64 {d6,d7}, [r4,:128], r5 5191: subs lr, lr, #4 520 vmul.f32 d22, d0, d4 521 vrev64.32 q3, q3 522 vmul.f32 d23, d1, d5 523 vrev64.32 q1, q1 524 vmul.f32 d20, d0, d7 525 vmul.f32 d21, d1, d6 526 beq 2f 527 vmla.f32 d22, d3, d7 528 vld1.64 {d0,d1}, [r1,:128]! 529 vmla.f32 d23, d2, d6 530 vld1.64 {d18,d19},[r2,:128], r5 531 vmls.f32 d20, d3, d4 532 vld1.64 {d24,d25},[r3,:128]! 533 vmls.f32 d21, d2, d5 534 vld1.64 {d6,d7}, [r4,:128], r5 535 vmov q1, q9 536 vrev64.32 q11, q11 537 vmov q2, q12 538 vswp d22, d23 539 vst1.64 {d20,d21},[r0,:128]! 540 vst1.64 {d22,d23},[ip,:128], r5 541 b 1b 5422: vmla.f32 d22, d3, d7 543 vmla.f32 d23, d2, d6 544 vmls.f32 d20, d3, d4 545 vmls.f32 d21, d2, d5 546 vrev64.32 q11, q11 547 vswp d22, d23 548 vst1.64 {d20,d21},[r0,:128]! 549 vst1.64 {d22,d23},[ip,:128], r5 550 pop {r4,r5,pc} 551endfunc 552 553#if CONFIG_VORBIS_DECODER 554function ff_vorbis_inverse_coupling_neon, export=1 555 vmov.i32 q10, #1<<31 556 subs r2, r2, #4 557 mov r3, r0 558 mov r12, r1 559 beq 3f 560 561 vld1.32 {d24-d25},[r1,:128]! 562 vld1.32 {d22-d23},[r0,:128]! 563 vcle.s32 q8, q12, #0 564 vand q9, q11, q10 565 veor q12, q12, q9 566 vand q2, q12, q8 567 vbic q3, q12, q8 568 vadd.f32 q12, q11, q2 569 vsub.f32 q11, q11, q3 5701: vld1.32 {d2-d3}, [r1,:128]! 571 vld1.32 {d0-d1}, [r0,:128]! 572 vcle.s32 q8, q1, #0 573 vand q9, q0, q10 574 veor q1, q1, q9 575 vst1.32 {d24-d25},[r3, :128]! 576 vst1.32 {d22-d23},[r12,:128]! 577 vand q2, q1, q8 578 vbic q3, q1, q8 579 vadd.f32 q1, q0, q2 580 vsub.f32 q0, q0, q3 581 subs r2, r2, #8 582 ble 2f 583 vld1.32 {d24-d25},[r1,:128]! 584 vld1.32 {d22-d23},[r0,:128]! 585 vcle.s32 q8, q12, #0 586 vand q9, q11, q10 587 veor q12, q12, q9 588 vst1.32 {d2-d3}, [r3, :128]! 589 vst1.32 {d0-d1}, [r12,:128]! 590 vand q2, q12, q8 591 vbic q3, q12, q8 592 vadd.f32 q12, q11, q2 593 vsub.f32 q11, q11, q3 594 b 1b 595 5962: vst1.32 {d2-d3}, [r3, :128]! 597 vst1.32 {d0-d1}, [r12,:128]! 598 it lt 599 bxlt lr 600 6013: vld1.32 {d2-d3}, [r1,:128] 602 vld1.32 {d0-d1}, [r0,:128] 603 vcle.s32 q8, q1, #0 604 vand q9, q0, q10 605 veor q1, q1, q9 606 vand q2, q1, q8 607 vbic q3, q1, q8 608 vadd.f32 q1, q0, q2 609 vsub.f32 q0, q0, q3 610 vst1.32 {d2-d3}, [r0,:128]! 611 vst1.32 {d0-d1}, [r1,:128]! 612 bx lr 613endfunc 614#endif 615 616function ff_vector_fmul_scalar_neon, export=1 617VFP len .req r2 618NOVFP len .req r3 619VFP vdup.32 q8, d0[0] 620NOVFP vdup.32 q8, r2 621 bics r12, len, #15 622 beq 3f 623 vld1.32 {q0},[r1,:128]! 624 vld1.32 {q1},[r1,:128]! 6251: vmul.f32 q0, q0, q8 626 vld1.32 {q2},[r1,:128]! 627 vmul.f32 q1, q1, q8 628 vld1.32 {q3},[r1,:128]! 629 vmul.f32 q2, q2, q8 630 vst1.32 {q0},[r0,:128]! 631 vmul.f32 q3, q3, q8 632 vst1.32 {q1},[r0,:128]! 633 subs r12, r12, #16 634 beq 2f 635 vld1.32 {q0},[r1,:128]! 636 vst1.32 {q2},[r0,:128]! 637 vld1.32 {q1},[r1,:128]! 638 vst1.32 {q3},[r0,:128]! 639 b 1b 6402: vst1.32 {q2},[r0,:128]! 641 vst1.32 {q3},[r0,:128]! 642 ands len, len, #15 643 it eq 644 bxeq lr 6453: vld1.32 {q0},[r1,:128]! 646 vmul.f32 q0, q0, q8 647 vst1.32 {q0},[r0,:128]! 648 subs len, len, #4 649 bgt 3b 650 bx lr 651 .unreq len 652endfunc 653 654function ff_vector_fmac_scalar_neon, export=1 655VFP len .req r2 656VFP acc .req r3 657NOVFP len .req r3 658NOVFP acc .req r2 659VFP vdup.32 q15, d0[0] 660NOVFP vdup.32 q15, r2 661 bics r12, len, #15 662 mov acc, r0 663 beq 3f 664 vld1.32 {q0}, [r1,:128]! 665 vld1.32 {q8}, [acc,:128]! 666 vld1.32 {q1}, [r1,:128]! 667 vld1.32 {q9}, [acc,:128]! 6681: vmla.f32 q8, q0, q15 669 vld1.32 {q2}, [r1,:128]! 670 vld1.32 {q10}, [acc,:128]! 671 vmla.f32 q9, q1, q15 672 vld1.32 {q3}, [r1,:128]! 673 vld1.32 {q11}, [acc,:128]! 674 vmla.f32 q10, q2, q15 675 vst1.32 {q8}, [r0,:128]! 676 vmla.f32 q11, q3, q15 677 vst1.32 {q9}, [r0,:128]! 678 subs r12, r12, #16 679 beq 2f 680 vld1.32 {q0}, [r1,:128]! 681 vld1.32 {q8}, [acc,:128]! 682 vst1.32 {q10}, [r0,:128]! 683 vld1.32 {q1}, [r1,:128]! 684 vld1.32 {q9}, [acc,:128]! 685 vst1.32 {q11}, [r0,:128]! 686 b 1b 6872: vst1.32 {q10}, [r0,:128]! 688 vst1.32 {q11}, [r0,:128]! 689 ands len, len, #15 690 it eq 691 bxeq lr 6923: vld1.32 {q0}, [r1,:128]! 693 vld1.32 {q8}, [acc,:128]! 694 vmla.f32 q8, q0, q15 695 vst1.32 {q8}, [r0,:128]! 696 subs len, len, #4 697 bgt 3b 698 bx lr 699 .unreq len 700endfunc 701 702function ff_butterflies_float_neon, export=1 7031: vld1.32 {q0},[r0,:128] 704 vld1.32 {q1},[r1,:128] 705 vsub.f32 q2, q0, q1 706 vadd.f32 q1, q0, q1 707 vst1.32 {q2},[r1,:128]! 708 vst1.32 {q1},[r0,:128]! 709 subs r2, r2, #4 710 bgt 1b 711 bx lr 712endfunc 713 714function ff_scalarproduct_float_neon, export=1 715 vmov.f32 q2, #0.0 7161: vld1.32 {q0},[r0,:128]! 717 vld1.32 {q1},[r1,:128]! 718 vmla.f32 q2, q0, q1 719 subs r2, r2, #4 720 bgt 1b 721 vadd.f32 d0, d4, d5 722 vpadd.f32 d0, d0, d0 723NOVFP vmov.32 r0, d0[0] 724 bx lr 725endfunc 726 727function ff_vector_fmul_reverse_neon, export=1 728 add r2, r2, r3, lsl #2 729 sub r2, r2, #32 730 mov r12, #-32 731 vld1.32 {q0-q1}, [r1,:128]! 732 vld1.32 {q2-q3}, [r2,:128], r12 7331: pld [r1, #32] 734 vrev64.32 q3, q3 735 vmul.f32 d16, d0, d7 736 vmul.f32 d17, d1, d6 737 pld [r2, #-32] 738 vrev64.32 q2, q2 739 vmul.f32 d18, d2, d5 740 vmul.f32 d19, d3, d4 741 subs r3, r3, #8 742 beq 2f 743 vld1.32 {q0-q1}, [r1,:128]! 744 vld1.32 {q2-q3}, [r2,:128], r12 745 vst1.32 {q8-q9}, [r0,:128]! 746 b 1b 7472: vst1.32 {q8-q9}, [r0,:128]! 748 bx lr 749endfunc 750 751function ff_vector_fmul_add_neon, export=1 752 ldr r12, [sp] 753 vld1.32 {q0-q1}, [r1,:128]! 754 vld1.32 {q8-q9}, [r2,:128]! 755 vld1.32 {q2-q3}, [r3,:128]! 756 vmul.f32 q10, q0, q8 757 vmul.f32 q11, q1, q9 7581: vadd.f32 q12, q2, q10 759 vadd.f32 q13, q3, q11 760 pld [r1, #16] 761 pld [r2, #16] 762 pld [r3, #16] 763 subs r12, r12, #8 764 beq 2f 765 vld1.32 {q0}, [r1,:128]! 766 vld1.32 {q8}, [r2,:128]! 767 vmul.f32 q10, q0, q8 768 vld1.32 {q1}, [r1,:128]! 769 vld1.32 {q9}, [r2,:128]! 770 vmul.f32 q11, q1, q9 771 vld1.32 {q2-q3}, [r3,:128]! 772 vst1.32 {q12-q13},[r0,:128]! 773 b 1b 7742: vst1.32 {q12-q13},[r0,:128]! 775 bx lr 776endfunc 777 778function ff_vector_clipf_neon, export=1 779VFP vdup.32 q1, d0[1] 780VFP vdup.32 q0, d0[0] 781NOVFP vdup.32 q0, r2 782NOVFP vdup.32 q1, r3 783NOVFP ldr r2, [sp] 784 vld1.f32 {q2},[r1,:128]! 785 vmin.f32 q10, q2, q1 786 vld1.f32 {q3},[r1,:128]! 787 vmin.f32 q11, q3, q1 7881: vmax.f32 q8, q10, q0 789 vmax.f32 q9, q11, q0 790 subs r2, r2, #8 791 beq 2f 792 vld1.f32 {q2},[r1,:128]! 793 vmin.f32 q10, q2, q1 794 vld1.f32 {q3},[r1,:128]! 795 vmin.f32 q11, q3, q1 796 vst1.f32 {q8},[r0,:128]! 797 vst1.f32 {q9},[r0,:128]! 798 b 1b 7992: vst1.f32 {q8},[r0,:128]! 800 vst1.f32 {q9},[r0,:128]! 801 bx lr 802endfunc 803 804function ff_apply_window_int16_neon, export=1 805 push {r4,lr} 806 add r4, r1, r3, lsl #1 807 add lr, r0, r3, lsl #1 808 sub r4, r4, #16 809 sub lr, lr, #16 810 mov r12, #-16 8111: 812 vld1.16 {q0}, [r1,:128]! 813 vld1.16 {q2}, [r2,:128]! 814 vld1.16 {q1}, [r4,:128], r12 815 vrev64.16 q3, q2 816 vqrdmulh.s16 q0, q0, q2 817 vqrdmulh.s16 d2, d2, d7 818 vqrdmulh.s16 d3, d3, d6 819 vst1.16 {q0}, [r0,:128]! 820 vst1.16 {q1}, [lr,:128], r12 821 subs r3, r3, #16 822 bgt 1b 823 824 pop {r4,pc} 825endfunc 826 827function ff_vector_clip_int32_neon, export=1 828 vdup.32 q0, r2 829 vdup.32 q1, r3 830 ldr r2, [sp] 8311: 832 vld1.32 {q2-q3}, [r1,:128]! 833 vmin.s32 q2, q2, q1 834 vmin.s32 q3, q3, q1 835 vmax.s32 q2, q2, q0 836 vmax.s32 q3, q3, q0 837 vst1.32 {q2-q3}, [r0,:128]! 838 subs r2, r2, #8 839 bgt 1b 840 bx lr 841endfunc 842