1/* 2 * ARM NEON optimised DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "asm.S" 23 24 preserve8 25 .fpu neon 26 .text 27 28 .macro pixels16 avg=0 29.if \avg 30 mov ip, r0 31.endif 321: vld1.64 {d0, d1}, [r1], r2 33 vld1.64 {d2, d3}, [r1], r2 34 vld1.64 {d4, d5}, [r1], r2 35 pld [r1, r2, lsl #2] 36 vld1.64 {d6, d7}, [r1], r2 37 pld [r1] 38 pld [r1, r2] 39 pld [r1, r2, lsl #1] 40.if \avg 41 vld1.64 {d16,d17}, [ip], r2 42 vrhadd.u8 q0, q0, q8 43 vld1.64 {d18,d19}, [ip], r2 44 vrhadd.u8 q1, q1, q9 45 vld1.64 {d20,d21}, [ip], r2 46 vrhadd.u8 q2, q2, q10 47 vld1.64 {d22,d23}, [ip], r2 48 vrhadd.u8 q3, q3, q11 49.endif 50 subs r3, r3, #4 51 vst1.64 {d0, d1}, [r0,:128], r2 52 vst1.64 {d2, d3}, [r0,:128], r2 53 vst1.64 {d4, d5}, [r0,:128], r2 54 vst1.64 {d6, d7}, [r0,:128], r2 55 bne 1b 56 bx lr 57 .endm 58 59 .macro pixels16_x2 vhadd=vrhadd.u8 601: vld1.64 {d0-d2}, [r1], r2 61 vld1.64 {d4-d6}, [r1], r2 62 pld [r1] 63 pld [r1, r2] 64 subs r3, r3, #2 65 vext.8 q1, q0, q1, #1 66 \vhadd q0, q0, q1 67 vext.8 q3, q2, q3, #1 68 \vhadd q2, q2, q3 69 vst1.64 {d0, d1}, [r0,:128], r2 70 vst1.64 {d4, d5}, [r0,:128], r2 71 bne 1b 72 bx lr 73 .endm 74 75 .macro pixels16_y2 vhadd=vrhadd.u8 76 push {lr} 77 add ip, r1, r2 78 lsl lr, r2, #1 79 vld1.64 {d0, d1}, [r1], lr 80 vld1.64 {d2, d3}, [ip], lr 811: subs r3, r3, #2 82 \vhadd q2, q0, q1 83 vld1.64 {d0, d1}, [r1], lr 84 \vhadd q3, q0, q1 85 vld1.64 {d2, d3}, [ip], lr 86 pld [r1] 87 pld [ip] 88 vst1.64 {d4, d5}, [r0,:128], r2 89 vst1.64 {d6, d7}, [r0,:128], r2 90 bne 1b 91 pop {pc} 92 .endm 93 94 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 95 push {lr} 96 lsl lr, r2, #1 97 add ip, r1, r2 98 vld1.64 {d0-d2}, [r1], lr 99 vld1.64 {d4-d6}, [ip], lr 100.if \no_rnd 101 vmov.i16 q13, #1 102.endif 103 pld [r1] 104 pld [ip] 105 vext.8 q1, q0, q1, #1 106 vext.8 q3, q2, q3, #1 107 vaddl.u8 q8, d0, d2 108 vaddl.u8 q10, d1, d3 109 vaddl.u8 q9, d4, d6 110 vaddl.u8 q11, d5, d7 1111: subs r3, r3, #2 112 vld1.64 {d0-d2}, [r1], lr 113 vadd.u16 q12, q8, q9 114 pld [r1] 115.if \no_rnd 116 vadd.u16 q12, q12, q13 117.endif 118 vext.8 q15, q0, q1, #1 119 vadd.u16 q1 , q10, q11 120 \vshrn d28, q12, #2 121.if \no_rnd 122 vadd.u16 q1, q1, q13 123.endif 124 \vshrn d29, q1, #2 125 vaddl.u8 q8, d0, d30 126 vld1.64 {d2-d4}, [ip], lr 127 vaddl.u8 q10, d1, d31 128 vst1.64 {d28,d29}, [r0,:128], r2 129 vadd.u16 q12, q8, q9 130 pld [ip] 131.if \no_rnd 132 vadd.u16 q12, q12, q13 133.endif 134 vext.8 q2, q1, q2, #1 135 vadd.u16 q0, q10, q11 136 \vshrn d30, q12, #2 137.if \no_rnd 138 vadd.u16 q0, q0, q13 139.endif 140 \vshrn d31, q0, #2 141 vaddl.u8 q9, d2, d4 142 vaddl.u8 q11, d3, d5 143 vst1.64 {d30,d31}, [r0,:128], r2 144 bgt 1b 145 pop {pc} 146 .endm 147 148 .macro pixels8 1491: vld1.64 {d0}, [r1], r2 150 vld1.64 {d1}, [r1], r2 151 vld1.64 {d2}, [r1], r2 152 pld [r1, r2, lsl #2] 153 vld1.64 {d3}, [r1], r2 154 pld [r1] 155 pld [r1, r2] 156 pld [r1, r2, lsl #1] 157 subs r3, r3, #4 158 vst1.64 {d0}, [r0,:64], r2 159 vst1.64 {d1}, [r0,:64], r2 160 vst1.64 {d2}, [r0,:64], r2 161 vst1.64 {d3}, [r0,:64], r2 162 bne 1b 163 bx lr 164 .endm 165 166 .macro pixels8_x2 vhadd=vrhadd.u8 1671: vld1.64 {d0, d1}, [r1], r2 168 vext.8 d1, d0, d1, #1 169 vld1.64 {d2, d3}, [r1], r2 170 vext.8 d3, d2, d3, #1 171 pld [r1] 172 pld [r1, r2] 173 subs r3, r3, #2 174 vswp d1, d2 175 \vhadd q0, q0, q1 176 vst1.64 {d0}, [r0,:64], r2 177 vst1.64 {d1}, [r0,:64], r2 178 bne 1b 179 bx lr 180 .endm 181 182 .macro pixels8_y2 vhadd=vrhadd.u8 183 push {lr} 184 add ip, r1, r2 185 lsl lr, r2, #1 186 vld1.64 {d0}, [r1], lr 187 vld1.64 {d1}, [ip], lr 1881: subs r3, r3, #2 189 \vhadd d4, d0, d1 190 vld1.64 {d0}, [r1], lr 191 \vhadd d5, d0, d1 192 vld1.64 {d1}, [ip], lr 193 pld [r1] 194 pld [ip] 195 vst1.64 {d4}, [r0,:64], r2 196 vst1.64 {d5}, [r0,:64], r2 197 bne 1b 198 pop {pc} 199 .endm 200 201 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 202 push {lr} 203 lsl lr, r2, #1 204 add ip, r1, r2 205 vld1.64 {d0, d1}, [r1], lr 206 vld1.64 {d2, d3}, [ip], lr 207.if \no_rnd 208 vmov.i16 q11, #1 209.endif 210 pld [r1] 211 pld [ip] 212 vext.8 d4, d0, d1, #1 213 vext.8 d6, d2, d3, #1 214 vaddl.u8 q8, d0, d4 215 vaddl.u8 q9, d2, d6 2161: subs r3, r3, #2 217 vld1.64 {d0, d1}, [r1], lr 218 pld [r1] 219 vadd.u16 q10, q8, q9 220 vext.8 d4, d0, d1, #1 221.if \no_rnd 222 vadd.u16 q10, q10, q11 223.endif 224 vaddl.u8 q8, d0, d4 225 \vshrn d5, q10, #2 226 vld1.64 {d2, d3}, [ip], lr 227 vadd.u16 q10, q8, q9 228 pld [ip] 229.if \no_rnd 230 vadd.u16 q10, q10, q11 231.endif 232 vst1.64 {d5}, [r0,:64], r2 233 \vshrn d7, q10, #2 234 vext.8 d6, d2, d3, #1 235 vaddl.u8 q9, d2, d6 236 vst1.64 {d7}, [r0,:64], r2 237 bgt 1b 238 pop {pc} 239 .endm 240 241 .macro pixfunc pfx name suf rnd_op args:vararg 242function ff_\pfx\name\suf\()_neon, export=1 243 \name \rnd_op \args 244 .endfunc 245 .endm 246 247 .macro pixfunc2 pfx name args:vararg 248 pixfunc \pfx \name 249 pixfunc \pfx \name \args 250 .endm 251 252function ff_put_h264_qpel16_mc00_neon, export=1 253 mov r3, #16 254 .endfunc 255 256 pixfunc put_ pixels16 257 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 258 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 259 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 260 261function ff_avg_h264_qpel16_mc00_neon, export=1 262 mov r3, #16 263 .endfunc 264 265 pixfunc avg_ pixels16,, 1 266 267function ff_put_h264_qpel8_mc00_neon, export=1 268 mov r3, #8 269 .endfunc 270 271 pixfunc put_ pixels8 272 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 273 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 274 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 275 276function ff_float_to_int16_neon, export=1 277 subs r2, r2, #8 278 vld1.64 {d0-d1}, [r1,:128]! 279 vcvt.s32.f32 q8, q0, #16 280 vld1.64 {d2-d3}, [r1,:128]! 281 vcvt.s32.f32 q9, q1, #16 282 beq 3f 283 bics ip, r2, #15 284 beq 2f 2851: subs ip, ip, #16 286 vshrn.s32 d4, q8, #16 287 vld1.64 {d0-d1}, [r1,:128]! 288 vcvt.s32.f32 q0, q0, #16 289 vshrn.s32 d5, q9, #16 290 vld1.64 {d2-d3}, [r1,:128]! 291 vcvt.s32.f32 q1, q1, #16 292 vshrn.s32 d6, q0, #16 293 vst1.64 {d4-d5}, [r0,:128]! 294 vshrn.s32 d7, q1, #16 295 vld1.64 {d16-d17},[r1,:128]! 296 vcvt.s32.f32 q8, q8, #16 297 vld1.64 {d18-d19},[r1,:128]! 298 vcvt.s32.f32 q9, q9, #16 299 vst1.64 {d6-d7}, [r0,:128]! 300 bne 1b 301 ands r2, r2, #15 302 beq 3f 3032: vld1.64 {d0-d1}, [r1,:128]! 304 vshrn.s32 d4, q8, #16 305 vcvt.s32.f32 q0, q0, #16 306 vld1.64 {d2-d3}, [r1,:128]! 307 vshrn.s32 d5, q9, #16 308 vcvt.s32.f32 q1, q1, #16 309 vshrn.s32 d6, q0, #16 310 vst1.64 {d4-d5}, [r0,:128]! 311 vshrn.s32 d7, q1, #16 312 vst1.64 {d6-d7}, [r0,:128]! 313 bx lr 3143: vshrn.s32 d4, q8, #16 315 vshrn.s32 d5, q9, #16 316 vst1.64 {d4-d5}, [r0,:128]! 317 bx lr 318 .endfunc 319 320function ff_float_to_int16_interleave_neon, export=1 321 cmp r3, #2 322 ldrlt r1, [r1] 323 blt ff_float_to_int16_neon 324 bne 4f 325 326 ldr r3, [r1] 327 ldr r1, [r1, #4] 328 329 subs r2, r2, #8 330 vld1.64 {d0-d1}, [r3,:128]! 331 vcvt.s32.f32 q8, q0, #16 332 vld1.64 {d2-d3}, [r3,:128]! 333 vcvt.s32.f32 q9, q1, #16 334 vld1.64 {d20-d21},[r1,:128]! 335 vcvt.s32.f32 q10, q10, #16 336 vld1.64 {d22-d23},[r1,:128]! 337 vcvt.s32.f32 q11, q11, #16 338 beq 3f 339 bics ip, r2, #15 340 beq 2f 3411: subs ip, ip, #16 342 vld1.64 {d0-d1}, [r3,:128]! 343 vcvt.s32.f32 q0, q0, #16 344 vsri.32 q10, q8, #16 345 vld1.64 {d2-d3}, [r3,:128]! 346 vcvt.s32.f32 q1, q1, #16 347 vld1.64 {d24-d25},[r1,:128]! 348 vcvt.s32.f32 q12, q12, #16 349 vld1.64 {d26-d27},[r1,:128]! 350 vsri.32 q11, q9, #16 351 vst1.64 {d20-d21},[r0,:128]! 352 vcvt.s32.f32 q13, q13, #16 353 vst1.64 {d22-d23},[r0,:128]! 354 vsri.32 q12, q0, #16 355 vld1.64 {d16-d17},[r3,:128]! 356 vsri.32 q13, q1, #16 357 vst1.64 {d24-d25},[r0,:128]! 358 vcvt.s32.f32 q8, q8, #16 359 vld1.64 {d18-d19},[r3,:128]! 360 vcvt.s32.f32 q9, q9, #16 361 vld1.64 {d20-d21},[r1,:128]! 362 vcvt.s32.f32 q10, q10, #16 363 vld1.64 {d22-d23},[r1,:128]! 364 vcvt.s32.f32 q11, q11, #16 365 vst1.64 {d26-d27},[r0,:128]! 366 bne 1b 367 ands r2, r2, #15 368 beq 3f 3692: vsri.32 q10, q8, #16 370 vld1.64 {d0-d1}, [r3,:128]! 371 vcvt.s32.f32 q0, q0, #16 372 vld1.64 {d2-d3}, [r3,:128]! 373 vcvt.s32.f32 q1, q1, #16 374 vld1.64 {d24-d25},[r1,:128]! 375 vcvt.s32.f32 q12, q12, #16 376 vsri.32 q11, q9, #16 377 vld1.64 {d26-d27},[r1,:128]! 378 vcvt.s32.f32 q13, q13, #16 379 vst1.64 {d20-d21},[r0,:128]! 380 vsri.32 q12, q0, #16 381 vst1.64 {d22-d23},[r0,:128]! 382 vsri.32 q13, q1, #16 383 vst1.64 {d24-d27},[r0,:128]! 384 bx lr 3853: vsri.32 q10, q8, #16 386 vsri.32 q11, q9, #16 387 vst1.64 {d20-d23},[r0,:128]! 388 bx lr 389 3904: push {r4-r8,lr} 391 cmp r3, #4 392 lsl ip, r3, #1 393 blt 4f 394 395 @ 4 channels 3965: ldmia r1!, {r4-r7} 397 mov lr, r2 398 mov r8, r0 399 vld1.64 {d16-d17},[r4,:128]! 400 vcvt.s32.f32 q8, q8, #16 401 vld1.64 {d18-d19},[r5,:128]! 402 vcvt.s32.f32 q9, q9, #16 403 vld1.64 {d20-d21},[r6,:128]! 404 vcvt.s32.f32 q10, q10, #16 405 vld1.64 {d22-d23},[r7,:128]! 406 vcvt.s32.f32 q11, q11, #16 4076: subs lr, lr, #8 408 vld1.64 {d0-d1}, [r4,:128]! 409 vcvt.s32.f32 q0, q0, #16 410 vsri.32 q9, q8, #16 411 vld1.64 {d2-d3}, [r5,:128]! 412 vcvt.s32.f32 q1, q1, #16 413 vsri.32 q11, q10, #16 414 vld1.64 {d4-d5}, [r6,:128]! 415 vcvt.s32.f32 q2, q2, #16 416 vzip.32 d18, d22 417 vld1.64 {d6-d7}, [r7,:128]! 418 vcvt.s32.f32 q3, q3, #16 419 vzip.32 d19, d23 420 vst1.64 {d18}, [r8], ip 421 vsri.32 q1, q0, #16 422 vst1.64 {d22}, [r8], ip 423 vsri.32 q3, q2, #16 424 vst1.64 {d19}, [r8], ip 425 vzip.32 d2, d6 426 vst1.64 {d23}, [r8], ip 427 vzip.32 d3, d7 428 beq 7f 429 vld1.64 {d16-d17},[r4,:128]! 430 vcvt.s32.f32 q8, q8, #16 431 vst1.64 {d2}, [r8], ip 432 vld1.64 {d18-d19},[r5,:128]! 433 vcvt.s32.f32 q9, q9, #16 434 vst1.64 {d6}, [r8], ip 435 vld1.64 {d20-d21},[r6,:128]! 436 vcvt.s32.f32 q10, q10, #16 437 vst1.64 {d3}, [r8], ip 438 vld1.64 {d22-d23},[r7,:128]! 439 vcvt.s32.f32 q11, q11, #16 440 vst1.64 {d7}, [r8], ip 441 b 6b 4427: vst1.64 {d2}, [r8], ip 443 vst1.64 {d6}, [r8], ip 444 vst1.64 {d3}, [r8], ip 445 vst1.64 {d7}, [r8], ip 446 subs r3, r3, #4 447 popeq {r4-r8,pc} 448 cmp r3, #4 449 add r0, r0, #8 450 bge 5b 451 452 @ 2 channels 4534: cmp r3, #2 454 blt 4f 455 ldmia r1!, {r4-r5} 456 mov lr, r2 457 mov r8, r0 458 tst lr, #8 459 vld1.64 {d16-d17},[r4,:128]! 460 vcvt.s32.f32 q8, q8, #16 461 vld1.64 {d18-d19},[r5,:128]! 462 vcvt.s32.f32 q9, q9, #16 463 vld1.64 {d20-d21},[r4,:128]! 464 vcvt.s32.f32 q10, q10, #16 465 vld1.64 {d22-d23},[r5,:128]! 466 vcvt.s32.f32 q11, q11, #16 467 beq 6f 468 subs lr, lr, #8 469 beq 7f 470 vsri.32 d18, d16, #16 471 vsri.32 d19, d17, #16 472 vld1.64 {d16-d17},[r4,:128]! 473 vcvt.s32.f32 q8, q8, #16 474 vst1.32 {d18[0]}, [r8], ip 475 vsri.32 d22, d20, #16 476 vst1.32 {d18[1]}, [r8], ip 477 vsri.32 d23, d21, #16 478 vst1.32 {d19[0]}, [r8], ip 479 vst1.32 {d19[1]}, [r8], ip 480 vld1.64 {d18-d19},[r5,:128]! 481 vcvt.s32.f32 q9, q9, #16 482 vst1.32 {d22[0]}, [r8], ip 483 vst1.32 {d22[1]}, [r8], ip 484 vld1.64 {d20-d21},[r4,:128]! 485 vcvt.s32.f32 q10, q10, #16 486 vst1.32 {d23[0]}, [r8], ip 487 vst1.32 {d23[1]}, [r8], ip 488 vld1.64 {d22-d23},[r5,:128]! 489 vcvt.s32.f32 q11, q11, #16 4906: subs lr, lr, #16 491 vld1.64 {d0-d1}, [r4,:128]! 492 vcvt.s32.f32 q0, q0, #16 493 vsri.32 d18, d16, #16 494 vld1.64 {d2-d3}, [r5,:128]! 495 vcvt.s32.f32 q1, q1, #16 496 vsri.32 d19, d17, #16 497 vld1.64 {d4-d5}, [r4,:128]! 498 vcvt.s32.f32 q2, q2, #16 499 vld1.64 {d6-d7}, [r5,:128]! 500 vcvt.s32.f32 q3, q3, #16 501 vst1.32 {d18[0]}, [r8], ip 502 vsri.32 d22, d20, #16 503 vst1.32 {d18[1]}, [r8], ip 504 vsri.32 d23, d21, #16 505 vst1.32 {d19[0]}, [r8], ip 506 vsri.32 d2, d0, #16 507 vst1.32 {d19[1]}, [r8], ip 508 vsri.32 d3, d1, #16 509 vst1.32 {d22[0]}, [r8], ip 510 vsri.32 d6, d4, #16 511 vst1.32 {d22[1]}, [r8], ip 512 vsri.32 d7, d5, #16 513 vst1.32 {d23[0]}, [r8], ip 514 vst1.32 {d23[1]}, [r8], ip 515 beq 6f 516 vld1.64 {d16-d17},[r4,:128]! 517 vcvt.s32.f32 q8, q8, #16 518 vst1.32 {d2[0]}, [r8], ip 519 vst1.32 {d2[1]}, [r8], ip 520 vld1.64 {d18-d19},[r5,:128]! 521 vcvt.s32.f32 q9, q9, #16 522 vst1.32 {d3[0]}, [r8], ip 523 vst1.32 {d3[1]}, [r8], ip 524 vld1.64 {d20-d21},[r4,:128]! 525 vcvt.s32.f32 q10, q10, #16 526 vst1.32 {d6[0]}, [r8], ip 527 vst1.32 {d6[1]}, [r8], ip 528 vld1.64 {d22-d23},[r5,:128]! 529 vcvt.s32.f32 q11, q11, #16 530 vst1.32 {d7[0]}, [r8], ip 531 vst1.32 {d7[1]}, [r8], ip 532 bgt 6b 5336: vst1.32 {d2[0]}, [r8], ip 534 vst1.32 {d2[1]}, [r8], ip 535 vst1.32 {d3[0]}, [r8], ip 536 vst1.32 {d3[1]}, [r8], ip 537 vst1.32 {d6[0]}, [r8], ip 538 vst1.32 {d6[1]}, [r8], ip 539 vst1.32 {d7[0]}, [r8], ip 540 vst1.32 {d7[1]}, [r8], ip 541 b 8f 5427: vsri.32 d18, d16, #16 543 vsri.32 d19, d17, #16 544 vst1.32 {d18[0]}, [r8], ip 545 vsri.32 d22, d20, #16 546 vst1.32 {d18[1]}, [r8], ip 547 vsri.32 d23, d21, #16 548 vst1.32 {d19[0]}, [r8], ip 549 vst1.32 {d19[1]}, [r8], ip 550 vst1.32 {d22[0]}, [r8], ip 551 vst1.32 {d22[1]}, [r8], ip 552 vst1.32 {d23[0]}, [r8], ip 553 vst1.32 {d23[1]}, [r8], ip 5548: subs r3, r3, #2 555 add r0, r0, #4 556 popeq {r4-r8,pc} 557 558 @ 1 channel 5594: ldr r4, [r1],#4 560 tst r2, #8 561 mov lr, r2 562 mov r5, r0 563 vld1.64 {d0-d1}, [r4,:128]! 564 vcvt.s32.f32 q0, q0, #16 565 vld1.64 {d2-d3}, [r4,:128]! 566 vcvt.s32.f32 q1, q1, #16 567 bne 8f 5686: subs lr, lr, #16 569 vld1.64 {d4-d5}, [r4,:128]! 570 vcvt.s32.f32 q2, q2, #16 571 vld1.64 {d6-d7}, [r4,:128]! 572 vcvt.s32.f32 q3, q3, #16 573 vst1.16 {d0[1]}, [r5,:16], ip 574 vst1.16 {d0[3]}, [r5,:16], ip 575 vst1.16 {d1[1]}, [r5,:16], ip 576 vst1.16 {d1[3]}, [r5,:16], ip 577 vst1.16 {d2[1]}, [r5,:16], ip 578 vst1.16 {d2[3]}, [r5,:16], ip 579 vst1.16 {d3[1]}, [r5,:16], ip 580 vst1.16 {d3[3]}, [r5,:16], ip 581 beq 7f 582 vld1.64 {d0-d1}, [r4,:128]! 583 vcvt.s32.f32 q0, q0, #16 584 vld1.64 {d2-d3}, [r4,:128]! 585 vcvt.s32.f32 q1, q1, #16 5867: vst1.16 {d4[1]}, [r5,:16], ip 587 vst1.16 {d4[3]}, [r5,:16], ip 588 vst1.16 {d5[1]}, [r5,:16], ip 589 vst1.16 {d5[3]}, [r5,:16], ip 590 vst1.16 {d6[1]}, [r5,:16], ip 591 vst1.16 {d6[3]}, [r5,:16], ip 592 vst1.16 {d7[1]}, [r5,:16], ip 593 vst1.16 {d7[3]}, [r5,:16], ip 594 bgt 6b 595 pop {r4-r8,pc} 5968: subs lr, lr, #8 597 vst1.16 {d0[1]}, [r5,:16], ip 598 vst1.16 {d0[3]}, [r5,:16], ip 599 vst1.16 {d1[1]}, [r5,:16], ip 600 vst1.16 {d1[3]}, [r5,:16], ip 601 vst1.16 {d2[1]}, [r5,:16], ip 602 vst1.16 {d2[3]}, [r5,:16], ip 603 vst1.16 {d3[1]}, [r5,:16], ip 604 vst1.16 {d3[3]}, [r5,:16], ip 605 popeq {r4-r8,pc} 606 vld1.64 {d0-d1}, [r4,:128]! 607 vcvt.s32.f32 q0, q0, #16 608 vld1.64 {d2-d3}, [r4,:128]! 609 vcvt.s32.f32 q1, q1, #16 610 b 6b 611 .endfunc 612 613function ff_vector_fmul_neon, export=1 614 mov r3, r0 615 subs r2, r2, #8 616 vld1.64 {d0-d3}, [r0,:128]! 617 vld1.64 {d4-d7}, [r1,:128]! 618 vmul.f32 q8, q0, q2 619 vmul.f32 q9, q1, q3 620 beq 3f 621 bics ip, r2, #15 622 beq 2f 6231: subs ip, ip, #16 624 vld1.64 {d0-d1}, [r0,:128]! 625 vld1.64 {d4-d5}, [r1,:128]! 626 vmul.f32 q10, q0, q2 627 vld1.64 {d2-d3}, [r0,:128]! 628 vld1.64 {d6-d7}, [r1,:128]! 629 vmul.f32 q11, q1, q3 630 vst1.64 {d16-d19},[r3,:128]! 631 vld1.64 {d0-d1}, [r0,:128]! 632 vld1.64 {d4-d5}, [r1,:128]! 633 vmul.f32 q8, q0, q2 634 vld1.64 {d2-d3}, [r0,:128]! 635 vld1.64 {d6-d7}, [r1,:128]! 636 vmul.f32 q9, q1, q3 637 vst1.64 {d20-d23},[r3,:128]! 638 bne 1b 639 ands r2, r2, #15 640 beq 3f 6412: vld1.64 {d0-d1}, [r0,:128]! 642 vld1.64 {d4-d5}, [r1,:128]! 643 vst1.64 {d16-d17},[r3,:128]! 644 vmul.f32 q8, q0, q2 645 vld1.64 {d2-d3}, [r0,:128]! 646 vld1.64 {d6-d7}, [r1,:128]! 647 vst1.64 {d18-d19},[r3,:128]! 648 vmul.f32 q9, q1, q3 6493: vst1.64 {d16-d19},[r3,:128]! 650 bx lr 651 .endfunc 652 653function ff_vector_fmul_window_neon, export=1 654 vld1.32 {d16[],d17[]}, [sp,:32] 655 push {r4,r5,lr} 656 ldr lr, [sp, #16] 657 sub r2, r2, #8 658 sub r5, lr, #2 659 add r2, r2, r5, lsl #2 660 add r4, r3, r5, lsl #3 661 add ip, r0, r5, lsl #3 662 mov r5, #-16 663 vld1.64 {d0,d1}, [r1,:128]! 664 vld1.64 {d2,d3}, [r2,:128], r5 665 vld1.64 {d4,d5}, [r3,:128]! 666 vld1.64 {d6,d7}, [r4,:128], r5 6671: subs lr, lr, #4 668 vmov q11, q8 669 vmla.f32 d22, d0, d4 670 vmov q10, q8 671 vmla.f32 d23, d1, d5 672 vrev64.32 q3, q3 673 vmla.f32 d20, d0, d7 674 vrev64.32 q1, q1 675 vmla.f32 d21, d1, d6 676 beq 2f 677 vmla.f32 d22, d3, d7 678 vld1.64 {d0,d1}, [r1,:128]! 679 vmla.f32 d23, d2, d6 680 vld1.64 {d18,d19},[r2,:128], r5 681 vmls.f32 d20, d3, d4 682 vld1.64 {d24,d25},[r3,:128]! 683 vmls.f32 d21, d2, d5 684 vld1.64 {d6,d7}, [r4,:128], r5 685 vmov q1, q9 686 vrev64.32 q11, q11 687 vmov q2, q12 688 vswp d22, d23 689 vst1.64 {d20,d21},[r0,:128]! 690 vst1.64 {d22,d23},[ip,:128], r5 691 b 1b 6922: vmla.f32 d22, d3, d7 693 vmla.f32 d23, d2, d6 694 vmls.f32 d20, d3, d4 695 vmls.f32 d21, d2, d5 696 vrev64.32 q11, q11 697 vswp d22, d23 698 vst1.64 {d20,d21},[r0,:128]! 699 vst1.64 {d22,d23},[ip,:128], r5 700 pop {r4,r5,pc} 701 .endfunc 702