1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "asm.S" 22 23 .fpu neon 24 25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7 26 vtrn.32 \r0, \r4 27 vtrn.32 \r1, \r5 28 vtrn.32 \r2, \r6 29 vtrn.32 \r3, \r7 30 vtrn.16 \r0, \r2 31 vtrn.16 \r1, \r3 32 vtrn.16 \r4, \r6 33 vtrn.16 \r5, \r7 34 vtrn.8 \r0, \r1 35 vtrn.8 \r2, \r3 36 vtrn.8 \r4, \r5 37 vtrn.8 \r6, \r7 38 .endm 39 40 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7 41 vswp \r0, \r4 42 vswp \r1, \r5 43 vswp \r2, \r6 44 vswp \r3, \r7 45 .endm 46 47 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7 48 vtrn.32 \r0, \r2 49 vtrn.32 \r1, \r3 50 vtrn.32 \r4, \r6 51 vtrn.32 \r5, \r7 52 vtrn.16 \r0, \r1 53 vtrn.16 \r2, \r3 54 vtrn.16 \r4, \r5 55 vtrn.16 \r6, \r7 56 .endm 57 58/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 59 .macro h264_chroma_mc8 type 60function ff_\type\()_h264_chroma_mc8_neon, export=1 61 push {r4-r7, lr} 62 ldrd r4, [sp, #20] 63.ifc \type,avg 64 mov lr, r0 65.endif 66 pld [r1] 67 pld [r1, r2] 68 69 muls r7, r4, r5 70 rsb r6, r7, r5, lsl #3 71 rsb ip, r7, r4, lsl #3 72 sub r4, r7, r4, lsl #3 73 sub r4, r4, r5, lsl #3 74 add r4, r4, #64 75 76 beq 2f 77 78 add r5, r1, r2 79 80 vdup.8 d0, r4 81 lsl r4, r2, #1 82 vdup.8 d1, ip 83 vld1.64 {d4, d5}, [r1], r4 84 vdup.8 d2, r6 85 vld1.64 {d6, d7}, [r5], r4 86 vdup.8 d3, r7 87 88 vext.8 d5, d4, d5, #1 89 vext.8 d7, d6, d7, #1 90 911: pld [r5] 92 vmull.u8 q8, d4, d0 93 vmlal.u8 q8, d5, d1 94 vld1.64 {d4, d5}, [r1], r4 95 vmlal.u8 q8, d6, d2 96 vext.8 d5, d4, d5, #1 97 vmlal.u8 q8, d7, d3 98 vmull.u8 q9, d6, d0 99 subs r3, r3, #2 100 vmlal.u8 q9, d7, d1 101 vmlal.u8 q9, d4, d2 102 vmlal.u8 q9, d5, d3 103 vrshrn.u16 d16, q8, #6 104 vld1.64 {d6, d7}, [r5], r4 105 pld [r1] 106 vrshrn.u16 d17, q9, #6 107.ifc \type,avg 108 vld1.64 {d20}, [lr,:64], r2 109 vld1.64 {d21}, [lr,:64], r2 110 vrhadd.u8 q8, q8, q10 111.endif 112 vext.8 d7, d6, d7, #1 113 vst1.64 {d16}, [r0,:64], r2 114 vst1.64 {d17}, [r0,:64], r2 115 bgt 1b 116 117 pop {r4-r7, pc} 118 1192: tst r6, r6 120 add ip, ip, r6 121 vdup.8 d0, r4 122 vdup.8 d1, ip 123 124 beq 4f 125 126 add r5, r1, r2 127 lsl r4, r2, #1 128 vld1.64 {d4}, [r1], r4 129 vld1.64 {d6}, [r5], r4 130 1313: pld [r5] 132 vmull.u8 q8, d4, d0 133 vmlal.u8 q8, d6, d1 134 vld1.64 {d4}, [r1], r4 135 vmull.u8 q9, d6, d0 136 vmlal.u8 q9, d4, d1 137 vld1.64 {d6}, [r5], r4 138 vrshrn.u16 d16, q8, #6 139 vrshrn.u16 d17, q9, #6 140.ifc \type,avg 141 vld1.64 {d20}, [lr,:64], r2 142 vld1.64 {d21}, [lr,:64], r2 143 vrhadd.u8 q8, q8, q10 144.endif 145 subs r3, r3, #2 146 pld [r1] 147 vst1.64 {d16}, [r0,:64], r2 148 vst1.64 {d17}, [r0,:64], r2 149 bgt 3b 150 151 pop {r4-r7, pc} 152 1534: vld1.64 {d4, d5}, [r1], r2 154 vld1.64 {d6, d7}, [r1], r2 155 vext.8 d5, d4, d5, #1 156 vext.8 d7, d6, d7, #1 157 1585: pld [r1] 159 subs r3, r3, #2 160 vmull.u8 q8, d4, d0 161 vmlal.u8 q8, d5, d1 162 vld1.64 {d4, d5}, [r1], r2 163 vmull.u8 q9, d6, d0 164 vmlal.u8 q9, d7, d1 165 pld [r1] 166 vext.8 d5, d4, d5, #1 167 vrshrn.u16 d16, q8, #6 168 vrshrn.u16 d17, q9, #6 169.ifc \type,avg 170 vld1.64 {d20}, [lr,:64], r2 171 vld1.64 {d21}, [lr,:64], r2 172 vrhadd.u8 q8, q8, q10 173.endif 174 vld1.64 {d6, d7}, [r1], r2 175 vext.8 d7, d6, d7, #1 176 vst1.64 {d16}, [r0,:64], r2 177 vst1.64 {d17}, [r0,:64], r2 178 bgt 5b 179 180 pop {r4-r7, pc} 181 .endfunc 182 .endm 183 184/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 185 .macro h264_chroma_mc4 type 186function ff_\type\()_h264_chroma_mc4_neon, export=1 187 push {r4-r7, lr} 188 ldrd r4, [sp, #20] 189.ifc \type,avg 190 mov lr, r0 191.endif 192 pld [r1] 193 pld [r1, r2] 194 195 muls r7, r4, r5 196 rsb r6, r7, r5, lsl #3 197 rsb ip, r7, r4, lsl #3 198 sub r4, r7, r4, lsl #3 199 sub r4, r4, r5, lsl #3 200 add r4, r4, #64 201 202 beq 2f 203 204 add r5, r1, r2 205 206 vdup.8 d0, r4 207 lsl r4, r2, #1 208 vdup.8 d1, ip 209 vld1.64 {d4}, [r1], r4 210 vdup.8 d2, r6 211 vld1.64 {d6}, [r5], r4 212 vdup.8 d3, r7 213 214 vext.8 d5, d4, d5, #1 215 vext.8 d7, d6, d7, #1 216 vtrn.32 d4, d5 217 vtrn.32 d6, d7 218 219 vtrn.32 d0, d1 220 vtrn.32 d2, d3 221 2221: pld [r5] 223 vmull.u8 q8, d4, d0 224 vmlal.u8 q8, d6, d2 225 vld1.64 {d4}, [r1], r4 226 vext.8 d5, d4, d5, #1 227 vtrn.32 d4, d5 228 vmull.u8 q9, d6, d0 229 vmlal.u8 q9, d4, d2 230 vld1.64 {d6}, [r5], r4 231 vadd.i16 d16, d16, d17 232 vadd.i16 d17, d18, d19 233 vrshrn.u16 d16, q8, #6 234 subs r3, r3, #2 235 pld [r1] 236.ifc \type,avg 237 vld1.32 {d20[0]}, [lr,:32], r2 238 vld1.32 {d20[1]}, [lr,:32], r2 239 vrhadd.u8 d16, d16, d20 240.endif 241 vext.8 d7, d6, d7, #1 242 vtrn.32 d6, d7 243 vst1.32 {d16[0]}, [r0,:32], r2 244 vst1.32 {d16[1]}, [r0,:32], r2 245 bgt 1b 246 247 pop {r4-r7, pc} 248 2492: tst r6, r6 250 add ip, ip, r6 251 vdup.8 d0, r4 252 vdup.8 d1, ip 253 vtrn.32 d0, d1 254 255 beq 4f 256 257 vext.32 d1, d0, d1, #1 258 add r5, r1, r2 259 lsl r4, r2, #1 260 vld1.32 {d4[0]}, [r1], r4 261 vld1.32 {d4[1]}, [r5], r4 262 2633: pld [r5] 264 vmull.u8 q8, d4, d0 265 vld1.32 {d4[0]}, [r1], r4 266 vmull.u8 q9, d4, d1 267 vld1.32 {d4[1]}, [r5], r4 268 vadd.i16 d16, d16, d17 269 vadd.i16 d17, d18, d19 270 vrshrn.u16 d16, q8, #6 271.ifc \type,avg 272 vld1.32 {d20[0]}, [lr,:32], r2 273 vld1.32 {d20[1]}, [lr,:32], r2 274 vrhadd.u8 d16, d16, d20 275.endif 276 subs r3, r3, #2 277 pld [r1] 278 vst1.32 {d16[0]}, [r0,:32], r2 279 vst1.32 {d16[1]}, [r0,:32], r2 280 bgt 3b 281 282 pop {r4-r7, pc} 283 2844: vld1.64 {d4}, [r1], r2 285 vld1.64 {d6}, [r1], r2 286 vext.8 d5, d4, d5, #1 287 vext.8 d7, d6, d7, #1 288 vtrn.32 d4, d5 289 vtrn.32 d6, d7 290 2915: vmull.u8 q8, d4, d0 292 vmull.u8 q9, d6, d0 293 subs r3, r3, #2 294 vld1.64 {d4}, [r1], r2 295 vext.8 d5, d4, d5, #1 296 vtrn.32 d4, d5 297 vadd.i16 d16, d16, d17 298 vadd.i16 d17, d18, d19 299 pld [r1] 300 vrshrn.u16 d16, q8, #6 301.ifc \type,avg 302 vld1.32 {d20[0]}, [lr,:32], r2 303 vld1.32 {d20[1]}, [lr,:32], r2 304 vrhadd.u8 d16, d16, d20 305.endif 306 vld1.64 {d6}, [r1], r2 307 vext.8 d7, d6, d7, #1 308 vtrn.32 d6, d7 309 pld [r1] 310 vst1.32 {d16[0]}, [r0,:32], r2 311 vst1.32 {d16[1]}, [r0,:32], r2 312 bgt 5b 313 314 pop {r4-r7, pc} 315 .endfunc 316 .endm 317 318 .text 319 .align 320 321 h264_chroma_mc8 put 322 h264_chroma_mc8 avg 323 h264_chroma_mc4 put 324 h264_chroma_mc4 avg 325 326 /* H.264 loop filter */ 327 328 .macro h264_loop_filter_start 329 ldr ip, [sp] 330 tst r2, r2 331 ldr ip, [ip] 332 tstne r3, r3 333 vmov.32 d24[0], ip 334 and ip, ip, ip, lsl #16 335 bxeq lr 336 ands ip, ip, ip, lsl #8 337 bxlt lr 338 .endm 339 340 .macro align_push_regs 341 and ip, sp, #15 342 add ip, ip, #32 343 sub sp, sp, ip 344 vst1.64 {d12-d15}, [sp,:128] 345 sub sp, sp, #32 346 vst1.64 {d8-d11}, [sp,:128] 347 .endm 348 349 .macro align_pop_regs 350 vld1.64 {d8-d11}, [sp,:128]! 351 vld1.64 {d12-d15}, [sp,:128], ip 352 .endm 353 354 .macro h264_loop_filter_luma 355 vdup.8 q11, r2 @ alpha 356 vmovl.u8 q12, d24 357 vabd.u8 q6, q8, q0 @ abs(p0 - q0) 358 vmovl.u16 q12, d24 359 vabd.u8 q14, q9, q8 @ abs(p1 - p0) 360 vsli.16 q12, q12, #8 361 vabd.u8 q15, q1, q0 @ abs(q1 - q0) 362 vsli.32 q12, q12, #16 363 vclt.u8 q6, q6, q11 @ < alpha 364 vdup.8 q11, r3 @ beta 365 vclt.s8 q7, q12, #0 366 vclt.u8 q14, q14, q11 @ < beta 367 vclt.u8 q15, q15, q11 @ < beta 368 vbic q6, q6, q7 369 vabd.u8 q4, q10, q8 @ abs(p2 - p0) 370 vand q6, q6, q14 371 vabd.u8 q5, q2, q0 @ abs(q2 - q0) 372 vclt.u8 q4, q4, q11 @ < beta 373 vand q6, q6, q15 374 vclt.u8 q5, q5, q11 @ < beta 375 vand q4, q4, q6 376 vand q5, q5, q6 377 vand q12, q12, q6 378 vrhadd.u8 q14, q8, q0 379 vsub.i8 q6, q12, q4 380 vqadd.u8 q7, q9, q12 381 vhadd.u8 q10, q10, q14 382 vsub.i8 q6, q6, q5 383 vhadd.u8 q14, q2, q14 384 vmin.u8 q7, q7, q10 385 vqsub.u8 q11, q9, q12 386 vqadd.u8 q2, q1, q12 387 vmax.u8 q7, q7, q11 388 vqsub.u8 q11, q1, q12 389 vmin.u8 q14, q2, q14 390 vmovl.u8 q2, d0 391 vmax.u8 q14, q14, q11 392 vmovl.u8 q10, d1 393 vsubw.u8 q2, q2, d16 394 vsubw.u8 q10, q10, d17 395 vshl.i16 q2, q2, #2 396 vshl.i16 q10, q10, #2 397 vaddw.u8 q2, q2, d18 398 vaddw.u8 q10, q10, d19 399 vsubw.u8 q2, q2, d2 400 vsubw.u8 q10, q10, d3 401 vrshrn.i16 d4, q2, #3 402 vrshrn.i16 d5, q10, #3 403 vbsl q4, q7, q9 404 vbsl q5, q14, q1 405 vneg.s8 q7, q6 406 vmovl.u8 q14, d16 407 vmin.s8 q2, q2, q6 408 vmovl.u8 q6, d17 409 vmax.s8 q2, q2, q7 410 vmovl.u8 q11, d0 411 vmovl.u8 q12, d1 412 vaddw.s8 q14, q14, d4 413 vaddw.s8 q6, q6, d5 414 vsubw.s8 q11, q11, d4 415 vsubw.s8 q12, q12, d5 416 vqmovun.s16 d16, q14 417 vqmovun.s16 d17, q6 418 vqmovun.s16 d0, q11 419 vqmovun.s16 d1, q12 420 .endm 421 422function ff_h264_v_loop_filter_luma_neon, export=1 423 h264_loop_filter_start 424 425 vld1.64 {d0, d1}, [r0,:128], r1 426 vld1.64 {d2, d3}, [r0,:128], r1 427 vld1.64 {d4, d5}, [r0,:128], r1 428 sub r0, r0, r1, lsl #2 429 sub r0, r0, r1, lsl #1 430 vld1.64 {d20,d21}, [r0,:128], r1 431 vld1.64 {d18,d19}, [r0,:128], r1 432 vld1.64 {d16,d17}, [r0,:128], r1 433 434 align_push_regs 435 436 h264_loop_filter_luma 437 438 sub r0, r0, r1, lsl #1 439 vst1.64 {d8, d9}, [r0,:128], r1 440 vst1.64 {d16,d17}, [r0,:128], r1 441 vst1.64 {d0, d1}, [r0,:128], r1 442 vst1.64 {d10,d11}, [r0,:128] 443 444 align_pop_regs 445 bx lr 446 .endfunc 447 448function ff_h264_h_loop_filter_luma_neon, export=1 449 h264_loop_filter_start 450 451 sub r0, r0, #4 452 vld1.64 {d6}, [r0], r1 453 vld1.64 {d20}, [r0], r1 454 vld1.64 {d18}, [r0], r1 455 vld1.64 {d16}, [r0], r1 456 vld1.64 {d0}, [r0], r1 457 vld1.64 {d2}, [r0], r1 458 vld1.64 {d4}, [r0], r1 459 vld1.64 {d26}, [r0], r1 460 vld1.64 {d7}, [r0], r1 461 vld1.64 {d21}, [r0], r1 462 vld1.64 {d19}, [r0], r1 463 vld1.64 {d17}, [r0], r1 464 vld1.64 {d1}, [r0], r1 465 vld1.64 {d3}, [r0], r1 466 vld1.64 {d5}, [r0], r1 467 vld1.64 {d27}, [r0], r1 468 469 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 470 471 align_push_regs 472 sub sp, sp, #16 473 vst1.64 {d4, d5}, [sp,:128] 474 sub sp, sp, #16 475 vst1.64 {d20,d21}, [sp,:128] 476 477 h264_loop_filter_luma 478 479 vld1.64 {d20,d21}, [sp,:128]! 480 vld1.64 {d4, d5}, [sp,:128]! 481 482 transpose_8x8 q3, q10, q4, q8, q0, q5, q2, q13 483 484 sub r0, r0, r1, lsl #4 485 vst1.64 {d6}, [r0], r1 486 vst1.64 {d20}, [r0], r1 487 vst1.64 {d8}, [r0], r1 488 vst1.64 {d16}, [r0], r1 489 vst1.64 {d0}, [r0], r1 490 vst1.64 {d10}, [r0], r1 491 vst1.64 {d4}, [r0], r1 492 vst1.64 {d26}, [r0], r1 493 vst1.64 {d7}, [r0], r1 494 vst1.64 {d21}, [r0], r1 495 vst1.64 {d9}, [r0], r1 496 vst1.64 {d17}, [r0], r1 497 vst1.64 {d1}, [r0], r1 498 vst1.64 {d11}, [r0], r1 499 vst1.64 {d5}, [r0], r1 500 vst1.64 {d27}, [r0], r1 501 502 align_pop_regs 503 bx lr 504 .endfunc 505 506 .macro h264_loop_filter_chroma 507 vdup.8 d22, r2 @ alpha 508 vmovl.u8 q12, d24 509 vabd.u8 d26, d16, d0 @ abs(p0 - q0) 510 vmovl.u8 q2, d0 511 vabd.u8 d28, d18, d16 @ abs(p1 - p0) 512 vsubw.u8 q2, q2, d16 513 vsli.16 d24, d24, #8 514 vshl.i16 q2, q2, #2 515 vabd.u8 d30, d2, d0 @ abs(q1 - q0) 516 vaddw.u8 q2, q2, d18 517 vclt.u8 d26, d26, d22 @ < alpha 518 vsubw.u8 q2, q2, d2 519 vdup.8 d22, r3 @ beta 520 vclt.s8 d25, d24, #0 521 vrshrn.i16 d4, q2, #3 522 vclt.u8 d28, d28, d22 @ < beta 523 vbic d26, d26, d25 524 vclt.u8 d30, d30, d22 @ < beta 525 vand d26, d26, d28 526 vneg.s8 d25, d24 527 vand d26, d26, d30 528 vmin.s8 d4, d4, d24 529 vmovl.u8 q14, d16 530 vand d4, d4, d26 531 vmax.s8 d4, d4, d25 532 vmovl.u8 q11, d0 533 vaddw.s8 q14, q14, d4 534 vsubw.s8 q11, q11, d4 535 vqmovun.s16 d16, q14 536 vqmovun.s16 d0, q11 537 .endm 538 539function ff_h264_v_loop_filter_chroma_neon, export=1 540 h264_loop_filter_start 541 542 sub r0, r0, r1, lsl #1 543 vld1.64 {d18}, [r0,:64], r1 544 vld1.64 {d16}, [r0,:64], r1 545 vld1.64 {d0}, [r0,:64], r1 546 vld1.64 {d2}, [r0,:64] 547 548 h264_loop_filter_chroma 549 550 sub r0, r0, r1, lsl #1 551 vst1.64 {d16}, [r0,:64], r1 552 vst1.64 {d0}, [r0,:64], r1 553 554 bx lr 555 .endfunc 556 557function ff_h264_h_loop_filter_chroma_neon, export=1 558 h264_loop_filter_start 559 560 sub r0, r0, #2 561 vld1.32 {d18[0]}, [r0], r1 562 vld1.32 {d16[0]}, [r0], r1 563 vld1.32 {d0[0]}, [r0], r1 564 vld1.32 {d2[0]}, [r0], r1 565 vld1.32 {d18[1]}, [r0], r1 566 vld1.32 {d16[1]}, [r0], r1 567 vld1.32 {d0[1]}, [r0], r1 568 vld1.32 {d2[1]}, [r0], r1 569 570 vtrn.16 d18, d0 571 vtrn.16 d16, d2 572 vtrn.8 d18, d16 573 vtrn.8 d0, d2 574 575 h264_loop_filter_chroma 576 577 vtrn.16 d18, d0 578 vtrn.16 d16, d2 579 vtrn.8 d18, d16 580 vtrn.8 d0, d2 581 582 sub r0, r0, r1, lsl #3 583 vst1.32 {d18[0]}, [r0], r1 584 vst1.32 {d16[0]}, [r0], r1 585 vst1.32 {d0[0]}, [r0], r1 586 vst1.32 {d2[0]}, [r0], r1 587 vst1.32 {d18[1]}, [r0], r1 588 vst1.32 {d16[1]}, [r0], r1 589 vst1.32 {d0[1]}, [r0], r1 590 vst1.32 {d2[1]}, [r0], r1 591 592 bx lr 593 .endfunc 594 595 /* H.264 qpel MC */ 596 597 .macro lowpass_const r 598 movw \r, #5 599 movt \r, #20 600 vmov.32 d6[0], \r 601 .endm 602 603 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 604.if \narrow 605 t0 .req q0 606 t1 .req q8 607.else 608 t0 .req \d0 609 t1 .req \d1 610.endif 611 vext.8 d2, \r0, \r1, #2 612 vext.8 d3, \r0, \r1, #3 613 vaddl.u8 q1, d2, d3 614 vext.8 d4, \r0, \r1, #1 615 vext.8 d5, \r0, \r1, #4 616 vaddl.u8 q2, d4, d5 617 vext.8 d30, \r0, \r1, #5 618 vaddl.u8 t0, \r0, d30 619 vext.8 d18, \r2, \r3, #2 620 vmla.i16 t0, q1, d6[1] 621 vext.8 d19, \r2, \r3, #3 622 vaddl.u8 q9, d18, d19 623 vext.8 d20, \r2, \r3, #1 624 vmls.i16 t0, q2, d6[0] 625 vext.8 d21, \r2, \r3, #4 626 vaddl.u8 q10, d20, d21 627 vext.8 d31, \r2, \r3, #5 628 vaddl.u8 t1, \r2, d31 629 vmla.i16 t1, q9, d6[1] 630 vmls.i16 t1, q10, d6[0] 631.if \narrow 632 vqrshrun.s16 \d0, t0, #5 633 vqrshrun.s16 \d1, t1, #5 634.endif 635 .unreq t0 636 .unreq t1 637 .endm 638 639 .macro lowpass_8_1 r0, r1, d0, narrow=1 640.if \narrow 641 t0 .req q0 642.else 643 t0 .req \d0 644.endif 645 vext.8 d2, \r0, \r1, #2 646 vext.8 d3, \r0, \r1, #3 647 vaddl.u8 q1, d2, d3 648 vext.8 d4, \r0, \r1, #1 649 vext.8 d5, \r0, \r1, #4 650 vaddl.u8 q2, d4, d5 651 vext.8 d30, \r0, \r1, #5 652 vaddl.u8 t0, \r0, d30 653 vmla.i16 t0, q1, d6[1] 654 vmls.i16 t0, q2, d6[0] 655.if \narrow 656 vqrshrun.s16 \d0, t0, #5 657.endif 658 .unreq t0 659 .endm 660 661 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 662 vext.16 q1, \r0, \r1, #2 663 vext.16 q0, \r0, \r1, #3 664 vaddl.s16 q9, d2, d0 665 vext.16 q2, \r0, \r1, #1 666 vaddl.s16 q1, d3, d1 667 vext.16 q3, \r0, \r1, #4 668 vaddl.s16 q10, d4, d6 669 vext.16 \r1, \r0, \r1, #5 670 vaddl.s16 q2, d5, d7 671 vaddl.s16 q0, \h0, \h1 672 vaddl.s16 q8, \l0, \l1 673 674 vshl.i32 q3, q9, #4 675 vshl.i32 q9, q9, #2 676 vshl.i32 q15, q10, #2 677 vadd.i32 q9, q9, q3 678 vadd.i32 q10, q10, q15 679 680 vshl.i32 q3, q1, #4 681 vshl.i32 q1, q1, #2 682 vshl.i32 q15, q2, #2 683 vadd.i32 q1, q1, q3 684 vadd.i32 q2, q2, q15 685 686 vadd.i32 q9, q9, q8 687 vsub.i32 q9, q9, q10 688 689 vadd.i32 q1, q1, q0 690 vsub.i32 q1, q1, q2 691 692 vrshrn.s32 d18, q9, #10 693 vrshrn.s32 d19, q1, #10 694 695 vqmovun.s16 \d, q9 696 .endm 697 698function put_h264_qpel16_h_lowpass_neon_packed 699 mov r4, lr 700 mov ip, #16 701 mov r3, #8 702 bl put_h264_qpel8_h_lowpass_neon 703 sub r1, r1, r2, lsl #4 704 add r1, r1, #8 705 mov ip, #16 706 mov lr, r4 707 b put_h264_qpel8_h_lowpass_neon 708 .endfunc 709 710function put_h264_qpel16_h_lowpass_neon 711 push {lr} 712 mov ip, #16 713 bl put_h264_qpel8_h_lowpass_neon 714 sub r0, r0, r3, lsl #4 715 sub r1, r1, r2, lsl #4 716 add r0, r0, #8 717 add r1, r1, #8 718 mov ip, #16 719 pop {lr} 720 .endfunc 721 722function put_h264_qpel8_h_lowpass_neon 7231: vld1.64 {d0, d1}, [r1], r2 724 vld1.64 {d16,d17}, [r1], r2 725 subs ip, ip, #2 726 lowpass_8 d0, d1, d16, d17, d0, d16 727 vst1.64 {d0}, [r0,:64], r3 728 vst1.64 {d16}, [r0,:64], r3 729 bne 1b 730 bx lr 731 .endfunc 732 733function put_h264_qpel16_h_lowpass_l2_neon 734 push {lr} 735 mov ip, #16 736 bl put_h264_qpel8_h_lowpass_l2_neon 737 sub r0, r0, r2, lsl #4 738 sub r1, r1, r2, lsl #4 739 sub r3, r3, r2, lsl #4 740 add r0, r0, #8 741 add r1, r1, #8 742 add r3, r3, #8 743 mov ip, #16 744 pop {lr} 745 .endfunc 746 747function put_h264_qpel8_h_lowpass_l2_neon 7481: vld1.64 {d0, d1}, [r1], r2 749 vld1.64 {d16,d17}, [r1], r2 750 vld1.64 {d28}, [r3], r2 751 vld1.64 {d29}, [r3], r2 752 subs ip, ip, #2 753 lowpass_8 d0, d1, d16, d17, d0, d1 754 vrhadd.u8 q0, q0, q14 755 vst1.64 {d0}, [r0,:64], r2 756 vst1.64 {d1}, [r0,:64], r2 757 bne 1b 758 bx lr 759 .endfunc 760 761function put_h264_qpel16_v_lowpass_neon_packed 762 mov r4, lr 763 mov r2, #8 764 bl put_h264_qpel8_v_lowpass_neon 765 sub r1, r1, r3, lsl #2 766 bl put_h264_qpel8_v_lowpass_neon 767 sub r1, r1, r3, lsl #4 768 sub r1, r1, r3, lsl #2 769 add r1, r1, #8 770 bl put_h264_qpel8_v_lowpass_neon 771 sub r1, r1, r3, lsl #2 772 mov lr, r4 773 b put_h264_qpel8_v_lowpass_neon 774 .endfunc 775 776function put_h264_qpel16_v_lowpass_neon 777 mov r4, lr 778 bl put_h264_qpel8_v_lowpass_neon 779 sub r1, r1, r3, lsl #2 780 bl put_h264_qpel8_v_lowpass_neon 781 sub r0, r0, r2, lsl #4 782 add r0, r0, #8 783 sub r1, r1, r3, lsl #4 784 sub r1, r1, r3, lsl #2 785 add r1, r1, #8 786 bl put_h264_qpel8_v_lowpass_neon 787 sub r1, r1, r3, lsl #2 788 mov lr, r4 789 .endfunc 790 791function put_h264_qpel8_v_lowpass_neon 792 vld1.64 {d8}, [r1], r3 793 vld1.64 {d10}, [r1], r3 794 vld1.64 {d12}, [r1], r3 795 vld1.64 {d14}, [r1], r3 796 vld1.64 {d22}, [r1], r3 797 vld1.64 {d24}, [r1], r3 798 vld1.64 {d26}, [r1], r3 799 vld1.64 {d28}, [r1], r3 800 vld1.64 {d9}, [r1], r3 801 vld1.64 {d11}, [r1], r3 802 vld1.64 {d13}, [r1], r3 803 vld1.64 {d15}, [r1], r3 804 vld1.64 {d23}, [r1] 805 806 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 807 lowpass_8 d8, d9, d10, d11, d8, d10 808 lowpass_8 d12, d13, d14, d15, d12, d14 809 lowpass_8 d22, d23, d24, d25, d22, d24 810 lowpass_8 d26, d27, d28, d29, d26, d28 811 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 812 813 vst1.64 {d8}, [r0,:64], r2 814 vst1.64 {d10}, [r0,:64], r2 815 vst1.64 {d12}, [r0,:64], r2 816 vst1.64 {d14}, [r0,:64], r2 817 vst1.64 {d22}, [r0,:64], r2 818 vst1.64 {d24}, [r0,:64], r2 819 vst1.64 {d26}, [r0,:64], r2 820 vst1.64 {d28}, [r0,:64], r2 821 822 bx lr 823 .endfunc 824 825function put_h264_qpel16_v_lowpass_l2_neon 826 mov r4, lr 827 bl put_h264_qpel8_v_lowpass_l2_neon 828 sub r1, r1, r3, lsl #2 829 bl put_h264_qpel8_v_lowpass_l2_neon 830 sub r0, r0, r3, lsl #4 831 sub ip, ip, r2, lsl #4 832 add r0, r0, #8 833 add ip, ip, #8 834 sub r1, r1, r3, lsl #4 835 sub r1, r1, r3, lsl #2 836 add r1, r1, #8 837 bl put_h264_qpel8_v_lowpass_l2_neon 838 sub r1, r1, r3, lsl #2 839 mov lr, r4 840 .endfunc 841 842function put_h264_qpel8_v_lowpass_l2_neon 843 vld1.64 {d8}, [r1], r3 844 vld1.64 {d10}, [r1], r3 845 vld1.64 {d12}, [r1], r3 846 vld1.64 {d14}, [r1], r3 847 vld1.64 {d22}, [r1], r3 848 vld1.64 {d24}, [r1], r3 849 vld1.64 {d26}, [r1], r3 850 vld1.64 {d28}, [r1], r3 851 vld1.64 {d9}, [r1], r3 852 vld1.64 {d11}, [r1], r3 853 vld1.64 {d13}, [r1], r3 854 vld1.64 {d15}, [r1], r3 855 vld1.64 {d23}, [r1] 856 857 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 858 lowpass_8 d8, d9, d10, d11, d8, d9 859 lowpass_8 d12, d13, d14, d15, d12, d13 860 lowpass_8 d22, d23, d24, d25, d22, d23 861 lowpass_8 d26, d27, d28, d29, d26, d27 862 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 863 864 vld1.64 {d0}, [ip], r2 865 vld1.64 {d1}, [ip], r2 866 vld1.64 {d2}, [ip], r2 867 vld1.64 {d3}, [ip], r2 868 vld1.64 {d4}, [ip], r2 869 vrhadd.u8 q0, q0, q4 870 vld1.64 {d5}, [ip], r2 871 vrhadd.u8 q1, q1, q6 872 vld1.64 {d10}, [ip], r2 873 vrhadd.u8 q2, q2, q11 874 vld1.64 {d11}, [ip], r2 875 876 vst1.64 {d0}, [r0,:64], r3 877 vst1.64 {d1}, [r0,:64], r3 878 vrhadd.u8 q5, q5, q13 879 vst1.64 {d2}, [r0,:64], r3 880 vst1.64 {d3}, [r0,:64], r3 881 vst1.64 {d4}, [r0,:64], r3 882 vst1.64 {d5}, [r0,:64], r3 883 vst1.64 {d10}, [r0,:64], r3 884 vst1.64 {d11}, [r0,:64], r3 885 886 bx lr 887 .endfunc 888 889function put_h264_qpel8_hv_lowpass_neon_top 890 lowpass_const ip 891 mov ip, #12 8921: vld1.64 {d0, d1}, [r1], r3 893 vld1.64 {d16,d17}, [r1], r3 894 subs ip, ip, #2 895 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 896 vst1.64 {d22-d25}, [r4,:128]! 897 bne 1b 898 899 vld1.64 {d0, d1}, [r1] 900 lowpass_8_1 d0, d1, q12, narrow=0 901 902 mov ip, #-16 903 add r4, r4, ip 904 vld1.64 {d30,d31}, [r4,:128], ip 905 vld1.64 {d20,d21}, [r4,:128], ip 906 vld1.64 {d18,d19}, [r4,:128], ip 907 vld1.64 {d16,d17}, [r4,:128], ip 908 vld1.64 {d14,d15}, [r4,:128], ip 909 vld1.64 {d12,d13}, [r4,:128], ip 910 vld1.64 {d10,d11}, [r4,:128], ip 911 vld1.64 {d8, d9}, [r4,:128], ip 912 vld1.64 {d6, d7}, [r4,:128], ip 913 vld1.64 {d4, d5}, [r4,:128], ip 914 vld1.64 {d2, d3}, [r4,:128], ip 915 vld1.64 {d0, d1}, [r4,:128] 916 917 swap4 d1, d3, d5, d7, d8, d10, d12, d14 918 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 919 920 swap4 d17, d19, d21, d31, d24, d26, d28, d22 921 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 922 923 vst1.64 {d30,d31}, [r4,:128]! 924 vst1.64 {d6, d7}, [r4,:128]! 925 vst1.64 {d20,d21}, [r4,:128]! 926 vst1.64 {d4, d5}, [r4,:128]! 927 vst1.64 {d18,d19}, [r4,:128]! 928 vst1.64 {d2, d3}, [r4,:128]! 929 vst1.64 {d16,d17}, [r4,:128]! 930 vst1.64 {d0, d1}, [r4,:128] 931 932 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 933 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 934 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 935 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 936 937 vld1.64 {d16,d17}, [r4,:128], ip 938 vld1.64 {d30,d31}, [r4,:128], ip 939 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 940 vld1.64 {d16,d17}, [r4,:128], ip 941 vld1.64 {d30,d31}, [r4,:128], ip 942 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 943 vld1.64 {d16,d17}, [r4,:128], ip 944 vld1.64 {d30,d31}, [r4,:128], ip 945 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 946 vld1.64 {d16,d17}, [r4,:128], ip 947 vld1.64 {d30,d31}, [r4,:128] 948 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 949 950 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 951 952 bx lr 953 .endfunc 954 955function put_h264_qpel8_hv_lowpass_neon 956 mov r10, lr 957 bl put_h264_qpel8_hv_lowpass_neon_top 958 vst1.64 {d12}, [r0,:64], r2 959 vst1.64 {d13}, [r0,:64], r2 960 vst1.64 {d14}, [r0,:64], r2 961 vst1.64 {d15}, [r0,:64], r2 962 vst1.64 {d8}, [r0,:64], r2 963 vst1.64 {d9}, [r0,:64], r2 964 vst1.64 {d10}, [r0,:64], r2 965 vst1.64 {d11}, [r0,:64], r2 966 967 mov lr, r10 968 bx lr 969 .endfunc 970 971function put_h264_qpel8_hv_lowpass_l2_neon 972 mov r10, lr 973 bl put_h264_qpel8_hv_lowpass_neon_top 974 975 vld1.64 {d0, d1}, [r2,:128]! 976 vld1.64 {d2, d3}, [r2,:128]! 977 vrhadd.u8 q0, q0, q6 978 vld1.64 {d4, d5}, [r2,:128]! 979 vrhadd.u8 q1, q1, q7 980 vld1.64 {d6, d7}, [r2,:128]! 981 vrhadd.u8 q2, q2, q4 982 983 vst1.64 {d0}, [r0,:64], r3 984 vrhadd.u8 q3, q3, q5 985 vst1.64 {d1}, [r0,:64], r3 986 vst1.64 {d2}, [r0,:64], r3 987 vst1.64 {d3}, [r0,:64], r3 988 vst1.64 {d4}, [r0,:64], r3 989 vst1.64 {d5}, [r0,:64], r3 990 vst1.64 {d6}, [r0,:64], r3 991 vst1.64 {d7}, [r0,:64], r3 992 993 mov lr, r10 994 bx lr 995 .endfunc 996 997function put_h264_qpel16_hv_lowpass_neon 998 mov r9, lr 999 bl put_h264_qpel8_hv_lowpass_neon 1000 sub r1, r1, r3, lsl #2 1001 bl put_h264_qpel8_hv_lowpass_neon 1002 sub r1, r1, r3, lsl #4 1003 sub r1, r1, r3, lsl #2 1004 add r1, r1, #8 1005 sub r0, r0, r2, lsl #4 1006 add r0, r0, #8 1007 bl put_h264_qpel8_hv_lowpass_neon 1008 sub r1, r1, r3, lsl #2 1009 mov lr, r9 1010 b put_h264_qpel8_hv_lowpass_neon 1011 .endfunc 1012 1013function put_h264_qpel16_hv_lowpass_l2_neon 1014 mov r9, lr 1015 sub r2, r4, #256 1016 bl put_h264_qpel8_hv_lowpass_l2_neon 1017 sub r1, r1, r3, lsl #2 1018 bl put_h264_qpel8_hv_lowpass_l2_neon 1019 sub r1, r1, r3, lsl #4 1020 sub r1, r1, r3, lsl #2 1021 add r1, r1, #8 1022 sub r0, r0, r3, lsl #4 1023 add r0, r0, #8 1024 bl put_h264_qpel8_hv_lowpass_l2_neon 1025 sub r1, r1, r3, lsl #2 1026 mov lr, r9 1027 b put_h264_qpel8_hv_lowpass_l2_neon 1028 .endfunc 1029 1030function ff_put_h264_qpel8_mc10_neon, export=1 1031 lowpass_const r3 1032 mov r3, r1 1033 sub r1, r1, #2 1034 mov ip, #8 1035 b put_h264_qpel8_h_lowpass_l2_neon 1036 .endfunc 1037 1038function ff_put_h264_qpel8_mc20_neon, export=1 1039 lowpass_const r3 1040 sub r1, r1, #2 1041 mov r3, r2 1042 mov ip, #8 1043 b put_h264_qpel8_h_lowpass_neon 1044 .endfunc 1045 1046function ff_put_h264_qpel8_mc30_neon, export=1 1047 lowpass_const r3 1048 add r3, r1, #1 1049 sub r1, r1, #2 1050 mov ip, #8 1051 b put_h264_qpel8_h_lowpass_l2_neon 1052 .endfunc 1053 1054function ff_put_h264_qpel8_mc01_neon, export=1 1055 push {lr} 1056 mov ip, r1 1057put_h264_qpel8_mc01: 1058 lowpass_const r3 1059 mov r3, r2 1060 sub r1, r1, r2, lsl #1 1061 vpush {d8-d15} 1062 bl put_h264_qpel8_v_lowpass_l2_neon 1063 vpop {d8-d15} 1064 pop {pc} 1065 .endfunc 1066 1067function ff_put_h264_qpel8_mc11_neon, export=1 1068 push {r0, r1, r2, lr} 1069put_h264_qpel8_mc11: 1070 lowpass_const r3 1071 sub sp, sp, #64 1072 mov r0, sp 1073 sub r1, r1, #2 1074 mov r3, #8 1075 mov ip, #8 1076 vpush {d8-d15} 1077 bl put_h264_qpel8_h_lowpass_neon 1078 ldrd r0, [sp, #128] 1079 mov r3, r2 1080 add ip, sp, #64 1081 sub r1, r1, r2, lsl #1 1082 mov r2, #8 1083 bl put_h264_qpel8_v_lowpass_l2_neon 1084 vpop {d8-d15} 1085 add sp, sp, #76 1086 pop {pc} 1087 .endfunc 1088 1089function ff_put_h264_qpel8_mc21_neon, export=1 1090 push {r0, r1, r4, r10, r11, lr} 1091put_h264_qpel8_mc21: 1092 lowpass_const r3 1093 mov r11, sp 1094 bic sp, sp, #15 1095 sub sp, sp, #(8*8+16*12) 1096 sub r1, r1, #2 1097 mov r3, #8 1098 mov r0, sp 1099 mov ip, #8 1100 vpush {d8-d15} 1101 bl put_h264_qpel8_h_lowpass_neon 1102 mov r4, r0 1103 ldrd r0, [r11] 1104 sub r1, r1, r2, lsl #1 1105 sub r1, r1, #2 1106 mov r3, r2 1107 sub r2, r4, #64 1108 bl put_h264_qpel8_hv_lowpass_l2_neon 1109 vpop {d8-d15} 1110 add sp, r11, #8 1111 pop {r4, r10, r11, pc} 1112 .endfunc 1113 1114function ff_put_h264_qpel8_mc31_neon, export=1 1115 add r1, r1, #1 1116 push {r0, r1, r2, lr} 1117 sub r1, r1, #1 1118 b put_h264_qpel8_mc11 1119 .endfunc 1120 1121function ff_put_h264_qpel8_mc02_neon, export=1 1122 push {lr} 1123 lowpass_const r3 1124 sub r1, r1, r2, lsl #1 1125 mov r3, r2 1126 vpush {d8-d15} 1127 bl put_h264_qpel8_v_lowpass_neon 1128 vpop {d8-d15} 1129 pop {pc} 1130 .endfunc 1131 1132function ff_put_h264_qpel8_mc12_neon, export=1 1133 push {r0, r1, r4, r10, r11, lr} 1134put_h264_qpel8_mc12: 1135 lowpass_const r3 1136 mov r11, sp 1137 bic sp, sp, #15 1138 sub sp, sp, #(8*8+16*12) 1139 sub r1, r1, r2, lsl #1 1140 mov r3, r2 1141 mov r2, #8 1142 mov r0, sp 1143 vpush {d8-d15} 1144 bl put_h264_qpel8_v_lowpass_neon 1145 mov r4, r0 1146 ldrd r0, [r11] 1147 sub r1, r1, r3, lsl #1 1148 sub r1, r1, #2 1149 sub r2, r4, #64 1150 bl put_h264_qpel8_hv_lowpass_l2_neon 1151 vpop {d8-d15} 1152 add sp, r11, #8 1153 pop {r4, r10, r11, pc} 1154 .endfunc 1155 1156function ff_put_h264_qpel8_mc22_neon, export=1 1157 push {r4, r10, r11, lr} 1158 mov r11, sp 1159 bic sp, sp, #15 1160 sub r1, r1, r2, lsl #1 1161 sub r1, r1, #2 1162 mov r3, r2 1163 sub sp, sp, #(16*12) 1164 mov r4, sp 1165 vpush {d8-d15} 1166 bl put_h264_qpel8_hv_lowpass_neon 1167 vpop {d8-d15} 1168 mov sp, r11 1169 pop {r4, r10, r11, pc} 1170 .endfunc 1171 1172function ff_put_h264_qpel8_mc32_neon, export=1 1173 push {r0, r1, r4, r10, r11, lr} 1174 add r1, r1, #1 1175 b put_h264_qpel8_mc12 1176 .endfunc 1177 1178function ff_put_h264_qpel8_mc03_neon, export=1 1179 push {lr} 1180 add ip, r1, r2 1181 b put_h264_qpel8_mc01 1182 .endfunc 1183 1184function ff_put_h264_qpel8_mc13_neon, export=1 1185 push {r0, r1, r2, lr} 1186 add r1, r1, r2 1187 b put_h264_qpel8_mc11 1188 .endfunc 1189 1190function ff_put_h264_qpel8_mc23_neon, export=1 1191 push {r0, r1, r4, r10, r11, lr} 1192 add r1, r1, r2 1193 b put_h264_qpel8_mc21 1194 .endfunc 1195 1196function ff_put_h264_qpel8_mc33_neon, export=1 1197 add r1, r1, #1 1198 push {r0, r1, r2, lr} 1199 add r1, r1, r2 1200 sub r1, r1, #1 1201 b put_h264_qpel8_mc11 1202 .endfunc 1203 1204function ff_put_h264_qpel16_mc10_neon, export=1 1205 lowpass_const r3 1206 mov r3, r1 1207 sub r1, r1, #2 1208 b put_h264_qpel16_h_lowpass_l2_neon 1209 .endfunc 1210 1211function ff_put_h264_qpel16_mc20_neon, export=1 1212 lowpass_const r3 1213 sub r1, r1, #2 1214 mov r3, r2 1215 b put_h264_qpel16_h_lowpass_neon 1216 .endfunc 1217 1218function ff_put_h264_qpel16_mc30_neon, export=1 1219 lowpass_const r3 1220 add r3, r1, #1 1221 sub r1, r1, #2 1222 b put_h264_qpel16_h_lowpass_l2_neon 1223 .endfunc 1224 1225function ff_put_h264_qpel16_mc01_neon, export=1 1226 push {r4, lr} 1227 mov ip, r1 1228put_h264_qpel16_mc01: 1229 lowpass_const r3 1230 mov r3, r2 1231 sub r1, r1, r2, lsl #1 1232 vpush {d8-d15} 1233 bl put_h264_qpel16_v_lowpass_l2_neon 1234 vpop {d8-d15} 1235 pop {r4, pc} 1236 .endfunc 1237 1238function ff_put_h264_qpel16_mc11_neon, export=1 1239 push {r0, r1, r4, lr} 1240put_h264_qpel16_mc11: 1241 lowpass_const r3 1242 sub sp, sp, #256 1243 mov r0, sp 1244 sub r1, r1, #2 1245 mov r3, #16 1246 vpush {d8-d15} 1247 bl put_h264_qpel16_h_lowpass_neon 1248 add r0, sp, #256 1249 ldrd r0, [r0, #64] 1250 mov r3, r2 1251 add ip, sp, #64 1252 sub r1, r1, r2, lsl #1 1253 mov r2, #16 1254 bl put_h264_qpel16_v_lowpass_l2_neon 1255 vpop {d8-d15} 1256 add sp, sp, #(256+8) 1257 pop {r4, pc} 1258 .endfunc 1259 1260function ff_put_h264_qpel16_mc21_neon, export=1 1261 push {r0, r1, r4-r5, r9-r11, lr} 1262put_h264_qpel16_mc21: 1263 lowpass_const r3 1264 mov r11, sp 1265 bic sp, sp, #15 1266 sub sp, sp, #(16*16+16*12) 1267 sub r1, r1, #2 1268 mov r0, sp 1269 vpush {d8-d15} 1270 bl put_h264_qpel16_h_lowpass_neon_packed 1271 mov r4, r0 1272 ldrd r0, [r11] 1273 sub r1, r1, r2, lsl #1 1274 sub r1, r1, #2 1275 mov r3, r2 1276 bl put_h264_qpel16_hv_lowpass_l2_neon 1277 vpop {d8-d15} 1278 add sp, r11, #8 1279 pop {r4-r5, r9-r11, pc} 1280 .endfunc 1281 1282function ff_put_h264_qpel16_mc31_neon, export=1 1283 add r1, r1, #1 1284 push {r0, r1, r4, lr} 1285 sub r1, r1, #1 1286 b put_h264_qpel16_mc11 1287 .endfunc 1288 1289function ff_put_h264_qpel16_mc02_neon, export=1 1290 push {r4, lr} 1291 lowpass_const r3 1292 sub r1, r1, r2, lsl #1 1293 mov r3, r2 1294 vpush {d8-d15} 1295 bl put_h264_qpel16_v_lowpass_neon 1296 vpop {d8-d15} 1297 pop {r4, pc} 1298 .endfunc 1299 1300function ff_put_h264_qpel16_mc12_neon, export=1 1301 push {r0, r1, r4-r5, r9-r11, lr} 1302put_h264_qpel16_mc12: 1303 lowpass_const r3 1304 mov r11, sp 1305 bic sp, sp, #15 1306 sub sp, sp, #(16*16+16*12) 1307 sub r1, r1, r2, lsl #1 1308 mov r0, sp 1309 mov r3, r2 1310 vpush {d8-d15} 1311 bl put_h264_qpel16_v_lowpass_neon_packed 1312 mov r4, r0 1313 ldrd r0, [r11] 1314 sub r1, r1, r3, lsl #1 1315 sub r1, r1, #2 1316 mov r2, r3 1317 bl put_h264_qpel16_hv_lowpass_l2_neon 1318 vpop {d8-d15} 1319 add sp, r11, #8 1320 pop {r4-r5, r9-r11, pc} 1321 .endfunc 1322 1323function ff_put_h264_qpel16_mc22_neon, export=1 1324 push {r4, r9-r11, lr} 1325 lowpass_const r3 1326 mov r11, sp 1327 bic sp, sp, #15 1328 sub r1, r1, r2, lsl #1 1329 sub r1, r1, #2 1330 mov r3, r2 1331 sub sp, sp, #(16*12) 1332 mov r4, sp 1333 vpush {d8-d15} 1334 bl put_h264_qpel16_hv_lowpass_neon 1335 vpop {d8-d15} 1336 mov sp, r11 1337 pop {r4, r9-r11, pc} 1338 .endfunc 1339 1340function ff_put_h264_qpel16_mc32_neon, export=1 1341 push {r0, r1, r4-r5, r9-r11, lr} 1342 add r1, r1, #1 1343 b put_h264_qpel16_mc12 1344 .endfunc 1345 1346function ff_put_h264_qpel16_mc03_neon, export=1 1347 push {r4, lr} 1348 add ip, r1, r2 1349 b put_h264_qpel16_mc01 1350 .endfunc 1351 1352function ff_put_h264_qpel16_mc13_neon, export=1 1353 push {r0, r1, r4, lr} 1354 add r1, r1, r2 1355 b put_h264_qpel16_mc11 1356 .endfunc 1357 1358function ff_put_h264_qpel16_mc23_neon, export=1 1359 push {r0, r1, r4-r5, r9-r11, lr} 1360 add r1, r1, r2 1361 b put_h264_qpel16_mc21 1362 .endfunc 1363 1364function ff_put_h264_qpel16_mc33_neon, export=1 1365 add r1, r1, #1 1366 push {r0, r1, r4, lr} 1367 add r1, r1, r2 1368 sub r1, r1, #1 1369 b put_h264_qpel16_mc11 1370 .endfunc 1371 1372@ Biweighted prediction 1373 1374 .macro biweight_16 macs, macd 1375 vdup.8 d0, r4 1376 vdup.8 d1, r5 1377 vmov q2, q8 1378 vmov q3, q8 13791: subs ip, ip, #2 1380 vld1.8 {d20-d21},[r0,:128], r2 1381 \macd q2, d0, d20 1382 pld [r0] 1383 \macd q3, d0, d21 1384 vld1.8 {d22-d23},[r1,:128], r2 1385 \macs q2, d1, d22 1386 pld [r1] 1387 \macs q3, d1, d23 1388 vmov q12, q8 1389 vld1.8 {d28-d29},[r0,:128], r2 1390 vmov q13, q8 1391 \macd q12, d0, d28 1392 pld [r0] 1393 \macd q13, d0, d29 1394 vld1.8 {d30-d31},[r1,:128], r2 1395 \macs q12, d1, d30 1396 pld [r1] 1397 \macs q13, d1, d31 1398 vshl.s16 q2, q2, q9 1399 vshl.s16 q3, q3, q9 1400 vqmovun.s16 d4, q2 1401 vqmovun.s16 d5, q3 1402 vshl.s16 q12, q12, q9 1403 vshl.s16 q13, q13, q9 1404 vqmovun.s16 d24, q12 1405 vqmovun.s16 d25, q13 1406 vmov q3, q8 1407 vst1.8 {d4- d5}, [r6,:128], r2 1408 vmov q2, q8 1409 vst1.8 {d24-d25},[r6,:128], r2 1410 bne 1b 1411 pop {r4-r6, pc} 1412 .endm 1413 1414 .macro biweight_8 macs, macd 1415 vdup.8 d0, r4 1416 vdup.8 d1, r5 1417 vmov q1, q8 1418 vmov q10, q8 14191: subs ip, ip, #2 1420 vld1.8 {d4},[r0,:64], r2 1421 \macd q1, d0, d4 1422 pld [r0] 1423 vld1.8 {d5},[r1,:64], r2 1424 \macs q1, d1, d5 1425 pld [r1] 1426 vld1.8 {d6},[r0,:64], r2 1427 \macd q10, d0, d6 1428 pld [r0] 1429 vld1.8 {d7},[r1,:64], r2 1430 \macs q10, d1, d7 1431 pld [r1] 1432 vshl.s16 q1, q1, q9 1433 vqmovun.s16 d2, q1 1434 vshl.s16 q10, q10, q9 1435 vqmovun.s16 d4, q10 1436 vmov q10, q8 1437 vst1.8 {d2},[r6,:64], r2 1438 vmov q1, q8 1439 vst1.8 {d4},[r6,:64], r2 1440 bne 1b 1441 pop {r4-r6, pc} 1442 .endm 1443 1444 .macro biweight_4 macs, macd 1445 vdup.8 d0, r4 1446 vdup.8 d1, r5 1447 vmov q1, q8 1448 vmov q10, q8 14491: subs ip, ip, #4 1450 vld1.32 {d4[0]},[r0,:32], r2 1451 vld1.32 {d4[1]},[r0,:32], r2 1452 \macd q1, d0, d4 1453 pld [r0] 1454 vld1.32 {d5[0]},[r1,:32], r2 1455 vld1.32 {d5[1]},[r1,:32], r2 1456 \macs q1, d1, d5 1457 pld [r1] 1458 blt 2f 1459 vld1.32 {d6[0]},[r0,:32], r2 1460 vld1.32 {d6[1]},[r0,:32], r2 1461 \macd q10, d0, d6 1462 pld [r0] 1463 vld1.32 {d7[0]},[r1,:32], r2 1464 vld1.32 {d7[1]},[r1,:32], r2 1465 \macs q10, d1, d7 1466 pld [r1] 1467 vshl.s16 q1, q1, q9 1468 vqmovun.s16 d2, q1 1469 vshl.s16 q10, q10, q9 1470 vqmovun.s16 d4, q10 1471 vmov q10, q8 1472 vst1.32 {d2[0]},[r6,:32], r2 1473 vst1.32 {d2[1]},[r6,:32], r2 1474 vmov q1, q8 1475 vst1.32 {d4[0]},[r6,:32], r2 1476 vst1.32 {d4[1]},[r6,:32], r2 1477 bne 1b 1478 pop {r4-r6, pc} 14792: vshl.s16 q1, q1, q9 1480 vqmovun.s16 d2, q1 1481 vst1.32 {d2[0]},[r6,:32], r2 1482 vst1.32 {d2[1]},[r6,:32], r2 1483 pop {r4-r6, pc} 1484 .endm 1485 1486 .macro biweight_func w 1487function biweight_h264_pixels_\w\()_neon 1488 push {r4-r6, lr} 1489 add r4, sp, #16 1490 ldm r4, {r4-r6} 1491 lsr lr, r4, #31 1492 add r6, r6, #1 1493 eors lr, lr, r5, lsr #30 1494 orr r6, r6, #1 1495 vdup.16 q9, r3 1496 lsl r6, r6, r3 1497 vmvn q9, q9 1498 vdup.16 q8, r6 1499 mov r6, r0 1500 beq 10f 1501 subs lr, lr, #1 1502 beq 20f 1503 subs lr, lr, #1 1504 beq 30f 1505 b 40f 150610: biweight_\w vmlal.u8, vmlal.u8 150720: rsb r4, r4, #0 1508 biweight_\w vmlal.u8, vmlsl.u8 150930: rsb r4, r4, #0 1510 rsb r5, r5, #0 1511 biweight_\w vmlsl.u8, vmlsl.u8 151240: rsb r5, r5, #0 1513 biweight_\w vmlsl.u8, vmlal.u8 1514 .endfunc 1515 .endm 1516 1517 .macro biweight_entry w, h, b=1 1518function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 1519 mov ip, #\h 1520.if \b 1521 b biweight_h264_pixels_\w\()_neon 1522.endif 1523 .endfunc 1524 .endm 1525 1526 biweight_entry 16, 8 1527 biweight_entry 16, 16, b=0 1528 biweight_func 16 1529 1530 biweight_entry 8, 16 1531 biweight_entry 8, 4 1532 biweight_entry 8, 8, b=0 1533 biweight_func 8 1534 1535 biweight_entry 4, 8 1536 biweight_entry 4, 2 1537 biweight_entry 4, 4, b=0 1538 biweight_func 4 1539 1540@ Weighted prediction 1541 1542 .macro weight_16 add 1543 vdup.8 d0, r3 15441: subs ip, ip, #2 1545 vld1.8 {d20-d21},[r0,:128], r1 1546 vmull.u8 q2, d0, d20 1547 pld [r0] 1548 vmull.u8 q3, d0, d21 1549 vld1.8 {d28-d29},[r0,:128], r1 1550 vmull.u8 q12, d0, d28 1551 pld [r0] 1552 vmull.u8 q13, d0, d29 1553 \add q2, q8, q2 1554 vrshl.s16 q2, q2, q9 1555 \add q3, q8, q3 1556 vrshl.s16 q3, q3, q9 1557 vqmovun.s16 d4, q2 1558 vqmovun.s16 d5, q3 1559 \add q12, q8, q12 1560 vrshl.s16 q12, q12, q9 1561 \add q13, q8, q13 1562 vrshl.s16 q13, q13, q9 1563 vqmovun.s16 d24, q12 1564 vqmovun.s16 d25, q13 1565 vst1.8 {d4- d5}, [r4,:128], r1 1566 vst1.8 {d24-d25},[r4,:128], r1 1567 bne 1b 1568 pop {r4, pc} 1569 .endm 1570 1571 .macro weight_8 add 1572 vdup.8 d0, r3 15731: subs ip, ip, #2 1574 vld1.8 {d4},[r0,:64], r1 1575 vmull.u8 q1, d0, d4 1576 pld [r0] 1577 vld1.8 {d6},[r0,:64], r1 1578 vmull.u8 q10, d0, d6 1579 \add q1, q8, q1 1580 pld [r0] 1581 vrshl.s16 q1, q1, q9 1582 vqmovun.s16 d2, q1 1583 \add q10, q8, q10 1584 vrshl.s16 q10, q10, q9 1585 vqmovun.s16 d4, q10 1586 vst1.8 {d2},[r4,:64], r1 1587 vst1.8 {d4},[r4,:64], r1 1588 bne 1b 1589 pop {r4, pc} 1590 .endm 1591 1592 .macro weight_4 add 1593 vdup.8 d0, r3 1594 vmov q1, q8 1595 vmov q10, q8 15961: subs ip, ip, #4 1597 vld1.32 {d4[0]},[r0,:32], r1 1598 vld1.32 {d4[1]},[r0,:32], r1 1599 vmull.u8 q1, d0, d4 1600 pld [r0] 1601 blt 2f 1602 vld1.32 {d6[0]},[r0,:32], r1 1603 vld1.32 {d6[1]},[r0,:32], r1 1604 vmull.u8 q10, d0, d6 1605 pld [r0] 1606 \add q1, q8, q1 1607 vrshl.s16 q1, q1, q9 1608 vqmovun.s16 d2, q1 1609 \add q10, q8, q10 1610 vrshl.s16 q10, q10, q9 1611 vqmovun.s16 d4, q10 1612 vmov q10, q8 1613 vst1.32 {d2[0]},[r4,:32], r1 1614 vst1.32 {d2[1]},[r4,:32], r1 1615 vmov q1, q8 1616 vst1.32 {d4[0]},[r4,:32], r1 1617 vst1.32 {d4[1]},[r4,:32], r1 1618 bne 1b 1619 pop {r4, pc} 16202: \add q1, q8, q1 1621 vrshl.s16 q1, q1, q9 1622 vqmovun.s16 d2, q1 1623 vst1.32 {d2[0]},[r4,:32], r1 1624 vst1.32 {d2[1]},[r4,:32], r1 1625 pop {r4, pc} 1626 .endm 1627 1628 .macro weight_func w 1629function weight_h264_pixels_\w\()_neon 1630 push {r4, lr} 1631 ldr r4, [sp, #8] 1632 cmp r2, #1 1633 lsl r4, r4, r2 1634 vdup.16 q8, r4 1635 mov r4, r0 1636 ble 20f 1637 rsb lr, r2, #1 1638 vdup.16 q9, lr 1639 cmp r3, #0 1640 blt 10f 1641 weight_\w vhadd.s16 164210: rsb r3, r3, #0 1643 weight_\w vhsub.s16 164420: rsb lr, r2, #0 1645 vdup.16 q9, lr 1646 cmp r3, #0 1647 blt 10f 1648 weight_\w vadd.s16 164910: rsb r3, r3, #0 1650 weight_\w vsub.s16 1651 .endfunc 1652 .endm 1653 1654 .macro weight_entry w, h, b=1 1655function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 1656 mov ip, #\h 1657.if \b 1658 b weight_h264_pixels_\w\()_neon 1659.endif 1660 .endfunc 1661 .endm 1662 1663 weight_entry 16, 8 1664 weight_entry 16, 16, b=0 1665 weight_func 16 1666 1667 weight_entry 8, 16 1668 weight_entry 8, 4 1669 weight_entry 8, 8, b=0 1670 weight_func 8 1671 1672 weight_entry 4, 8 1673 weight_entry 4, 2 1674 weight_entry 4, 4, b=0 1675 weight_func 4 1676