1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "asm.S" 22#include "neon.S" 23 24 /* H.264 loop filter */ 25 26.macro h264_loop_filter_start 27 ldr r12, [sp] 28 tst r2, r2 29 ldr r12, [r12] 30 it ne 31 tstne r3, r3 32 vmov.32 d24[0], r12 33 and r12, r12, r12, lsl #16 34 it eq 35 bxeq lr 36 ands r12, r12, r12, lsl #8 37 it lt 38 bxlt lr 39.endm 40 41.macro h264_loop_filter_luma 42 vdup.8 q11, r2 @ alpha 43 vmovl.u8 q12, d24 44 vabd.u8 q6, q8, q0 @ abs(p0 - q0) 45 vmovl.u16 q12, d24 46 vabd.u8 q14, q9, q8 @ abs(p1 - p0) 47 vsli.16 q12, q12, #8 48 vabd.u8 q15, q1, q0 @ abs(q1 - q0) 49 vsli.32 q12, q12, #16 50 vclt.u8 q6, q6, q11 @ < alpha 51 vdup.8 q11, r3 @ beta 52 vclt.s8 q7, q12, #0 53 vclt.u8 q14, q14, q11 @ < beta 54 vclt.u8 q15, q15, q11 @ < beta 55 vbic q6, q6, q7 56 vabd.u8 q4, q10, q8 @ abs(p2 - p0) 57 vand q6, q6, q14 58 vabd.u8 q5, q2, q0 @ abs(q2 - q0) 59 vclt.u8 q4, q4, q11 @ < beta 60 vand q6, q6, q15 61 vclt.u8 q5, q5, q11 @ < beta 62 vand q4, q4, q6 63 vand q5, q5, q6 64 vand q12, q12, q6 65 vrhadd.u8 q14, q8, q0 66 vsub.i8 q6, q12, q4 67 vqadd.u8 q7, q9, q12 68 vhadd.u8 q10, q10, q14 69 vsub.i8 q6, q6, q5 70 vhadd.u8 q14, q2, q14 71 vmin.u8 q7, q7, q10 72 vqsub.u8 q11, q9, q12 73 vqadd.u8 q2, q1, q12 74 vmax.u8 q7, q7, q11 75 vqsub.u8 q11, q1, q12 76 vmin.u8 q14, q2, q14 77 vmovl.u8 q2, d0 78 vmax.u8 q14, q14, q11 79 vmovl.u8 q10, d1 80 vsubw.u8 q2, q2, d16 81 vsubw.u8 q10, q10, d17 82 vshl.i16 q2, q2, #2 83 vshl.i16 q10, q10, #2 84 vaddw.u8 q2, q2, d18 85 vaddw.u8 q10, q10, d19 86 vsubw.u8 q2, q2, d2 87 vsubw.u8 q10, q10, d3 88 vrshrn.i16 d4, q2, #3 89 vrshrn.i16 d5, q10, #3 90 vbsl q4, q7, q9 91 vbsl q5, q14, q1 92 vneg.s8 q7, q6 93 vmovl.u8 q14, d16 94 vmin.s8 q2, q2, q6 95 vmovl.u8 q6, d17 96 vmax.s8 q2, q2, q7 97 vmovl.u8 q11, d0 98 vmovl.u8 q12, d1 99 vaddw.s8 q14, q14, d4 100 vaddw.s8 q6, q6, d5 101 vsubw.s8 q11, q11, d4 102 vsubw.s8 q12, q12, d5 103 vqmovun.s16 d16, q14 104 vqmovun.s16 d17, q6 105 vqmovun.s16 d0, q11 106 vqmovun.s16 d1, q12 107.endm 108 109function ff_h264_v_loop_filter_luma_neon, export=1 110 h264_loop_filter_start 111 112 vld1.8 {d0, d1}, [r0,:128], r1 113 vld1.8 {d2, d3}, [r0,:128], r1 114 vld1.8 {d4, d5}, [r0,:128], r1 115 sub r0, r0, r1, lsl #2 116 sub r0, r0, r1, lsl #1 117 vld1.8 {d20,d21}, [r0,:128], r1 118 vld1.8 {d18,d19}, [r0,:128], r1 119 vld1.8 {d16,d17}, [r0,:128], r1 120 121 vpush {d8-d15} 122 123 h264_loop_filter_luma 124 125 sub r0, r0, r1, lsl #1 126 vst1.8 {d8, d9}, [r0,:128], r1 127 vst1.8 {d16,d17}, [r0,:128], r1 128 vst1.8 {d0, d1}, [r0,:128], r1 129 vst1.8 {d10,d11}, [r0,:128] 130 131 vpop {d8-d15} 132 bx lr 133endfunc 134 135function ff_h264_h_loop_filter_luma_neon, export=1 136 h264_loop_filter_start 137 138 sub r0, r0, #4 139 vld1.8 {d6}, [r0], r1 140 vld1.8 {d20}, [r0], r1 141 vld1.8 {d18}, [r0], r1 142 vld1.8 {d16}, [r0], r1 143 vld1.8 {d0}, [r0], r1 144 vld1.8 {d2}, [r0], r1 145 vld1.8 {d4}, [r0], r1 146 vld1.8 {d26}, [r0], r1 147 vld1.8 {d7}, [r0], r1 148 vld1.8 {d21}, [r0], r1 149 vld1.8 {d19}, [r0], r1 150 vld1.8 {d17}, [r0], r1 151 vld1.8 {d1}, [r0], r1 152 vld1.8 {d3}, [r0], r1 153 vld1.8 {d5}, [r0], r1 154 vld1.8 {d27}, [r0], r1 155 156 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 157 158 vpush {d8-d15} 159 160 h264_loop_filter_luma 161 162 transpose_4x4 q4, q8, q0, q5 163 164 sub r0, r0, r1, lsl #4 165 add r0, r0, #2 166 vst1.32 {d8[0]}, [r0], r1 167 vst1.32 {d16[0]}, [r0], r1 168 vst1.32 {d0[0]}, [r0], r1 169 vst1.32 {d10[0]}, [r0], r1 170 vst1.32 {d8[1]}, [r0], r1 171 vst1.32 {d16[1]}, [r0], r1 172 vst1.32 {d0[1]}, [r0], r1 173 vst1.32 {d10[1]}, [r0], r1 174 vst1.32 {d9[0]}, [r0], r1 175 vst1.32 {d17[0]}, [r0], r1 176 vst1.32 {d1[0]}, [r0], r1 177 vst1.32 {d11[0]}, [r0], r1 178 vst1.32 {d9[1]}, [r0], r1 179 vst1.32 {d17[1]}, [r0], r1 180 vst1.32 {d1[1]}, [r0], r1 181 vst1.32 {d11[1]}, [r0], r1 182 183 vpop {d8-d15} 184 bx lr 185endfunc 186 187.macro h264_loop_filter_chroma 188 vdup.8 d22, r2 @ alpha 189 vmovl.u8 q12, d24 190 vabd.u8 d26, d16, d0 @ abs(p0 - q0) 191 vmovl.u8 q2, d0 192 vabd.u8 d28, d18, d16 @ abs(p1 - p0) 193 vsubw.u8 q2, q2, d16 194 vsli.16 d24, d24, #8 195 vshl.i16 q2, q2, #2 196 vabd.u8 d30, d2, d0 @ abs(q1 - q0) 197 vaddw.u8 q2, q2, d18 198 vclt.u8 d26, d26, d22 @ < alpha 199 vsubw.u8 q2, q2, d2 200 vdup.8 d22, r3 @ beta 201 vrshrn.i16 d4, q2, #3 202 vclt.u8 d28, d28, d22 @ < beta 203 vclt.u8 d30, d30, d22 @ < beta 204 vmin.s8 d4, d4, d24 205 vneg.s8 d25, d24 206 vand d26, d26, d28 207 vmax.s8 d4, d4, d25 208 vand d26, d26, d30 209 vmovl.u8 q11, d0 210 vand d4, d4, d26 211 vmovl.u8 q14, d16 212 vaddw.s8 q14, q14, d4 213 vsubw.s8 q11, q11, d4 214 vqmovun.s16 d16, q14 215 vqmovun.s16 d0, q11 216.endm 217 218function ff_h264_v_loop_filter_chroma_neon, export=1 219 h264_loop_filter_start 220 221 sub r0, r0, r1, lsl #1 222 vld1.8 {d18}, [r0,:64], r1 223 vld1.8 {d16}, [r0,:64], r1 224 vld1.8 {d0}, [r0,:64], r1 225 vld1.8 {d2}, [r0,:64] 226 227 h264_loop_filter_chroma 228 229 sub r0, r0, r1, lsl #1 230 vst1.8 {d16}, [r0,:64], r1 231 vst1.8 {d0}, [r0,:64], r1 232 233 bx lr 234endfunc 235 236function ff_h264_h_loop_filter_chroma_neon, export=1 237 h264_loop_filter_start 238 239 sub r0, r0, #2 240 vld1.32 {d18[0]}, [r0], r1 241 vld1.32 {d16[0]}, [r0], r1 242 vld1.32 {d0[0]}, [r0], r1 243 vld1.32 {d2[0]}, [r0], r1 244 vld1.32 {d18[1]}, [r0], r1 245 vld1.32 {d16[1]}, [r0], r1 246 vld1.32 {d0[1]}, [r0], r1 247 vld1.32 {d2[1]}, [r0], r1 248 249 vtrn.16 d18, d0 250 vtrn.16 d16, d2 251 vtrn.8 d18, d16 252 vtrn.8 d0, d2 253 254 h264_loop_filter_chroma 255 256 vtrn.16 d18, d0 257 vtrn.16 d16, d2 258 vtrn.8 d18, d16 259 vtrn.8 d0, d2 260 261 sub r0, r0, r1, lsl #3 262 vst1.32 {d18[0]}, [r0], r1 263 vst1.32 {d16[0]}, [r0], r1 264 vst1.32 {d0[0]}, [r0], r1 265 vst1.32 {d2[0]}, [r0], r1 266 vst1.32 {d18[1]}, [r0], r1 267 vst1.32 {d16[1]}, [r0], r1 268 vst1.32 {d0[1]}, [r0], r1 269 vst1.32 {d2[1]}, [r0], r1 270 271 bx lr 272endfunc 273 274 /* H.264 qpel MC */ 275 276.macro lowpass_const r 277 movw \r, #5 278 movt \r, #20 279 vmov.32 d6[0], \r 280.endm 281 282.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 283 .if \narrow 284 t0 .req q0 285 t1 .req q8 286 .else 287 t0 .req \d0 288 t1 .req \d1 289 .endif 290 vext.8 d2, \r0, \r1, #2 291 vext.8 d3, \r0, \r1, #3 292 vaddl.u8 q1, d2, d3 293 vext.8 d4, \r0, \r1, #1 294 vext.8 d5, \r0, \r1, #4 295 vaddl.u8 q2, d4, d5 296 vext.8 d30, \r0, \r1, #5 297 vaddl.u8 t0, \r0, d30 298 vext.8 d18, \r2, \r3, #2 299 vmla.i16 t0, q1, d6[1] 300 vext.8 d19, \r2, \r3, #3 301 vaddl.u8 q9, d18, d19 302 vext.8 d20, \r2, \r3, #1 303 vmls.i16 t0, q2, d6[0] 304 vext.8 d21, \r2, \r3, #4 305 vaddl.u8 q10, d20, d21 306 vext.8 d31, \r2, \r3, #5 307 vaddl.u8 t1, \r2, d31 308 vmla.i16 t1, q9, d6[1] 309 vmls.i16 t1, q10, d6[0] 310 .if \narrow 311 vqrshrun.s16 \d0, t0, #5 312 vqrshrun.s16 \d1, t1, #5 313 .endif 314 .unreq t0 315 .unreq t1 316.endm 317 318.macro lowpass_8_1 r0, r1, d0, narrow=1 319 .if \narrow 320 t0 .req q0 321 .else 322 t0 .req \d0 323 .endif 324 vext.8 d2, \r0, \r1, #2 325 vext.8 d3, \r0, \r1, #3 326 vaddl.u8 q1, d2, d3 327 vext.8 d4, \r0, \r1, #1 328 vext.8 d5, \r0, \r1, #4 329 vaddl.u8 q2, d4, d5 330 vext.8 d30, \r0, \r1, #5 331 vaddl.u8 t0, \r0, d30 332 vmla.i16 t0, q1, d6[1] 333 vmls.i16 t0, q2, d6[0] 334 .if \narrow 335 vqrshrun.s16 \d0, t0, #5 336 .endif 337 .unreq t0 338.endm 339 340.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d 341 vext.16 q1, \r0, \r1, #2 342 vext.16 q0, \r0, \r1, #3 343 vaddl.s16 q9, d2, d0 344 vext.16 q2, \r0, \r1, #1 345 vaddl.s16 q1, d3, d1 346 vext.16 q3, \r0, \r1, #4 347 vaddl.s16 q10, d4, d6 348 vext.16 \r1, \r0, \r1, #5 349 vaddl.s16 q2, d5, d7 350 vaddl.s16 q0, \h0, \h1 351 vaddl.s16 q8, \l0, \l1 352 353 vshl.i32 q3, q9, #4 354 vshl.i32 q9, q9, #2 355 vshl.i32 q15, q10, #2 356 vadd.i32 q9, q9, q3 357 vadd.i32 q10, q10, q15 358 359 vshl.i32 q3, q1, #4 360 vshl.i32 q1, q1, #2 361 vshl.i32 q15, q2, #2 362 vadd.i32 q1, q1, q3 363 vadd.i32 q2, q2, q15 364 365 vadd.i32 q9, q9, q8 366 vsub.i32 q9, q9, q10 367 368 vadd.i32 q1, q1, q0 369 vsub.i32 q1, q1, q2 370 371 vrshrn.s32 d18, q9, #10 372 vrshrn.s32 d19, q1, #10 373 374 vqmovun.s16 \d, q9 375.endm 376 377function put_h264_qpel16_h_lowpass_neon_packed 378 mov r4, lr 379 mov r12, #16 380 mov r3, #8 381 bl put_h264_qpel8_h_lowpass_neon 382 sub r1, r1, r2, lsl #4 383 add r1, r1, #8 384 mov r12, #16 385 mov lr, r4 386 b put_h264_qpel8_h_lowpass_neon 387endfunc 388 389.macro h264_qpel_h_lowpass type 390function \type\()_h264_qpel16_h_lowpass_neon 391 push {lr} 392 mov r12, #16 393 bl \type\()_h264_qpel8_h_lowpass_neon 394 sub r0, r0, r3, lsl #4 395 sub r1, r1, r2, lsl #4 396 add r0, r0, #8 397 add r1, r1, #8 398 mov r12, #16 399 pop {lr} 400endfunc 401 402function \type\()_h264_qpel8_h_lowpass_neon 4031: vld1.8 {d0, d1}, [r1], r2 404 vld1.8 {d16,d17}, [r1], r2 405 subs r12, r12, #2 406 lowpass_8 d0, d1, d16, d17, d0, d16 407 .ifc \type,avg 408 vld1.8 {d2}, [r0,:64], r3 409 vrhadd.u8 d0, d0, d2 410 vld1.8 {d3}, [r0,:64] 411 vrhadd.u8 d16, d16, d3 412 sub r0, r0, r3 413 .endif 414 vst1.8 {d0}, [r0,:64], r3 415 vst1.8 {d16}, [r0,:64], r3 416 bne 1b 417 bx lr 418endfunc 419.endm 420 421 h264_qpel_h_lowpass put 422 h264_qpel_h_lowpass avg 423 424.macro h264_qpel_h_lowpass_l2 type 425function \type\()_h264_qpel16_h_lowpass_l2_neon 426 push {lr} 427 mov r12, #16 428 bl \type\()_h264_qpel8_h_lowpass_l2_neon 429 sub r0, r0, r2, lsl #4 430 sub r1, r1, r2, lsl #4 431 sub r3, r3, r2, lsl #4 432 add r0, r0, #8 433 add r1, r1, #8 434 add r3, r3, #8 435 mov r12, #16 436 pop {lr} 437endfunc 438 439function \type\()_h264_qpel8_h_lowpass_l2_neon 4401: vld1.8 {d0, d1}, [r1], r2 441 vld1.8 {d16,d17}, [r1], r2 442 vld1.8 {d28}, [r3], r2 443 vld1.8 {d29}, [r3], r2 444 subs r12, r12, #2 445 lowpass_8 d0, d1, d16, d17, d0, d1 446 vrhadd.u8 q0, q0, q14 447 .ifc \type,avg 448 vld1.8 {d2}, [r0,:64], r2 449 vrhadd.u8 d0, d0, d2 450 vld1.8 {d3}, [r0,:64] 451 vrhadd.u8 d1, d1, d3 452 sub r0, r0, r2 453 .endif 454 vst1.8 {d0}, [r0,:64], r2 455 vst1.8 {d1}, [r0,:64], r2 456 bne 1b 457 bx lr 458endfunc 459.endm 460 461 h264_qpel_h_lowpass_l2 put 462 h264_qpel_h_lowpass_l2 avg 463 464function put_h264_qpel16_v_lowpass_neon_packed 465 mov r4, lr 466 mov r2, #8 467 bl put_h264_qpel8_v_lowpass_neon 468 sub r1, r1, r3, lsl #2 469 bl put_h264_qpel8_v_lowpass_neon 470 sub r1, r1, r3, lsl #4 471 sub r1, r1, r3, lsl #2 472 add r1, r1, #8 473 bl put_h264_qpel8_v_lowpass_neon 474 sub r1, r1, r3, lsl #2 475 mov lr, r4 476 b put_h264_qpel8_v_lowpass_neon 477endfunc 478 479.macro h264_qpel_v_lowpass type 480function \type\()_h264_qpel16_v_lowpass_neon 481 mov r4, lr 482 bl \type\()_h264_qpel8_v_lowpass_neon 483 sub r1, r1, r3, lsl #2 484 bl \type\()_h264_qpel8_v_lowpass_neon 485 sub r0, r0, r2, lsl #4 486 add r0, r0, #8 487 sub r1, r1, r3, lsl #4 488 sub r1, r1, r3, lsl #2 489 add r1, r1, #8 490 bl \type\()_h264_qpel8_v_lowpass_neon 491 sub r1, r1, r3, lsl #2 492 mov lr, r4 493endfunc 494 495function \type\()_h264_qpel8_v_lowpass_neon 496 vld1.8 {d8}, [r1], r3 497 vld1.8 {d10}, [r1], r3 498 vld1.8 {d12}, [r1], r3 499 vld1.8 {d14}, [r1], r3 500 vld1.8 {d22}, [r1], r3 501 vld1.8 {d24}, [r1], r3 502 vld1.8 {d26}, [r1], r3 503 vld1.8 {d28}, [r1], r3 504 vld1.8 {d9}, [r1], r3 505 vld1.8 {d11}, [r1], r3 506 vld1.8 {d13}, [r1], r3 507 vld1.8 {d15}, [r1], r3 508 vld1.8 {d23}, [r1] 509 510 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 511 lowpass_8 d8, d9, d10, d11, d8, d10 512 lowpass_8 d12, d13, d14, d15, d12, d14 513 lowpass_8 d22, d23, d24, d25, d22, d24 514 lowpass_8 d26, d27, d28, d29, d26, d28 515 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 516 517 .ifc \type,avg 518 vld1.8 {d9}, [r0,:64], r2 519 vrhadd.u8 d8, d8, d9 520 vld1.8 {d11}, [r0,:64], r2 521 vrhadd.u8 d10, d10, d11 522 vld1.8 {d13}, [r0,:64], r2 523 vrhadd.u8 d12, d12, d13 524 vld1.8 {d15}, [r0,:64], r2 525 vrhadd.u8 d14, d14, d15 526 vld1.8 {d23}, [r0,:64], r2 527 vrhadd.u8 d22, d22, d23 528 vld1.8 {d25}, [r0,:64], r2 529 vrhadd.u8 d24, d24, d25 530 vld1.8 {d27}, [r0,:64], r2 531 vrhadd.u8 d26, d26, d27 532 vld1.8 {d29}, [r0,:64], r2 533 vrhadd.u8 d28, d28, d29 534 sub r0, r0, r2, lsl #3 535 .endif 536 537 vst1.8 {d8}, [r0,:64], r2 538 vst1.8 {d10}, [r0,:64], r2 539 vst1.8 {d12}, [r0,:64], r2 540 vst1.8 {d14}, [r0,:64], r2 541 vst1.8 {d22}, [r0,:64], r2 542 vst1.8 {d24}, [r0,:64], r2 543 vst1.8 {d26}, [r0,:64], r2 544 vst1.8 {d28}, [r0,:64], r2 545 546 bx lr 547endfunc 548.endm 549 550 h264_qpel_v_lowpass put 551 h264_qpel_v_lowpass avg 552 553.macro h264_qpel_v_lowpass_l2 type 554function \type\()_h264_qpel16_v_lowpass_l2_neon 555 mov r4, lr 556 bl \type\()_h264_qpel8_v_lowpass_l2_neon 557 sub r1, r1, r3, lsl #2 558 bl \type\()_h264_qpel8_v_lowpass_l2_neon 559 sub r0, r0, r3, lsl #4 560 sub r12, r12, r2, lsl #4 561 add r0, r0, #8 562 add r12, r12, #8 563 sub r1, r1, r3, lsl #4 564 sub r1, r1, r3, lsl #2 565 add r1, r1, #8 566 bl \type\()_h264_qpel8_v_lowpass_l2_neon 567 sub r1, r1, r3, lsl #2 568 mov lr, r4 569endfunc 570 571function \type\()_h264_qpel8_v_lowpass_l2_neon 572 vld1.8 {d8}, [r1], r3 573 vld1.8 {d10}, [r1], r3 574 vld1.8 {d12}, [r1], r3 575 vld1.8 {d14}, [r1], r3 576 vld1.8 {d22}, [r1], r3 577 vld1.8 {d24}, [r1], r3 578 vld1.8 {d26}, [r1], r3 579 vld1.8 {d28}, [r1], r3 580 vld1.8 {d9}, [r1], r3 581 vld1.8 {d11}, [r1], r3 582 vld1.8 {d13}, [r1], r3 583 vld1.8 {d15}, [r1], r3 584 vld1.8 {d23}, [r1] 585 586 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 587 lowpass_8 d8, d9, d10, d11, d8, d9 588 lowpass_8 d12, d13, d14, d15, d12, d13 589 lowpass_8 d22, d23, d24, d25, d22, d23 590 lowpass_8 d26, d27, d28, d29, d26, d27 591 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 592 593 vld1.8 {d0}, [r12], r2 594 vld1.8 {d1}, [r12], r2 595 vld1.8 {d2}, [r12], r2 596 vld1.8 {d3}, [r12], r2 597 vld1.8 {d4}, [r12], r2 598 vrhadd.u8 q0, q0, q4 599 vld1.8 {d5}, [r12], r2 600 vrhadd.u8 q1, q1, q6 601 vld1.8 {d10}, [r12], r2 602 vrhadd.u8 q2, q2, q11 603 vld1.8 {d11}, [r12], r2 604 vrhadd.u8 q5, q5, q13 605 606 .ifc \type,avg 607 vld1.8 {d16}, [r0,:64], r3 608 vrhadd.u8 d0, d0, d16 609 vld1.8 {d17}, [r0,:64], r3 610 vrhadd.u8 d1, d1, d17 611 vld1.8 {d16}, [r0,:64], r3 612 vrhadd.u8 d2, d2, d16 613 vld1.8 {d17}, [r0,:64], r3 614 vrhadd.u8 d3, d3, d17 615 vld1.8 {d16}, [r0,:64], r3 616 vrhadd.u8 d4, d4, d16 617 vld1.8 {d17}, [r0,:64], r3 618 vrhadd.u8 d5, d5, d17 619 vld1.8 {d16}, [r0,:64], r3 620 vrhadd.u8 d10, d10, d16 621 vld1.8 {d17}, [r0,:64], r3 622 vrhadd.u8 d11, d11, d17 623 sub r0, r0, r3, lsl #3 624 .endif 625 626 vst1.8 {d0}, [r0,:64], r3 627 vst1.8 {d1}, [r0,:64], r3 628 vst1.8 {d2}, [r0,:64], r3 629 vst1.8 {d3}, [r0,:64], r3 630 vst1.8 {d4}, [r0,:64], r3 631 vst1.8 {d5}, [r0,:64], r3 632 vst1.8 {d10}, [r0,:64], r3 633 vst1.8 {d11}, [r0,:64], r3 634 635 bx lr 636endfunc 637.endm 638 639 h264_qpel_v_lowpass_l2 put 640 h264_qpel_v_lowpass_l2 avg 641 642function put_h264_qpel8_hv_lowpass_neon_top 643 lowpass_const r12 644 mov r12, #12 6451: vld1.8 {d0, d1}, [r1], r3 646 vld1.8 {d16,d17}, [r1], r3 647 subs r12, r12, #2 648 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 649 vst1.8 {d22-d25}, [r4,:128]! 650 bne 1b 651 652 vld1.8 {d0, d1}, [r1] 653 lowpass_8_1 d0, d1, q12, narrow=0 654 655 mov r12, #-16 656 add r4, r4, r12 657 vld1.8 {d30,d31}, [r4,:128], r12 658 vld1.8 {d20,d21}, [r4,:128], r12 659 vld1.8 {d18,d19}, [r4,:128], r12 660 vld1.8 {d16,d17}, [r4,:128], r12 661 vld1.8 {d14,d15}, [r4,:128], r12 662 vld1.8 {d12,d13}, [r4,:128], r12 663 vld1.8 {d10,d11}, [r4,:128], r12 664 vld1.8 {d8, d9}, [r4,:128], r12 665 vld1.8 {d6, d7}, [r4,:128], r12 666 vld1.8 {d4, d5}, [r4,:128], r12 667 vld1.8 {d2, d3}, [r4,:128], r12 668 vld1.8 {d0, d1}, [r4,:128] 669 670 swap4 d1, d3, d5, d7, d8, d10, d12, d14 671 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 672 673 swap4 d17, d19, d21, d31, d24, d26, d28, d22 674 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 675 676 vst1.8 {d30,d31}, [r4,:128]! 677 vst1.8 {d6, d7}, [r4,:128]! 678 vst1.8 {d20,d21}, [r4,:128]! 679 vst1.8 {d4, d5}, [r4,:128]! 680 vst1.8 {d18,d19}, [r4,:128]! 681 vst1.8 {d2, d3}, [r4,:128]! 682 vst1.8 {d16,d17}, [r4,:128]! 683 vst1.8 {d0, d1}, [r4,:128] 684 685 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 686 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 687 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 688 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 689 690 vld1.8 {d16,d17}, [r4,:128], r12 691 vld1.8 {d30,d31}, [r4,:128], r12 692 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 693 vld1.8 {d16,d17}, [r4,:128], r12 694 vld1.8 {d30,d31}, [r4,:128], r12 695 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 696 vld1.8 {d16,d17}, [r4,:128], r12 697 vld1.8 {d30,d31}, [r4,:128], r12 698 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 699 vld1.8 {d16,d17}, [r4,:128], r12 700 vld1.8 {d30,d31}, [r4,:128] 701 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 702 703 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 704 705 bx lr 706endfunc 707 708.macro h264_qpel8_hv_lowpass type 709function \type\()_h264_qpel8_hv_lowpass_neon 710 mov r10, lr 711 bl put_h264_qpel8_hv_lowpass_neon_top 712 .ifc \type,avg 713 vld1.8 {d0}, [r0,:64], r2 714 vrhadd.u8 d12, d12, d0 715 vld1.8 {d1}, [r0,:64], r2 716 vrhadd.u8 d13, d13, d1 717 vld1.8 {d2}, [r0,:64], r2 718 vrhadd.u8 d14, d14, d2 719 vld1.8 {d3}, [r0,:64], r2 720 vrhadd.u8 d15, d15, d3 721 vld1.8 {d4}, [r0,:64], r2 722 vrhadd.u8 d8, d8, d4 723 vld1.8 {d5}, [r0,:64], r2 724 vrhadd.u8 d9, d9, d5 725 vld1.8 {d6}, [r0,:64], r2 726 vrhadd.u8 d10, d10, d6 727 vld1.8 {d7}, [r0,:64], r2 728 vrhadd.u8 d11, d11, d7 729 sub r0, r0, r2, lsl #3 730 .endif 731 732 vst1.8 {d12}, [r0,:64], r2 733 vst1.8 {d13}, [r0,:64], r2 734 vst1.8 {d14}, [r0,:64], r2 735 vst1.8 {d15}, [r0,:64], r2 736 vst1.8 {d8}, [r0,:64], r2 737 vst1.8 {d9}, [r0,:64], r2 738 vst1.8 {d10}, [r0,:64], r2 739 vst1.8 {d11}, [r0,:64], r2 740 741 mov lr, r10 742 bx lr 743endfunc 744.endm 745 746 h264_qpel8_hv_lowpass put 747 h264_qpel8_hv_lowpass avg 748 749.macro h264_qpel8_hv_lowpass_l2 type 750function \type\()_h264_qpel8_hv_lowpass_l2_neon 751 mov r10, lr 752 bl put_h264_qpel8_hv_lowpass_neon_top 753 754 vld1.8 {d0, d1}, [r2,:128]! 755 vld1.8 {d2, d3}, [r2,:128]! 756 vrhadd.u8 q0, q0, q6 757 vld1.8 {d4, d5}, [r2,:128]! 758 vrhadd.u8 q1, q1, q7 759 vld1.8 {d6, d7}, [r2,:128]! 760 vrhadd.u8 q2, q2, q4 761 vrhadd.u8 q3, q3, q5 762 .ifc \type,avg 763 vld1.8 {d16}, [r0,:64], r3 764 vrhadd.u8 d0, d0, d16 765 vld1.8 {d17}, [r0,:64], r3 766 vrhadd.u8 d1, d1, d17 767 vld1.8 {d18}, [r0,:64], r3 768 vrhadd.u8 d2, d2, d18 769 vld1.8 {d19}, [r0,:64], r3 770 vrhadd.u8 d3, d3, d19 771 vld1.8 {d20}, [r0,:64], r3 772 vrhadd.u8 d4, d4, d20 773 vld1.8 {d21}, [r0,:64], r3 774 vrhadd.u8 d5, d5, d21 775 vld1.8 {d22}, [r0,:64], r3 776 vrhadd.u8 d6, d6, d22 777 vld1.8 {d23}, [r0,:64], r3 778 vrhadd.u8 d7, d7, d23 779 sub r0, r0, r3, lsl #3 780 .endif 781 vst1.8 {d0}, [r0,:64], r3 782 vst1.8 {d1}, [r0,:64], r3 783 vst1.8 {d2}, [r0,:64], r3 784 vst1.8 {d3}, [r0,:64], r3 785 vst1.8 {d4}, [r0,:64], r3 786 vst1.8 {d5}, [r0,:64], r3 787 vst1.8 {d6}, [r0,:64], r3 788 vst1.8 {d7}, [r0,:64], r3 789 790 mov lr, r10 791 bx lr 792endfunc 793.endm 794 795 h264_qpel8_hv_lowpass_l2 put 796 h264_qpel8_hv_lowpass_l2 avg 797 798.macro h264_qpel16_hv type 799function \type\()_h264_qpel16_hv_lowpass_neon 800 mov r9, lr 801 bl \type\()_h264_qpel8_hv_lowpass_neon 802 sub r1, r1, r3, lsl #2 803 bl \type\()_h264_qpel8_hv_lowpass_neon 804 sub r1, r1, r3, lsl #4 805 sub r1, r1, r3, lsl #2 806 add r1, r1, #8 807 sub r0, r0, r2, lsl #4 808 add r0, r0, #8 809 bl \type\()_h264_qpel8_hv_lowpass_neon 810 sub r1, r1, r3, lsl #2 811 mov lr, r9 812 b \type\()_h264_qpel8_hv_lowpass_neon 813endfunc 814 815function \type\()_h264_qpel16_hv_lowpass_l2_neon 816 mov r9, lr 817 sub r2, r4, #256 818 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 819 sub r1, r1, r3, lsl #2 820 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 821 sub r1, r1, r3, lsl #4 822 sub r1, r1, r3, lsl #2 823 add r1, r1, #8 824 sub r0, r0, r3, lsl #4 825 add r0, r0, #8 826 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 827 sub r1, r1, r3, lsl #2 828 mov lr, r9 829 b \type\()_h264_qpel8_hv_lowpass_l2_neon 830endfunc 831.endm 832 833 h264_qpel16_hv put 834 h264_qpel16_hv avg 835 836.macro h264_qpel8 type 837function ff_\type\()_h264_qpel8_mc10_neon, export=1 838 lowpass_const r3 839 mov r3, r1 840 sub r1, r1, #2 841 mov r12, #8 842 b \type\()_h264_qpel8_h_lowpass_l2_neon 843endfunc 844 845function ff_\type\()_h264_qpel8_mc20_neon, export=1 846 lowpass_const r3 847 sub r1, r1, #2 848 mov r3, r2 849 mov r12, #8 850 b \type\()_h264_qpel8_h_lowpass_neon 851endfunc 852 853function ff_\type\()_h264_qpel8_mc30_neon, export=1 854 lowpass_const r3 855 add r3, r1, #1 856 sub r1, r1, #2 857 mov r12, #8 858 b \type\()_h264_qpel8_h_lowpass_l2_neon 859endfunc 860 861function ff_\type\()_h264_qpel8_mc01_neon, export=1 862 push {lr} 863 mov r12, r1 864\type\()_h264_qpel8_mc01: 865 lowpass_const r3 866 mov r3, r2 867 sub r1, r1, r2, lsl #1 868 vpush {d8-d15} 869 bl \type\()_h264_qpel8_v_lowpass_l2_neon 870 vpop {d8-d15} 871 pop {pc} 872endfunc 873 874function ff_\type\()_h264_qpel8_mc11_neon, export=1 875 push {r0, r1, r11, lr} 876\type\()_h264_qpel8_mc11: 877 lowpass_const r3 878 mov r11, sp 879A bic sp, sp, #15 880T bic r0, r11, #15 881T mov sp, r0 882 sub sp, sp, #64 883 mov r0, sp 884 sub r1, r1, #2 885 mov r3, #8 886 mov r12, #8 887 vpush {d8-d15} 888 bl put_h264_qpel8_h_lowpass_neon 889 ldrd r0, [r11], #8 890 mov r3, r2 891 add r12, sp, #64 892 sub r1, r1, r2, lsl #1 893 mov r2, #8 894 bl \type\()_h264_qpel8_v_lowpass_l2_neon 895 vpop {d8-d15} 896 mov sp, r11 897 pop {r11, pc} 898endfunc 899 900function ff_\type\()_h264_qpel8_mc21_neon, export=1 901 push {r0, r1, r4, r10, r11, lr} 902\type\()_h264_qpel8_mc21: 903 lowpass_const r3 904 mov r11, sp 905A bic sp, sp, #15 906T bic r0, r11, #15 907T mov sp, r0 908 sub sp, sp, #(8*8+16*12) 909 sub r1, r1, #2 910 mov r3, #8 911 mov r0, sp 912 mov r12, #8 913 vpush {d8-d15} 914 bl put_h264_qpel8_h_lowpass_neon 915 mov r4, r0 916 ldrd r0, [r11], #8 917 sub r1, r1, r2, lsl #1 918 sub r1, r1, #2 919 mov r3, r2 920 sub r2, r4, #64 921 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 922 vpop {d8-d15} 923 mov sp, r11 924 pop {r4, r10, r11, pc} 925endfunc 926 927function ff_\type\()_h264_qpel8_mc31_neon, export=1 928 add r1, r1, #1 929 push {r0, r1, r11, lr} 930 sub r1, r1, #1 931 b \type\()_h264_qpel8_mc11 932endfunc 933 934function ff_\type\()_h264_qpel8_mc02_neon, export=1 935 push {lr} 936 lowpass_const r3 937 sub r1, r1, r2, lsl #1 938 mov r3, r2 939 vpush {d8-d15} 940 bl \type\()_h264_qpel8_v_lowpass_neon 941 vpop {d8-d15} 942 pop {pc} 943endfunc 944 945function ff_\type\()_h264_qpel8_mc12_neon, export=1 946 push {r0, r1, r4, r10, r11, lr} 947\type\()_h264_qpel8_mc12: 948 lowpass_const r3 949 mov r11, sp 950A bic sp, sp, #15 951T bic r0, r11, #15 952T mov sp, r0 953 sub sp, sp, #(8*8+16*12) 954 sub r1, r1, r2, lsl #1 955 mov r3, r2 956 mov r2, #8 957 mov r0, sp 958 vpush {d8-d15} 959 bl put_h264_qpel8_v_lowpass_neon 960 mov r4, r0 961 ldrd r0, [r11], #8 962 sub r1, r1, r3, lsl #1 963 sub r1, r1, #2 964 sub r2, r4, #64 965 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 966 vpop {d8-d15} 967 mov sp, r11 968 pop {r4, r10, r11, pc} 969endfunc 970 971function ff_\type\()_h264_qpel8_mc22_neon, export=1 972 push {r4, r10, r11, lr} 973 mov r11, sp 974A bic sp, sp, #15 975T bic r4, r11, #15 976T mov sp, r4 977 sub r1, r1, r2, lsl #1 978 sub r1, r1, #2 979 mov r3, r2 980 sub sp, sp, #(16*12) 981 mov r4, sp 982 vpush {d8-d15} 983 bl \type\()_h264_qpel8_hv_lowpass_neon 984 vpop {d8-d15} 985 mov sp, r11 986 pop {r4, r10, r11, pc} 987endfunc 988 989function ff_\type\()_h264_qpel8_mc32_neon, export=1 990 push {r0, r1, r4, r10, r11, lr} 991 add r1, r1, #1 992 b \type\()_h264_qpel8_mc12 993endfunc 994 995function ff_\type\()_h264_qpel8_mc03_neon, export=1 996 push {lr} 997 add r12, r1, r2 998 b \type\()_h264_qpel8_mc01 999endfunc 1000 1001function ff_\type\()_h264_qpel8_mc13_neon, export=1 1002 push {r0, r1, r11, lr} 1003 add r1, r1, r2 1004 b \type\()_h264_qpel8_mc11 1005endfunc 1006 1007function ff_\type\()_h264_qpel8_mc23_neon, export=1 1008 push {r0, r1, r4, r10, r11, lr} 1009 add r1, r1, r2 1010 b \type\()_h264_qpel8_mc21 1011endfunc 1012 1013function ff_\type\()_h264_qpel8_mc33_neon, export=1 1014 add r1, r1, #1 1015 push {r0, r1, r11, lr} 1016 add r1, r1, r2 1017 sub r1, r1, #1 1018 b \type\()_h264_qpel8_mc11 1019endfunc 1020.endm 1021 1022 h264_qpel8 put 1023 h264_qpel8 avg 1024 1025.macro h264_qpel16 type 1026function ff_\type\()_h264_qpel16_mc10_neon, export=1 1027 lowpass_const r3 1028 mov r3, r1 1029 sub r1, r1, #2 1030 b \type\()_h264_qpel16_h_lowpass_l2_neon 1031endfunc 1032 1033function ff_\type\()_h264_qpel16_mc20_neon, export=1 1034 lowpass_const r3 1035 sub r1, r1, #2 1036 mov r3, r2 1037 b \type\()_h264_qpel16_h_lowpass_neon 1038endfunc 1039 1040function ff_\type\()_h264_qpel16_mc30_neon, export=1 1041 lowpass_const r3 1042 add r3, r1, #1 1043 sub r1, r1, #2 1044 b \type\()_h264_qpel16_h_lowpass_l2_neon 1045endfunc 1046 1047function ff_\type\()_h264_qpel16_mc01_neon, export=1 1048 push {r4, lr} 1049 mov r12, r1 1050\type\()_h264_qpel16_mc01: 1051 lowpass_const r3 1052 mov r3, r2 1053 sub r1, r1, r2, lsl #1 1054 vpush {d8-d15} 1055 bl \type\()_h264_qpel16_v_lowpass_l2_neon 1056 vpop {d8-d15} 1057 pop {r4, pc} 1058endfunc 1059 1060function ff_\type\()_h264_qpel16_mc11_neon, export=1 1061 push {r0, r1, r4, r11, lr} 1062\type\()_h264_qpel16_mc11: 1063 lowpass_const r3 1064 mov r11, sp 1065A bic sp, sp, #15 1066T bic r0, r11, #15 1067T mov sp, r0 1068 sub sp, sp, #256 1069 mov r0, sp 1070 sub r1, r1, #2 1071 mov r3, #16 1072 vpush {d8-d15} 1073 bl put_h264_qpel16_h_lowpass_neon 1074 ldrd r0, [r11], #8 1075 mov r3, r2 1076 add r12, sp, #64 1077 sub r1, r1, r2, lsl #1 1078 mov r2, #16 1079 bl \type\()_h264_qpel16_v_lowpass_l2_neon 1080 vpop {d8-d15} 1081 mov sp, r11 1082 pop {r4, r11, pc} 1083endfunc 1084 1085function ff_\type\()_h264_qpel16_mc21_neon, export=1 1086 push {r0, r1, r4-r5, r9-r11, lr} 1087\type\()_h264_qpel16_mc21: 1088 lowpass_const r3 1089 mov r11, sp 1090A bic sp, sp, #15 1091T bic r0, r11, #15 1092T mov sp, r0 1093 sub sp, sp, #(16*16+16*12) 1094 sub r1, r1, #2 1095 mov r0, sp 1096 vpush {d8-d15} 1097 bl put_h264_qpel16_h_lowpass_neon_packed 1098 mov r4, r0 1099 ldrd r0, [r11], #8 1100 sub r1, r1, r2, lsl #1 1101 sub r1, r1, #2 1102 mov r3, r2 1103 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 1104 vpop {d8-d15} 1105 mov sp, r11 1106 pop {r4-r5, r9-r11, pc} 1107endfunc 1108 1109function ff_\type\()_h264_qpel16_mc31_neon, export=1 1110 add r1, r1, #1 1111 push {r0, r1, r4, r11, lr} 1112 sub r1, r1, #1 1113 b \type\()_h264_qpel16_mc11 1114endfunc 1115 1116function ff_\type\()_h264_qpel16_mc02_neon, export=1 1117 push {r4, lr} 1118 lowpass_const r3 1119 sub r1, r1, r2, lsl #1 1120 mov r3, r2 1121 vpush {d8-d15} 1122 bl \type\()_h264_qpel16_v_lowpass_neon 1123 vpop {d8-d15} 1124 pop {r4, pc} 1125endfunc 1126 1127function ff_\type\()_h264_qpel16_mc12_neon, export=1 1128 push {r0, r1, r4-r5, r9-r11, lr} 1129\type\()_h264_qpel16_mc12: 1130 lowpass_const r3 1131 mov r11, sp 1132A bic sp, sp, #15 1133T bic r0, r11, #15 1134T mov sp, r0 1135 sub sp, sp, #(16*16+16*12) 1136 sub r1, r1, r2, lsl #1 1137 mov r0, sp 1138 mov r3, r2 1139 vpush {d8-d15} 1140 bl put_h264_qpel16_v_lowpass_neon_packed 1141 mov r4, r0 1142 ldrd r0, [r11], #8 1143 sub r1, r1, r3, lsl #1 1144 sub r1, r1, #2 1145 mov r2, r3 1146 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 1147 vpop {d8-d15} 1148 mov sp, r11 1149 pop {r4-r5, r9-r11, pc} 1150endfunc 1151 1152function ff_\type\()_h264_qpel16_mc22_neon, export=1 1153 push {r4, r9-r11, lr} 1154 lowpass_const r3 1155 mov r11, sp 1156A bic sp, sp, #15 1157T bic r4, r11, #15 1158T mov sp, r4 1159 sub r1, r1, r2, lsl #1 1160 sub r1, r1, #2 1161 mov r3, r2 1162 sub sp, sp, #(16*12) 1163 mov r4, sp 1164 vpush {d8-d15} 1165 bl \type\()_h264_qpel16_hv_lowpass_neon 1166 vpop {d8-d15} 1167 mov sp, r11 1168 pop {r4, r9-r11, pc} 1169endfunc 1170 1171function ff_\type\()_h264_qpel16_mc32_neon, export=1 1172 push {r0, r1, r4-r5, r9-r11, lr} 1173 add r1, r1, #1 1174 b \type\()_h264_qpel16_mc12 1175endfunc 1176 1177function ff_\type\()_h264_qpel16_mc03_neon, export=1 1178 push {r4, lr} 1179 add r12, r1, r2 1180 b \type\()_h264_qpel16_mc01 1181endfunc 1182 1183function ff_\type\()_h264_qpel16_mc13_neon, export=1 1184 push {r0, r1, r4, r11, lr} 1185 add r1, r1, r2 1186 b \type\()_h264_qpel16_mc11 1187endfunc 1188 1189function ff_\type\()_h264_qpel16_mc23_neon, export=1 1190 push {r0, r1, r4-r5, r9-r11, lr} 1191 add r1, r1, r2 1192 b \type\()_h264_qpel16_mc21 1193endfunc 1194 1195function ff_\type\()_h264_qpel16_mc33_neon, export=1 1196 add r1, r1, #1 1197 push {r0, r1, r4, r11, lr} 1198 add r1, r1, r2 1199 sub r1, r1, #1 1200 b \type\()_h264_qpel16_mc11 1201endfunc 1202.endm 1203 1204 h264_qpel16 put 1205 h264_qpel16 avg 1206 1207@ Biweighted prediction 1208 1209.macro biweight_16 macs, macd 1210 vdup.8 d0, r4 1211 vdup.8 d1, r5 1212 vmov q2, q8 1213 vmov q3, q8 12141: subs r3, r3, #2 1215 vld1.8 {d20-d21},[r0,:128], r2 1216 \macd q2, d0, d20 1217 pld [r0] 1218 \macd q3, d0, d21 1219 vld1.8 {d22-d23},[r1,:128], r2 1220 \macs q2, d1, d22 1221 pld [r1] 1222 \macs q3, d1, d23 1223 vmov q12, q8 1224 vld1.8 {d28-d29},[r0,:128], r2 1225 vmov q13, q8 1226 \macd q12, d0, d28 1227 pld [r0] 1228 \macd q13, d0, d29 1229 vld1.8 {d30-d31},[r1,:128], r2 1230 \macs q12, d1, d30 1231 pld [r1] 1232 \macs q13, d1, d31 1233 vshl.s16 q2, q2, q9 1234 vshl.s16 q3, q3, q9 1235 vqmovun.s16 d4, q2 1236 vqmovun.s16 d5, q3 1237 vshl.s16 q12, q12, q9 1238 vshl.s16 q13, q13, q9 1239 vqmovun.s16 d24, q12 1240 vqmovun.s16 d25, q13 1241 vmov q3, q8 1242 vst1.8 {d4- d5}, [r6,:128], r2 1243 vmov q2, q8 1244 vst1.8 {d24-d25},[r6,:128], r2 1245 bne 1b 1246 pop {r4-r6, pc} 1247.endm 1248 1249.macro biweight_8 macs, macd 1250 vdup.8 d0, r4 1251 vdup.8 d1, r5 1252 vmov q1, q8 1253 vmov q10, q8 12541: subs r3, r3, #2 1255 vld1.8 {d4},[r0,:64], r2 1256 \macd q1, d0, d4 1257 pld [r0] 1258 vld1.8 {d5},[r1,:64], r2 1259 \macs q1, d1, d5 1260 pld [r1] 1261 vld1.8 {d6},[r0,:64], r2 1262 \macd q10, d0, d6 1263 pld [r0] 1264 vld1.8 {d7},[r1,:64], r2 1265 \macs q10, d1, d7 1266 pld [r1] 1267 vshl.s16 q1, q1, q9 1268 vqmovun.s16 d2, q1 1269 vshl.s16 q10, q10, q9 1270 vqmovun.s16 d4, q10 1271 vmov q10, q8 1272 vst1.8 {d2},[r6,:64], r2 1273 vmov q1, q8 1274 vst1.8 {d4},[r6,:64], r2 1275 bne 1b 1276 pop {r4-r6, pc} 1277.endm 1278 1279.macro biweight_4 macs, macd 1280 vdup.8 d0, r4 1281 vdup.8 d1, r5 1282 vmov q1, q8 1283 vmov q10, q8 12841: subs r3, r3, #4 1285 vld1.32 {d4[0]},[r0,:32], r2 1286 vld1.32 {d4[1]},[r0,:32], r2 1287 \macd q1, d0, d4 1288 pld [r0] 1289 vld1.32 {d5[0]},[r1,:32], r2 1290 vld1.32 {d5[1]},[r1,:32], r2 1291 \macs q1, d1, d5 1292 pld [r1] 1293 blt 2f 1294 vld1.32 {d6[0]},[r0,:32], r2 1295 vld1.32 {d6[1]},[r0,:32], r2 1296 \macd q10, d0, d6 1297 pld [r0] 1298 vld1.32 {d7[0]},[r1,:32], r2 1299 vld1.32 {d7[1]},[r1,:32], r2 1300 \macs q10, d1, d7 1301 pld [r1] 1302 vshl.s16 q1, q1, q9 1303 vqmovun.s16 d2, q1 1304 vshl.s16 q10, q10, q9 1305 vqmovun.s16 d4, q10 1306 vmov q10, q8 1307 vst1.32 {d2[0]},[r6,:32], r2 1308 vst1.32 {d2[1]},[r6,:32], r2 1309 vmov q1, q8 1310 vst1.32 {d4[0]},[r6,:32], r2 1311 vst1.32 {d4[1]},[r6,:32], r2 1312 bne 1b 1313 pop {r4-r6, pc} 13142: vshl.s16 q1, q1, q9 1315 vqmovun.s16 d2, q1 1316 vst1.32 {d2[0]},[r6,:32], r2 1317 vst1.32 {d2[1]},[r6,:32], r2 1318 pop {r4-r6, pc} 1319.endm 1320 1321.macro biweight_func w 1322function ff_biweight_h264_pixels_\w\()_neon, export=1 1323 push {r4-r6, lr} 1324 ldr r12, [sp, #16] 1325 add r4, sp, #20 1326 ldm r4, {r4-r6} 1327 lsr lr, r4, #31 1328 add r6, r6, #1 1329 eors lr, lr, r5, lsr #30 1330 orr r6, r6, #1 1331 vdup.16 q9, r12 1332 lsl r6, r6, r12 1333 vmvn q9, q9 1334 vdup.16 q8, r6 1335 mov r6, r0 1336 beq 10f 1337 subs lr, lr, #1 1338 beq 20f 1339 subs lr, lr, #1 1340 beq 30f 1341 b 40f 134210: biweight_\w vmlal.u8, vmlal.u8 134320: rsb r4, r4, #0 1344 biweight_\w vmlal.u8, vmlsl.u8 134530: rsb r4, r4, #0 1346 rsb r5, r5, #0 1347 biweight_\w vmlsl.u8, vmlsl.u8 134840: rsb r5, r5, #0 1349 biweight_\w vmlsl.u8, vmlal.u8 1350endfunc 1351.endm 1352 1353 biweight_func 16 1354 biweight_func 8 1355 biweight_func 4 1356 1357@ Weighted prediction 1358 1359.macro weight_16 add 1360 vdup.8 d0, r12 13611: subs r2, r2, #2 1362 vld1.8 {d20-d21},[r0,:128], r1 1363 vmull.u8 q2, d0, d20 1364 pld [r0] 1365 vmull.u8 q3, d0, d21 1366 vld1.8 {d28-d29},[r0,:128], r1 1367 vmull.u8 q12, d0, d28 1368 pld [r0] 1369 vmull.u8 q13, d0, d29 1370 \add q2, q8, q2 1371 vrshl.s16 q2, q2, q9 1372 \add q3, q8, q3 1373 vrshl.s16 q3, q3, q9 1374 vqmovun.s16 d4, q2 1375 vqmovun.s16 d5, q3 1376 \add q12, q8, q12 1377 vrshl.s16 q12, q12, q9 1378 \add q13, q8, q13 1379 vrshl.s16 q13, q13, q9 1380 vqmovun.s16 d24, q12 1381 vqmovun.s16 d25, q13 1382 vst1.8 {d4- d5}, [r4,:128], r1 1383 vst1.8 {d24-d25},[r4,:128], r1 1384 bne 1b 1385 pop {r4, pc} 1386.endm 1387 1388.macro weight_8 add 1389 vdup.8 d0, r12 13901: subs r2, r2, #2 1391 vld1.8 {d4},[r0,:64], r1 1392 vmull.u8 q1, d0, d4 1393 pld [r0] 1394 vld1.8 {d6},[r0,:64], r1 1395 vmull.u8 q10, d0, d6 1396 \add q1, q8, q1 1397 pld [r0] 1398 vrshl.s16 q1, q1, q9 1399 vqmovun.s16 d2, q1 1400 \add q10, q8, q10 1401 vrshl.s16 q10, q10, q9 1402 vqmovun.s16 d4, q10 1403 vst1.8 {d2},[r4,:64], r1 1404 vst1.8 {d4},[r4,:64], r1 1405 bne 1b 1406 pop {r4, pc} 1407.endm 1408 1409.macro weight_4 add 1410 vdup.8 d0, r12 1411 vmov q1, q8 1412 vmov q10, q8 14131: subs r2, r2, #4 1414 vld1.32 {d4[0]},[r0,:32], r1 1415 vld1.32 {d4[1]},[r0,:32], r1 1416 vmull.u8 q1, d0, d4 1417 pld [r0] 1418 blt 2f 1419 vld1.32 {d6[0]},[r0,:32], r1 1420 vld1.32 {d6[1]},[r0,:32], r1 1421 vmull.u8 q10, d0, d6 1422 pld [r0] 1423 \add q1, q8, q1 1424 vrshl.s16 q1, q1, q9 1425 vqmovun.s16 d2, q1 1426 \add q10, q8, q10 1427 vrshl.s16 q10, q10, q9 1428 vqmovun.s16 d4, q10 1429 vmov q10, q8 1430 vst1.32 {d2[0]},[r4,:32], r1 1431 vst1.32 {d2[1]},[r4,:32], r1 1432 vmov q1, q8 1433 vst1.32 {d4[0]},[r4,:32], r1 1434 vst1.32 {d4[1]},[r4,:32], r1 1435 bne 1b 1436 pop {r4, pc} 14372: \add q1, q8, q1 1438 vrshl.s16 q1, q1, q9 1439 vqmovun.s16 d2, q1 1440 vst1.32 {d2[0]},[r4,:32], r1 1441 vst1.32 {d2[1]},[r4,:32], r1 1442 pop {r4, pc} 1443.endm 1444 1445.macro weight_func w 1446function ff_weight_h264_pixels_\w\()_neon, export=1 1447 push {r4, lr} 1448 ldr r12, [sp, #8] 1449 ldr r4, [sp, #12] 1450 cmp r3, #1 1451 lsl r4, r4, r3 1452 vdup.16 q8, r4 1453 mov r4, r0 1454 ble 20f 1455 rsb lr, r3, #1 1456 vdup.16 q9, lr 1457 cmp r12, #0 1458 blt 10f 1459 weight_\w vhadd.s16 146010: rsb r12, r12, #0 1461 weight_\w vhsub.s16 146220: rsb lr, r3, #0 1463 vdup.16 q9, lr 1464 cmp r12, #0 1465 blt 10f 1466 weight_\w vadd.s16 146710: rsb r12, r12, #0 1468 weight_\w vsub.s16 1469endfunc 1470.endm 1471 1472 weight_func 16 1473 weight_func 8 1474 weight_func 4 1475