1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/aarch64/asm.S" 23#include "neon.S" 24 25 /* H.264 qpel MC */ 26 27.macro lowpass_const r 28 movz \r, #20, lsl #16 29 movk \r, #5 30 mov v6.S[0], \r 31.endm 32 33//trashes v0-v5 34.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 35 ext v2.8B, \r0\().8B, \r1\().8B, #2 36 ext v3.8B, \r0\().8B, \r1\().8B, #3 37 uaddl v2.8H, v2.8B, v3.8B 38 ext v4.8B, \r0\().8B, \r1\().8B, #1 39 ext v5.8B, \r0\().8B, \r1\().8B, #4 40 uaddl v4.8H, v4.8B, v5.8B 41 ext v1.8B, \r0\().8B, \r1\().8B, #5 42 uaddl \d0\().8H, \r0\().8B, v1.8B 43 ext v0.8B, \r2\().8B, \r3\().8B, #2 44 mla \d0\().8H, v2.8H, v6.H[1] 45 ext v1.8B, \r2\().8B, \r3\().8B, #3 46 uaddl v0.8H, v0.8B, v1.8B 47 ext v1.8B, \r2\().8B, \r3\().8B, #1 48 mls \d0\().8H, v4.8H, v6.H[0] 49 ext v3.8B, \r2\().8B, \r3\().8B, #4 50 uaddl v1.8H, v1.8B, v3.8B 51 ext v2.8B, \r2\().8B, \r3\().8B, #5 52 uaddl \d1\().8H, \r2\().8B, v2.8B 53 mla \d1\().8H, v0.8H, v6.H[1] 54 mls \d1\().8H, v1.8H, v6.H[0] 55 .if \narrow 56 sqrshrun \d0\().8B, \d0\().8H, #5 57 sqrshrun \d1\().8B, \d1\().8H, #5 58 .endif 59.endm 60 61//trashes v0-v5, v7, v30-v31 62.macro lowpass_8H r0, r1 63 ext v0.16B, \r0\().16B, \r0\().16B, #2 64 ext v1.16B, \r0\().16B, \r0\().16B, #3 65 uaddl v0.8H, v0.8B, v1.8B 66 ext v2.16B, \r0\().16B, \r0\().16B, #1 67 ext v3.16B, \r0\().16B, \r0\().16B, #4 68 uaddl v2.8H, v2.8B, v3.8B 69 ext v30.16B, \r0\().16B, \r0\().16B, #5 70 uaddl \r0\().8H, \r0\().8B, v30.8B 71 ext v4.16B, \r1\().16B, \r1\().16B, #2 72 mla \r0\().8H, v0.8H, v6.H[1] 73 ext v5.16B, \r1\().16B, \r1\().16B, #3 74 uaddl v4.8H, v4.8B, v5.8B 75 ext v7.16B, \r1\().16B, \r1\().16B, #1 76 mls \r0\().8H, v2.8H, v6.H[0] 77 ext v0.16B, \r1\().16B, \r1\().16B, #4 78 uaddl v7.8H, v7.8B, v0.8B 79 ext v31.16B, \r1\().16B, \r1\().16B, #5 80 uaddl \r1\().8H, \r1\().8B, v31.8B 81 mla \r1\().8H, v4.8H, v6.H[1] 82 mls \r1\().8H, v7.8H, v6.H[0] 83.endm 84 85// trashes v2-v5, v30 86.macro lowpass_8_1 r0, r1, d0, narrow=1 87 ext v2.8B, \r0\().8B, \r1\().8B, #2 88 ext v3.8B, \r0\().8B, \r1\().8B, #3 89 uaddl v2.8H, v2.8B, v3.8B 90 ext v4.8B, \r0\().8B, \r1\().8B, #1 91 ext v5.8B, \r0\().8B, \r1\().8B, #4 92 uaddl v4.8H, v4.8B, v5.8B 93 ext v30.8B, \r0\().8B, \r1\().8B, #5 94 uaddl \d0\().8H, \r0\().8B, v30.8B 95 mla \d0\().8H, v2.8H, v6.H[1] 96 mls \d0\().8H, v4.8H, v6.H[0] 97 .if \narrow 98 sqrshrun \d0\().8B, \d0\().8H, #5 99 .endif 100.endm 101 102// trashed v0-v7 103.macro lowpass_8.16 r0, r1, r2 104 ext v1.16B, \r0\().16B, \r1\().16B, #4 105 ext v0.16B, \r0\().16B, \r1\().16B, #6 106 saddl v5.4S, v1.4H, v0.4H 107 ext v2.16B, \r0\().16B, \r1\().16B, #2 108 saddl2 v1.4S, v1.8H, v0.8H 109 ext v3.16B, \r0\().16B, \r1\().16B, #8 110 saddl v6.4S, v2.4H, v3.4H 111 ext \r1\().16B, \r0\().16B, \r1\().16B, #10 112 saddl2 v2.4S, v2.8H, v3.8H 113 saddl v0.4S, \r0\().4H, \r1\().4H 114 saddl2 v4.4S, \r0\().8H, \r1\().8H 115 116 shl v3.4S, v5.4S, #4 117 shl v5.4S, v5.4S, #2 118 shl v7.4S, v6.4S, #2 119 add v5.4S, v5.4S, v3.4S 120 add v6.4S, v6.4S, v7.4S 121 122 shl v3.4S, v1.4S, #4 123 shl v1.4S, v1.4S, #2 124 shl v7.4S, v2.4S, #2 125 add v1.4S, v1.4S, v3.4S 126 add v2.4S, v2.4S, v7.4S 127 128 add v5.4S, v5.4S, v0.4S 129 sub v5.4S, v5.4S, v6.4S 130 131 add v1.4S, v1.4S, v4.4S 132 sub v1.4S, v1.4S, v2.4S 133 134 rshrn v5.4H, v5.4S, #10 135 rshrn2 v5.8H, v1.4S, #10 136 137 sqxtun \r2\().8B, v5.8H 138.endm 139 140function put_h264_qpel16_h_lowpass_neon_packed 141 mov x4, x30 142 mov x12, #16 143 mov x3, #8 144 bl put_h264_qpel8_h_lowpass_neon 145 sub x1, x1, x2, lsl #4 146 add x1, x1, #8 147 mov x12, #16 148 mov x30, x4 149 b put_h264_qpel8_h_lowpass_neon 150endfunc 151 152.macro h264_qpel_h_lowpass type 153function \type\()_h264_qpel16_h_lowpass_neon 154 mov x13, x30 155 mov x12, #16 156 bl \type\()_h264_qpel8_h_lowpass_neon 157 sub x0, x0, x3, lsl #4 158 sub x1, x1, x2, lsl #4 159 add x0, x0, #8 160 add x1, x1, #8 161 mov x12, #16 162 mov x30, x13 163endfunc 164 165function \type\()_h264_qpel8_h_lowpass_neon 1661: ld1 {v28.8B, v29.8B}, [x1], x2 167 ld1 {v16.8B, v17.8B}, [x1], x2 168 subs x12, x12, #2 169 lowpass_8 v28, v29, v16, v17, v28, v16 170 .ifc \type,avg 171 ld1 {v2.8B}, [x0], x3 172 urhadd v28.8B, v28.8B, v2.8B 173 ld1 {v3.8B}, [x0] 174 urhadd v16.8B, v16.8B, v3.8B 175 sub x0, x0, x3 176 .endif 177 st1 {v28.8B}, [x0], x3 178 st1 {v16.8B}, [x0], x3 179 b.ne 1b 180 ret 181endfunc 182.endm 183 184 h264_qpel_h_lowpass put 185 h264_qpel_h_lowpass avg 186 187.macro h264_qpel_h_lowpass_l2 type 188function \type\()_h264_qpel16_h_lowpass_l2_neon 189 mov x13, x30 190 mov x12, #16 191 bl \type\()_h264_qpel8_h_lowpass_l2_neon 192 sub x0, x0, x2, lsl #4 193 sub x1, x1, x2, lsl #4 194 sub x3, x3, x2, lsl #4 195 add x0, x0, #8 196 add x1, x1, #8 197 add x3, x3, #8 198 mov x12, #16 199 mov x30, x13 200endfunc 201 202function \type\()_h264_qpel8_h_lowpass_l2_neon 2031: ld1 {v26.8B, v27.8B}, [x1], x2 204 ld1 {v16.8B, v17.8B}, [x1], x2 205 ld1 {v28.8B}, [x3], x2 206 ld1 {v29.8B}, [x3], x2 207 subs x12, x12, #2 208 lowpass_8 v26, v27, v16, v17, v26, v27 209 urhadd v26.8B, v26.8B, v28.8B 210 urhadd v27.8B, v27.8B, v29.8B 211 .ifc \type,avg 212 ld1 {v2.8B}, [x0], x2 213 urhadd v26.8B, v26.8B, v2.8B 214 ld1 {v3.8B}, [x0] 215 urhadd v27.8B, v27.8B, v3.8B 216 sub x0, x0, x2 217 .endif 218 st1 {v26.8B}, [x0], x2 219 st1 {v27.8B}, [x0], x2 220 b.ne 1b 221 ret 222endfunc 223.endm 224 225 h264_qpel_h_lowpass_l2 put 226 h264_qpel_h_lowpass_l2 avg 227 228function put_h264_qpel16_v_lowpass_neon_packed 229 mov x4, x30 230 mov x2, #8 231 bl put_h264_qpel8_v_lowpass_neon 232 sub x1, x1, x3, lsl #2 233 bl put_h264_qpel8_v_lowpass_neon 234 sub x1, x1, x3, lsl #4 235 sub x1, x1, x3, lsl #2 236 add x1, x1, #8 237 bl put_h264_qpel8_v_lowpass_neon 238 sub x1, x1, x3, lsl #2 239 mov x30, x4 240 b put_h264_qpel8_v_lowpass_neon 241endfunc 242 243.macro h264_qpel_v_lowpass type 244function \type\()_h264_qpel16_v_lowpass_neon 245 mov x4, x30 246 bl \type\()_h264_qpel8_v_lowpass_neon 247 sub x1, x1, x3, lsl #2 248 bl \type\()_h264_qpel8_v_lowpass_neon 249 sub x0, x0, x2, lsl #4 250 add x0, x0, #8 251 sub x1, x1, x3, lsl #4 252 sub x1, x1, x3, lsl #2 253 add x1, x1, #8 254 bl \type\()_h264_qpel8_v_lowpass_neon 255 sub x1, x1, x3, lsl #2 256 mov x30, x4 257endfunc 258 259function \type\()_h264_qpel8_v_lowpass_neon 260 ld1 {v16.8B}, [x1], x3 261 ld1 {v18.8B}, [x1], x3 262 ld1 {v20.8B}, [x1], x3 263 ld1 {v22.8B}, [x1], x3 264 ld1 {v24.8B}, [x1], x3 265 ld1 {v26.8B}, [x1], x3 266 ld1 {v28.8B}, [x1], x3 267 ld1 {v30.8B}, [x1], x3 268 ld1 {v17.8B}, [x1], x3 269 ld1 {v19.8B}, [x1], x3 270 ld1 {v21.8B}, [x1], x3 271 ld1 {v23.8B}, [x1], x3 272 ld1 {v25.8B}, [x1] 273 274 transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 275 transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 276 lowpass_8 v16, v17, v18, v19, v16, v17 277 lowpass_8 v20, v21, v22, v23, v18, v19 278 lowpass_8 v24, v25, v26, v27, v20, v21 279 lowpass_8 v28, v29, v30, v31, v22, v23 280 transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 281 282 .ifc \type,avg 283 ld1 {v24.8B}, [x0], x2 284 urhadd v16.8B, v16.8B, v24.8B 285 ld1 {v25.8B}, [x0], x2 286 urhadd v17.8B, v17.8B, v25.8B 287 ld1 {v26.8B}, [x0], x2 288 urhadd v18.8B, v18.8B, v26.8B 289 ld1 {v27.8B}, [x0], x2 290 urhadd v19.8B, v19.8B, v27.8B 291 ld1 {v28.8B}, [x0], x2 292 urhadd v20.8B, v20.8B, v28.8B 293 ld1 {v29.8B}, [x0], x2 294 urhadd v21.8B, v21.8B, v29.8B 295 ld1 {v30.8B}, [x0], x2 296 urhadd v22.8B, v22.8B, v30.8B 297 ld1 {v31.8B}, [x0], x2 298 urhadd v23.8B, v23.8B, v31.8B 299 sub x0, x0, x2, lsl #3 300 .endif 301 302 st1 {v16.8B}, [x0], x2 303 st1 {v17.8B}, [x0], x2 304 st1 {v18.8B}, [x0], x2 305 st1 {v19.8B}, [x0], x2 306 st1 {v20.8B}, [x0], x2 307 st1 {v21.8B}, [x0], x2 308 st1 {v22.8B}, [x0], x2 309 st1 {v23.8B}, [x0], x2 310 311 ret 312endfunc 313.endm 314 315 h264_qpel_v_lowpass put 316 h264_qpel_v_lowpass avg 317 318.macro h264_qpel_v_lowpass_l2 type 319function \type\()_h264_qpel16_v_lowpass_l2_neon 320 mov x4, x30 321 bl \type\()_h264_qpel8_v_lowpass_l2_neon 322 sub x1, x1, x3, lsl #2 323 bl \type\()_h264_qpel8_v_lowpass_l2_neon 324 sub x0, x0, x3, lsl #4 325 sub x12, x12, x2, lsl #4 326 add x0, x0, #8 327 add x12, x12, #8 328 sub x1, x1, x3, lsl #4 329 sub x1, x1, x3, lsl #2 330 add x1, x1, #8 331 bl \type\()_h264_qpel8_v_lowpass_l2_neon 332 sub x1, x1, x3, lsl #2 333 mov x30, x4 334endfunc 335 336function \type\()_h264_qpel8_v_lowpass_l2_neon 337 ld1 {v16.8B}, [x1], x3 338 ld1 {v18.8B}, [x1], x3 339 ld1 {v20.8B}, [x1], x3 340 ld1 {v22.8B}, [x1], x3 341 ld1 {v24.8B}, [x1], x3 342 ld1 {v26.8B}, [x1], x3 343 ld1 {v28.8B}, [x1], x3 344 ld1 {v30.8B}, [x1], x3 345 ld1 {v17.8B}, [x1], x3 346 ld1 {v19.8B}, [x1], x3 347 ld1 {v21.8B}, [x1], x3 348 ld1 {v23.8B}, [x1], x3 349 ld1 {v25.8B}, [x1] 350 351 transpose_8x8B v16, v18, v20, v22, v24, v26, v28, v30, v0, v1 352 transpose_8x8B v17, v19, v21, v23, v25, v27, v29, v31, v0, v1 353 lowpass_8 v16, v17, v18, v19, v16, v17 354 lowpass_8 v20, v21, v22, v23, v18, v19 355 lowpass_8 v24, v25, v26, v27, v20, v21 356 lowpass_8 v28, v29, v30, v31, v22, v23 357 transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 358 359 ld1 {v24.8B}, [x12], x2 360 ld1 {v25.8B}, [x12], x2 361 ld1 {v26.8B}, [x12], x2 362 ld1 {v27.8B}, [x12], x2 363 ld1 {v28.8B}, [x12], x2 364 urhadd v16.8B, v24.8B, v16.8B 365 urhadd v17.8B, v25.8B, v17.8B 366 ld1 {v29.8B}, [x12], x2 367 urhadd v18.8B, v26.8B, v18.8B 368 urhadd v19.8B, v27.8B, v19.8B 369 ld1 {v30.8B}, [x12], x2 370 urhadd v20.8B, v28.8B, v20.8B 371 urhadd v21.8B, v29.8B, v21.8B 372 ld1 {v31.8B}, [x12], x2 373 urhadd v22.8B, v30.8B, v22.8B 374 urhadd v23.8B, v31.8B, v23.8B 375 376 .ifc \type,avg 377 ld1 {v24.8B}, [x0], x3 378 urhadd v16.8B, v16.8B, v24.8B 379 ld1 {v25.8B}, [x0], x3 380 urhadd v17.8B, v17.8B, v25.8B 381 ld1 {v26.8B}, [x0], x3 382 urhadd v18.8B, v18.8B, v26.8B 383 ld1 {v27.8B}, [x0], x3 384 urhadd v19.8B, v19.8B, v27.8B 385 ld1 {v28.8B}, [x0], x3 386 urhadd v20.8B, v20.8B, v28.8B 387 ld1 {v29.8B}, [x0], x3 388 urhadd v21.8B, v21.8B, v29.8B 389 ld1 {v30.8B}, [x0], x3 390 urhadd v22.8B, v22.8B, v30.8B 391 ld1 {v31.8B}, [x0], x3 392 urhadd v23.8B, v23.8B, v31.8B 393 sub x0, x0, x3, lsl #3 394 .endif 395 396 st1 {v16.8B}, [x0], x3 397 st1 {v17.8B}, [x0], x3 398 st1 {v18.8B}, [x0], x3 399 st1 {v19.8B}, [x0], x3 400 st1 {v20.8B}, [x0], x3 401 st1 {v21.8B}, [x0], x3 402 st1 {v22.8B}, [x0], x3 403 st1 {v23.8B}, [x0], x3 404 405 ret 406endfunc 407.endm 408 409 h264_qpel_v_lowpass_l2 put 410 h264_qpel_v_lowpass_l2 avg 411 412function put_h264_qpel8_hv_lowpass_neon_top 413 lowpass_const w12 414 ld1 {v16.8H}, [x1], x3 415 ld1 {v17.8H}, [x1], x3 416 ld1 {v18.8H}, [x1], x3 417 ld1 {v19.8H}, [x1], x3 418 ld1 {v20.8H}, [x1], x3 419 ld1 {v21.8H}, [x1], x3 420 ld1 {v22.8H}, [x1], x3 421 ld1 {v23.8H}, [x1], x3 422 ld1 {v24.8H}, [x1], x3 423 ld1 {v25.8H}, [x1], x3 424 ld1 {v26.8H}, [x1], x3 425 ld1 {v27.8H}, [x1], x3 426 ld1 {v28.8H}, [x1] 427 lowpass_8H v16, v17 428 lowpass_8H v18, v19 429 lowpass_8H v20, v21 430 lowpass_8H v22, v23 431 lowpass_8H v24, v25 432 lowpass_8H v26, v27 433 lowpass_8H v28, v29 434 435 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 436 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 437 438 lowpass_8.16 v16, v24, v16 439 lowpass_8.16 v17, v25, v17 440 441 lowpass_8.16 v18, v26, v18 442 lowpass_8.16 v19, v27, v19 443 444 lowpass_8.16 v20, v28, v20 445 lowpass_8.16 v21, v29, v21 446 447 lowpass_8.16 v22, v30, v22 448 lowpass_8.16 v23, v31, v23 449 450 transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 451 452 ret 453endfunc 454 455.macro h264_qpel8_hv_lowpass type 456function \type\()_h264_qpel8_hv_lowpass_neon 457 mov x10, x30 458 bl put_h264_qpel8_hv_lowpass_neon_top 459 .ifc \type,avg 460 ld1 {v0.8B}, [x0], x2 461 urhadd v16.8B, v16.8B, v0.8B 462 ld1 {v1.8B}, [x0], x2 463 urhadd v17.8B, v17.8B, v1.8B 464 ld1 {v2.8B}, [x0], x2 465 urhadd v18.8B, v18.8B, v2.8B 466 ld1 {v3.8B}, [x0], x2 467 urhadd v19.8B, v19.8B, v3.8B 468 ld1 {v4.8B}, [x0], x2 469 urhadd v20.8B, v20.8B, v4.8B 470 ld1 {v5.8B}, [x0], x2 471 urhadd v21.8B, v21.8B, v5.8B 472 ld1 {v6.8B}, [x0], x2 473 urhadd v22.8B, v22.8B, v6.8B 474 ld1 {v7.8B}, [x0], x2 475 urhadd v23.8B, v23.8B, v7.8B 476 sub x0, x0, x2, lsl #3 477 .endif 478 479 st1 {v16.8B}, [x0], x2 480 st1 {v17.8B}, [x0], x2 481 st1 {v18.8B}, [x0], x2 482 st1 {v19.8B}, [x0], x2 483 st1 {v20.8B}, [x0], x2 484 st1 {v21.8B}, [x0], x2 485 st1 {v22.8B}, [x0], x2 486 st1 {v23.8B}, [x0], x2 487 488 ret x10 489endfunc 490.endm 491 492 h264_qpel8_hv_lowpass put 493 h264_qpel8_hv_lowpass avg 494 495.macro h264_qpel8_hv_lowpass_l2 type 496function \type\()_h264_qpel8_hv_lowpass_l2_neon 497 mov x10, x30 498 bl put_h264_qpel8_hv_lowpass_neon_top 499 500 ld1 {v0.8B, v1.8B}, [x2], #16 501 ld1 {v2.8B, v3.8B}, [x2], #16 502 urhadd v0.8B, v0.8B, v16.8B 503 urhadd v1.8B, v1.8B, v17.8B 504 ld1 {v4.8B, v5.8B}, [x2], #16 505 urhadd v2.8B, v2.8B, v18.8B 506 urhadd v3.8B, v3.8B, v19.8B 507 ld1 {v6.8B, v7.8B}, [x2], #16 508 urhadd v4.8B, v4.8B, v20.8B 509 urhadd v5.8B, v5.8B, v21.8B 510 urhadd v6.8B, v6.8B, v22.8B 511 urhadd v7.8B, v7.8B, v23.8B 512 .ifc \type,avg 513 ld1 {v16.8B}, [x0], x3 514 urhadd v0.8B, v0.8B, v16.8B 515 ld1 {v17.8B}, [x0], x3 516 urhadd v1.8B, v1.8B, v17.8B 517 ld1 {v18.8B}, [x0], x3 518 urhadd v2.8B, v2.8B, v18.8B 519 ld1 {v19.8B}, [x0], x3 520 urhadd v3.8B, v3.8B, v19.8B 521 ld1 {v20.8B}, [x0], x3 522 urhadd v4.8B, v4.8B, v20.8B 523 ld1 {v21.8B}, [x0], x3 524 urhadd v5.8B, v5.8B, v21.8B 525 ld1 {v22.8B}, [x0], x3 526 urhadd v6.8B, v6.8B, v22.8B 527 ld1 {v23.8B}, [x0], x3 528 urhadd v7.8B, v7.8B, v23.8B 529 sub x0, x0, x3, lsl #3 530 .endif 531 st1 {v0.8B}, [x0], x3 532 st1 {v1.8B}, [x0], x3 533 st1 {v2.8B}, [x0], x3 534 st1 {v3.8B}, [x0], x3 535 st1 {v4.8B}, [x0], x3 536 st1 {v5.8B}, [x0], x3 537 st1 {v6.8B}, [x0], x3 538 st1 {v7.8B}, [x0], x3 539 540 ret x10 541endfunc 542.endm 543 544 h264_qpel8_hv_lowpass_l2 put 545 h264_qpel8_hv_lowpass_l2 avg 546 547.macro h264_qpel16_hv type 548function \type\()_h264_qpel16_hv_lowpass_neon 549 mov x13, x30 550 bl \type\()_h264_qpel8_hv_lowpass_neon 551 sub x1, x1, x3, lsl #2 552 bl \type\()_h264_qpel8_hv_lowpass_neon 553 sub x1, x1, x3, lsl #4 554 sub x1, x1, x3, lsl #2 555 add x1, x1, #8 556 sub x0, x0, x2, lsl #4 557 add x0, x0, #8 558 bl \type\()_h264_qpel8_hv_lowpass_neon 559 sub x1, x1, x3, lsl #2 560 mov x30, x13 561 b \type\()_h264_qpel8_hv_lowpass_neon 562endfunc 563 564function \type\()_h264_qpel16_hv_lowpass_l2_neon 565 mov x13, x30 566 sub x2, x4, #256 567 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 568 sub x1, x1, x3, lsl #2 569 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 570 sub x1, x1, x3, lsl #4 571 sub x1, x1, x3, lsl #2 572 add x1, x1, #8 573 sub x0, x0, x3, lsl #4 574 add x0, x0, #8 575 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 576 sub x1, x1, x3, lsl #2 577 mov x30, x13 578 b \type\()_h264_qpel8_hv_lowpass_l2_neon 579endfunc 580.endm 581 582 h264_qpel16_hv put 583 h264_qpel16_hv avg 584 585.macro h264_qpel8 type 586function ff_\type\()_h264_qpel8_mc10_neon, export=1 587 lowpass_const w3 588 mov x3, x1 589 sub x1, x1, #2 590 mov x12, #8 591 b \type\()_h264_qpel8_h_lowpass_l2_neon 592endfunc 593 594function ff_\type\()_h264_qpel8_mc20_neon, export=1 595 lowpass_const w3 596 sub x1, x1, #2 597 mov x3, x2 598 mov x12, #8 599 b \type\()_h264_qpel8_h_lowpass_neon 600endfunc 601 602function ff_\type\()_h264_qpel8_mc30_neon, export=1 603 lowpass_const w3 604 add x3, x1, #1 605 sub x1, x1, #2 606 mov x12, #8 607 b \type\()_h264_qpel8_h_lowpass_l2_neon 608endfunc 609 610function ff_\type\()_h264_qpel8_mc01_neon, export=1 611 mov x14, x30 612 mov x12, x1 613\type\()_h264_qpel8_mc01: 614 lowpass_const w3 615 mov x3, x2 616 sub x1, x1, x2, lsl #1 617 bl \type\()_h264_qpel8_v_lowpass_l2_neon 618 ret x14 619endfunc 620 621function ff_\type\()_h264_qpel8_mc11_neon, export=1 622 mov x14, x30 623 mov x8, x0 624 mov x9, x1 625\type\()_h264_qpel8_mc11: 626 lowpass_const w3 627 mov x11, sp 628 sub sp, sp, #64 629 mov x0, sp 630 sub x1, x1, #2 631 mov x3, #8 632 mov x12, #8 633 bl put_h264_qpel8_h_lowpass_neon 634 mov x0, x8 635 mov x3, x2 636 mov x12, sp 637 sub x1, x9, x2, lsl #1 638 mov x2, #8 639 bl \type\()_h264_qpel8_v_lowpass_l2_neon 640 mov sp, x11 641 ret x14 642endfunc 643 644function ff_\type\()_h264_qpel8_mc21_neon, export=1 645 mov x14, x30 646 mov x8, x0 647 mov x9, x1 648\type\()_h264_qpel8_mc21: 649 lowpass_const w3 650 mov x11, sp 651 sub sp, sp, #(8*8+16*12) 652 sub x1, x1, #2 653 mov x3, #8 654 mov x0, sp 655 mov x12, #8 656 bl put_h264_qpel8_h_lowpass_neon 657 mov x4, x0 658 mov x0, x8 659 sub x1, x9, x2, lsl #1 660 sub x1, x1, #2 661 mov x3, x2 662 sub x2, x4, #64 663 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 664 mov sp, x11 665 ret x14 666endfunc 667 668function ff_\type\()_h264_qpel8_mc31_neon, export=1 669 add x1, x1, #1 670 mov x14, x30 671 mov x8, x0 672 mov x9, x1 673 sub x1, x1, #1 674 b \type\()_h264_qpel8_mc11 675endfunc 676 677function ff_\type\()_h264_qpel8_mc02_neon, export=1 678 mov x14, x30 679 lowpass_const w3 680 sub x1, x1, x2, lsl #1 681 mov x3, x2 682 bl \type\()_h264_qpel8_v_lowpass_neon 683 ret x14 684endfunc 685 686function ff_\type\()_h264_qpel8_mc12_neon, export=1 687 mov x14, x30 688 mov x8, x0 689 mov x9, x1 690\type\()_h264_qpel8_mc12: 691 lowpass_const w3 692 mov x11, sp 693 sub sp, sp, #(8*8+16*12) 694 sub x1, x1, x2, lsl #1 695 mov x3, x2 696 mov x2, #8 697 mov x0, sp 698 bl put_h264_qpel8_v_lowpass_neon 699 mov x4, x0 700 mov x0, x8 701 sub x1, x9, x3, lsl #1 702 sub x1, x1, #2 703 sub x2, x4, #64 704 bl \type\()_h264_qpel8_hv_lowpass_l2_neon 705 mov sp, x11 706 ret x14 707endfunc 708 709function ff_\type\()_h264_qpel8_mc22_neon, export=1 710 mov x14, x30 711 mov x11, sp 712 sub x1, x1, x2, lsl #1 713 sub x1, x1, #2 714 mov x3, x2 715 bl \type\()_h264_qpel8_hv_lowpass_neon 716 mov sp, x11 717 ret x14 718endfunc 719 720function ff_\type\()_h264_qpel8_mc32_neon, export=1 721 mov x14, x30 722 mov x8, x0 723 mov x9, x1 724 add x1, x1, #1 725 b \type\()_h264_qpel8_mc12 726endfunc 727 728function ff_\type\()_h264_qpel8_mc03_neon, export=1 729 mov x14, x30 730 add x12, x1, x2 731 b \type\()_h264_qpel8_mc01 732endfunc 733 734function ff_\type\()_h264_qpel8_mc13_neon, export=1 735 mov x14, x30 736 mov x8, x0 737 mov x9, x1 738 add x1, x1, x2 739 b \type\()_h264_qpel8_mc11 740endfunc 741 742function ff_\type\()_h264_qpel8_mc23_neon, export=1 743 mov x14, x30 744 mov x8, x0 745 mov x9, x1 746 add x1, x1, x2 747 b \type\()_h264_qpel8_mc21 748endfunc 749 750function ff_\type\()_h264_qpel8_mc33_neon, export=1 751 add x1, x1, #1 752 mov x14, x30 753 mov x8, x0 754 mov x9, x1 755 add x1, x1, x2 756 sub x1, x1, #1 757 b \type\()_h264_qpel8_mc11 758endfunc 759.endm 760 761 h264_qpel8 put 762 h264_qpel8 avg 763 764.macro h264_qpel16 type 765function ff_\type\()_h264_qpel16_mc10_neon, export=1 766 lowpass_const w3 767 mov x3, x1 768 sub x1, x1, #2 769 b \type\()_h264_qpel16_h_lowpass_l2_neon 770endfunc 771 772function ff_\type\()_h264_qpel16_mc20_neon, export=1 773 lowpass_const w3 774 sub x1, x1, #2 775 mov x3, x2 776 b \type\()_h264_qpel16_h_lowpass_neon 777endfunc 778 779function ff_\type\()_h264_qpel16_mc30_neon, export=1 780 lowpass_const w3 781 add x3, x1, #1 782 sub x1, x1, #2 783 b \type\()_h264_qpel16_h_lowpass_l2_neon 784endfunc 785 786function ff_\type\()_h264_qpel16_mc01_neon, export=1 787 mov x14, x30 788 mov x12, x1 789\type\()_h264_qpel16_mc01: 790 lowpass_const w3 791 mov x3, x2 792 sub x1, x1, x2, lsl #1 793 bl \type\()_h264_qpel16_v_lowpass_l2_neon 794 ret x14 795endfunc 796 797function ff_\type\()_h264_qpel16_mc11_neon, export=1 798 mov x14, x30 799 mov x8, x0 800 mov x9, x1 801\type\()_h264_qpel16_mc11: 802 lowpass_const w3 803 mov x11, sp 804 sub sp, sp, #256 805 mov x0, sp 806 sub x1, x1, #2 807 mov x3, #16 808 bl put_h264_qpel16_h_lowpass_neon 809 mov x0, x8 810 mov x3, x2 811 mov x12, sp 812 sub x1, x9, x2, lsl #1 813 mov x2, #16 814 bl \type\()_h264_qpel16_v_lowpass_l2_neon 815 mov sp, x11 816 ret x14 817endfunc 818 819function ff_\type\()_h264_qpel16_mc21_neon, export=1 820 mov x14, x30 821 mov x8, x0 822 mov x9, x1 823\type\()_h264_qpel16_mc21: 824 lowpass_const w3 825 mov x11, sp 826 sub sp, sp, #(16*16+16*12) 827 sub x1, x1, #2 828 mov x0, sp 829 bl put_h264_qpel16_h_lowpass_neon_packed 830 mov x4, x0 831 mov x0, x8 832 sub x1, x9, x2, lsl #1 833 sub x1, x1, #2 834 mov x3, x2 835 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 836 mov sp, x11 837 ret x14 838endfunc 839 840function ff_\type\()_h264_qpel16_mc31_neon, export=1 841 add x1, x1, #1 842 mov x14, x30 843 mov x8, x0 844 mov x9, x1 845 sub x1, x1, #1 846 b \type\()_h264_qpel16_mc11 847endfunc 848 849function ff_\type\()_h264_qpel16_mc02_neon, export=1 850 mov x14, x30 851 lowpass_const w3 852 sub x1, x1, x2, lsl #1 853 mov x3, x2 854 bl \type\()_h264_qpel16_v_lowpass_neon 855 ret x14 856endfunc 857 858function ff_\type\()_h264_qpel16_mc12_neon, export=1 859 mov x14, x30 860 mov x8, x0 861 mov x9, x1 862\type\()_h264_qpel16_mc12: 863 lowpass_const w3 864 mov x11, sp 865 sub sp, sp, #(16*16+16*12) 866 sub x1, x1, x2, lsl #1 867 mov x0, sp 868 mov x3, x2 869 bl put_h264_qpel16_v_lowpass_neon_packed 870 mov x4, x0 871 mov x0, x8 872 sub x1, x9, x3, lsl #1 873 sub x1, x1, #2 874 mov x2, x3 875 bl \type\()_h264_qpel16_hv_lowpass_l2_neon 876 mov sp, x11 877 ret x14 878endfunc 879 880function ff_\type\()_h264_qpel16_mc22_neon, export=1 881 mov x14, x30 882 lowpass_const w3 883 mov x11, sp 884 sub x1, x1, x2, lsl #1 885 sub x1, x1, #2 886 mov x3, x2 887 bl \type\()_h264_qpel16_hv_lowpass_neon 888 mov sp, x11 // restore stack 889 ret x14 890endfunc 891 892function ff_\type\()_h264_qpel16_mc32_neon, export=1 893 mov x14, x30 894 mov x8, x0 895 mov x9, x1 896 add x1, x1, #1 897 b \type\()_h264_qpel16_mc12 898endfunc 899 900function ff_\type\()_h264_qpel16_mc03_neon, export=1 901 mov x14, x30 902 add x12, x1, x2 903 b \type\()_h264_qpel16_mc01 904endfunc 905 906function ff_\type\()_h264_qpel16_mc13_neon, export=1 907 mov x14, x30 908 mov x8, x0 909 mov x9, x1 910 add x1, x1, x2 911 b \type\()_h264_qpel16_mc11 912endfunc 913 914function ff_\type\()_h264_qpel16_mc23_neon, export=1 915 mov x14, x30 916 mov x8, x0 917 mov x9, x1 918 add x1, x1, x2 919 b \type\()_h264_qpel16_mc21 920endfunc 921 922function ff_\type\()_h264_qpel16_mc33_neon, export=1 923 add x1, x1, #1 924 mov x14, x30 925 mov x8, x0 926 mov x9, x1 927 add x1, x1, x2 928 sub x1, x1, #1 929 b \type\()_h264_qpel16_mc11 930endfunc 931.endm 932 933 h264_qpel16 put 934 h264_qpel16 avg 935