1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/aarch64/asm.S" 23#include "neon.S" 24 25.macro h264_loop_filter_start 26 cmp w2, #0 27 ldr w6, [x4] 28 ccmp w3, #0, #0, ne 29 mov v24.S[0], w6 30 and w6, w6, w6, lsl #16 31 b.eq 1f 32 ands w6, w6, w6, lsl #8 33 b.ge 2f 341: 35 ret 362: 37.endm 38 39.macro h264_loop_filter_luma 40 dup v22.16B, w2 // alpha 41 uxtl v24.8H, v24.8B 42 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) 43 uxtl v24.4S, v24.4H 44 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) 45 sli v24.8H, v24.8H, #8 46 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) 47 sli v24.4S, v24.4S, #16 48 cmhi v21.16B, v22.16B, v21.16B // < alpha 49 dup v22.16B, w3 // beta 50 cmlt v23.16B, v24.16B, #0 51 cmhi v28.16B, v22.16B, v28.16B // < beta 52 cmhi v30.16B, v22.16B, v30.16B // < beta 53 bic v21.16B, v21.16B, v23.16B 54 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) 55 and v21.16B, v21.16B, v28.16B 56 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) 57 cmhi v17.16B, v22.16B, v17.16B // < beta 58 and v21.16B, v21.16B, v30.16B 59 cmhi v19.16B, v22.16B, v19.16B // < beta 60 and v17.16B, v17.16B, v21.16B 61 and v19.16B, v19.16B, v21.16B 62 and v24.16B, v24.16B, v21.16B 63 urhadd v28.16B, v16.16B, v0.16B 64 sub v21.16B, v24.16B, v17.16B 65 uqadd v23.16B, v18.16B, v24.16B 66 uhadd v20.16B, v20.16B, v28.16B 67 sub v21.16B, v21.16B, v19.16B 68 uhadd v28.16B, v4.16B, v28.16B 69 umin v23.16B, v23.16B, v20.16B 70 uqsub v22.16B, v18.16B, v24.16B 71 uqadd v4.16B, v2.16B, v24.16B 72 umax v23.16B, v23.16B, v22.16B 73 uqsub v22.16B, v2.16B, v24.16B 74 umin v28.16B, v4.16B, v28.16B 75 uxtl v4.8H, v0.8B 76 umax v28.16B, v28.16B, v22.16B 77 uxtl2 v20.8H, v0.16B 78 usubw v4.8H, v4.8H, v16.8B 79 usubw2 v20.8H, v20.8H, v16.16B 80 shl v4.8H, v4.8H, #2 81 shl v20.8H, v20.8H, #2 82 uaddw v4.8H, v4.8H, v18.8B 83 uaddw2 v20.8H, v20.8H, v18.16B 84 usubw v4.8H, v4.8H, v2.8B 85 usubw2 v20.8H, v20.8H, v2.16B 86 rshrn v4.8B, v4.8H, #3 87 rshrn2 v4.16B, v20.8H, #3 88 bsl v17.16B, v23.16B, v18.16B 89 bsl v19.16B, v28.16B, v2.16B 90 neg v23.16B, v21.16B 91 uxtl v28.8H, v16.8B 92 smin v4.16B, v4.16B, v21.16B 93 uxtl2 v21.8H, v16.16B 94 smax v4.16B, v4.16B, v23.16B 95 uxtl v22.8H, v0.8B 96 uxtl2 v24.8H, v0.16B 97 saddw v28.8H, v28.8H, v4.8B 98 saddw2 v21.8H, v21.8H, v4.16B 99 ssubw v22.8H, v22.8H, v4.8B 100 ssubw2 v24.8H, v24.8H, v4.16B 101 sqxtun v16.8B, v28.8H 102 sqxtun2 v16.16B, v21.8H 103 sqxtun v0.8B, v22.8H 104 sqxtun2 v0.16B, v24.8H 105.endm 106 107function ff_h264_v_loop_filter_luma_neon, export=1 108 h264_loop_filter_start 109 sxtw x1, w1 110 111 ld1 {v0.16B}, [x0], x1 112 ld1 {v2.16B}, [x0], x1 113 ld1 {v4.16B}, [x0], x1 114 sub x0, x0, x1, lsl #2 115 sub x0, x0, x1, lsl #1 116 ld1 {v20.16B}, [x0], x1 117 ld1 {v18.16B}, [x0], x1 118 ld1 {v16.16B}, [x0], x1 119 120 h264_loop_filter_luma 121 122 sub x0, x0, x1, lsl #1 123 st1 {v17.16B}, [x0], x1 124 st1 {v16.16B}, [x0], x1 125 st1 {v0.16B}, [x0], x1 126 st1 {v19.16B}, [x0] 127 128 ret 129endfunc 130 131function ff_h264_h_loop_filter_luma_neon, export=1 132 h264_loop_filter_start 133 134 sub x0, x0, #4 135 ld1 {v6.8B}, [x0], x1 136 ld1 {v20.8B}, [x0], x1 137 ld1 {v18.8B}, [x0], x1 138 ld1 {v16.8B}, [x0], x1 139 ld1 {v0.8B}, [x0], x1 140 ld1 {v2.8B}, [x0], x1 141 ld1 {v4.8B}, [x0], x1 142 ld1 {v26.8B}, [x0], x1 143 ld1 {v6.D}[1], [x0], x1 144 ld1 {v20.D}[1], [x0], x1 145 ld1 {v18.D}[1], [x0], x1 146 ld1 {v16.D}[1], [x0], x1 147 ld1 {v0.D}[1], [x0], x1 148 ld1 {v2.D}[1], [x0], x1 149 ld1 {v4.D}[1], [x0], x1 150 ld1 {v26.D}[1], [x0], x1 151 152 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 153 154 h264_loop_filter_luma 155 156 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 157 158 sub x0, x0, x1, lsl #4 159 add x0, x0, #2 160 st1 {v17.S}[0], [x0], x1 161 st1 {v16.S}[0], [x0], x1 162 st1 {v0.S}[0], [x0], x1 163 st1 {v19.S}[0], [x0], x1 164 st1 {v17.S}[1], [x0], x1 165 st1 {v16.S}[1], [x0], x1 166 st1 {v0.S}[1], [x0], x1 167 st1 {v19.S}[1], [x0], x1 168 st1 {v17.S}[2], [x0], x1 169 st1 {v16.S}[2], [x0], x1 170 st1 {v0.S}[2], [x0], x1 171 st1 {v19.S}[2], [x0], x1 172 st1 {v17.S}[3], [x0], x1 173 st1 {v16.S}[3], [x0], x1 174 st1 {v0.S}[3], [x0], x1 175 st1 {v19.S}[3], [x0], x1 176 177 ret 178endfunc 179 180.macro h264_loop_filter_chroma 181 dup v22.8B, w2 // alpha 182 uxtl v24.8H, v24.8B 183 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) 184 uxtl v4.8H, v0.8B 185 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) 186 usubw v4.8H, v4.8H, v16.8B 187 sli v24.8H, v24.8H, #8 188 shl v4.8H, v4.8H, #2 189 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) 190 uaddw v4.8H, v4.8H, v18.8B 191 cmhi v26.8B, v22.8B, v26.8B // < alpha 192 usubw v4.8H, v4.8H, v2.8B 193 dup v22.8B, w3 // beta 194 rshrn v4.8B, v4.8H, #3 195 cmhi v28.8B, v22.8B, v28.8B // < beta 196 cmhi v30.8B, v22.8B, v30.8B // < beta 197 smin v4.8B, v4.8B, v24.8B 198 neg v25.8B, v24.8B 199 and v26.8B, v26.8B, v28.8B 200 smax v4.8B, v4.8B, v25.8B 201 and v26.8B, v26.8B, v30.8B 202 uxtl v22.8H, v0.8B 203 and v4.8B, v4.8B, v26.8B 204 uxtl v28.8H, v16.8B 205 saddw v28.8H, v28.8H, v4.8B 206 ssubw v22.8H, v22.8H, v4.8B 207 sqxtun v16.8B, v28.8H 208 sqxtun v0.8B, v22.8H 209.endm 210 211function ff_h264_v_loop_filter_chroma_neon, export=1 212 h264_loop_filter_start 213 214 sub x0, x0, x1, lsl #1 215 ld1 {v18.8B}, [x0], x1 216 ld1 {v16.8B}, [x0], x1 217 ld1 {v0.8B}, [x0], x1 218 ld1 {v2.8B}, [x0] 219 220 h264_loop_filter_chroma 221 222 sub x0, x0, x1, lsl #1 223 st1 {v16.8B}, [x0], x1 224 st1 {v0.8B}, [x0], x1 225 226 ret 227endfunc 228 229function ff_h264_h_loop_filter_chroma_neon, export=1 230 h264_loop_filter_start 231 232 sub x0, x0, #2 233 ld1 {v18.S}[0], [x0], x1 234 ld1 {v16.S}[0], [x0], x1 235 ld1 {v0.S}[0], [x0], x1 236 ld1 {v2.S}[0], [x0], x1 237 ld1 {v18.S}[1], [x0], x1 238 ld1 {v16.S}[1], [x0], x1 239 ld1 {v0.S}[1], [x0], x1 240 ld1 {v2.S}[1], [x0], x1 241 242 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 243 244 h264_loop_filter_chroma 245 246 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 247 248 sub x0, x0, x1, lsl #3 249 st1 {v18.S}[0], [x0], x1 250 st1 {v16.S}[0], [x0], x1 251 st1 {v0.S}[0], [x0], x1 252 st1 {v2.S}[0], [x0], x1 253 st1 {v18.S}[1], [x0], x1 254 st1 {v16.S}[1], [x0], x1 255 st1 {v0.S}[1], [x0], x1 256 st1 {v2.S}[1], [x0], x1 257 258 ret 259endfunc 260 261.macro biweight_16 macs, macd 262 dup v0.16B, w5 263 dup v1.16B, w6 264 mov v4.16B, v16.16B 265 mov v6.16B, v16.16B 2661: subs w3, w3, #2 267 ld1 {v20.16B}, [x0], x2 268 \macd v4.8H, v0.8B, v20.8B 269 \macd\()2 v6.8H, v0.16B, v20.16B 270 ld1 {v22.16B}, [x1], x2 271 \macs v4.8H, v1.8B, v22.8B 272 \macs\()2 v6.8H, v1.16B, v22.16B 273 mov v24.16B, v16.16B 274 ld1 {v28.16B}, [x0], x2 275 mov v26.16B, v16.16B 276 \macd v24.8H, v0.8B, v28.8B 277 \macd\()2 v26.8H, v0.16B, v28.16B 278 ld1 {v30.16B}, [x1], x2 279 \macs v24.8H, v1.8B, v30.8B 280 \macs\()2 v26.8H, v1.16B, v30.16B 281 sshl v4.8H, v4.8H, v18.8H 282 sshl v6.8H, v6.8H, v18.8H 283 sqxtun v4.8B, v4.8H 284 sqxtun2 v4.16B, v6.8H 285 sshl v24.8H, v24.8H, v18.8H 286 sshl v26.8H, v26.8H, v18.8H 287 sqxtun v24.8B, v24.8H 288 sqxtun2 v24.16B, v26.8H 289 mov v6.16B, v16.16B 290 st1 {v4.16B}, [x7], x2 291 mov v4.16B, v16.16B 292 st1 {v24.16B}, [x7], x2 293 b.ne 1b 294 ret 295.endm 296 297.macro biweight_8 macs, macd 298 dup v0.8B, w5 299 dup v1.8B, w6 300 mov v2.16B, v16.16B 301 mov v20.16B, v16.16B 3021: subs w3, w3, #2 303 ld1 {v4.8B}, [x0], x2 304 \macd v2.8H, v0.8B, v4.8B 305 ld1 {v5.8B}, [x1], x2 306 \macs v2.8H, v1.8B, v5.8B 307 ld1 {v6.8B}, [x0], x2 308 \macd v20.8H, v0.8B, v6.8B 309 ld1 {v7.8B}, [x1], x2 310 \macs v20.8H, v1.8B, v7.8B 311 sshl v2.8H, v2.8H, v18.8H 312 sqxtun v2.8B, v2.8H 313 sshl v20.8H, v20.8H, v18.8H 314 sqxtun v4.8B, v20.8H 315 mov v20.16B, v16.16B 316 st1 {v2.8B}, [x7], x2 317 mov v2.16B, v16.16B 318 st1 {v4.8B}, [x7], x2 319 b.ne 1b 320 ret 321.endm 322 323.macro biweight_4 macs, macd 324 dup v0.8B, w5 325 dup v1.8B, w6 326 mov v2.16B, v16.16B 327 mov v20.16B,v16.16B 3281: subs w3, w3, #4 329 ld1 {v4.S}[0], [x0], x2 330 ld1 {v4.S}[1], [x0], x2 331 \macd v2.8H, v0.8B, v4.8B 332 ld1 {v5.S}[0], [x1], x2 333 ld1 {v5.S}[1], [x1], x2 334 \macs v2.8H, v1.8B, v5.8B 335 b.lt 2f 336 ld1 {v6.S}[0], [x0], x2 337 ld1 {v6.S}[1], [x0], x2 338 \macd v20.8H, v0.8B, v6.8B 339 ld1 {v7.S}[0], [x1], x2 340 ld1 {v7.S}[1], [x1], x2 341 \macs v20.8H, v1.8B, v7.8B 342 sshl v2.8H, v2.8H, v18.8H 343 sqxtun v2.8B, v2.8H 344 sshl v20.8H, v20.8H, v18.8H 345 sqxtun v4.8B, v20.8H 346 mov v20.16B, v16.16B 347 st1 {v2.S}[0], [x7], x2 348 st1 {v2.S}[1], [x7], x2 349 mov v2.16B, v16.16B 350 st1 {v4.S}[0], [x7], x2 351 st1 {v4.S}[1], [x7], x2 352 b.ne 1b 353 ret 3542: sshl v2.8H, v2.8H, v18.8H 355 sqxtun v2.8B, v2.8H 356 st1 {v2.S}[0], [x7], x2 357 st1 {v2.S}[1], [x7], x2 358 ret 359.endm 360 361.macro biweight_func w 362function ff_biweight_h264_pixels_\w\()_neon, export=1 363 sxtw x2, w2 364 lsr w8, w5, #31 365 add w7, w7, #1 366 eor w8, w8, w6, lsr #30 367 orr w7, w7, #1 368 dup v18.8H, w4 369 lsl w7, w7, w4 370 not v18.16B, v18.16B 371 dup v16.8H, w7 372 mov x7, x0 373 cbz w8, 10f 374 subs w8, w8, #1 375 b.eq 20f 376 subs w8, w8, #1 377 b.eq 30f 378 b 40f 37910: biweight_\w umlal, umlal 38020: neg w5, w5 381 biweight_\w umlal, umlsl 38230: neg w5, w5 383 neg w6, w6 384 biweight_\w umlsl, umlsl 38540: neg w6, w6 386 biweight_\w umlsl, umlal 387endfunc 388.endm 389 390 biweight_func 16 391 biweight_func 8 392 biweight_func 4 393 394.macro weight_16 add 395 dup v0.16B, w4 3961: subs w2, w2, #2 397 ld1 {v20.16B}, [x0], x1 398 umull v4.8H, v0.8B, v20.8B 399 umull2 v6.8H, v0.16B, v20.16B 400 ld1 {v28.16B}, [x0], x1 401 umull v24.8H, v0.8B, v28.8B 402 umull2 v26.8H, v0.16B, v28.16B 403 \add v4.8H, v16.8H, v4.8H 404 srshl v4.8H, v4.8H, v18.8H 405 \add v6.8H, v16.8H, v6.8H 406 srshl v6.8H, v6.8H, v18.8H 407 sqxtun v4.8B, v4.8H 408 sqxtun2 v4.16B, v6.8H 409 \add v24.8H, v16.8H, v24.8H 410 srshl v24.8H, v24.8H, v18.8H 411 \add v26.8H, v16.8H, v26.8H 412 srshl v26.8H, v26.8H, v18.8H 413 sqxtun v24.8B, v24.8H 414 sqxtun2 v24.16B, v26.8H 415 st1 {v4.16B}, [x5], x1 416 st1 {v24.16B}, [x5], x1 417 b.ne 1b 418 ret 419.endm 420 421.macro weight_8 add 422 dup v0.8B, w4 4231: subs w2, w2, #2 424 ld1 {v4.8B}, [x0], x1 425 umull v2.8H, v0.8B, v4.8B 426 ld1 {v6.8B}, [x0], x1 427 umull v20.8H, v0.8B, v6.8B 428 \add v2.8H, v16.8H, v2.8H 429 srshl v2.8H, v2.8H, v18.8H 430 sqxtun v2.8B, v2.8H 431 \add v20.8H, v16.8H, v20.8H 432 srshl v20.8H, v20.8H, v18.8H 433 sqxtun v4.8B, v20.8H 434 st1 {v2.8B}, [x5], x1 435 st1 {v4.8B}, [x5], x1 436 b.ne 1b 437 ret 438.endm 439 440.macro weight_4 add 441 dup v0.8B, w4 4421: subs w2, w2, #4 443 ld1 {v4.S}[0], [x0], x1 444 ld1 {v4.S}[1], [x0], x1 445 umull v2.8H, v0.8B, v4.8B 446 b.lt 2f 447 ld1 {v6.S}[0], [x0], x1 448 ld1 {v6.S}[1], [x0], x1 449 umull v20.8H, v0.8B, v6.8B 450 \add v2.8H, v16.8H, v2.8H 451 srshl v2.8H, v2.8H, v18.8H 452 sqxtun v2.8B, v2.8H 453 \add v20.8H, v16.8H, v20.8H 454 srshl v20.8H, v20.8h, v18.8H 455 sqxtun v4.8B, v20.8H 456 st1 {v2.S}[0], [x5], x1 457 st1 {v2.S}[1], [x5], x1 458 st1 {v4.S}[0], [x5], x1 459 st1 {v4.S}[1], [x5], x1 460 b.ne 1b 461 ret 4622: \add v2.8H, v16.8H, v2.8H 463 srshl v2.8H, v2.8H, v18.8H 464 sqxtun v2.8B, v2.8H 465 st1 {v2.S}[0], [x5], x1 466 st1 {v2.S}[1], [x5], x1 467 ret 468.endm 469 470.macro weight_func w 471function ff_weight_h264_pixels_\w\()_neon, export=1 472 sxtw x1, w1 473 cmp w3, #1 474 mov w6, #1 475 lsl w5, w5, w3 476 dup v16.8H, w5 477 mov x5, x0 478 b.le 20f 479 sub w6, w6, w3 480 dup v18.8H, w6 481 cmp w4, #0 482 b.lt 10f 483 weight_\w shadd 48410: neg w4, w4 485 weight_\w shsub 48620: neg w6, w3 487 dup v18.8H, w6 488 cmp w4, #0 489 b.lt 10f 490 weight_\w add 49110: neg w4, w4 492 weight_\w sub 493endfunc 494.endm 495 496 weight_func 16 497 weight_func 8 498 weight_func 4 499