1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/aarch64/asm.S" 23#include "neon.S" 24 25function ff_h264_idct_add_neon, export=1 26 ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] 27 sxtw x2, w2 28 movi v30.8H, #0 29 30 add v4.4H, v0.4H, v2.4H 31 sshr v16.4H, v1.4H, #1 32 st1 {v30.8H}, [x1], #16 33 sshr v17.4H, v3.4H, #1 34 st1 {v30.8H}, [x1], #16 35 sub v5.4H, v0.4H, v2.4H 36 add v6.4H, v1.4H, v17.4H 37 sub v7.4H, v16.4H, v3.4H 38 add v0.4H, v4.4H, v6.4H 39 add v1.4H, v5.4H, v7.4H 40 sub v2.4H, v4.4H, v6.4H 41 sub v3.4H, v5.4H, v7.4H 42 43 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 44 45 add v4.4H, v0.4H, v3.4H 46 ld1 {v18.S}[0], [x0], x2 47 sshr v16.4H, v2.4H, #1 48 sshr v17.4H, v1.4H, #1 49 ld1 {v19.S}[1], [x0], x2 50 sub v5.4H, v0.4H, v3.4H 51 ld1 {v18.S}[1], [x0], x2 52 add v6.4H, v16.4H, v1.4H 53 ins v4.D[1], v5.D[0] 54 sub v7.4H, v2.4H, v17.4H 55 ld1 {v19.S}[0], [x0], x2 56 ins v6.D[1], v7.D[0] 57 sub x0, x0, x2, lsl #2 58 add v0.8H, v4.8H, v6.8H 59 sub v1.8H, v4.8H, v6.8H 60 61 srshr v0.8H, v0.8H, #6 62 srshr v1.8H, v1.8H, #6 63 64 uaddw v0.8H, v0.8H, v18.8B 65 uaddw v1.8H, v1.8H, v19.8B 66 67 sqxtun v0.8B, v0.8H 68 sqxtun v1.8B, v1.8H 69 70 st1 {v0.S}[0], [x0], x2 71 st1 {v1.S}[1], [x0], x2 72 st1 {v0.S}[1], [x0], x2 73 st1 {v1.S}[0], [x0], x2 74 75 sub x1, x1, #32 76 ret 77endfunc 78 79function ff_h264_idct_dc_add_neon, export=1 80 sxtw x2, w2 81 mov w3, #0 82 ld1r {v2.8H}, [x1] 83 strh w3, [x1] 84 srshr v2.8H, v2.8H, #6 85 ld1 {v0.S}[0], [x0], x2 86 ld1 {v0.S}[1], [x0], x2 87 uaddw v3.8H, v2.8H, v0.8B 88 ld1 {v1.S}[0], [x0], x2 89 ld1 {v1.S}[1], [x0], x2 90 uaddw v4.8H, v2.8H, v1.8B 91 sqxtun v0.8B, v3.8H 92 sqxtun v1.8B, v4.8H 93 sub x0, x0, x2, lsl #2 94 st1 {v0.S}[0], [x0], x2 95 st1 {v0.S}[1], [x0], x2 96 st1 {v1.S}[0], [x0], x2 97 st1 {v1.S}[1], [x0], x2 98 ret 99endfunc 100 101function ff_h264_idct_add16_neon, export=1 102 mov x12, x30 103 mov x6, x0 // dest 104 mov x5, x1 // block_offset 105 mov x1, x2 // block 106 mov w9, w3 // stride 107 movrel x7, scan8 108 mov x10, #16 109 movrel x13, X(ff_h264_idct_dc_add_neon) 110 movrel x14, X(ff_h264_idct_add_neon) 1111: mov w2, w9 112 ldrb w3, [x7], #1 113 ldrsw x0, [x5], #4 114 ldrb w3, [x4, w3, uxtw] 115 subs w3, w3, #1 116 b.lt 2f 117 ldrsh w3, [x1] 118 add x0, x0, x6 119 ccmp w3, #0, #4, eq 120 csel x15, x13, x14, ne 121 blr x15 1222: subs x10, x10, #1 123 add x1, x1, #32 124 b.ne 1b 125 ret x12 126endfunc 127 128function ff_h264_idct_add16intra_neon, export=1 129 mov x12, x30 130 mov x6, x0 // dest 131 mov x5, x1 // block_offset 132 mov x1, x2 // block 133 mov w9, w3 // stride 134 movrel x7, scan8 135 mov x10, #16 136 movrel x13, X(ff_h264_idct_dc_add_neon) 137 movrel x14, X(ff_h264_idct_add_neon) 1381: mov w2, w9 139 ldrb w3, [x7], #1 140 ldrsw x0, [x5], #4 141 ldrb w3, [x4, w3, uxtw] 142 add x0, x0, x6 143 cmp w3, #0 144 ldrsh w3, [x1] 145 csel x15, x13, x14, eq 146 ccmp w3, #0, #0, eq 147 b.eq 2f 148 blr x15 1492: subs x10, x10, #1 150 add x1, x1, #32 151 b.ne 1b 152 ret x12 153endfunc 154 155function ff_h264_idct_add8_neon, export=1 156 sub sp, sp, #0x40 157 stp x19, x20, [sp] 158 mov x12, x30 159 ldp x6, x15, [x0] // dest[0], dest[1] 160 add x5, x1, #16*4 // block_offset 161 add x9, x2, #16*32 // block 162 mov w19, w3 // stride 163 movrel x13, X(ff_h264_idct_dc_add_neon) 164 movrel x14, X(ff_h264_idct_add_neon) 165 movrel x7, scan8+16 166 mov x10, #0 167 mov x11, #16 1681: mov w2, w19 169 ldrb w3, [x7, x10] // scan8[i] 170 ldrsw x0, [x5, x10, lsl #2] // block_offset[i] 171 ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] 172 add x0, x0, x6 // block_offset[i] + dst[j-1] 173 add x1, x9, x10, lsl #5 // block + i * 16 174 cmp w3, #0 175 ldrsh w3, [x1] // block[i*16] 176 csel x20, x13, x14, eq 177 ccmp w3, #0, #0, eq 178 b.eq 2f 179 blr x20 1802: add x10, x10, #1 181 cmp x10, #4 182 csel x10, x11, x10, eq // mov x10, #16 183 csel x6, x15, x6, eq 184 cmp x10, #20 185 b.lt 1b 186 ldp x19, x20, [sp] 187 add sp, sp, #0x40 188 ret x12 189endfunc 190 191.macro idct8x8_cols pass 192 .if \pass == 0 193 va .req v18 194 vb .req v30 195 sshr v18.8H, v26.8H, #1 196 add v16.8H, v24.8H, v28.8H 197 ld1 {v30.8H, v31.8H}, [x1] 198 st1 {v19.8H}, [x1], #16 199 st1 {v19.8H}, [x1], #16 200 sub v17.8H, v24.8H, v28.8H 201 sshr v19.8H, v30.8H, #1 202 sub v18.8H, v18.8H, v30.8H 203 add v19.8H, v19.8H, v26.8H 204 .else 205 va .req v30 206 vb .req v18 207 sshr v30.8H, v26.8H, #1 208 sshr v19.8H, v18.8H, #1 209 add v16.8H, v24.8H, v28.8H 210 sub v17.8H, v24.8H, v28.8H 211 sub v30.8H, v30.8H, v18.8H 212 add v19.8H, v19.8H, v26.8H 213 .endif 214 add v26.8H, v17.8H, va.8H 215 sub v28.8H, v17.8H, va.8H 216 add v24.8H, v16.8H, v19.8H 217 sub vb.8H, v16.8H, v19.8H 218 sub v16.8H, v29.8H, v27.8H 219 add v17.8H, v31.8H, v25.8H 220 sub va.8H, v31.8H, v25.8H 221 add v19.8H, v29.8H, v27.8H 222 sub v16.8H, v16.8H, v31.8H 223 sub v17.8H, v17.8H, v27.8H 224 add va.8H, va.8H, v29.8H 225 add v19.8H, v19.8H, v25.8H 226 sshr v25.8H, v25.8H, #1 227 sshr v27.8H, v27.8H, #1 228 sshr v29.8H, v29.8H, #1 229 sshr v31.8H, v31.8H, #1 230 sub v16.8H, v16.8H, v31.8H 231 sub v17.8H, v17.8H, v27.8H 232 add va.8H, va.8H, v29.8H 233 add v19.8H, v19.8H, v25.8H 234 sshr v25.8H, v16.8H, #2 235 sshr v27.8H, v17.8H, #2 236 sshr v29.8H, va.8H, #2 237 sshr v31.8H, v19.8H, #2 238 sub v19.8H, v19.8H, v25.8H 239 sub va.8H, v27.8H, va.8H 240 add v17.8H, v17.8H, v29.8H 241 add v16.8H, v16.8H, v31.8H 242 .if \pass == 0 243 sub v31.8H, v24.8H, v19.8H 244 add v24.8H, v24.8H, v19.8H 245 add v25.8H, v26.8H, v18.8H 246 sub v18.8H, v26.8H, v18.8H 247 add v26.8H, v28.8H, v17.8H 248 add v27.8H, v30.8H, v16.8H 249 sub v29.8H, v28.8H, v17.8H 250 sub v28.8H, v30.8H, v16.8H 251 .else 252 sub v31.8H, v24.8H, v19.8H 253 add v24.8H, v24.8H, v19.8H 254 add v25.8H, v26.8H, v30.8H 255 sub v30.8H, v26.8H, v30.8H 256 add v26.8H, v28.8H, v17.8H 257 sub v29.8H, v28.8H, v17.8H 258 add v27.8H, v18.8H, v16.8H 259 sub v28.8H, v18.8H, v16.8H 260 .endif 261 .unreq va 262 .unreq vb 263.endm 264 265function ff_h264_idct8_add_neon, export=1 266 movi v19.8H, #0 267 ld1 {v24.8H, v25.8H}, [x1] 268 st1 {v19.8H}, [x1], #16 269 st1 {v19.8H}, [x1], #16 270 ld1 {v26.8H, v27.8H}, [x1] 271 st1 {v19.8H}, [x1], #16 272 st1 {v19.8H}, [x1], #16 273 ld1 {v28.8H, v29.8H}, [x1] 274 st1 {v19.8H}, [x1], #16 275 st1 {v19.8H}, [x1], #16 276 277 idct8x8_cols 0 278 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 279 idct8x8_cols 1 280 281 mov x3, x0 282 srshr v24.8H, v24.8H, #6 283 ld1 {v0.8B}, [x0], x2 284 srshr v25.8H, v25.8H, #6 285 ld1 {v1.8B}, [x0], x2 286 srshr v26.8H, v26.8H, #6 287 ld1 {v2.8B}, [x0], x2 288 srshr v27.8H, v27.8H, #6 289 ld1 {v3.8B}, [x0], x2 290 srshr v28.8H, v28.8H, #6 291 ld1 {v4.8B}, [x0], x2 292 srshr v29.8H, v29.8H, #6 293 ld1 {v5.8B}, [x0], x2 294 srshr v30.8H, v30.8H, #6 295 ld1 {v6.8B}, [x0], x2 296 srshr v31.8H, v31.8H, #6 297 ld1 {v7.8B}, [x0], x2 298 uaddw v24.8H, v24.8H, v0.8B 299 uaddw v25.8H, v25.8H, v1.8B 300 uaddw v26.8H, v26.8H, v2.8B 301 sqxtun v0.8B, v24.8H 302 uaddw v27.8H, v27.8H, v3.8B 303 sqxtun v1.8B, v25.8H 304 uaddw v28.8H, v28.8H, v4.8B 305 sqxtun v2.8B, v26.8H 306 st1 {v0.8B}, [x3], x2 307 uaddw v29.8H, v29.8H, v5.8B 308 sqxtun v3.8B, v27.8H 309 st1 {v1.8B}, [x3], x2 310 uaddw v30.8H, v30.8H, v6.8B 311 sqxtun v4.8B, v28.8H 312 st1 {v2.8B}, [x3], x2 313 uaddw v31.8H, v31.8H, v7.8B 314 sqxtun v5.8B, v29.8H 315 st1 {v3.8B}, [x3], x2 316 sqxtun v6.8B, v30.8H 317 sqxtun v7.8B, v31.8H 318 st1 {v4.8B}, [x3], x2 319 st1 {v5.8B}, [x3], x2 320 st1 {v6.8B}, [x3], x2 321 st1 {v7.8B}, [x3], x2 322 323 sub x1, x1, #128 324 ret 325endfunc 326 327function ff_h264_idct8_dc_add_neon, export=1 328 mov w3, #0 329 sxtw x2, w2 330 ld1r {v31.8H}, [x1] 331 strh w3, [x1] 332 ld1 {v0.8B}, [x0], x2 333 srshr v31.8H, v31.8H, #6 334 ld1 {v1.8B}, [x0], x2 335 ld1 {v2.8B}, [x0], x2 336 uaddw v24.8H, v31.8H, v0.8B 337 ld1 {v3.8B}, [x0], x2 338 uaddw v25.8H, v31.8H, v1.8B 339 ld1 {v4.8B}, [x0], x2 340 uaddw v26.8H, v31.8H, v2.8B 341 ld1 {v5.8B}, [x0], x2 342 uaddw v27.8H, v31.8H, v3.8B 343 ld1 {v6.8B}, [x0], x2 344 uaddw v28.8H, v31.8H, v4.8B 345 ld1 {v7.8B}, [x0], x2 346 uaddw v29.8H, v31.8H, v5.8B 347 uaddw v30.8H, v31.8H, v6.8B 348 uaddw v31.8H, v31.8H, v7.8B 349 sqxtun v0.8B, v24.8H 350 sqxtun v1.8B, v25.8H 351 sqxtun v2.8B, v26.8H 352 sqxtun v3.8B, v27.8H 353 sub x0, x0, x2, lsl #3 354 st1 {v0.8B}, [x0], x2 355 sqxtun v4.8B, v28.8H 356 st1 {v1.8B}, [x0], x2 357 sqxtun v5.8B, v29.8H 358 st1 {v2.8B}, [x0], x2 359 sqxtun v6.8B, v30.8H 360 st1 {v3.8B}, [x0], x2 361 sqxtun v7.8B, v31.8H 362 st1 {v4.8B}, [x0], x2 363 st1 {v5.8B}, [x0], x2 364 st1 {v6.8B}, [x0], x2 365 st1 {v7.8B}, [x0], x2 366 ret 367endfunc 368 369function ff_h264_idct8_add4_neon, export=1 370 mov x12, x30 371 mov x6, x0 372 mov x5, x1 373 mov x1, x2 374 mov w2, w3 375 movrel x7, scan8 376 mov w10, #16 377 movrel x13, X(ff_h264_idct8_dc_add_neon) 378 movrel x14, X(ff_h264_idct8_add_neon) 3791: ldrb w9, [x7], #4 380 ldrsw x0, [x5], #16 381 ldrb w9, [x4, w9, UXTW] 382 subs w9, w9, #1 383 b.lt 2f 384 ldrsh w11, [x1] 385 add x0, x6, x0 386 ccmp w11, #0, #4, eq 387 csel x15, x13, x14, ne 388 blr x15 3892: subs w10, w10, #4 390 add x1, x1, #128 391 b.ne 1b 392 ret x12 393endfunc 394 395const scan8 396 .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 397 .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 398 .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 399 .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 400 .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 401 .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 402 .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 403 .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 404 .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 405 .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 406 .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 407 .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 408endconst 409