1/* 2 * VP8 NEON optimisations 3 * 4 * Copyright (c) 2010 Rob Clark <rob@ti.com> 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "libavutil/arm/asm.S" 25#include "neon.S" 26 27function ff_vp8_luma_dc_wht_neon, export=1 28 vld1.16 {q0-q1}, [r1,:128] 29 vmov.i16 q15, #0 30 31 vadd.i16 d4, d0, d3 32 vadd.i16 d6, d1, d2 33 vst1.16 {q15}, [r1,:128]! 34 vsub.i16 d7, d1, d2 35 vsub.i16 d5, d0, d3 36 vst1.16 {q15}, [r1,:128] 37 vadd.i16 q0, q2, q3 38 vsub.i16 q1, q2, q3 39 40 vmov.i16 q8, #3 41 42 vtrn.32 d0, d2 43 vtrn.32 d1, d3 44 vtrn.16 d0, d1 45 vtrn.16 d2, d3 46 47 vadd.i16 d0, d0, d16 48 49 vadd.i16 d4, d0, d3 50 vadd.i16 d6, d1, d2 51 vsub.i16 d7, d1, d2 52 vsub.i16 d5, d0, d3 53 vadd.i16 q0, q2, q3 54 vsub.i16 q1, q2, q3 55 56 vshr.s16 q0, q0, #3 57 vshr.s16 q1, q1, #3 58 59 mov r3, #32 60 vst1.16 {d0[0]}, [r0,:16], r3 61 vst1.16 {d1[0]}, [r0,:16], r3 62 vst1.16 {d2[0]}, [r0,:16], r3 63 vst1.16 {d3[0]}, [r0,:16], r3 64 vst1.16 {d0[1]}, [r0,:16], r3 65 vst1.16 {d1[1]}, [r0,:16], r3 66 vst1.16 {d2[1]}, [r0,:16], r3 67 vst1.16 {d3[1]}, [r0,:16], r3 68 vst1.16 {d0[2]}, [r0,:16], r3 69 vst1.16 {d1[2]}, [r0,:16], r3 70 vst1.16 {d2[2]}, [r0,:16], r3 71 vst1.16 {d3[2]}, [r0,:16], r3 72 vst1.16 {d0[3]}, [r0,:16], r3 73 vst1.16 {d1[3]}, [r0,:16], r3 74 vst1.16 {d2[3]}, [r0,:16], r3 75 vst1.16 {d3[3]}, [r0,:16], r3 76 77 bx lr 78endfunc 79 80function ff_vp8_idct_add_neon, export=1 81 vld1.16 {q0-q1}, [r1,:128] 82 movw r3, #20091 83 movt r3, #35468/2 84 vdup.32 d4, r3 85 86 vmull.s16 q12, d1, d4[0] 87 vmull.s16 q13, d3, d4[0] 88 vqdmulh.s16 d20, d1, d4[1] 89 vqdmulh.s16 d23, d3, d4[1] 90 vshrn.s32 d21, q12, #16 91 vshrn.s32 d22, q13, #16 92 vadd.s16 d21, d21, d1 93 vadd.s16 d22, d22, d3 94 95 vadd.s16 d16, d0, d2 96 vsub.s16 d17, d0, d2 97 vadd.s16 d18, d21, d23 98 vsub.s16 d19, d20, d22 99 vadd.s16 q0, q8, q9 100 vsub.s16 q1, q8, q9 101 102 vtrn.32 d0, d3 103 vtrn.32 d1, d2 104 vtrn.16 d0, d1 105 vtrn.16 d3, d2 106 107 vmov.i16 q15, #0 108 vmull.s16 q12, d1, d4[0] 109 vst1.16 {q15}, [r1,:128]! 110 vmull.s16 q13, d2, d4[0] 111 vst1.16 {q15}, [r1,:128] 112 vqdmulh.s16 d21, d1, d4[1] 113 vqdmulh.s16 d23, d2, d4[1] 114 vshrn.s32 d20, q12, #16 115 vshrn.s32 d22, q13, #16 116 vadd.i16 d20, d20, d1 117 vadd.i16 d22, d22, d2 118 119 vadd.i16 d16, d0, d3 120 vsub.i16 d17, d0, d3 121 vadd.i16 d18, d20, d23 122 vld1.32 {d20[]}, [r0,:32], r2 123 vsub.i16 d19, d21, d22 124 vld1.32 {d22[]}, [r0,:32], r2 125 vadd.s16 q0, q8, q9 126 vld1.32 {d23[]}, [r0,:32], r2 127 vsub.s16 q1, q8, q9 128 vld1.32 {d21[]}, [r0,:32], r2 129 vrshr.s16 q0, q0, #3 130 vtrn.32 q10, q11 131 vrshr.s16 q1, q1, #3 132 133 sub r0, r0, r2, lsl #2 134 135 vtrn.32 d0, d3 136 vtrn.32 d1, d2 137 vtrn.16 d0, d1 138 vtrn.16 d3, d2 139 140 vaddw.u8 q0, q0, d20 141 vaddw.u8 q1, q1, d21 142 vqmovun.s16 d0, q0 143 vqmovun.s16 d1, q1 144 145 vst1.32 {d0[0]}, [r0,:32], r2 146 vst1.32 {d0[1]}, [r0,:32], r2 147 vst1.32 {d1[1]}, [r0,:32], r2 148 vst1.32 {d1[0]}, [r0,:32], r2 149 150 bx lr 151endfunc 152 153function ff_vp8_idct_dc_add_neon, export=1 154 mov r3, #0 155 ldrsh r12, [r1] 156 strh r3, [r1] 157 vdup.16 q1, r12 158 vrshr.s16 q1, q1, #3 159 vld1.32 {d0[]}, [r0,:32], r2 160 vld1.32 {d1[]}, [r0,:32], r2 161 vld1.32 {d0[1]}, [r0,:32], r2 162 vld1.32 {d1[1]}, [r0,:32], r2 163 vaddw.u8 q2, q1, d0 164 vaddw.u8 q3, q1, d1 165 sub r0, r0, r2, lsl #2 166 vqmovun.s16 d0, q2 167 vqmovun.s16 d1, q3 168 vst1.32 {d0[0]}, [r0,:32], r2 169 vst1.32 {d1[0]}, [r0,:32], r2 170 vst1.32 {d0[1]}, [r0,:32], r2 171 vst1.32 {d1[1]}, [r0,:32], r2 172 bx lr 173endfunc 174 175function ff_vp8_idct_dc_add4uv_neon, export=1 176 vmov.i16 d0, #0 177 mov r3, #32 178 vld1.16 {d16[]}, [r1,:16] 179 vst1.16 {d0[0]}, [r1,:16], r3 180 vld1.16 {d17[]}, [r1,:16] 181 vst1.16 {d0[0]}, [r1,:16], r3 182 vld1.16 {d18[]}, [r1,:16] 183 vst1.16 {d0[0]}, [r1,:16], r3 184 vld1.16 {d19[]}, [r1,:16] 185 vst1.16 {d0[0]}, [r1,:16], r3 186 mov r3, r0 187 vrshr.s16 q8, q8, #3 @ dc >>= 3 188 vld1.8 {d0}, [r0,:64], r2 189 vrshr.s16 q9, q9, #3 190 vld1.8 {d1}, [r0,:64], r2 191 vaddw.u8 q10, q8, d0 192 vld1.8 {d2}, [r0,:64], r2 193 vaddw.u8 q0, q8, d1 194 vld1.8 {d3}, [r0,:64], r2 195 vaddw.u8 q11, q8, d2 196 vld1.8 {d4}, [r0,:64], r2 197 vaddw.u8 q1, q8, d3 198 vld1.8 {d5}, [r0,:64], r2 199 vaddw.u8 q12, q9, d4 200 vld1.8 {d6}, [r0,:64], r2 201 vaddw.u8 q2, q9, d5 202 vld1.8 {d7}, [r0,:64], r2 203 vaddw.u8 q13, q9, d6 204 vqmovun.s16 d20, q10 205 vaddw.u8 q3, q9, d7 206 vqmovun.s16 d21, q0 207 vqmovun.s16 d22, q11 208 vst1.8 {d20}, [r3,:64], r2 209 vqmovun.s16 d23, q1 210 vst1.8 {d21}, [r3,:64], r2 211 vqmovun.s16 d24, q12 212 vst1.8 {d22}, [r3,:64], r2 213 vqmovun.s16 d25, q2 214 vst1.8 {d23}, [r3,:64], r2 215 vqmovun.s16 d26, q13 216 vst1.8 {d24}, [r3,:64], r2 217 vqmovun.s16 d27, q3 218 vst1.8 {d25}, [r3,:64], r2 219 vst1.8 {d26}, [r3,:64], r2 220 vst1.8 {d27}, [r3,:64], r2 221 222 bx lr 223endfunc 224 225function ff_vp8_idct_dc_add4y_neon, export=1 226 vmov.i16 d0, #0 227 mov r3, #32 228 vld1.16 {d16[]}, [r1,:16] 229 vst1.16 {d0[0]}, [r1,:16], r3 230 vld1.16 {d17[]}, [r1,:16] 231 vst1.16 {d0[0]}, [r1,:16], r3 232 vld1.16 {d18[]}, [r1,:16] 233 vst1.16 {d0[0]}, [r1,:16], r3 234 vld1.16 {d19[]}, [r1,:16] 235 vst1.16 {d0[0]}, [r1,:16], r3 236 vrshr.s16 q8, q8, #3 @ dc >>= 3 237 vld1.8 {q0}, [r0,:128], r2 238 vrshr.s16 q9, q9, #3 239 vld1.8 {q1}, [r0,:128], r2 240 vaddw.u8 q10, q8, d0 241 vld1.8 {q2}, [r0,:128], r2 242 vaddw.u8 q0, q9, d1 243 vld1.8 {q3}, [r0,:128], r2 244 vaddw.u8 q11, q8, d2 245 vaddw.u8 q1, q9, d3 246 vaddw.u8 q12, q8, d4 247 vaddw.u8 q2, q9, d5 248 vaddw.u8 q13, q8, d6 249 vaddw.u8 q3, q9, d7 250 sub r0, r0, r2, lsl #2 251 vqmovun.s16 d20, q10 252 vqmovun.s16 d21, q0 253 vqmovun.s16 d22, q11 254 vqmovun.s16 d23, q1 255 vqmovun.s16 d24, q12 256 vst1.8 {q10}, [r0,:128], r2 257 vqmovun.s16 d25, q2 258 vst1.8 {q11}, [r0,:128], r2 259 vqmovun.s16 d26, q13 260 vst1.8 {q12}, [r0,:128], r2 261 vqmovun.s16 d27, q3 262 vst1.8 {q13}, [r0,:128], r2 263 264 bx lr 265endfunc 266 267@ Register layout: 268@ P3..Q3 -> q0..q7 269@ flim_E -> q14 270@ flim_I -> q15 271@ hev_thresh -> r12 272@ 273.macro vp8_loop_filter, inner=0, simple=0 274 .if \simple 275 vabd.u8 q9, q3, q4 @ abs(P0-Q0) 276 vabd.u8 q15, q2, q5 @ abs(P1-Q1) 277 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 278 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 279 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 280 vmov.i8 q13, #0x80 281 vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim 282 .else 283 @ calculate hev and normal_limit: 284 vabd.u8 q12, q2, q3 @ abs(P1-P0) 285 vabd.u8 q13, q5, q4 @ abs(Q1-Q0) 286 vabd.u8 q10, q0, q1 @ abs(P3-P2) 287 vabd.u8 q11, q1, q2 @ abs(P2-P1) 288 vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I 289 vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I 290 vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I 291 vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I 292 vand q8, q8, q9 293 vabd.u8 q9, q7, q6 @ abs(Q3-Q2) 294 vand q8, q8, q11 295 vabd.u8 q11, q6, q5 @ abs(Q2-Q1) 296 vand q8, q8, q10 297 vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I 298 vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I 299 vabd.u8 q9, q3, q4 @ abs(P0-Q0) 300 vabd.u8 q15, q2, q5 @ abs(P1-Q1) 301 vand q8, q8, q10 302 vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 303 vand q8, q8, q11 304 vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 305 vdup.8 q15, r12 @ hev_thresh 306 vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 307 vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh 308 vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E 309 vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh 310 vand q8, q8, q11 311 vmov.i8 q13, #0x80 312 vorr q9, q12, q14 313 .endif 314 315 @ at this point: 316 @ q8: normal_limit 317 @ q9: hev 318 319 @ convert to signed value: 320 veor q3, q3, q13 @ PS0 = P0 ^ 0x80 321 veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 322 323 vmov.i16 q12, #3 324 vsubl.s8 q10, d8, d6 @ QS0 - PS0 325 vsubl.s8 q11, d9, d7 @ (widened to 16bit) 326 veor q2, q2, q13 @ PS1 = P1 ^ 0x80 327 veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 328 vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) 329 vmul.i16 q11, q11, q12 330 331 vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) 332 vmov.i8 q14, #4 333 vmov.i8 q15, #3 334 .if \inner 335 vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) 336 .endif 337 vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) 338 vaddw.s8 q11, q11, d25 339 vqmovn.s16 d20, q10 @ narrow result back into q10 340 vqmovn.s16 d21, q11 341 .if !\inner && !\simple 342 veor q1, q1, q13 @ PS2 = P2 ^ 0x80 343 veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 344 .endif 345 vand q10, q10, q8 @ w &= normal_limit 346 347 @ registers used at this point.. 348 @ q0 -> P3 (don't corrupt) 349 @ q1-q6 -> PS2-QS2 350 @ q7 -> Q3 (don't corrupt) 351 @ q9 -> hev 352 @ q10 -> w 353 @ q13 -> #0x80 354 @ q14 -> #4 355 @ q15 -> #3 356 @ q8, q11, q12 -> unused 357 358 @ filter_common: is4tap==1 359 @ c1 = clamp(w + 4) >> 3; 360 @ c2 = clamp(w + 3) >> 3; 361 @ Q0 = s2u(QS0 - c1); 362 @ P0 = s2u(PS0 + c2); 363 364 .if \simple 365 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) 366 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) 367 vshr.s8 q11, q11, #3 @ c1 >>= 3 368 vshr.s8 q12, q12, #3 @ c2 >>= 3 369 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 370 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 371 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 372 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 373 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 374 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 375 .elseif \inner 376 @ the !is4tap case of filter_common, only used for inner blocks 377 @ c3 = ((c1&~hev) + 1) >> 1; 378 @ Q1 = s2u(QS1 - c3); 379 @ P1 = s2u(PS1 + c3); 380 vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) 381 vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) 382 vshr.s8 q11, q11, #3 @ c1 >>= 3 383 vshr.s8 q12, q12, #3 @ c2 >>= 3 384 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 385 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 386 vbic q11, q11, q9 @ c1 & ~hev 387 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 388 vrshr.s8 q11, q11, #1 @ c3 >>= 1 389 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 390 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) 391 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) 392 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 393 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 394 .else 395 vand q12, q10, q9 @ w & hev 396 vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) 397 vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) 398 vshr.s8 q11, q11, #3 @ c1 >>= 3 399 vshr.s8 q12, q12, #3 @ c2 >>= 3 400 vbic q10, q10, q9 @ w &= ~hev 401 vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) 402 vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) 403 404 @ filter_mbedge: 405 @ a = clamp((27*w + 63) >> 7); 406 @ Q0 = s2u(QS0 - a); 407 @ P0 = s2u(PS0 + a); 408 @ a = clamp((18*w + 63) >> 7); 409 @ Q1 = s2u(QS1 - a); 410 @ P1 = s2u(PS1 + a); 411 @ a = clamp((9*w + 63) >> 7); 412 @ Q2 = s2u(QS2 - a); 413 @ P2 = s2u(PS2 + a); 414 vmov.i16 q9, #63 415 vshll.s8 q14, d20, #3 416 vshll.s8 q15, d21, #3 417 vaddw.s8 q14, q14, d20 418 vaddw.s8 q15, q15, d21 419 vadd.s16 q8, q9, q14 420 vadd.s16 q9, q9, q15 @ 9*w + 63 421 vadd.s16 q11, q8, q14 422 vadd.s16 q12, q9, q15 @ 18*w + 63 423 vadd.s16 q14, q11, q14 424 vadd.s16 q15, q12, q15 @ 27*w + 63 425 vqshrn.s16 d16, q8, #7 426 vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) 427 vqshrn.s16 d22, q11, #7 428 vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) 429 vqshrn.s16 d28, q14, #7 430 vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) 431 vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) 432 vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) 433 vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) 434 vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) 435 vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) 436 vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) 437 veor q3, q3, q13 @ P0 = PS0 ^ 0x80 438 veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 439 veor q2, q2, q13 @ P1 = PS1 ^ 0x80 440 veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 441 veor q1, q1, q13 @ P2 = PS2 ^ 0x80 442 veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 443 .endif 444.endm 445 446.macro vp8_v_loop_filter16 name, inner=0, simple=0 447function ff_vp8_v_loop_filter16\name\()_neon, export=1 448 vpush {q4-q7} 449 sub r0, r0, r1, lsl #1+!\simple 450 451 @ Load pixels: 452 .if !\simple 453 ldr r12, [sp, #64] @ hev_thresh 454 vld1.8 {q0}, [r0,:128], r1 @ P3 455 vld1.8 {q1}, [r0,:128], r1 @ P2 456 .endif 457 vld1.8 {q2}, [r0,:128], r1 @ P1 458 vld1.8 {q3}, [r0,:128], r1 @ P0 459 vld1.8 {q4}, [r0,:128], r1 @ Q0 460 vld1.8 {q5}, [r0,:128], r1 @ Q1 461 .if !\simple 462 vld1.8 {q6}, [r0,:128], r1 @ Q2 463 vld1.8 {q7}, [r0,:128] @ Q3 464 vdup.8 q15, r3 @ flim_I 465 .endif 466 vdup.8 q14, r2 @ flim_E 467 468 vp8_loop_filter inner=\inner, simple=\simple 469 470 @ back up to P2: dst -= stride * 6 471 sub r0, r0, r1, lsl #2 472 .if !\simple 473 sub r0, r0, r1, lsl #1 474 475 @ Store pixels: 476 vst1.8 {q1}, [r0,:128], r1 @ P2 477 .endif 478 vst1.8 {q2}, [r0,:128], r1 @ P1 479 vst1.8 {q3}, [r0,:128], r1 @ P0 480 vst1.8 {q4}, [r0,:128], r1 @ Q0 481 vst1.8 {q5}, [r0,:128], r1 @ Q1 482 .if !\simple 483 vst1.8 {q6}, [r0,:128] @ Q2 484 .endif 485 486 vpop {q4-q7} 487 bx lr 488endfunc 489.endm 490 491vp8_v_loop_filter16 492vp8_v_loop_filter16 _inner, inner=1 493vp8_v_loop_filter16 _simple, simple=1 494 495.macro vp8_v_loop_filter8uv name, inner=0 496function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 497 vpush {q4-q7} 498 sub r0, r0, r2, lsl #2 499 sub r1, r1, r2, lsl #2 500 ldr r12, [sp, #64] @ flim_I 501 502 @ Load pixels: 503 vld1.8 {d0}, [r0,:64], r2 @ P3 504 vld1.8 {d1}, [r1,:64], r2 @ P3 505 vld1.8 {d2}, [r0,:64], r2 @ P2 506 vld1.8 {d3}, [r1,:64], r2 @ P2 507 vld1.8 {d4}, [r0,:64], r2 @ P1 508 vld1.8 {d5}, [r1,:64], r2 @ P1 509 vld1.8 {d6}, [r0,:64], r2 @ P0 510 vld1.8 {d7}, [r1,:64], r2 @ P0 511 vld1.8 {d8}, [r0,:64], r2 @ Q0 512 vld1.8 {d9}, [r1,:64], r2 @ Q0 513 vld1.8 {d10}, [r0,:64], r2 @ Q1 514 vld1.8 {d11}, [r1,:64], r2 @ Q1 515 vld1.8 {d12}, [r0,:64], r2 @ Q2 516 vld1.8 {d13}, [r1,:64], r2 @ Q2 517 vld1.8 {d14}, [r0,:64] @ Q3 518 vld1.8 {d15}, [r1,:64] @ Q3 519 520 vdup.8 q14, r3 @ flim_E 521 vdup.8 q15, r12 @ flim_I 522 ldr r12, [sp, #68] @ hev_thresh 523 524 vp8_loop_filter inner=\inner 525 526 @ back up to P2: u,v -= stride * 6 527 sub r0, r0, r2, lsl #2 528 sub r1, r1, r2, lsl #2 529 sub r0, r0, r2, lsl #1 530 sub r1, r1, r2, lsl #1 531 532 @ Store pixels: 533 vst1.8 {d2}, [r0,:64], r2 @ P2 534 vst1.8 {d3}, [r1,:64], r2 @ P2 535 vst1.8 {d4}, [r0,:64], r2 @ P1 536 vst1.8 {d5}, [r1,:64], r2 @ P1 537 vst1.8 {d6}, [r0,:64], r2 @ P0 538 vst1.8 {d7}, [r1,:64], r2 @ P0 539 vst1.8 {d8}, [r0,:64], r2 @ Q0 540 vst1.8 {d9}, [r1,:64], r2 @ Q0 541 vst1.8 {d10}, [r0,:64], r2 @ Q1 542 vst1.8 {d11}, [r1,:64], r2 @ Q1 543 vst1.8 {d12}, [r0,:64] @ Q2 544 vst1.8 {d13}, [r1,:64] @ Q2 545 546 vpop {q4-q7} 547 bx lr 548endfunc 549.endm 550 551vp8_v_loop_filter8uv 552vp8_v_loop_filter8uv _inner, inner=1 553 554.macro vp8_h_loop_filter16 name, inner=0, simple=0 555function ff_vp8_h_loop_filter16\name\()_neon, export=1 556 vpush {q4-q7} 557 sub r0, r0, #4 558 .if !\simple 559 ldr r12, [sp, #64] @ hev_thresh 560 .endif 561 562 @ Load pixels: 563 vld1.8 {d0}, [r0], r1 @ load first 8-line src data 564 vld1.8 {d2}, [r0], r1 565 vld1.8 {d4}, [r0], r1 566 vld1.8 {d6}, [r0], r1 567 vld1.8 {d8}, [r0], r1 568 vld1.8 {d10}, [r0], r1 569 vld1.8 {d12}, [r0], r1 570 vld1.8 {d14}, [r0], r1 571 vld1.8 {d1}, [r0], r1 @ load second 8-line src data 572 vld1.8 {d3}, [r0], r1 573 vld1.8 {d5}, [r0], r1 574 vld1.8 {d7}, [r0], r1 575 vld1.8 {d9}, [r0], r1 576 vld1.8 {d11}, [r0], r1 577 vld1.8 {d13}, [r0], r1 578 vld1.8 {d15}, [r0], r1 579 580 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 581 582 vdup.8 q14, r2 @ flim_E 583 .if !\simple 584 vdup.8 q15, r3 @ flim_I 585 .endif 586 587 vp8_loop_filter inner=\inner, simple=\simple 588 589 sub r0, r0, r1, lsl #4 @ backup 16 rows 590 591 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 592 593 @ Store pixels: 594 vst1.8 {d0}, [r0], r1 595 vst1.8 {d2}, [r0], r1 596 vst1.8 {d4}, [r0], r1 597 vst1.8 {d6}, [r0], r1 598 vst1.8 {d8}, [r0], r1 599 vst1.8 {d10}, [r0], r1 600 vst1.8 {d12}, [r0], r1 601 vst1.8 {d14}, [r0], r1 602 vst1.8 {d1}, [r0], r1 603 vst1.8 {d3}, [r0], r1 604 vst1.8 {d5}, [r0], r1 605 vst1.8 {d7}, [r0], r1 606 vst1.8 {d9}, [r0], r1 607 vst1.8 {d11}, [r0], r1 608 vst1.8 {d13}, [r0], r1 609 vst1.8 {d15}, [r0] 610 611 vpop {q4-q7} 612 bx lr 613endfunc 614.endm 615 616vp8_h_loop_filter16 617vp8_h_loop_filter16 _inner, inner=1 618vp8_h_loop_filter16 _simple, simple=1 619 620.macro vp8_h_loop_filter8uv name, inner=0 621function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 622 vpush {q4-q7} 623 sub r0, r0, #4 624 sub r1, r1, #4 625 ldr r12, [sp, #64] @ flim_I 626 627 @ Load pixels: 628 vld1.8 {d0}, [r0], r2 @ load u 629 vld1.8 {d1}, [r1], r2 @ load v 630 vld1.8 {d2}, [r0], r2 631 vld1.8 {d3}, [r1], r2 632 vld1.8 {d4}, [r0], r2 633 vld1.8 {d5}, [r1], r2 634 vld1.8 {d6}, [r0], r2 635 vld1.8 {d7}, [r1], r2 636 vld1.8 {d8}, [r0], r2 637 vld1.8 {d9}, [r1], r2 638 vld1.8 {d10}, [r0], r2 639 vld1.8 {d11}, [r1], r2 640 vld1.8 {d12}, [r0], r2 641 vld1.8 {d13}, [r1], r2 642 vld1.8 {d14}, [r0], r2 643 vld1.8 {d15}, [r1], r2 644 645 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 646 647 vdup.8 q14, r3 @ flim_E 648 vdup.8 q15, r12 @ flim_I 649 ldr r12, [sp, #68] @ hev_thresh 650 651 vp8_loop_filter inner=\inner 652 653 sub r0, r0, r2, lsl #3 @ backup u 8 rows 654 sub r1, r1, r2, lsl #3 @ backup v 8 rows 655 656 transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 657 658 @ Store pixels: 659 vst1.8 {d0}, [r0], r2 660 vst1.8 {d1}, [r1], r2 661 vst1.8 {d2}, [r0], r2 662 vst1.8 {d3}, [r1], r2 663 vst1.8 {d4}, [r0], r2 664 vst1.8 {d5}, [r1], r2 665 vst1.8 {d6}, [r0], r2 666 vst1.8 {d7}, [r1], r2 667 vst1.8 {d8}, [r0], r2 668 vst1.8 {d9}, [r1], r2 669 vst1.8 {d10}, [r0], r2 670 vst1.8 {d11}, [r1], r2 671 vst1.8 {d12}, [r0], r2 672 vst1.8 {d13}, [r1], r2 673 vst1.8 {d14}, [r0] 674 vst1.8 {d15}, [r1] 675 676 vpop {q4-q7} 677 bx lr 678endfunc 679.endm 680 681vp8_h_loop_filter8uv 682vp8_h_loop_filter8uv _inner, inner=1 683 684function ff_put_vp8_pixels16_neon, export=1 685 ldr r12, [sp, #0] @ h 6861: 687 subs r12, r12, #4 688 vld1.8 {q0}, [r2], r3 689 vld1.8 {q1}, [r2], r3 690 vld1.8 {q2}, [r2], r3 691 vld1.8 {q3}, [r2], r3 692 vst1.8 {q0}, [r0,:128], r1 693 vst1.8 {q1}, [r0,:128], r1 694 vst1.8 {q2}, [r0,:128], r1 695 vst1.8 {q3}, [r0,:128], r1 696 bgt 1b 697 bx lr 698endfunc 699 700function ff_put_vp8_pixels8_neon, export=1 701 ldr r12, [sp, #0] @ h 7021: 703 subs r12, r12, #4 704 vld1.8 {d0}, [r2], r3 705 vld1.8 {d1}, [r2], r3 706 vld1.8 {d2}, [r2], r3 707 vld1.8 {d3}, [r2], r3 708 vst1.8 {d0}, [r0,:64], r1 709 vst1.8 {d1}, [r0,:64], r1 710 vst1.8 {d2}, [r0,:64], r1 711 vst1.8 {d3}, [r0,:64], r1 712 bgt 1b 713 bx lr 714endfunc 715 716/* 4/6-tap 8th-pel MC */ 717 718.macro vp8_epel8_h6 d, a, b 719 vext.8 d27, \a, \b, #1 720 vmovl.u8 q8, \a 721 vext.8 d28, \a, \b, #2 722 vmovl.u8 q9, d27 723 vext.8 d29, \a, \b, #3 724 vmovl.u8 q10, d28 725 vext.8 d30, \a, \b, #4 726 vmovl.u8 q11, d29 727 vext.8 d31, \a, \b, #5 728 vmovl.u8 q12, d30 729 vmul.u16 q10, q10, d0[2] 730 vmovl.u8 q13, d31 731 vmul.u16 q11, q11, d0[3] 732 vmls.u16 q10, q9, d0[1] 733 vmls.u16 q11, q12, d1[0] 734 vmla.u16 q10, q8, d0[0] 735 vmla.u16 q11, q13, d1[1] 736 vqadd.s16 q11, q10, q11 737 vqrshrun.s16 \d, q11, #7 738.endm 739 740.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 741 vext.8 q14, \q0, \q1, #3 742 vext.8 q15, \q0, \q1, #4 743 vmovl.u8 q11, d28 744 vmovl.u8 q14, d29 745 vext.8 q3, \q0, \q1, #2 746 vmovl.u8 q12, d30 747 vmovl.u8 q15, d31 748 vext.8 q8, \q0, \q1, #1 749 vmovl.u8 q10, d6 750 vmovl.u8 q3, d7 751 vext.8 q2, \q0, \q1, #5 752 vmovl.u8 q13, d4 753 vmovl.u8 q2, d5 754 vmovl.u8 q9, d16 755 vmovl.u8 q8, d17 756 vmul.u16 q11, q11, d0[3] 757 vmul.u16 q10, q10, d0[2] 758 vmul.u16 q3, q3, d0[2] 759 vmul.u16 q14, q14, d0[3] 760 vmls.u16 q11, q12, d1[0] 761 vmovl.u8 q12, \s0 762 vmovl.u8 q1, \s1 763 vmls.u16 q10, q9, d0[1] 764 vmls.u16 q3, q8, d0[1] 765 vmls.u16 q14, q15, d1[0] 766 vmla.u16 q10, q12, d0[0] 767 vmla.u16 q11, q13, d1[1] 768 vmla.u16 q3, q1, d0[0] 769 vmla.u16 q14, q2, d1[1] 770 vqadd.s16 q11, q10, q11 771 vqadd.s16 q14, q3, q14 772 vqrshrun.s16 \d0, q11, #7 773 vqrshrun.s16 \d1, q14, #7 774.endm 775 776.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 777 vmovl.u8 q10, \s2 778 vmovl.u8 q11, \s3 779 vmovl.u8 q9, \s1 780 vmovl.u8 q12, \s4 781 vmovl.u8 q8, \s0 782 vmovl.u8 q13, \s5 783 vmul.u16 q10, q10, d0[2] 784 vmul.u16 q11, q11, d0[3] 785 vmls.u16 q10, q9, d0[1] 786 vmls.u16 q11, q12, d1[0] 787 vmla.u16 q10, q8, d0[0] 788 vmla.u16 q11, q13, d1[1] 789 vqadd.s16 q11, q10, q11 790 vqrshrun.s16 \d0, q11, #7 791.endm 792 793.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 794 vmovl.u8 q10, \s0 795 vmovl.u8 q11, \s3 796 vmovl.u8 q14, \s6 797 vmovl.u8 q9, \s1 798 vmovl.u8 q12, \s4 799 vmovl.u8 q8, \s2 800 vmovl.u8 q13, \s5 801 vmul.u16 q10, q10, d0[0] 802 vmul.u16 q15, q11, d0[3] 803 vmul.u16 q11, q11, d0[2] 804 vmul.u16 q14, q14, d1[1] 805 vmls.u16 q10, q9, d0[1] 806 vmls.u16 q15, q12, d1[0] 807 vmls.u16 q11, q8, d0[1] 808 vmls.u16 q14, q13, d1[0] 809 vmla.u16 q10, q8, d0[2] 810 vmla.u16 q15, q13, d1[1] 811 vmla.u16 q11, q9, d0[0] 812 vmla.u16 q14, q12, d0[3] 813 vqadd.s16 q15, q10, q15 814 vqadd.s16 q14, q11, q14 815 vqrshrun.s16 \d0, q15, #7 816 vqrshrun.s16 \d1, q14, #7 817.endm 818 819.macro vp8_epel8_h4 d, a, b 820 vext.8 d28, \a, \b, #1 821 vmovl.u8 q9, \a 822 vext.8 d29, \a, \b, #2 823 vmovl.u8 q10, d28 824 vext.8 d30, \a, \b, #3 825 vmovl.u8 q11, d29 826 vmovl.u8 q12, d30 827 vmul.u16 q10, q10, d0[2] 828 vmul.u16 q11, q11, d0[3] 829 vmls.u16 q10, q9, d0[1] 830 vmls.u16 q11, q12, d1[0] 831 vqadd.s16 q11, q10, q11 832 vqrshrun.s16 \d, q11, #7 833.endm 834 835.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 836 vmovl.u8 q9, \s0 837 vmovl.u8 q10, \s1 838 vmovl.u8 q11, \s2 839 vmovl.u8 q12, \s3 840 vmovl.u8 q13, \s4 841 vmul.u16 q8, q10, d0[2] 842 vmul.u16 q14, q11, d0[3] 843 vmul.u16 q11, q11, d0[2] 844 vmul.u16 q15, q12, d0[3] 845 vmls.u16 q8, q9, d0[1] 846 vmls.u16 q14, q12, d1[0] 847 vmls.u16 q11, q10, d0[1] 848 vmls.u16 q15, q13, d1[0] 849 vqadd.s16 q8, q8, q14 850 vqadd.s16 q11, q11, q15 851 vqrshrun.s16 \d0, q8, #7 852 vqrshrun.s16 \d1, q11, #7 853.endm 854 855function ff_put_vp8_epel16_v6_neon, export=1 856 sub r2, r2, r3, lsl #1 857 push {r4,lr} 858 vpush {d8-d15} 859 860 ldr r4, [sp, #80] @ my 861 movrel lr, subpel_filters-16 862 ldr r12, [sp, #72] @ h 863 add r4, lr, r4, lsl #4 864 vld1.16 {q0}, [r4,:128] 8651: 866 vld1.8 {d2-d3}, [r2], r3 867 vld1.8 {d4-d5}, [r2], r3 868 vld1.8 {d6-d7}, [r2], r3 869 vld1.8 {d8-d9}, [r2], r3 870 vld1.8 {d10-d11},[r2], r3 871 vld1.8 {d12-d13},[r2], r3 872 vld1.8 {d14-d15},[r2] 873 sub r2, r2, r3, lsl #2 874 875 vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 876 vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 877 878 vst1.8 {d2-d3}, [r0,:128], r1 879 vst1.8 {d4-d5}, [r0,:128], r1 880 subs r12, r12, #2 881 bne 1b 882 883 vpop {d8-d15} 884 pop {r4,pc} 885endfunc 886 887function ff_put_vp8_epel16_h6_neon, export=1 888 sub r2, r2, #2 889 push {r4,lr} 890 891 ldr r4, [sp, #12] @ mx 892 movrel lr, subpel_filters-16 893 ldr r12, [sp, #8] @ h 894 add r4, lr, r4, lsl #4 895 vld1.16 {q0}, [r4,:128] 8961: 897 vld1.8 {d2-d4}, [r2], r3 898 899 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 900 901 vst1.8 {d2-d3}, [r0,:128], r1 902 subs r12, r12, #1 903 bne 1b 904 905 pop {r4,pc} 906endfunc 907 908function ff_put_vp8_epel16_h6v6_neon, export=1 909 sub r2, r2, r3, lsl #1 910 sub r2, r2, #2 911 push {r4,lr} 912 vpush {d8-d9} 913 914 @ first pass (horizontal): 915 ldr r4, [sp, #28] @ mx 916 movrel lr, subpel_filters-16 917 ldr r12, [sp, #24] @ h 918 add r4, lr, r4, lsl #4 919 sub sp, sp, #336+16 920 vld1.16 {q0}, [r4,:128] 921 add lr, sp, #15 922 add r12, r12, #5 923 bic lr, lr, #15 9241: 925 vld1.8 {d2,d3,d4}, [r2], r3 926 927 vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 928 929 vst1.8 {d2-d3}, [lr,:128]! 930 subs r12, r12, #1 931 bne 1b 932 933 @ second pass (vertical): 934 ldr r4, [sp, #336+16+32] @ my 935 movrel lr, subpel_filters-16 936 ldr r12, [sp, #336+16+24] @ h 937 add r4, lr, r4, lsl #4 938 add lr, sp, #15 939 vld1.16 {q0}, [r4,:128] 940 bic lr, lr, #15 9412: 942 vld1.8 {d2-d5}, [lr,:128]! 943 vld1.8 {d6-d9}, [lr,:128]! 944 vld1.8 {d28-d31},[lr,:128] 945 sub lr, lr, #48 946 947 vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 948 vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 949 950 vst1.8 {d2-d3}, [r0,:128], r1 951 subs r12, r12, #1 952 bne 2b 953 954 add sp, sp, #336+16 955 vpop {d8-d9} 956 pop {r4,pc} 957endfunc 958 959function ff_put_vp8_epel8_v6_neon, export=1 960 sub r2, r2, r3, lsl #1 961 push {r4,lr} 962 963 ldr r4, [sp, #16] @ my 964 movrel lr, subpel_filters-16 965 ldr r12, [sp, #8] @ h 966 add r4, lr, r4, lsl #4 967 vld1.16 {q0}, [r4,:128] 9681: 969 vld1.8 {d2}, [r2], r3 970 vld1.8 {d3}, [r2], r3 971 vld1.8 {d4}, [r2], r3 972 vld1.8 {d5}, [r2], r3 973 vld1.8 {d6}, [r2], r3 974 vld1.8 {d7}, [r2], r3 975 vld1.8 {d28}, [r2] 976 977 sub r2, r2, r3, lsl #2 978 979 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 980 981 vst1.8 {d2}, [r0,:64], r1 982 vst1.8 {d3}, [r0,:64], r1 983 subs r12, r12, #2 984 bne 1b 985 986 pop {r4,pc} 987endfunc 988 989function ff_put_vp8_epel8_h6_neon, export=1 990 sub r2, r2, #2 991 push {r4,lr} 992 993 ldr r4, [sp, #12] @ mx 994 movrel lr, subpel_filters-16 995 ldr r12, [sp, #8] @ h 996 add r4, lr, r4, lsl #4 997 vld1.16 {q0}, [r4,:128] 9981: 999 vld1.8 {d2,d3}, [r2], r3 1000 1001 vp8_epel8_h6 d2, d2, d3 1002 1003 vst1.8 {d2}, [r0,:64], r1 1004 subs r12, r12, #1 1005 bne 1b 1006 1007 pop {r4,pc} 1008endfunc 1009 1010function ff_put_vp8_epel8_h6v6_neon, export=1 1011 sub r2, r2, r3, lsl #1 1012 sub r2, r2, #2 1013 push {r4,lr} 1014 1015 @ first pass (horizontal): 1016 ldr r4, [sp, #12] @ mx 1017 movrel lr, subpel_filters-16 1018 ldr r12, [sp, #8] @ h 1019 add r4, lr, r4, lsl #4 1020 sub sp, sp, #168+16 1021 vld1.16 {q0}, [r4,:128] 1022 add lr, sp, #15 1023 add r12, r12, #5 1024 bic lr, lr, #15 10251: 1026 vld1.8 {d2,d3}, [r2], r3 1027 1028 vp8_epel8_h6 d2, d2, d3 1029 1030 vst1.8 {d2}, [lr,:64]! 1031 subs r12, r12, #1 1032 bne 1b 1033 1034 @ second pass (vertical): 1035 ldr r4, [sp, #168+16+16] @ my 1036 movrel lr, subpel_filters-16 1037 ldr r12, [sp, #168+16+8] @ h 1038 add r4, lr, r4, lsl #4 1039 add lr, sp, #15 1040 vld1.16 {q0}, [r4,:128] 1041 bic lr, lr, #15 10422: 1043 vld1.8 {d2-d5}, [lr,:128]! 1044 vld1.8 {d6-d7}, [lr,:128]! 1045 vld1.8 {d30}, [lr,:64] 1046 sub lr, lr, #32 1047 1048 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 1049 1050 vst1.8 {d2}, [r0,:64], r1 1051 vst1.8 {d3}, [r0,:64], r1 1052 subs r12, r12, #2 1053 bne 2b 1054 1055 add sp, sp, #168+16 1056 pop {r4,pc} 1057endfunc 1058 1059function ff_put_vp8_epel8_v4_neon, export=1 1060 sub r2, r2, r3 1061 push {r4,lr} 1062 1063 ldr r4, [sp, #16] @ my 1064 movrel lr, subpel_filters-16 1065 ldr r12, [sp, #8] @ h 1066 add r4, lr, r4, lsl #4 1067 vld1.16 {q0}, [r4,:128] 10681: 1069 vld1.8 {d2}, [r2], r3 1070 vld1.8 {d3}, [r2], r3 1071 vld1.8 {d4}, [r2], r3 1072 vld1.8 {d5}, [r2], r3 1073 vld1.8 {d6}, [r2] 1074 sub r2, r2, r3, lsl #1 1075 1076 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1077 1078 vst1.8 {d2}, [r0,:64], r1 1079 vst1.8 {d3}, [r0,:64], r1 1080 subs r12, r12, #2 1081 bne 1b 1082 1083 pop {r4,pc} 1084endfunc 1085 1086function ff_put_vp8_epel8_h4_neon, export=1 1087 sub r2, r2, #1 1088 push {r4,lr} 1089 1090 ldr r4, [sp, #12] @ mx 1091 movrel lr, subpel_filters-16 1092 ldr r12, [sp, #8] @ h 1093 add r4, lr, r4, lsl #4 1094 vld1.16 {q0}, [r4,:128] 10951: 1096 vld1.8 {d2,d3}, [r2], r3 1097 1098 vp8_epel8_h4 d2, d2, d3 1099 1100 vst1.8 {d2}, [r0,:64], r1 1101 subs r12, r12, #1 1102 bne 1b 1103 1104 pop {r4,pc} 1105endfunc 1106 1107function ff_put_vp8_epel8_h4v4_neon, export=1 1108 sub r2, r2, r3 1109 sub r2, r2, #1 1110 push {r4,lr} 1111 1112 @ first pass (horizontal): 1113 ldr r4, [sp, #12] @ mx 1114 movrel lr, subpel_filters-16 1115 ldr r12, [sp, #8] @ h 1116 add r4, lr, r4, lsl #4 1117 sub sp, sp, #168+16 1118 vld1.16 {q0}, [r4,:128] 1119 add lr, sp, #15 1120 add r12, r12, #3 1121 bic lr, lr, #15 11221: 1123 vld1.8 {d2,d3}, [r2], r3 1124 1125 vp8_epel8_h4 d2, d2, d3 1126 1127 vst1.8 {d2}, [lr,:64]! 1128 subs r12, r12, #1 1129 bne 1b 1130 1131 @ second pass (vertical): 1132 ldr r4, [sp, #168+16+16] @ my 1133 movrel lr, subpel_filters-16 1134 ldr r12, [sp, #168+16+8] @ h 1135 add r4, lr, r4, lsl #4 1136 add lr, sp, #15 1137 vld1.16 {q0}, [r4,:128] 1138 bic lr, lr, #15 11392: 1140 vld1.8 {d2-d5}, [lr,:128]! 1141 vld1.8 {d6}, [lr,:64] 1142 sub lr, lr, #16 1143 1144 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1145 1146 vst1.8 {d2}, [r0,:64], r1 1147 vst1.8 {d3}, [r0,:64], r1 1148 subs r12, r12, #2 1149 bne 2b 1150 1151 add sp, sp, #168+16 1152 pop {r4,pc} 1153endfunc 1154 1155function ff_put_vp8_epel8_h6v4_neon, export=1 1156 sub r2, r2, r3 1157 sub r2, r2, #2 1158 push {r4,lr} 1159 1160 @ first pass (horizontal): 1161 ldr r4, [sp, #12] @ mx 1162 movrel lr, subpel_filters-16 1163 ldr r12, [sp, #8] @ h 1164 add r4, lr, r4, lsl #4 1165 sub sp, sp, #168+16 1166 vld1.16 {q0}, [r4,:128] 1167 add lr, sp, #15 1168 add r12, r12, #3 1169 bic lr, lr, #15 11701: 1171 vld1.8 {d2,d3}, [r2], r3 1172 1173 vp8_epel8_h6 d2, d2, d3 1174 1175 vst1.8 {d2}, [lr,:64]! 1176 subs r12, r12, #1 1177 bne 1b 1178 1179 @ second pass (vertical): 1180 ldr r4, [sp, #168+16+16] @ my 1181 movrel lr, subpel_filters-16 1182 ldr r12, [sp, #168+16+8] @ h 1183 add r4, lr, r4, lsl #4 1184 add lr, sp, #15 1185 vld1.16 {q0}, [r4,:128] 1186 bic lr, lr, #15 11872: 1188 vld1.8 {d2-d5}, [lr,:128]! 1189 vld1.8 {d6}, [lr,:64] 1190 sub lr, lr, #16 1191 1192 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1193 1194 vst1.8 {d2}, [r0,:64], r1 1195 vst1.8 {d3}, [r0,:64], r1 1196 subs r12, r12, #2 1197 bne 2b 1198 1199 add sp, sp, #168+16 1200 pop {r4,pc} 1201endfunc 1202 1203function ff_put_vp8_epel8_h4v6_neon, export=1 1204 sub r2, r2, r3, lsl #1 1205 sub r2, r2, #1 1206 push {r4,lr} 1207 1208 @ first pass (horizontal): 1209 ldr r4, [sp, #12] @ mx 1210 movrel lr, subpel_filters-16 1211 ldr r12, [sp, #8] @ h 1212 add r4, lr, r4, lsl #4 1213 sub sp, sp, #168+16 1214 vld1.16 {q0}, [r4,:128] 1215 add lr, sp, #15 1216 add r12, r12, #5 1217 bic lr, lr, #15 12181: 1219 vld1.8 {d2,d3}, [r2], r3 1220 1221 vp8_epel8_h4 d2, d2, d3 1222 1223 vst1.8 {d2}, [lr,:64]! 1224 subs r12, r12, #1 1225 bne 1b 1226 1227 @ second pass (vertical): 1228 ldr r4, [sp, #168+16+16] @ my 1229 movrel lr, subpel_filters-16 1230 ldr r12, [sp, #168+16+8] @ h 1231 add r4, lr, r4, lsl #4 1232 add lr, sp, #15 1233 vld1.16 {q0}, [r4,:128] 1234 bic lr, lr, #15 12352: 1236 vld1.8 {d2-d5}, [lr,:128]! 1237 vld1.8 {d6-d7}, [lr,:128]! 1238 vld1.8 {d30}, [lr,:64] 1239 sub lr, lr, #32 1240 1241 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 1242 1243 vst1.8 {d2}, [r0,:64], r1 1244 vst1.8 {d3}, [r0,:64], r1 1245 subs r12, r12, #2 1246 bne 2b 1247 1248 add sp, sp, #168+16 1249 pop {r4,pc} 1250endfunc 1251 1252.ltorg 1253 1254function ff_put_vp8_epel4_v6_neon, export=1 1255 sub r2, r2, r3, lsl #1 1256 push {r4,lr} 1257 1258 ldr r4, [sp, #16] @ my 1259 movrel lr, subpel_filters-16 1260 ldr r12, [sp, #8] @ h 1261 add r4, lr, r4, lsl #4 1262 vld1.16 {q0}, [r4,:128] 12631: 1264 vld1.32 {d2[]}, [r2], r3 1265 vld1.32 {d3[]}, [r2], r3 1266 vld1.32 {d4[]}, [r2], r3 1267 vld1.32 {d5[]}, [r2], r3 1268 vld1.32 {d6[]}, [r2], r3 1269 vld1.32 {d7[]}, [r2], r3 1270 vld1.32 {d28[]}, [r2] 1271 sub r2, r2, r3, lsl #2 1272 vld1.32 {d2[1]}, [r2], r3 1273 vld1.32 {d3[1]}, [r2], r3 1274 vld1.32 {d4[1]}, [r2], r3 1275 vld1.32 {d5[1]}, [r2], r3 1276 vld1.32 {d6[1]}, [r2], r3 1277 vld1.32 {d7[1]}, [r2], r3 1278 vld1.32 {d28[1]}, [r2] 1279 sub r2, r2, r3, lsl #2 1280 1281 vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 1282 1283 vst1.32 {d2[0]}, [r0,:32], r1 1284 vst1.32 {d3[0]}, [r0,:32], r1 1285 vst1.32 {d2[1]}, [r0,:32], r1 1286 vst1.32 {d3[1]}, [r0,:32], r1 1287 subs r12, r12, #4 1288 bne 1b 1289 1290 pop {r4,pc} 1291endfunc 1292 1293function ff_put_vp8_epel4_h6_neon, export=1 1294 sub r2, r2, #2 1295 push {r4,lr} 1296 1297 ldr r4, [sp, #12] @ mx 1298 movrel lr, subpel_filters-16 1299 ldr r12, [sp, #8] @ h 1300 add r4, lr, r4, lsl #4 1301 vld1.16 {q0}, [r4,:128] 13021: 1303 vld1.8 {q1}, [r2], r3 1304 vp8_epel8_h6 d2, d2, d3 1305 vst1.32 {d2[0]}, [r0,:32], r1 1306 subs r12, r12, #1 1307 bne 1b 1308 1309 pop {r4,pc} 1310endfunc 1311 1312function ff_put_vp8_epel4_h6v6_neon, export=1 1313 sub r2, r2, r3, lsl #1 1314 sub r2, r2, #2 1315 push {r4,lr} 1316 1317 ldr r4, [sp, #12] @ mx 1318 movrel lr, subpel_filters-16 1319 ldr r12, [sp, #8] @ h 1320 add r4, lr, r4, lsl #4 1321 sub sp, sp, #52+16 1322 vld1.16 {q0}, [r4,:128] 1323 add lr, sp, #15 1324 add r12, r12, #5 1325 bic lr, lr, #15 13261: 1327 vld1.8 {q1}, [r2], r3 1328 vp8_epel8_h6 d2, d2, d3 1329 vst1.32 {d2[0]}, [lr,:32]! 1330 subs r12, r12, #1 1331 bne 1b 1332 1333 ldr r4, [sp, #52+16+16] @ my 1334 movrel lr, subpel_filters-16 1335 ldr r12, [sp, #52+16+8] @ h 1336 add r4, lr, r4, lsl #4 1337 add lr, sp, #15 1338 vld1.16 {q0}, [r4,:128] 1339 bic lr, lr, #15 13402: 1341 vld1.8 {d2-d3}, [lr,:128]! 1342 vld1.8 {d6}, [lr,:64]! 1343 vld1.32 {d28[]}, [lr,:32] 1344 sub lr, lr, #16 1345 vld1.8 {d4-d5}, [lr]! 1346 vld1.8 {d7}, [lr,:64]! 1347 vld1.32 {d28[1]}, [lr,:32] 1348 sub lr, lr, #16 1349 vtrn.32 q1, q2 1350 vtrn.32 d6, d7 1351 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 1352 vst1.32 {d2[0]}, [r0,:32], r1 1353 vst1.32 {d3[0]}, [r0,:32], r1 1354 vst1.32 {d2[1]}, [r0,:32], r1 1355 vst1.32 {d3[1]}, [r0,:32], r1 1356 subs r12, r12, #4 1357 bne 2b 1358 1359 add sp, sp, #52+16 1360 pop {r4,pc} 1361endfunc 1362 1363function ff_put_vp8_epel4_h4v6_neon, export=1 1364 sub r2, r2, r3, lsl #1 1365 sub r2, r2, #1 1366 push {r4,lr} 1367 1368 ldr r4, [sp, #12] @ mx 1369 movrel lr, subpel_filters-16 1370 ldr r12, [sp, #8] @ h 1371 add r4, lr, r4, lsl #4 1372 sub sp, sp, #52+16 1373 vld1.16 {q0}, [r4,:128] 1374 add lr, sp, #15 1375 add r12, r12, #5 1376 bic lr, lr, #15 13771: 1378 vld1.8 {d2}, [r2], r3 1379 vp8_epel8_h4 d2, d2, d2 1380 vst1.32 {d2[0]}, [lr,:32]! 1381 subs r12, r12, #1 1382 bne 1b 1383 1384 ldr r4, [sp, #52+16+16] @ my 1385 movrel lr, subpel_filters-16 1386 ldr r12, [sp, #52+16+8] @ h 1387 add r4, lr, r4, lsl #4 1388 add lr, sp, #15 1389 vld1.16 {q0}, [r4,:128] 1390 bic lr, lr, #15 13912: 1392 vld1.8 {d2-d3}, [lr,:128]! 1393 vld1.8 {d6}, [lr,:64]! 1394 vld1.32 {d28[]}, [lr,:32] 1395 sub lr, lr, #16 1396 vld1.8 {d4-d5}, [lr]! 1397 vld1.8 {d7}, [lr,:64]! 1398 vld1.32 {d28[1]}, [lr,:32] 1399 sub lr, lr, #16 1400 vtrn.32 q1, q2 1401 vtrn.32 d6, d7 1402 vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 1403 vst1.32 {d2[0]}, [r0,:32], r1 1404 vst1.32 {d3[0]}, [r0,:32], r1 1405 vst1.32 {d2[1]}, [r0,:32], r1 1406 vst1.32 {d3[1]}, [r0,:32], r1 1407 subs r12, r12, #4 1408 bne 2b 1409 1410 add sp, sp, #52+16 1411 pop {r4,pc} 1412endfunc 1413 1414function ff_put_vp8_epel4_h6v4_neon, export=1 1415 sub r2, r2, r3 1416 sub r2, r2, #2 1417 push {r4,lr} 1418 1419 ldr r4, [sp, #12] @ mx 1420 movrel lr, subpel_filters-16 1421 ldr r12, [sp, #8] @ h 1422 add r4, lr, r4, lsl #4 1423 sub sp, sp, #44+16 1424 vld1.16 {q0}, [r4,:128] 1425 add lr, sp, #15 1426 add r12, r12, #3 1427 bic lr, lr, #15 14281: 1429 vld1.8 {q1}, [r2], r3 1430 vp8_epel8_h6 d2, d2, d3 1431 vst1.32 {d2[0]}, [lr,:32]! 1432 subs r12, r12, #1 1433 bne 1b 1434 1435 ldr r4, [sp, #44+16+16] @ my 1436 movrel lr, subpel_filters-16 1437 ldr r12, [sp, #44+16+8] @ h 1438 add r4, lr, r4, lsl #4 1439 add lr, sp, #15 1440 vld1.16 {q0}, [r4,:128] 1441 bic lr, lr, #15 14422: 1443 vld1.8 {d2-d3}, [lr,:128]! 1444 vld1.32 {d6[]}, [lr,:32] 1445 sub lr, lr, #8 1446 vld1.8 {d4-d5}, [lr]! 1447 vld1.32 {d6[1]}, [lr,:32] 1448 sub lr, lr, #8 1449 vtrn.32 q1, q2 1450 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 1451 vst1.32 {d2[0]}, [r0,:32], r1 1452 vst1.32 {d3[0]}, [r0,:32], r1 1453 vst1.32 {d2[1]}, [r0,:32], r1 1454 vst1.32 {d3[1]}, [r0,:32], r1 1455 subs r12, r12, #4 1456 bne 2b 1457 1458 add sp, sp, #44+16 1459 pop {r4,pc} 1460endfunc 1461 1462function ff_put_vp8_epel4_h4_neon, export=1 1463 sub r2, r2, #1 1464 push {r4,lr} 1465 1466 ldr r4, [sp, #12] @ mx 1467 movrel lr, subpel_filters-16 1468 ldr r12, [sp, #8] @ h 1469 add r4, lr, r4, lsl #4 1470 vld1.16 {q0}, [r4,:128] 14711: 1472 vld1.8 {d2}, [r2], r3 1473 vp8_epel8_h4 d2, d2, d2 1474 vst1.32 {d2[0]}, [r0,:32], r1 1475 subs r12, r12, #1 1476 bne 1b 1477 1478 pop {r4,pc} 1479endfunc 1480 1481function ff_put_vp8_epel4_v4_neon, export=1 1482 sub r2, r2, r3 1483 push {r4,lr} 1484 1485 ldr r4, [sp, #16] @ my 1486 movrel lr, subpel_filters-16 1487 ldr r12, [sp, #8] @ h 1488 add r4, lr, r4, lsl #4 1489 vld1.16 {q0}, [r4,:128] 14901: 1491 vld1.32 {d2[]}, [r2], r3 1492 vld1.32 {d3[]}, [r2], r3 1493 vld1.32 {d4[]}, [r2], r3 1494 vld1.32 {d5[]}, [r2], r3 1495 vld1.32 {d6[]}, [r2] 1496 sub r2, r2, r3, lsl #1 1497 vld1.32 {d2[1]}, [r2], r3 1498 vld1.32 {d3[1]}, [r2], r3 1499 vld1.32 {d4[1]}, [r2], r3 1500 vld1.32 {d5[1]}, [r2], r3 1501 vld1.32 {d6[1]}, [r2] 1502 sub r2, r2, r3, lsl #1 1503 1504 vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 1505 1506 vst1.32 {d2[0]}, [r0,:32], r1 1507 vst1.32 {d3[0]}, [r0,:32], r1 1508 vst1.32 {d2[1]}, [r0,:32], r1 1509 vst1.32 {d3[1]}, [r0,:32], r1 1510 subs r12, r12, #4 1511 bne 1b 1512 1513 pop {r4,pc} 1514endfunc 1515 1516function ff_put_vp8_epel4_h4v4_neon, export=1 1517 sub r2, r2, r3 1518 sub r2, r2, #1 1519 push {r4,lr} 1520 1521 ldr r4, [sp, #12] @ mx 1522 movrel lr, subpel_filters-16 1523 ldr r12, [sp, #8] @ h 1524 add r4, lr, r4, lsl #4 1525 sub sp, sp, #44+16 1526 vld1.16 {q0}, [r4,:128] 1527 add lr, sp, #15 1528 add r12, r12, #3 1529 bic lr, lr, #15 15301: 1531 vld1.8 {d2}, [r2], r3 1532 vp8_epel8_h4 d2, d2, d3 1533 vst1.32 {d2[0]}, [lr,:32]! 1534 subs r12, r12, #1 1535 bne 1b 1536 1537 ldr r4, [sp, #44+16+16] @ my 1538 movrel lr, subpel_filters-16 1539 ldr r12, [sp, #44+16+8] @ h 1540 add r4, lr, r4, lsl #4 1541 add lr, sp, #15 1542 vld1.16 {q0}, [r4,:128] 1543 bic lr, lr, #15 15442: 1545 vld1.8 {d2-d3}, [lr,:128]! 1546 vld1.32 {d6[]}, [lr,:32] 1547 sub lr, lr, #8 1548 vld1.8 {d4-d5}, [lr]! 1549 vld1.32 {d6[1]}, [lr,:32] 1550 sub lr, lr, #8 1551 vtrn.32 q1, q2 1552 vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 1553 vst1.32 {d2[0]}, [r0,:32], r1 1554 vst1.32 {d3[0]}, [r0,:32], r1 1555 vst1.32 {d2[1]}, [r0,:32], r1 1556 vst1.32 {d3[1]}, [r0,:32], r1 1557 subs r12, r12, #4 1558 bne 2b 1559 1560 add sp, sp, #44+16 1561 pop {r4,pc} 1562endfunc 1563 1564@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit 1565@ arithmatic can be used to apply filters 1566const subpel_filters, align=4 1567 .short 0, 6, 123, 12, 1, 0, 0, 0 1568 .short 2, 11, 108, 36, 8, 1, 0, 0 1569 .short 0, 9, 93, 50, 6, 0, 0, 0 1570 .short 3, 16, 77, 77, 16, 3, 0, 0 1571 .short 0, 6, 50, 93, 9, 0, 0, 0 1572 .short 1, 8, 36, 108, 11, 2, 0, 0 1573 .short 0, 1, 12, 123, 6, 0, 0, 0 1574endconst 1575 1576/* Bilinear MC */ 1577 1578function ff_put_vp8_bilin16_h_neon, export=1 1579 ldr r12, [sp, #4] @ mx 1580 vdup.8 d0, r12 1581 rsb r12, r12, #8 1582 vdup.8 d1, r12 1583 ldr r12, [sp] @ h 15841: 1585 subs r12, r12, #2 1586 vld1.8 {d2-d4}, [r2], r3 1587 vext.8 q2, q1, q2, #1 1588 vmull.u8 q8, d2, d1 1589 vmlal.u8 q8, d4, d0 1590 vld1.8 {d18-d20},[r2], r3 1591 vmull.u8 q3, d3, d1 1592 vmlal.u8 q3, d5, d0 1593 vext.8 q10, q9, q10, #1 1594 vmull.u8 q11, d18, d1 1595 vmlal.u8 q11, d20, d0 1596 vmull.u8 q12, d19, d1 1597 vmlal.u8 q12, d21, d0 1598 vrshrn.u16 d4, q8, #3 1599 vrshrn.u16 d5, q3, #3 1600 vrshrn.u16 d6, q11, #3 1601 vrshrn.u16 d7, q12, #3 1602 vst1.8 {q2}, [r0,:128], r1 1603 vst1.8 {q3}, [r0,:128], r1 1604 bgt 1b 1605 1606 bx lr 1607endfunc 1608 1609function ff_put_vp8_bilin16_v_neon, export=1 1610 ldr r12, [sp, #8] @ my 1611 vdup.8 d0, r12 1612 rsb r12, r12, #8 1613 vdup.8 d1, r12 1614 ldr r12, [sp] @ h 1615 vld1.8 {q1}, [r2], r3 16161: 1617 subs r12, r12, #2 1618 vld1.8 {q2}, [r2], r3 1619 vmull.u8 q3, d2, d1 1620 vmlal.u8 q3, d4, d0 1621 vmull.u8 q8, d3, d1 1622 vmlal.u8 q8, d5, d0 1623 vld1.8 {q1}, [r2], r3 1624 vmull.u8 q9, d4, d1 1625 vmlal.u8 q9, d2, d0 1626 vmull.u8 q10, d5, d1 1627 vmlal.u8 q10, d3, d0 1628 vrshrn.u16 d4, q3, #3 1629 vrshrn.u16 d5, q8, #3 1630 vrshrn.u16 d6, q9, #3 1631 vrshrn.u16 d7, q10, #3 1632 vst1.8 {q2}, [r0,:128], r1 1633 vst1.8 {q3}, [r0,:128], r1 1634 bgt 1b 1635 1636 bx lr 1637endfunc 1638 1639function ff_put_vp8_bilin16_hv_neon, export=1 1640 ldr r12, [sp, #4] @ mx 1641 vdup.8 d0, r12 1642 rsb r12, r12, #8 1643 vdup.8 d1, r12 1644 ldr r12, [sp, #8] @ my 1645 vdup.8 d2, r12 1646 rsb r12, r12, #8 1647 vdup.8 d3, r12 1648 ldr r12, [sp] @ h 1649 1650 vld1.8 {d4-d6}, [r2], r3 1651 vext.8 q3, q2, q3, #1 1652 vmull.u8 q8, d4, d1 1653 vmlal.u8 q8, d6, d0 1654 vmull.u8 q9, d5, d1 1655 vmlal.u8 q9, d7, d0 1656 vrshrn.u16 d4, q8, #3 1657 vrshrn.u16 d5, q9, #3 16581: 1659 subs r12, r12, #2 1660 vld1.8 {d18-d20},[r2], r3 1661 vext.8 q10, q9, q10, #1 1662 vmull.u8 q11, d18, d1 1663 vmlal.u8 q11, d20, d0 1664 vld1.8 {d26-d28},[r2], r3 1665 vmull.u8 q12, d19, d1 1666 vmlal.u8 q12, d21, d0 1667 vext.8 q14, q13, q14, #1 1668 vmull.u8 q8, d26, d1 1669 vmlal.u8 q8, d28, d0 1670 vmull.u8 q9, d27, d1 1671 vmlal.u8 q9, d29, d0 1672 vrshrn.u16 d6, q11, #3 1673 vrshrn.u16 d7, q12, #3 1674 vmull.u8 q12, d4, d3 1675 vmlal.u8 q12, d6, d2 1676 vmull.u8 q15, d5, d3 1677 vmlal.u8 q15, d7, d2 1678 vrshrn.u16 d4, q8, #3 1679 vrshrn.u16 d5, q9, #3 1680 vmull.u8 q10, d6, d3 1681 vmlal.u8 q10, d4, d2 1682 vmull.u8 q11, d7, d3 1683 vmlal.u8 q11, d5, d2 1684 vrshrn.u16 d24, q12, #3 1685 vrshrn.u16 d25, q15, #3 1686 vst1.8 {q12}, [r0,:128], r1 1687 vrshrn.u16 d20, q10, #3 1688 vrshrn.u16 d21, q11, #3 1689 vst1.8 {q10}, [r0,:128], r1 1690 bgt 1b 1691 1692 bx lr 1693endfunc 1694 1695function ff_put_vp8_bilin8_h_neon, export=1 1696 ldr r12, [sp, #4] @ mx 1697 vdup.8 d0, r12 1698 rsb r12, r12, #8 1699 vdup.8 d1, r12 1700 ldr r12, [sp] @ h 17011: 1702 subs r12, r12, #2 1703 vld1.8 {q1}, [r2], r3 1704 vext.8 d3, d2, d3, #1 1705 vmull.u8 q2, d2, d1 1706 vmlal.u8 q2, d3, d0 1707 vld1.8 {q3}, [r2], r3 1708 vext.8 d7, d6, d7, #1 1709 vmull.u8 q8, d6, d1 1710 vmlal.u8 q8, d7, d0 1711 vrshrn.u16 d4, q2, #3 1712 vrshrn.u16 d16, q8, #3 1713 vst1.8 {d4}, [r0,:64], r1 1714 vst1.8 {d16}, [r0,:64], r1 1715 bgt 1b 1716 1717 bx lr 1718endfunc 1719 1720function ff_put_vp8_bilin8_v_neon, export=1 1721 ldr r12, [sp, #8] @ my 1722 vdup.8 d0, r12 1723 rsb r12, r12, #8 1724 vdup.8 d1, r12 1725 ldr r12, [sp] @ h 1726 vld1.8 {d2}, [r2], r3 17271: 1728 subs r12, r12, #2 1729 vld1.8 {d3}, [r2], r3 1730 vmull.u8 q2, d2, d1 1731 vmlal.u8 q2, d3, d0 1732 vld1.8 {d2}, [r2], r3 1733 vmull.u8 q3, d3, d1 1734 vmlal.u8 q3, d2, d0 1735 vrshrn.u16 d4, q2, #3 1736 vrshrn.u16 d6, q3, #3 1737 vst1.8 {d4}, [r0,:64], r1 1738 vst1.8 {d6}, [r0,:64], r1 1739 bgt 1b 1740 1741 bx lr 1742endfunc 1743 1744function ff_put_vp8_bilin8_hv_neon, export=1 1745 ldr r12, [sp, #4] @ mx 1746 vdup.8 d0, r12 1747 rsb r12, r12, #8 1748 vdup.8 d1, r12 1749 ldr r12, [sp, #8] @ my 1750 vdup.8 d2, r12 1751 rsb r12, r12, #8 1752 vdup.8 d3, r12 1753 ldr r12, [sp] @ h 1754 1755 vld1.8 {q2}, [r2], r3 1756 vext.8 d5, d4, d5, #1 1757 vmull.u8 q9, d4, d1 1758 vmlal.u8 q9, d5, d0 1759 vrshrn.u16 d22, q9, #3 17601: 1761 subs r12, r12, #2 1762 vld1.8 {q3}, [r2], r3 1763 vext.8 d7, d6, d7, #1 1764 vmull.u8 q8, d6, d1 1765 vmlal.u8 q8, d7, d0 1766 vld1.8 {q2}, [r2], r3 1767 vext.8 d5, d4, d5, #1 1768 vmull.u8 q9, d4, d1 1769 vmlal.u8 q9, d5, d0 1770 vrshrn.u16 d16, q8, #3 1771 vmull.u8 q10, d22, d3 1772 vmlal.u8 q10, d16, d2 1773 vrshrn.u16 d22, q9, #3 1774 vmull.u8 q12, d16, d3 1775 vmlal.u8 q12, d22, d2 1776 vrshrn.u16 d20, q10, #3 1777 vst1.8 {d20}, [r0,:64], r1 1778 vrshrn.u16 d23, q12, #3 1779 vst1.8 {d23}, [r0,:64], r1 1780 bgt 1b 1781 1782 bx lr 1783endfunc 1784 1785function ff_put_vp8_bilin4_h_neon, export=1 1786 ldr r12, [sp, #4] @ mx 1787 vdup.8 d0, r12 1788 rsb r12, r12, #8 1789 vdup.8 d1, r12 1790 ldr r12, [sp] @ h 17911: 1792 subs r12, r12, #2 1793 vld1.8 {d2}, [r2], r3 1794 vext.8 d3, d2, d3, #1 1795 vld1.8 {d6}, [r2], r3 1796 vext.8 d7, d6, d7, #1 1797 vtrn.32 q1, q3 1798 vmull.u8 q2, d2, d1 1799 vmlal.u8 q2, d3, d0 1800 vrshrn.u16 d4, q2, #3 1801 vst1.32 {d4[0]}, [r0,:32], r1 1802 vst1.32 {d4[1]}, [r0,:32], r1 1803 bgt 1b 1804 1805 bx lr 1806endfunc 1807 1808function ff_put_vp8_bilin4_v_neon, export=1 1809 ldr r12, [sp, #8] @ my 1810 vdup.8 d0, r12 1811 rsb r12, r12, #8 1812 vdup.8 d1, r12 1813 ldr r12, [sp] @ h 1814 vld1.32 {d2[]}, [r2], r3 18151: 1816 vld1.32 {d3[]}, [r2] 1817 vld1.32 {d2[1]}, [r2], r3 1818 vld1.32 {d3[1]}, [r2], r3 1819 vmull.u8 q2, d2, d1 1820 vmlal.u8 q2, d3, d0 1821 vtrn.32 d3, d2 1822 vrshrn.u16 d4, q2, #3 1823 vst1.32 {d4[0]}, [r0,:32], r1 1824 vst1.32 {d4[1]}, [r0,:32], r1 1825 subs r12, r12, #2 1826 bgt 1b 1827 1828 bx lr 1829endfunc 1830 1831function ff_put_vp8_bilin4_hv_neon, export=1 1832 ldr r12, [sp, #4] @ mx 1833 vdup.8 d0, r12 1834 rsb r12, r12, #8 1835 vdup.8 d1, r12 1836 ldr r12, [sp, #8] @ my 1837 vdup.8 d2, r12 1838 rsb r12, r12, #8 1839 vdup.8 d3, r12 1840 ldr r12, [sp] @ h 1841 1842 vld1.8 {d4}, [r2], r3 1843 vext.8 d5, d4, d4, #1 1844 vmull.u8 q9, d4, d1 1845 vmlal.u8 q9, d5, d0 1846 vrshrn.u16 d22, q9, #3 18471: 1848 subs r12, r12, #2 1849 vld1.8 {d6}, [r2], r3 1850 vext.8 d7, d6, d6, #1 1851 vld1.8 {d4}, [r2], r3 1852 vext.8 d5, d4, d4, #1 1853 vtrn.32 q3, q2 1854 vmull.u8 q8, d6, d1 1855 vmlal.u8 q8, d7, d0 1856 vrshrn.u16 d16, q8, #3 1857 vmull.u8 q10, d16, d2 1858 vtrn.32 d22, d16 1859 vmlal.u8 q10, d22, d3 1860 vrev64.32 d22, d16 1861 vrshrn.u16 d20, q10, #3 1862 vst1.32 {d20[0]}, [r0,:32], r1 1863 vst1.32 {d20[1]}, [r0,:32], r1 1864 bgt 1b 1865 1866 bx lr 1867endfunc 1868