1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "asm.S" 22 23/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 24.macro h264_chroma_mc8 type, codec=h264 25function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 26 push {r4-r7, lr} 27 ldrd r4, [sp, #20] 28 .ifc \type,avg 29 mov lr, r0 30 .endif 31 pld [r1] 32 pld [r1, r2] 33 34 .ifc \codec,rv40 35 movrel r6, rv40bias 36 lsr r7, r5, #1 37 add r6, r6, r7, lsl #3 38 lsr r7, r4, #1 39 add r6, r6, r7, lsl #1 40 vld1.16 {d22[],d23[]}, [r6,:16] 41 .endif 42 43A muls r7, r4, r5 44T mul r7, r4, r5 45T cmp r7, #0 46 rsb r6, r7, r5, lsl #3 47 rsb r12, r7, r4, lsl #3 48 sub r4, r7, r4, lsl #3 49 sub r4, r4, r5, lsl #3 50 add r4, r4, #64 51 52 beq 2f 53 54 add r5, r1, r2 55 56 vdup.8 d0, r4 57 lsl r4, r2, #1 58 vdup.8 d1, r12 59 vld1.8 {d4, d5}, [r1], r4 60 vdup.8 d2, r6 61 vld1.8 {d6, d7}, [r5], r4 62 vdup.8 d3, r7 63 64 vext.8 d5, d4, d5, #1 65 vext.8 d7, d6, d7, #1 66 671: pld [r5] 68 vmull.u8 q8, d4, d0 69 vmlal.u8 q8, d5, d1 70 vld1.8 {d4, d5}, [r1], r4 71 vmlal.u8 q8, d6, d2 72 vext.8 d5, d4, d5, #1 73 vmlal.u8 q8, d7, d3 74 vmull.u8 q9, d6, d0 75 subs r3, r3, #2 76 vmlal.u8 q9, d7, d1 77 vmlal.u8 q9, d4, d2 78 vmlal.u8 q9, d5, d3 79 vld1.8 {d6, d7}, [r5], r4 80 pld [r1] 81 .ifc \codec,h264 82 vrshrn.u16 d16, q8, #6 83 vrshrn.u16 d17, q9, #6 84 .else 85 vadd.u16 q8, q8, q11 86 vadd.u16 q9, q9, q11 87 vshrn.u16 d16, q8, #6 88 vshrn.u16 d17, q9, #6 89 .endif 90 .ifc \type,avg 91 vld1.8 {d20}, [lr,:64], r2 92 vld1.8 {d21}, [lr,:64], r2 93 vrhadd.u8 q8, q8, q10 94 .endif 95 vext.8 d7, d6, d7, #1 96 vst1.8 {d16}, [r0,:64], r2 97 vst1.8 {d17}, [r0,:64], r2 98 bgt 1b 99 100 pop {r4-r7, pc} 101 1022: tst r6, r6 103 add r12, r12, r6 104 vdup.8 d0, r4 105 vdup.8 d1, r12 106 107 beq 4f 108 109 add r5, r1, r2 110 lsl r4, r2, #1 111 vld1.8 {d4}, [r1], r4 112 vld1.8 {d6}, [r5], r4 113 1143: pld [r5] 115 vmull.u8 q8, d4, d0 116 vmlal.u8 q8, d6, d1 117 vld1.8 {d4}, [r1], r4 118 vmull.u8 q9, d6, d0 119 vmlal.u8 q9, d4, d1 120 vld1.8 {d6}, [r5], r4 121 .ifc \codec,h264 122 vrshrn.u16 d16, q8, #6 123 vrshrn.u16 d17, q9, #6 124 .else 125 vadd.u16 q8, q8, q11 126 vadd.u16 q9, q9, q11 127 vshrn.u16 d16, q8, #6 128 vshrn.u16 d17, q9, #6 129 .endif 130 .ifc \type,avg 131 vld1.8 {d20}, [lr,:64], r2 132 vld1.8 {d21}, [lr,:64], r2 133 vrhadd.u8 q8, q8, q10 134 .endif 135 subs r3, r3, #2 136 pld [r1] 137 vst1.8 {d16}, [r0,:64], r2 138 vst1.8 {d17}, [r0,:64], r2 139 bgt 3b 140 141 pop {r4-r7, pc} 142 1434: vld1.8 {d4, d5}, [r1], r2 144 vld1.8 {d6, d7}, [r1], r2 145 vext.8 d5, d4, d5, #1 146 vext.8 d7, d6, d7, #1 147 1485: pld [r1] 149 subs r3, r3, #2 150 vmull.u8 q8, d4, d0 151 vmlal.u8 q8, d5, d1 152 vld1.8 {d4, d5}, [r1], r2 153 vmull.u8 q9, d6, d0 154 vmlal.u8 q9, d7, d1 155 pld [r1] 156 vext.8 d5, d4, d5, #1 157 .ifc \codec,h264 158 vrshrn.u16 d16, q8, #6 159 vrshrn.u16 d17, q9, #6 160 .else 161 vadd.u16 q8, q8, q11 162 vadd.u16 q9, q9, q11 163 vshrn.u16 d16, q8, #6 164 vshrn.u16 d17, q9, #6 165 .endif 166 .ifc \type,avg 167 vld1.8 {d20}, [lr,:64], r2 168 vld1.8 {d21}, [lr,:64], r2 169 vrhadd.u8 q8, q8, q10 170 .endif 171 vld1.8 {d6, d7}, [r1], r2 172 vext.8 d7, d6, d7, #1 173 vst1.8 {d16}, [r0,:64], r2 174 vst1.8 {d17}, [r0,:64], r2 175 bgt 5b 176 177 pop {r4-r7, pc} 178endfunc 179.endm 180 181/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ 182.macro h264_chroma_mc4 type, codec=h264 183function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 184 push {r4-r7, lr} 185 ldrd r4, [sp, #20] 186 .ifc \type,avg 187 mov lr, r0 188 .endif 189 pld [r1] 190 pld [r1, r2] 191 192 .ifc \codec,rv40 193 movrel r6, rv40bias 194 lsr r7, r5, #1 195 add r6, r6, r7, lsl #3 196 lsr r7, r4, #1 197 add r6, r6, r7, lsl #1 198 vld1.16 {d22[],d23[]}, [r6,:16] 199 .endif 200 201A muls r7, r4, r5 202T mul r7, r4, r5 203T cmp r7, #0 204 rsb r6, r7, r5, lsl #3 205 rsb r12, r7, r4, lsl #3 206 sub r4, r7, r4, lsl #3 207 sub r4, r4, r5, lsl #3 208 add r4, r4, #64 209 210 beq 2f 211 212 add r5, r1, r2 213 214 vdup.8 d0, r4 215 lsl r4, r2, #1 216 vdup.8 d1, r12 217 vld1.8 {d4}, [r1], r4 218 vdup.8 d2, r6 219 vld1.8 {d6}, [r5], r4 220 vdup.8 d3, r7 221 222 vext.8 d5, d4, d5, #1 223 vext.8 d7, d6, d7, #1 224 vtrn.32 d4, d5 225 vtrn.32 d6, d7 226 227 vtrn.32 d0, d1 228 vtrn.32 d2, d3 229 2301: pld [r5] 231 vmull.u8 q8, d4, d0 232 vmlal.u8 q8, d6, d2 233 vld1.8 {d4}, [r1], r4 234 vext.8 d5, d4, d5, #1 235 vtrn.32 d4, d5 236 vmull.u8 q9, d6, d0 237 vmlal.u8 q9, d4, d2 238 vld1.8 {d6}, [r5], r4 239 vadd.i16 d16, d16, d17 240 vadd.i16 d17, d18, d19 241 .ifc \codec,h264 242 vrshrn.u16 d16, q8, #6 243 .else 244 vadd.u16 q8, q8, q11 245 vshrn.u16 d16, q8, #6 246 .endif 247 subs r3, r3, #2 248 pld [r1] 249 .ifc \type,avg 250 vld1.32 {d20[0]}, [lr,:32], r2 251 vld1.32 {d20[1]}, [lr,:32], r2 252 vrhadd.u8 d16, d16, d20 253 .endif 254 vext.8 d7, d6, d7, #1 255 vtrn.32 d6, d7 256 vst1.32 {d16[0]}, [r0,:32], r2 257 vst1.32 {d16[1]}, [r0,:32], r2 258 bgt 1b 259 260 pop {r4-r7, pc} 261 2622: tst r6, r6 263 add r12, r12, r6 264 vdup.8 d0, r4 265 vdup.8 d1, r12 266 vtrn.32 d0, d1 267 268 beq 4f 269 270 vext.32 d1, d0, d1, #1 271 add r5, r1, r2 272 lsl r4, r2, #1 273 vld1.32 {d4[0]}, [r1], r4 274 vld1.32 {d4[1]}, [r5], r4 275 2763: pld [r5] 277 vmull.u8 q8, d4, d0 278 vld1.32 {d4[0]}, [r1], r4 279 vmull.u8 q9, d4, d1 280 vld1.32 {d4[1]}, [r5], r4 281 vadd.i16 d16, d16, d17 282 vadd.i16 d17, d18, d19 283 .ifc \codec,h264 284 vrshrn.u16 d16, q8, #6 285 .else 286 vadd.u16 q8, q8, q11 287 vshrn.u16 d16, q8, #6 288 .endif 289 .ifc \type,avg 290 vld1.32 {d20[0]}, [lr,:32], r2 291 vld1.32 {d20[1]}, [lr,:32], r2 292 vrhadd.u8 d16, d16, d20 293 .endif 294 subs r3, r3, #2 295 pld [r1] 296 vst1.32 {d16[0]}, [r0,:32], r2 297 vst1.32 {d16[1]}, [r0,:32], r2 298 bgt 3b 299 300 pop {r4-r7, pc} 301 3024: vld1.8 {d4}, [r1], r2 303 vld1.8 {d6}, [r1], r2 304 vext.8 d5, d4, d5, #1 305 vext.8 d7, d6, d7, #1 306 vtrn.32 d4, d5 307 vtrn.32 d6, d7 308 3095: vmull.u8 q8, d4, d0 310 vmull.u8 q9, d6, d0 311 subs r3, r3, #2 312 vld1.8 {d4}, [r1], r2 313 vext.8 d5, d4, d5, #1 314 vtrn.32 d4, d5 315 vadd.i16 d16, d16, d17 316 vadd.i16 d17, d18, d19 317 pld [r1] 318 .ifc \codec,h264 319 vrshrn.u16 d16, q8, #6 320 .else 321 vadd.u16 q8, q8, q11 322 vshrn.u16 d16, q8, #6 323 .endif 324 .ifc \type,avg 325 vld1.32 {d20[0]}, [lr,:32], r2 326 vld1.32 {d20[1]}, [lr,:32], r2 327 vrhadd.u8 d16, d16, d20 328 .endif 329 vld1.8 {d6}, [r1], r2 330 vext.8 d7, d6, d7, #1 331 vtrn.32 d6, d7 332 pld [r1] 333 vst1.32 {d16[0]}, [r0,:32], r2 334 vst1.32 {d16[1]}, [r0,:32], r2 335 bgt 5b 336 337 pop {r4-r7, pc} 338endfunc 339.endm 340 341.macro h264_chroma_mc2 type 342function ff_\type\()_h264_chroma_mc2_neon, export=1 343 push {r4-r6, lr} 344 ldr r4, [sp, #16] 345 ldr lr, [sp, #20] 346 pld [r1] 347 pld [r1, r2] 348 orrs r5, r4, lr 349 beq 2f 350 351 mul r5, r4, lr 352 rsb r6, r5, lr, lsl #3 353 rsb r12, r5, r4, lsl #3 354 sub r4, r5, r4, lsl #3 355 sub r4, r4, lr, lsl #3 356 add r4, r4, #64 357 vdup.8 d0, r4 358 vdup.8 d2, r12 359 vdup.8 d1, r6 360 vdup.8 d3, r5 361 vtrn.16 q0, q1 3621: 363 vld1.32 {d4[0]}, [r1], r2 364 vld1.32 {d4[1]}, [r1], r2 365 vrev64.32 d5, d4 366 vld1.32 {d5[1]}, [r1] 367 vext.8 q3, q2, q2, #1 368 vtrn.16 q2, q3 369 vmull.u8 q8, d4, d0 370 vmlal.u8 q8, d5, d1 371 .ifc \type,avg 372 vld1.16 {d18[0]}, [r0,:16], r2 373 vld1.16 {d18[1]}, [r0,:16] 374 sub r0, r0, r2 375 .endif 376 vtrn.32 d16, d17 377 vadd.i16 d16, d16, d17 378 vrshrn.u16 d16, q8, #6 379 .ifc \type,avg 380 vrhadd.u8 d16, d16, d18 381 .endif 382 vst1.16 {d16[0]}, [r0,:16], r2 383 vst1.16 {d16[1]}, [r0,:16], r2 384 subs r3, r3, #2 385 bgt 1b 386 pop {r4-r6, pc} 3872: 388 .ifc \type,put 389 ldrh_post r5, r1, r2 390 strh_post r5, r0, r2 391 ldrh_post r6, r1, r2 392 strh_post r6, r0, r2 393 .else 394 vld1.16 {d16[0]}, [r1], r2 395 vld1.16 {d16[1]}, [r1], r2 396 vld1.16 {d18[0]}, [r0,:16], r2 397 vld1.16 {d18[1]}, [r0,:16] 398 sub r0, r0, r2 399 vrhadd.u8 d16, d16, d18 400 vst1.16 {d16[0]}, [r0,:16], r2 401 vst1.16 {d16[1]}, [r0,:16], r2 402 .endif 403 subs r3, r3, #2 404 bgt 2b 405 pop {r4-r6, pc} 406endfunc 407.endm 408 409#if CONFIG_H264_DECODER 410 h264_chroma_mc8 put 411 h264_chroma_mc8 avg 412 h264_chroma_mc4 put 413 h264_chroma_mc4 avg 414 h264_chroma_mc2 put 415 h264_chroma_mc2 avg 416#endif 417 418#if CONFIG_RV40_DECODER 419const rv40bias 420 .short 0, 16, 32, 16 421 .short 32, 28, 32, 28 422 .short 0, 32, 16, 32 423 .short 32, 28, 32, 28 424endconst 425 426 h264_chroma_mc8 put, rv40 427 h264_chroma_mc8 avg, rv40 428 h264_chroma_mc4 put, rv40 429 h264_chroma_mc4 avg, rv40 430#endif 431