1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "asm.S" 22 23 preserve8 24 .fpu neon 25 26 .text 27 28function ff_h264_idct_add_neon, export=1 29 mov r3, #(1<<5) 30 vmov.i16 d16, #0 31 vmov.16 d16[0], r3 32 vld1.64 {d0-d3}, [r1,:128] 33 vadd.i16 d0, d0, d16 34 35 vswp d1, d2 36 vadd.i16 d4, d0, d1 37 vshr.s16 q8, q1, #1 38 vsub.i16 d5, d0, d1 39 vadd.i16 d6, d2, d17 40 vsub.i16 d7, d16, d3 41 vadd.i16 q0, q2, q3 42 vsub.i16 q1, q2, q3 43 44 vtrn.16 d0, d1 45 vtrn.16 d3, d2 46 vtrn.32 d0, d3 47 vtrn.32 d1, d2 48 49 vadd.i16 d4, d0, d3 50 vld1.32 {d18[0]}, [r0,:32], r2 51 vswp d1, d3 52 vshr.s16 q8, q1, #1 53 vld1.32 {d19[1]}, [r0,:32], r2 54 vsub.i16 d5, d0, d1 55 vld1.32 {d18[1]}, [r0,:32], r2 56 vadd.i16 d6, d16, d3 57 vld1.32 {d19[0]}, [r0,:32], r2 58 vsub.i16 d7, d2, d17 59 sub r0, r0, r2, lsl #2 60 vadd.i16 q0, q2, q3 61 vsub.i16 q1, q2, q3 62 63 vshr.s16 q0, q0, #6 64 vshr.s16 q1, q1, #6 65 66 vaddw.u8 q0, q0, d18 67 vaddw.u8 q1, q1, d19 68 69 vqmovun.s16 d0, q0 70 vqmovun.s16 d1, q1 71 72 vst1.32 {d0[0]}, [r0,:32], r2 73 vst1.32 {d1[1]}, [r0,:32], r2 74 vst1.32 {d0[1]}, [r0,:32], r2 75 vst1.32 {d1[0]}, [r0,:32], r2 76 77 bx lr 78 .endfunc 79 80function ff_h264_idct_dc_add_neon, export=1 81 vld1.16 {d2[],d3[]}, [r1,:16] 82 vrshr.s16 q1, q1, #6 83 vld1.32 {d0[0]}, [r0,:32], r2 84 vld1.32 {d0[1]}, [r0,:32], r2 85 vaddw.u8 q2, q1, d0 86 vld1.32 {d1[0]}, [r0,:32], r2 87 vld1.32 {d1[1]}, [r0,:32], r2 88 vaddw.u8 q1, q1, d1 89 vqmovun.s16 d0, q2 90 vqmovun.s16 d1, q1 91 sub r0, r0, r2, lsl #2 92 vst1.32 {d0[0]}, [r0,:32], r2 93 vst1.32 {d0[1]}, [r0,:32], r2 94 vst1.32 {d1[0]}, [r0,:32], r2 95 vst1.32 {d1[1]}, [r0,:32], r2 96 bx lr 97 .endfunc 98 99function ff_h264_idct_add16_neon, export=1 100 push {r4-r8,lr} 101 mov r4, r0 102 mov r5, r1 103 mov r1, r2 104 mov r2, r3 105 ldr r6, [sp, #24] 106 movrel r7, scan8 107 mov ip, #16 1081: ldrb r8, [r7], #1 109 ldr r0, [r5], #4 110 ldrb r8, [r6, r8] 111 subs r8, r8, #1 112 blt 2f 113 ldrsh lr, [r1] 114 add r0, r0, r4 115 movne lr, #0 116 cmp lr, #0 117 adrne lr, ff_h264_idct_dc_add_neon 118 adreq lr, ff_h264_idct_add_neon 119 blx lr 1202: subs ip, ip, #1 121 add r1, r1, #32 122 bne 1b 123 pop {r4-r8,pc} 124 .endfunc 125 126function ff_h264_idct_add16intra_neon, export=1 127 push {r4-r8,lr} 128 mov r4, r0 129 mov r5, r1 130 mov r1, r2 131 mov r2, r3 132 ldr r6, [sp, #24] 133 movrel r7, scan8 134 mov ip, #16 1351: ldrb r8, [r7], #1 136 ldr r0, [r5], #4 137 ldrb r8, [r6, r8] 138 add r0, r0, r4 139 cmp r8, #0 140 ldrsh r8, [r1] 141 adrne lr, ff_h264_idct_add_neon 142 adreq lr, ff_h264_idct_dc_add_neon 143 cmpeq r8, #0 144 blxne lr 145 subs ip, ip, #1 146 add r1, r1, #32 147 bne 1b 148 pop {r4-r8,pc} 149 .endfunc 150 151function ff_h264_idct_add8_neon, export=1 152 push {r4-r10,lr} 153 ldm r0, {r4,r9} 154 add r5, r1, #16*4 155 add r1, r2, #16*32 156 mov r2, r3 157 ldr r6, [sp, #32] 158 movrel r7, scan8+16 159 mov ip, #8 1601: ldrb r8, [r7], #1 161 ldr r0, [r5], #4 162 ldrb r8, [r6, r8] 163 tst ip, #4 164 addeq r0, r0, r4 165 addne r0, r0, r9 166 cmp r8, #0 167 ldrsh r8, [r1] 168 adrne lr, ff_h264_idct_add_neon 169 adreq lr, ff_h264_idct_dc_add_neon 170 cmpeq r8, #0 171 blxne lr 172 subs ip, ip, #1 173 add r1, r1, #32 174 bne 1b 175 pop {r4-r10,pc} 176 .endfunc 177 178 .section .rodata 179scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8 180 .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8 181 .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8 182 .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8 183 .byte 1+1*8, 2+1*8 184 .byte 1+2*8, 2+2*8 185 .byte 1+4*8, 2+4*8 186 .byte 1+5*8, 2+5*8 187