1/* 2 * ARM NEON optimised Float DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "asm.S" 24 25function ff_vector_fmul_neon, export=1 26 subs r3, r3, #8 27 vld1.32 {d0-d3}, [r1,:128]! 28 vld1.32 {d4-d7}, [r2,:128]! 29 vmul.f32 q8, q0, q2 30 vmul.f32 q9, q1, q3 31 beq 3f 32 bics ip, r3, #15 33 beq 2f 341: subs ip, ip, #16 35 vld1.32 {d0-d1}, [r1,:128]! 36 vld1.32 {d4-d5}, [r2,:128]! 37 vmul.f32 q10, q0, q2 38 vld1.32 {d2-d3}, [r1,:128]! 39 vld1.32 {d6-d7}, [r2,:128]! 40 vmul.f32 q11, q1, q3 41 vst1.32 {d16-d19},[r0,:128]! 42 vld1.32 {d0-d1}, [r1,:128]! 43 vld1.32 {d4-d5}, [r2,:128]! 44 vmul.f32 q8, q0, q2 45 vld1.32 {d2-d3}, [r1,:128]! 46 vld1.32 {d6-d7}, [r2,:128]! 47 vmul.f32 q9, q1, q3 48 vst1.32 {d20-d23},[r0,:128]! 49 bne 1b 50 ands r3, r3, #15 51 beq 3f 522: vld1.32 {d0-d1}, [r1,:128]! 53 vld1.32 {d4-d5}, [r2,:128]! 54 vst1.32 {d16-d17},[r0,:128]! 55 vmul.f32 q8, q0, q2 56 vld1.32 {d2-d3}, [r1,:128]! 57 vld1.32 {d6-d7}, [r2,:128]! 58 vst1.32 {d18-d19},[r0,:128]! 59 vmul.f32 q9, q1, q3 603: vst1.32 {d16-d19},[r0,:128]! 61 bx lr 62endfunc 63 64function ff_vector_fmac_scalar_neon, export=1 65VFP len .req r2 66VFP acc .req r3 67NOVFP len .req r3 68NOVFP acc .req r2 69VFP vdup.32 q15, d0[0] 70NOVFP vdup.32 q15, r2 71 bics r12, len, #15 72 mov acc, r0 73 beq 3f 74 vld1.32 {q0}, [r1,:128]! 75 vld1.32 {q8}, [acc,:128]! 76 vld1.32 {q1}, [r1,:128]! 77 vld1.32 {q9}, [acc,:128]! 781: vmla.f32 q8, q0, q15 79 vld1.32 {q2}, [r1,:128]! 80 vld1.32 {q10}, [acc,:128]! 81 vmla.f32 q9, q1, q15 82 vld1.32 {q3}, [r1,:128]! 83 vld1.32 {q11}, [acc,:128]! 84 vmla.f32 q10, q2, q15 85 vst1.32 {q8}, [r0,:128]! 86 vmla.f32 q11, q3, q15 87 vst1.32 {q9}, [r0,:128]! 88 subs r12, r12, #16 89 beq 2f 90 vld1.32 {q0}, [r1,:128]! 91 vld1.32 {q8}, [acc,:128]! 92 vst1.32 {q10}, [r0,:128]! 93 vld1.32 {q1}, [r1,:128]! 94 vld1.32 {q9}, [acc,:128]! 95 vst1.32 {q11}, [r0,:128]! 96 b 1b 972: vst1.32 {q10}, [r0,:128]! 98 vst1.32 {q11}, [r0,:128]! 99 ands len, len, #15 100 it eq 101 bxeq lr 1023: vld1.32 {q0}, [r1,:128]! 103 vld1.32 {q8}, [acc,:128]! 104 vmla.f32 q8, q0, q15 105 vst1.32 {q8}, [r0,:128]! 106 subs len, len, #4 107 bgt 3b 108 bx lr 109 .unreq len 110endfunc 111 112function ff_vector_fmul_scalar_neon, export=1 113VFP len .req r2 114NOVFP len .req r3 115VFP vdup.32 q8, d0[0] 116NOVFP vdup.32 q8, r2 117 bics r12, len, #15 118 beq 3f 119 vld1.32 {q0},[r1,:128]! 120 vld1.32 {q1},[r1,:128]! 1211: vmul.f32 q0, q0, q8 122 vld1.32 {q2},[r1,:128]! 123 vmul.f32 q1, q1, q8 124 vld1.32 {q3},[r1,:128]! 125 vmul.f32 q2, q2, q8 126 vst1.32 {q0},[r0,:128]! 127 vmul.f32 q3, q3, q8 128 vst1.32 {q1},[r0,:128]! 129 subs r12, r12, #16 130 beq 2f 131 vld1.32 {q0},[r1,:128]! 132 vst1.32 {q2},[r0,:128]! 133 vld1.32 {q1},[r1,:128]! 134 vst1.32 {q3},[r0,:128]! 135 b 1b 1362: vst1.32 {q2},[r0,:128]! 137 vst1.32 {q3},[r0,:128]! 138 ands len, len, #15 139 it eq 140 bxeq lr 1413: vld1.32 {q0},[r1,:128]! 142 vmul.f32 q0, q0, q8 143 vst1.32 {q0},[r0,:128]! 144 subs len, len, #4 145 bgt 3b 146 bx lr 147 .unreq len 148endfunc 149 150function ff_vector_fmul_window_neon, export=1 151 push {r4,r5,lr} 152 ldr lr, [sp, #12] 153 sub r2, r2, #8 154 sub r5, lr, #2 155 add r2, r2, r5, lsl #2 156 add r4, r3, r5, lsl #3 157 add ip, r0, r5, lsl #3 158 mov r5, #-16 159 vld1.32 {d0,d1}, [r1,:128]! 160 vld1.32 {d2,d3}, [r2,:128], r5 161 vld1.32 {d4,d5}, [r3,:128]! 162 vld1.32 {d6,d7}, [r4,:128], r5 1631: subs lr, lr, #4 164 vmul.f32 d22, d0, d4 165 vrev64.32 q3, q3 166 vmul.f32 d23, d1, d5 167 vrev64.32 q1, q1 168 vmul.f32 d20, d0, d7 169 vmul.f32 d21, d1, d6 170 beq 2f 171 vmla.f32 d22, d3, d7 172 vld1.32 {d0,d1}, [r1,:128]! 173 vmla.f32 d23, d2, d6 174 vld1.32 {d18,d19},[r2,:128], r5 175 vmls.f32 d20, d3, d4 176 vld1.32 {d24,d25},[r3,:128]! 177 vmls.f32 d21, d2, d5 178 vld1.32 {d6,d7}, [r4,:128], r5 179 vmov q1, q9 180 vrev64.32 q11, q11 181 vmov q2, q12 182 vswp d22, d23 183 vst1.32 {d20,d21},[r0,:128]! 184 vst1.32 {d22,d23},[ip,:128], r5 185 b 1b 1862: vmla.f32 d22, d3, d7 187 vmla.f32 d23, d2, d6 188 vmls.f32 d20, d3, d4 189 vmls.f32 d21, d2, d5 190 vrev64.32 q11, q11 191 vswp d22, d23 192 vst1.32 {d20,d21},[r0,:128]! 193 vst1.32 {d22,d23},[ip,:128], r5 194 pop {r4,r5,pc} 195endfunc 196 197function ff_vector_fmul_add_neon, export=1 198 ldr r12, [sp] 199 vld1.32 {q0-q1}, [r1,:128]! 200 vld1.32 {q8-q9}, [r2,:128]! 201 vld1.32 {q2-q3}, [r3,:128]! 202 vmul.f32 q10, q0, q8 203 vmul.f32 q11, q1, q9 2041: vadd.f32 q12, q2, q10 205 vadd.f32 q13, q3, q11 206 pld [r1, #16] 207 pld [r2, #16] 208 pld [r3, #16] 209 subs r12, r12, #8 210 beq 2f 211 vld1.32 {q0}, [r1,:128]! 212 vld1.32 {q8}, [r2,:128]! 213 vmul.f32 q10, q0, q8 214 vld1.32 {q1}, [r1,:128]! 215 vld1.32 {q9}, [r2,:128]! 216 vmul.f32 q11, q1, q9 217 vld1.32 {q2-q3}, [r3,:128]! 218 vst1.32 {q12-q13},[r0,:128]! 219 b 1b 2202: vst1.32 {q12-q13},[r0,:128]! 221 bx lr 222endfunc 223 224function ff_vector_fmul_reverse_neon, export=1 225 add r2, r2, r3, lsl #2 226 sub r2, r2, #32 227 mov r12, #-32 228 vld1.32 {q0-q1}, [r1,:128]! 229 vld1.32 {q2-q3}, [r2,:128], r12 2301: pld [r1, #32] 231 vrev64.32 q3, q3 232 vmul.f32 d16, d0, d7 233 vmul.f32 d17, d1, d6 234 pld [r2, #-32] 235 vrev64.32 q2, q2 236 vmul.f32 d18, d2, d5 237 vmul.f32 d19, d3, d4 238 subs r3, r3, #8 239 beq 2f 240 vld1.32 {q0-q1}, [r1,:128]! 241 vld1.32 {q2-q3}, [r2,:128], r12 242 vst1.32 {q8-q9}, [r0,:128]! 243 b 1b 2442: vst1.32 {q8-q9}, [r0,:128]! 245 bx lr 246endfunc 247 248function ff_butterflies_float_neon, export=1 2491: vld1.32 {q0},[r0,:128] 250 vld1.32 {q1},[r1,:128] 251 vsub.f32 q2, q0, q1 252 vadd.f32 q1, q0, q1 253 vst1.32 {q2},[r1,:128]! 254 vst1.32 {q1},[r0,:128]! 255 subs r2, r2, #4 256 bgt 1b 257 bx lr 258endfunc 259 260function ff_scalarproduct_float_neon, export=1 261 vmov.f32 q2, #0.0 2621: vld1.32 {q0},[r0,:128]! 263 vld1.32 {q1},[r1,:128]! 264 vmla.f32 q2, q0, q1 265 subs r2, r2, #4 266 bgt 1b 267 vadd.f32 d0, d4, d5 268 vpadd.f32 d0, d0, d0 269NOVFP vmov.32 r0, d0[0] 270 bx lr 271endfunc 272