1/* 2 * ARM NEON optimised Format Conversion Utils 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "asm.S" 24 25 preserve8 26 27function ff_float_to_int16_neon, export=1 28 subs r2, r2, #8 29 vld1.64 {d0-d1}, [r1,:128]! 30 vcvt.s32.f32 q8, q0, #16 31 vld1.64 {d2-d3}, [r1,:128]! 32 vcvt.s32.f32 q9, q1, #16 33 beq 3f 34 bics ip, r2, #15 35 beq 2f 361: subs ip, ip, #16 37 vshrn.s32 d4, q8, #16 38 vld1.64 {d0-d1}, [r1,:128]! 39 vcvt.s32.f32 q0, q0, #16 40 vshrn.s32 d5, q9, #16 41 vld1.64 {d2-d3}, [r1,:128]! 42 vcvt.s32.f32 q1, q1, #16 43 vshrn.s32 d6, q0, #16 44 vst1.64 {d4-d5}, [r0,:128]! 45 vshrn.s32 d7, q1, #16 46 vld1.64 {d16-d17},[r1,:128]! 47 vcvt.s32.f32 q8, q8, #16 48 vld1.64 {d18-d19},[r1,:128]! 49 vcvt.s32.f32 q9, q9, #16 50 vst1.64 {d6-d7}, [r0,:128]! 51 bne 1b 52 ands r2, r2, #15 53 beq 3f 542: vld1.64 {d0-d1}, [r1,:128]! 55 vshrn.s32 d4, q8, #16 56 vcvt.s32.f32 q0, q0, #16 57 vld1.64 {d2-d3}, [r1,:128]! 58 vshrn.s32 d5, q9, #16 59 vcvt.s32.f32 q1, q1, #16 60 vshrn.s32 d6, q0, #16 61 vst1.64 {d4-d5}, [r0,:128]! 62 vshrn.s32 d7, q1, #16 63 vst1.64 {d6-d7}, [r0,:128]! 64 bx lr 653: vshrn.s32 d4, q8, #16 66 vshrn.s32 d5, q9, #16 67 vst1.64 {d4-d5}, [r0,:128]! 68 bx lr 69endfunc 70 71function ff_float_to_int16_interleave_neon, export=1 72 cmp r3, #2 73 itt lt 74 ldrlt r1, [r1] 75 blt ff_float_to_int16_neon 76 bne 4f 77 78 ldr r3, [r1] 79 ldr r1, [r1, #4] 80 81 subs r2, r2, #8 82 vld1.64 {d0-d1}, [r3,:128]! 83 vcvt.s32.f32 q8, q0, #16 84 vld1.64 {d2-d3}, [r3,:128]! 85 vcvt.s32.f32 q9, q1, #16 86 vld1.64 {d20-d21},[r1,:128]! 87 vcvt.s32.f32 q10, q10, #16 88 vld1.64 {d22-d23},[r1,:128]! 89 vcvt.s32.f32 q11, q11, #16 90 beq 3f 91 bics ip, r2, #15 92 beq 2f 931: subs ip, ip, #16 94 vld1.64 {d0-d1}, [r3,:128]! 95 vcvt.s32.f32 q0, q0, #16 96 vsri.32 q10, q8, #16 97 vld1.64 {d2-d3}, [r3,:128]! 98 vcvt.s32.f32 q1, q1, #16 99 vld1.64 {d24-d25},[r1,:128]! 100 vcvt.s32.f32 q12, q12, #16 101 vld1.64 {d26-d27},[r1,:128]! 102 vsri.32 q11, q9, #16 103 vst1.64 {d20-d21},[r0,:128]! 104 vcvt.s32.f32 q13, q13, #16 105 vst1.64 {d22-d23},[r0,:128]! 106 vsri.32 q12, q0, #16 107 vld1.64 {d16-d17},[r3,:128]! 108 vsri.32 q13, q1, #16 109 vst1.64 {d24-d25},[r0,:128]! 110 vcvt.s32.f32 q8, q8, #16 111 vld1.64 {d18-d19},[r3,:128]! 112 vcvt.s32.f32 q9, q9, #16 113 vld1.64 {d20-d21},[r1,:128]! 114 vcvt.s32.f32 q10, q10, #16 115 vld1.64 {d22-d23},[r1,:128]! 116 vcvt.s32.f32 q11, q11, #16 117 vst1.64 {d26-d27},[r0,:128]! 118 bne 1b 119 ands r2, r2, #15 120 beq 3f 1212: vsri.32 q10, q8, #16 122 vld1.64 {d0-d1}, [r3,:128]! 123 vcvt.s32.f32 q0, q0, #16 124 vld1.64 {d2-d3}, [r3,:128]! 125 vcvt.s32.f32 q1, q1, #16 126 vld1.64 {d24-d25},[r1,:128]! 127 vcvt.s32.f32 q12, q12, #16 128 vsri.32 q11, q9, #16 129 vld1.64 {d26-d27},[r1,:128]! 130 vcvt.s32.f32 q13, q13, #16 131 vst1.64 {d20-d21},[r0,:128]! 132 vsri.32 q12, q0, #16 133 vst1.64 {d22-d23},[r0,:128]! 134 vsri.32 q13, q1, #16 135 vst1.64 {d24-d27},[r0,:128]! 136 bx lr 1373: vsri.32 q10, q8, #16 138 vsri.32 q11, q9, #16 139 vst1.64 {d20-d23},[r0,:128]! 140 bx lr 141 1424: push {r4-r8,lr} 143 cmp r3, #4 144 lsl ip, r3, #1 145 blt 4f 146 147 @ 4 channels 1485: ldmia r1!, {r4-r7} 149 mov lr, r2 150 mov r8, r0 151 vld1.64 {d16-d17},[r4,:128]! 152 vcvt.s32.f32 q8, q8, #16 153 vld1.64 {d18-d19},[r5,:128]! 154 vcvt.s32.f32 q9, q9, #16 155 vld1.64 {d20-d21},[r6,:128]! 156 vcvt.s32.f32 q10, q10, #16 157 vld1.64 {d22-d23},[r7,:128]! 158 vcvt.s32.f32 q11, q11, #16 1596: subs lr, lr, #8 160 vld1.64 {d0-d1}, [r4,:128]! 161 vcvt.s32.f32 q0, q0, #16 162 vsri.32 q9, q8, #16 163 vld1.64 {d2-d3}, [r5,:128]! 164 vcvt.s32.f32 q1, q1, #16 165 vsri.32 q11, q10, #16 166 vld1.64 {d4-d5}, [r6,:128]! 167 vcvt.s32.f32 q2, q2, #16 168 vzip.32 d18, d22 169 vld1.64 {d6-d7}, [r7,:128]! 170 vcvt.s32.f32 q3, q3, #16 171 vzip.32 d19, d23 172 vst1.64 {d18}, [r8], ip 173 vsri.32 q1, q0, #16 174 vst1.64 {d22}, [r8], ip 175 vsri.32 q3, q2, #16 176 vst1.64 {d19}, [r8], ip 177 vzip.32 d2, d6 178 vst1.64 {d23}, [r8], ip 179 vzip.32 d3, d7 180 beq 7f 181 vld1.64 {d16-d17},[r4,:128]! 182 vcvt.s32.f32 q8, q8, #16 183 vst1.64 {d2}, [r8], ip 184 vld1.64 {d18-d19},[r5,:128]! 185 vcvt.s32.f32 q9, q9, #16 186 vst1.64 {d6}, [r8], ip 187 vld1.64 {d20-d21},[r6,:128]! 188 vcvt.s32.f32 q10, q10, #16 189 vst1.64 {d3}, [r8], ip 190 vld1.64 {d22-d23},[r7,:128]! 191 vcvt.s32.f32 q11, q11, #16 192 vst1.64 {d7}, [r8], ip 193 b 6b 1947: vst1.64 {d2}, [r8], ip 195 vst1.64 {d6}, [r8], ip 196 vst1.64 {d3}, [r8], ip 197 vst1.64 {d7}, [r8], ip 198 subs r3, r3, #4 199 it eq 200 popeq {r4-r8,pc} 201 cmp r3, #4 202 add r0, r0, #8 203 bge 5b 204 205 @ 2 channels 2064: cmp r3, #2 207 blt 4f 208 ldmia r1!, {r4-r5} 209 mov lr, r2 210 mov r8, r0 211 tst lr, #8 212 vld1.64 {d16-d17},[r4,:128]! 213 vcvt.s32.f32 q8, q8, #16 214 vld1.64 {d18-d19},[r5,:128]! 215 vcvt.s32.f32 q9, q9, #16 216 vld1.64 {d20-d21},[r4,:128]! 217 vcvt.s32.f32 q10, q10, #16 218 vld1.64 {d22-d23},[r5,:128]! 219 vcvt.s32.f32 q11, q11, #16 220 beq 6f 221 subs lr, lr, #8 222 beq 7f 223 vsri.32 d18, d16, #16 224 vsri.32 d19, d17, #16 225 vld1.64 {d16-d17},[r4,:128]! 226 vcvt.s32.f32 q8, q8, #16 227 vst1.32 {d18[0]}, [r8], ip 228 vsri.32 d22, d20, #16 229 vst1.32 {d18[1]}, [r8], ip 230 vsri.32 d23, d21, #16 231 vst1.32 {d19[0]}, [r8], ip 232 vst1.32 {d19[1]}, [r8], ip 233 vld1.64 {d18-d19},[r5,:128]! 234 vcvt.s32.f32 q9, q9, #16 235 vst1.32 {d22[0]}, [r8], ip 236 vst1.32 {d22[1]}, [r8], ip 237 vld1.64 {d20-d21},[r4,:128]! 238 vcvt.s32.f32 q10, q10, #16 239 vst1.32 {d23[0]}, [r8], ip 240 vst1.32 {d23[1]}, [r8], ip 241 vld1.64 {d22-d23},[r5,:128]! 242 vcvt.s32.f32 q11, q11, #16 2436: subs lr, lr, #16 244 vld1.64 {d0-d1}, [r4,:128]! 245 vcvt.s32.f32 q0, q0, #16 246 vsri.32 d18, d16, #16 247 vld1.64 {d2-d3}, [r5,:128]! 248 vcvt.s32.f32 q1, q1, #16 249 vsri.32 d19, d17, #16 250 vld1.64 {d4-d5}, [r4,:128]! 251 vcvt.s32.f32 q2, q2, #16 252 vld1.64 {d6-d7}, [r5,:128]! 253 vcvt.s32.f32 q3, q3, #16 254 vst1.32 {d18[0]}, [r8], ip 255 vsri.32 d22, d20, #16 256 vst1.32 {d18[1]}, [r8], ip 257 vsri.32 d23, d21, #16 258 vst1.32 {d19[0]}, [r8], ip 259 vsri.32 d2, d0, #16 260 vst1.32 {d19[1]}, [r8], ip 261 vsri.32 d3, d1, #16 262 vst1.32 {d22[0]}, [r8], ip 263 vsri.32 d6, d4, #16 264 vst1.32 {d22[1]}, [r8], ip 265 vsri.32 d7, d5, #16 266 vst1.32 {d23[0]}, [r8], ip 267 vst1.32 {d23[1]}, [r8], ip 268 beq 6f 269 vld1.64 {d16-d17},[r4,:128]! 270 vcvt.s32.f32 q8, q8, #16 271 vst1.32 {d2[0]}, [r8], ip 272 vst1.32 {d2[1]}, [r8], ip 273 vld1.64 {d18-d19},[r5,:128]! 274 vcvt.s32.f32 q9, q9, #16 275 vst1.32 {d3[0]}, [r8], ip 276 vst1.32 {d3[1]}, [r8], ip 277 vld1.64 {d20-d21},[r4,:128]! 278 vcvt.s32.f32 q10, q10, #16 279 vst1.32 {d6[0]}, [r8], ip 280 vst1.32 {d6[1]}, [r8], ip 281 vld1.64 {d22-d23},[r5,:128]! 282 vcvt.s32.f32 q11, q11, #16 283 vst1.32 {d7[0]}, [r8], ip 284 vst1.32 {d7[1]}, [r8], ip 285 bgt 6b 2866: vst1.32 {d2[0]}, [r8], ip 287 vst1.32 {d2[1]}, [r8], ip 288 vst1.32 {d3[0]}, [r8], ip 289 vst1.32 {d3[1]}, [r8], ip 290 vst1.32 {d6[0]}, [r8], ip 291 vst1.32 {d6[1]}, [r8], ip 292 vst1.32 {d7[0]}, [r8], ip 293 vst1.32 {d7[1]}, [r8], ip 294 b 8f 2957: vsri.32 d18, d16, #16 296 vsri.32 d19, d17, #16 297 vst1.32 {d18[0]}, [r8], ip 298 vsri.32 d22, d20, #16 299 vst1.32 {d18[1]}, [r8], ip 300 vsri.32 d23, d21, #16 301 vst1.32 {d19[0]}, [r8], ip 302 vst1.32 {d19[1]}, [r8], ip 303 vst1.32 {d22[0]}, [r8], ip 304 vst1.32 {d22[1]}, [r8], ip 305 vst1.32 {d23[0]}, [r8], ip 306 vst1.32 {d23[1]}, [r8], ip 3078: subs r3, r3, #2 308 add r0, r0, #4 309 it eq 310 popeq {r4-r8,pc} 311 312 @ 1 channel 3134: ldr r4, [r1],#4 314 tst r2, #8 315 mov lr, r2 316 mov r5, r0 317 vld1.64 {d0-d1}, [r4,:128]! 318 vcvt.s32.f32 q0, q0, #16 319 vld1.64 {d2-d3}, [r4,:128]! 320 vcvt.s32.f32 q1, q1, #16 321 bne 8f 3226: subs lr, lr, #16 323 vld1.64 {d4-d5}, [r4,:128]! 324 vcvt.s32.f32 q2, q2, #16 325 vld1.64 {d6-d7}, [r4,:128]! 326 vcvt.s32.f32 q3, q3, #16 327 vst1.16 {d0[1]}, [r5,:16], ip 328 vst1.16 {d0[3]}, [r5,:16], ip 329 vst1.16 {d1[1]}, [r5,:16], ip 330 vst1.16 {d1[3]}, [r5,:16], ip 331 vst1.16 {d2[1]}, [r5,:16], ip 332 vst1.16 {d2[3]}, [r5,:16], ip 333 vst1.16 {d3[1]}, [r5,:16], ip 334 vst1.16 {d3[3]}, [r5,:16], ip 335 beq 7f 336 vld1.64 {d0-d1}, [r4,:128]! 337 vcvt.s32.f32 q0, q0, #16 338 vld1.64 {d2-d3}, [r4,:128]! 339 vcvt.s32.f32 q1, q1, #16 3407: vst1.16 {d4[1]}, [r5,:16], ip 341 vst1.16 {d4[3]}, [r5,:16], ip 342 vst1.16 {d5[1]}, [r5,:16], ip 343 vst1.16 {d5[3]}, [r5,:16], ip 344 vst1.16 {d6[1]}, [r5,:16], ip 345 vst1.16 {d6[3]}, [r5,:16], ip 346 vst1.16 {d7[1]}, [r5,:16], ip 347 vst1.16 {d7[3]}, [r5,:16], ip 348 bgt 6b 349 pop {r4-r8,pc} 3508: subs lr, lr, #8 351 vst1.16 {d0[1]}, [r5,:16], ip 352 vst1.16 {d0[3]}, [r5,:16], ip 353 vst1.16 {d1[1]}, [r5,:16], ip 354 vst1.16 {d1[3]}, [r5,:16], ip 355 vst1.16 {d2[1]}, [r5,:16], ip 356 vst1.16 {d2[3]}, [r5,:16], ip 357 vst1.16 {d3[1]}, [r5,:16], ip 358 vst1.16 {d3[3]}, [r5,:16], ip 359 it eq 360 popeq {r4-r8,pc} 361 vld1.64 {d0-d1}, [r4,:128]! 362 vcvt.s32.f32 q0, q0, #16 363 vld1.64 {d2-d3}, [r4,:128]! 364 vcvt.s32.f32 q1, q1, #16 365 b 6b 366endfunc 367 368function ff_int32_to_float_fmul_scalar_neon, export=1 369VFP vdup.32 q0, d0[0] 370VFP len .req r2 371NOVFP vdup.32 q0, r2 372NOVFP len .req r3 373 374 vld1.32 {q1},[r1,:128]! 375 vcvt.f32.s32 q3, q1 376 vld1.32 {q2},[r1,:128]! 377 vcvt.f32.s32 q8, q2 3781: subs len, len, #8 379 pld [r1, #16] 380 vmul.f32 q9, q3, q0 381 vmul.f32 q10, q8, q0 382 beq 2f 383 vld1.32 {q1},[r1,:128]! 384 vcvt.f32.s32 q3, q1 385 vld1.32 {q2},[r1,:128]! 386 vcvt.f32.s32 q8, q2 387 vst1.32 {q9}, [r0,:128]! 388 vst1.32 {q10},[r0,:128]! 389 b 1b 3902: vst1.32 {q9}, [r0,:128]! 391 vst1.32 {q10},[r0,:128]! 392 bx lr 393 .unreq len 394endfunc 395