1/* 2 * ARM NEON optimised integer operations 3 * Copyright (c) 2009 Kostya Shishkov 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "asm.S" 23 24 preserve8 25 .fpu neon 26 27function ff_scalarproduct_int16_neon, export=1 28 vmov.i16 q0, #0 29 vmov.i16 q1, #0 30 vmov.i16 q2, #0 31 vmov.i16 q3, #0 32 negs r3, r3 33 beq 2f 34 35 vdup.s32 q12, r3 361: vld1.16 {d16-d17}, [r0]! 37 vld1.16 {d20-d21}, [r1,:128]! 38 vmull.s16 q12, d16, d20 39 vld1.16 {d18-d19}, [r0]! 40 vmull.s16 q13, d17, d21 41 vld1.16 {d22-d23}, [r1,:128]! 42 vmull.s16 q14, d18, d22 43 vmull.s16 q15, d19, d23 44 vshl.s32 q8, q12, q12 45 vshl.s32 q9, q13, q12 46 vadd.s32 q0, q0, q8 47 vshl.s32 q10, q14, q12 48 vadd.s32 q1, q1, q9 49 vshl.s32 q11, q15, q12 50 vadd.s32 q2, q2, q10 51 vadd.s32 q3, q3, q11 52 subs r2, r2, #16 53 bne 1b 54 b 3f 55 562: vld1.16 {d16-d17}, [r0]! 57 vld1.16 {d20-d21}, [r1,:128]! 58 vmlal.s16 q0, d16, d20 59 vld1.16 {d18-d19}, [r0]! 60 vmlal.s16 q1, d17, d21 61 vld1.16 {d22-d23}, [r1,:128]! 62 vmlal.s16 q2, d18, d22 63 vmlal.s16 q3, d19, d23 64 subs r2, r2, #16 65 bne 2b 66 673: vpadd.s32 d16, d0, d1 68 vpadd.s32 d17, d2, d3 69 vpadd.s32 d10, d4, d5 70 vpadd.s32 d11, d6, d7 71 vpadd.s32 d0, d16, d17 72 vpadd.s32 d1, d10, d11 73 vpadd.s32 d2, d0, d1 74 vpaddl.s32 d3, d2 75 vmov.32 r0, d3[0] 76 bx lr 77endfunc 78 79@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) 80function ff_scalarproduct_and_madd_int16_neon, export=1 81 vld1.16 {d28[],d29[]}, [sp] 82 vmov.i16 q0, #0 83 vmov.i16 q1, #0 84 vmov.i16 q2, #0 85 vmov.i16 q3, #0 86 mov r12, r0 87 881: vld1.16 {d16-d17}, [r0,:128]! 89 vld1.16 {d18-d19}, [r1]! 90 vld1.16 {d20-d21}, [r2]! 91 vld1.16 {d22-d23}, [r0,:128]! 92 vld1.16 {d24-d25}, [r1]! 93 vld1.16 {d26-d27}, [r2]! 94 vmul.s16 q10, q10, q14 95 vmul.s16 q13, q13, q14 96 vmlal.s16 q0, d16, d18 97 vmlal.s16 q1, d17, d19 98 vadd.s16 q10, q8, q10 99 vadd.s16 q13, q11, q13 100 vmlal.s16 q2, d22, d24 101 vmlal.s16 q3, d23, d25 102 vst1.16 {q10}, [r12,:128]! 103 subs r3, r3, #16 104 vst1.16 {q13}, [r12,:128]! 105 bne 1b 106 107 vpadd.s32 d16, d0, d1 108 vpadd.s32 d17, d2, d3 109 vpadd.s32 d10, d4, d5 110 vpadd.s32 d11, d6, d7 111 vpadd.s32 d0, d16, d17 112 vpadd.s32 d1, d10, d11 113 vpadd.s32 d2, d0, d1 114 vpaddl.s32 d3, d2 115 vmov.32 r0, d3[0] 116 bx lr 117endfunc 118