1/* 2 * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> 3 * 4 * This file is part of Libav. 5 * 6 * Libav is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * Libav is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with Libav; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "config.h" 22#include "asm.S" 23 24/* 25 * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle 26 * throughput for almost all the instructions (except for double precision 27 * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles 28 * for arithmetic operations. Scheduling code to avoid pipeline stalls is very 29 * important for performance. One more interesting feature is that VFP has 30 * independent load/store and arithmetics pipelines, so it is possible to make 31 * them work simultaneously and get more than 1 operation per cycle. Load/store 32 * pipeline can process 2 single precision floating point values per cycle and 33 * supports bulk loads and stores for large sets of registers. Arithmetic operations 34 * can be done on vectors, which allows to keep the arithmetics pipeline busy, 35 * while the processor may issue and execute other instructions. Detailed 36 * optimization manuals can be found at http://www.arm.com 37 */ 38 39/** 40 * ARM VFP optimized implementation of 'vector_fmul_c' function. 41 * Assume that len is a positive number and is multiple of 8 42 */ 43@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len) 44function ff_vector_fmul_vfp, export=1 45 vpush {d8-d15} 46 fmrx r12, fpscr 47 orr r12, r12, #(3 << 16) /* set vector size to 4 */ 48 fmxr fpscr, r12 49 50 vldmia r1!, {s0-s3} 51 vldmia r2!, {s8-s11} 52 vldmia r1!, {s4-s7} 53 vldmia r2!, {s12-s15} 54 vmul.f32 s8, s0, s8 551: 56 subs r3, r3, #16 57 vmul.f32 s12, s4, s12 58 itttt ge 59 vldmiage r1!, {s16-s19} 60 vldmiage r2!, {s24-s27} 61 vldmiage r1!, {s20-s23} 62 vldmiage r2!, {s28-s31} 63 it ge 64 vmulge.f32 s24, s16, s24 65 vstmia r0!, {s8-s11} 66 vstmia r0!, {s12-s15} 67 it ge 68 vmulge.f32 s28, s20, s28 69 itttt gt 70 vldmiagt r1!, {s0-s3} 71 vldmiagt r2!, {s8-s11} 72 vldmiagt r1!, {s4-s7} 73 vldmiagt r2!, {s12-s15} 74 ittt ge 75 vmulge.f32 s8, s0, s8 76 vstmiage r0!, {s24-s27} 77 vstmiage r0!, {s28-s31} 78 bgt 1b 79 80 bic r12, r12, #(7 << 16) /* set vector size back to 1 */ 81 fmxr fpscr, r12 82 vpop {d8-d15} 83 bx lr 84endfunc 85 86/** 87 * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function. 88 * Assume that len is a positive number and is multiple of 8 89 */ 90@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, 91@ const float *src1, int len) 92function ff_vector_fmul_reverse_vfp, export=1 93 vpush {d8-d15} 94 add r2, r2, r3, lsl #2 95 vldmdb r2!, {s0-s3} 96 vldmia r1!, {s8-s11} 97 vldmdb r2!, {s4-s7} 98 vldmia r1!, {s12-s15} 99 vmul.f32 s8, s3, s8 100 vmul.f32 s9, s2, s9 101 vmul.f32 s10, s1, s10 102 vmul.f32 s11, s0, s11 1031: 104 subs r3, r3, #16 105 it ge 106 vldmdbge r2!, {s16-s19} 107 vmul.f32 s12, s7, s12 108 it ge 109 vldmiage r1!, {s24-s27} 110 vmul.f32 s13, s6, s13 111 it ge 112 vldmdbge r2!, {s20-s23} 113 vmul.f32 s14, s5, s14 114 it ge 115 vldmiage r1!, {s28-s31} 116 vmul.f32 s15, s4, s15 117 it ge 118 vmulge.f32 s24, s19, s24 119 it gt 120 vldmdbgt r2!, {s0-s3} 121 it ge 122 vmulge.f32 s25, s18, s25 123 vstmia r0!, {s8-s13} 124 it ge 125 vmulge.f32 s26, s17, s26 126 it gt 127 vldmiagt r1!, {s8-s11} 128 itt ge 129 vmulge.f32 s27, s16, s27 130 vmulge.f32 s28, s23, s28 131 it gt 132 vldmdbgt r2!, {s4-s7} 133 it ge 134 vmulge.f32 s29, s22, s29 135 vstmia r0!, {s14-s15} 136 ittt ge 137 vmulge.f32 s30, s21, s30 138 vmulge.f32 s31, s20, s31 139 vmulge.f32 s8, s3, s8 140 it gt 141 vldmiagt r1!, {s12-s15} 142 itttt ge 143 vmulge.f32 s9, s2, s9 144 vmulge.f32 s10, s1, s10 145 vstmiage r0!, {s24-s27} 146 vmulge.f32 s11, s0, s11 147 it ge 148 vstmiage r0!, {s28-s31} 149 bgt 1b 150 151 vpop {d8-d15} 152 bx lr 153endfunc 154