1/* 2 * ARM NEON optimised MDCT 3 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "asm.S" 23 24 preserve8 25 26#define ff_fft_calc_neon X(ff_fft_calc_neon) 27 28function ff_imdct_half_neon, export=1 29 push {r4-r8,lr} 30 31 mov r12, #1 32 ldr lr, [r0, #20] @ mdct_bits 33 ldr r4, [r0, #24] @ tcos 34 ldr r3, [r0, #8] @ revtab 35 lsl r12, r12, lr @ n = 1 << nbits 36 lsr lr, r12, #2 @ n4 = n >> 2 37 add r7, r2, r12, lsl #1 38 mov r12, #-16 39 sub r7, r7, #16 40 41 vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 42 vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x 43 vrev64.32 d17, d17 44 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 45 vmul.f32 d6, d17, d2 46 vmul.f32 d7, d0, d2 471: 48 subs lr, lr, #2 49 ldr r6, [r3], #4 50 vmul.f32 d4, d0, d3 51 vmul.f32 d5, d17, d3 52 vsub.f32 d4, d6, d4 53 vadd.f32 d5, d5, d7 54 uxth r8, r6, ror #16 55 uxth r6, r6 56 add r8, r1, r8, lsl #3 57 add r6, r1, r6, lsl #3 58 beq 1f 59 vld2.32 {d16-d17},[r7,:128],r12 60 vld2.32 {d0-d1}, [r2,:128]! 61 vrev64.32 d17, d17 62 vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 63 vmul.f32 d6, d17, d2 64 vmul.f32 d7, d0, d2 65 vst2.32 {d4[0],d5[0]}, [r6,:64] 66 vst2.32 {d4[1],d5[1]}, [r8,:64] 67 b 1b 681: 69 vst2.32 {d4[0],d5[0]}, [r6,:64] 70 vst2.32 {d4[1],d5[1]}, [r8,:64] 71 72 mov r4, r0 73 mov r6, r1 74 bl ff_fft_calc_neon 75 76 mov r12, #1 77 ldr lr, [r4, #20] @ mdct_bits 78 ldr r4, [r4, #24] @ tcos 79 lsl r12, r12, lr @ n = 1 << nbits 80 lsr lr, r12, #3 @ n8 = n >> 3 81 82 add r4, r4, lr, lsl #3 83 add r6, r6, lr, lsl #3 84 sub r1, r4, #16 85 sub r3, r6, #16 86 87 mov r7, #-16 88 mov r8, r6 89 mov r0, r3 90 91 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 92 vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 93 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 941: 95 subs lr, lr, #2 96 vmul.f32 d7, d0, d18 97 vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 98 vmul.f32 d4, d1, d18 99 vmul.f32 d5, d21, d19 100 vmul.f32 d6, d20, d19 101 vmul.f32 d22, d1, d16 102 vmul.f32 d23, d21, d17 103 vmul.f32 d24, d0, d16 104 vmul.f32 d25, d20, d17 105 vadd.f32 d7, d7, d22 106 vadd.f32 d6, d6, d23 107 vsub.f32 d4, d4, d24 108 vsub.f32 d5, d5, d25 109 beq 1f 110 vld2.32 {d0-d1}, [r3,:128], r7 111 vld2.32 {d20-d21},[r6,:128]! 112 vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 113 vrev64.32 q3, q3 114 vst2.32 {d4,d6}, [r0,:128], r7 115 vst2.32 {d5,d7}, [r8,:128]! 116 b 1b 1171: 118 vrev64.32 q3, q3 119 vst2.32 {d4,d6}, [r0,:128] 120 vst2.32 {d5,d7}, [r8,:128] 121 122 pop {r4-r8,pc} 123endfunc 124 125function ff_imdct_calc_neon, export=1 126 push {r4-r6,lr} 127 128 ldr r3, [r0, #20] 129 mov r4, #1 130 mov r5, r1 131 lsl r4, r4, r3 132 add r1, r1, r4 133 134 bl ff_imdct_half_neon 135 136 add r0, r5, r4, lsl #2 137 add r1, r5, r4, lsl #1 138 sub r0, r0, #8 139 sub r2, r1, #16 140 mov r3, #-16 141 mov r6, #-8 142 vmov.i32 d30, #1<<31 1431: 144 vld1.32 {d0-d1}, [r2,:128], r3 145 pld [r0, #-16] 146 vrev64.32 q0, q0 147 vld1.32 {d2-d3}, [r1,:128]! 148 veor d4, d1, d30 149 pld [r2, #-16] 150 vrev64.32 q1, q1 151 veor d5, d0, d30 152 vst1.32 {d2}, [r0,:64], r6 153 vst1.32 {d3}, [r0,:64], r6 154 vst1.32 {d4-d5}, [r5,:128]! 155 subs r4, r4, #16 156 bgt 1b 157 158 pop {r4-r6,pc} 159endfunc 160 161function ff_mdct_calc_neon, export=1 162 push {r4-r10,lr} 163 164 mov r12, #1 165 ldr lr, [r0, #20] @ mdct_bits 166 ldr r4, [r0, #24] @ tcos 167 ldr r3, [r0, #8] @ revtab 168 lsl lr, r12, lr @ n = 1 << nbits 169 add r7, r2, lr @ in4u 170 sub r9, r7, #16 @ in4d 171 add r2, r7, lr, lsl #1 @ in3u 172 add r8, r9, lr, lsl #1 @ in3d 173 add r5, r4, lr, lsl #1 174 sub r5, r5, #16 175 sub r3, r3, #4 176 mov r12, #-16 177 178 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 179 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 180 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 181 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 182 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 183 vsub.f32 d0, d18, d0 @ in4d-in4u I 184 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 185 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 186 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 187 vadd.f32 d1, d1, d19 @ in3u+in3d -R 188 vsub.f32 d16, d16, d2 @ in0u-in2d R 189 vadd.f32 d17, d17, d3 @ in2u+in1d -I 1901: 191 vmul.f32 d7, d0, d21 @ I*s 192A ldr r10, [r3, lr, lsr #1] 193T lsr r10, lr, #1 194T ldr r10, [r3, r10] 195 vmul.f32 d6, d1, d20 @ -R*c 196 ldr r6, [r3, #4]! 197 vmul.f32 d4, d1, d21 @ -R*s 198 vmul.f32 d5, d0, d20 @ I*c 199 vmul.f32 d24, d16, d30 @ R*c 200 vmul.f32 d25, d17, d31 @ -I*s 201 vmul.f32 d22, d16, d31 @ R*s 202 vmul.f32 d23, d17, d30 @ I*c 203 subs lr, lr, #16 204 vsub.f32 d6, d6, d7 @ -R*c-I*s 205 vadd.f32 d7, d4, d5 @ -R*s+I*c 206 vsub.f32 d24, d25, d24 @ I*s-R*c 207 vadd.f32 d25, d22, d23 @ R*s-I*c 208 beq 1f 209 mov r12, #-16 210 vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 211 vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 212 vneg.f32 d7, d7 @ R*s-I*c 213 vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 214 vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 215 vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 216 vsub.f32 d0, d18, d0 @ in4d-in4u I 217 vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 218 vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 219 vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 220 vadd.f32 d1, d1, d19 @ in3u+in3d -R 221 vsub.f32 d16, d16, d2 @ in0u-in2d R 222 vadd.f32 d17, d17, d3 @ in2u+in1d -I 223 uxth r12, r6, ror #16 224 uxth r6, r6 225 add r12, r1, r12, lsl #3 226 add r6, r1, r6, lsl #3 227 vst2.32 {d6[0],d7[0]}, [r6,:64] 228 vst2.32 {d6[1],d7[1]}, [r12,:64] 229 uxth r6, r10, ror #16 230 uxth r10, r10 231 add r6 , r1, r6, lsl #3 232 add r10, r1, r10, lsl #3 233 vst2.32 {d24[0],d25[0]},[r10,:64] 234 vst2.32 {d24[1],d25[1]},[r6,:64] 235 b 1b 2361: 237 vneg.f32 d7, d7 @ R*s-I*c 238 uxth r12, r6, ror #16 239 uxth r6, r6 240 add r12, r1, r12, lsl #3 241 add r6, r1, r6, lsl #3 242 vst2.32 {d6[0],d7[0]}, [r6,:64] 243 vst2.32 {d6[1],d7[1]}, [r12,:64] 244 uxth r6, r10, ror #16 245 uxth r10, r10 246 add r6 , r1, r6, lsl #3 247 add r10, r1, r10, lsl #3 248 vst2.32 {d24[0],d25[0]},[r10,:64] 249 vst2.32 {d24[1],d25[1]},[r6,:64] 250 251 mov r4, r0 252 mov r6, r1 253 bl ff_fft_calc_neon 254 255 mov r12, #1 256 ldr lr, [r4, #20] @ mdct_bits 257 ldr r4, [r4, #24] @ tcos 258 lsl r12, r12, lr @ n = 1 << nbits 259 lsr lr, r12, #3 @ n8 = n >> 3 260 261 add r4, r4, lr, lsl #3 262 add r6, r6, lr, lsl #3 263 sub r1, r4, #16 264 sub r3, r6, #16 265 266 mov r7, #-16 267 mov r8, r6 268 mov r0, r3 269 270 vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 271 vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 272 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 2731: 274 subs lr, lr, #2 275 vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 276 vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 277 vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 278 vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 279 vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 280 vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 281 vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 282 vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 283 vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 284 vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 285 vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 286 vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 287 vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 288 vneg.f32 q2, q2 289 beq 1f 290 vld2.32 {d0-d1}, [r3,:128], r7 291 vld2.32 {d20-d21},[r6,:128]! 292 vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 293 vrev64.32 q3, q3 294 vst2.32 {d4,d6}, [r0,:128], r7 295 vst2.32 {d5,d7}, [r8,:128]! 296 b 1b 2971: 298 vrev64.32 q3, q3 299 vst2.32 {d4,d6}, [r0,:128] 300 vst2.32 {d5,d7}, [r8,:128] 301 302 pop {r4-r10,pc} 303endfunc 304