1/* 2 * Copyright (C) 2010, Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY 17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 */ 24 25#include "config.h" 26 27#if ENABLE(WEB_AUDIO) 28 29#include "VectorMath.h" 30 31#if OS(DARWIN) 32#include <Accelerate/Accelerate.h> 33#endif 34 35#ifdef __SSE2__ 36#include <emmintrin.h> 37#endif 38 39#if HAVE(ARM_NEON_INTRINSICS) 40#include <arm_neon.h> 41#endif 42 43#include <algorithm> 44#include <math.h> 45 46namespace WebCore { 47 48namespace VectorMath { 49 50#if OS(DARWIN) 51// On the Mac we use the highly optimized versions in Accelerate.framework 52// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as 53// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. 54 55void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 56{ 57#if defined(__ppc__) || defined(__i386__) 58 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 59#else 60 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 61#endif 62} 63 64void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 65{ 66#if defined(__ppc__) || defined(__i386__) 67 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 68#else 69 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 70#endif 71} 72 73void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 74{ 75#if defined(__ppc__) || defined(__i386__) 76 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 77#else 78 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 79#endif 80} 81 82void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 83{ 84 DSPSplitComplex sc1; 85 DSPSplitComplex sc2; 86 DSPSplitComplex dest; 87 sc1.realp = const_cast<float*>(real1P); 88 sc1.imagp = const_cast<float*>(imag1P); 89 sc2.realp = const_cast<float*>(real2P); 90 sc2.imagp = const_cast<float*>(imag2P); 91 dest.realp = realDestP; 92 dest.imagp = imagDestP; 93#if defined(__ppc__) || defined(__i386__) 94 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 95#else 96 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 97#endif 98} 99 100void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 101{ 102 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); 103} 104 105void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 106{ 107 vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); 108} 109 110void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 111{ 112 vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); 113} 114 115void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 116{ 117 vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); 118} 119#else 120 121void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 122{ 123 int n = framesToProcess; 124 125#ifdef __SSE2__ 126 if ((sourceStride == 1) && (destStride == 1)) { 127 float k = *scale; 128 129 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 130 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 131 *destP += k * *sourceP; 132 sourceP++; 133 destP++; 134 n--; 135 } 136 137 // Now the sourceP is aligned, use SSE. 138 int tailFrames = n % 4; 139 const float* endP = destP + n - tailFrames; 140 141 __m128 pSource; 142 __m128 dest; 143 __m128 temp; 144 __m128 mScale = _mm_set_ps1(k); 145 146 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 147 148#define SSE2_MULT_ADD(loadInstr, storeInstr) \ 149 while (destP < endP) \ 150 { \ 151 pSource = _mm_load_ps(sourceP); \ 152 temp = _mm_mul_ps(pSource, mScale); \ 153 dest = _mm_##loadInstr##_ps(destP); \ 154 dest = _mm_add_ps(dest, temp); \ 155 _mm_##storeInstr##_ps(destP, dest); \ 156 sourceP += 4; \ 157 destP += 4; \ 158 } 159 160 if (destAligned) 161 SSE2_MULT_ADD(load, store) 162 else 163 SSE2_MULT_ADD(loadu, storeu) 164 165 n = tailFrames; 166 } 167#elif HAVE(ARM_NEON_INTRINSICS) 168 if ((sourceStride == 1) && (destStride == 1)) { 169 int tailFrames = n % 4; 170 const float* endP = destP + n - tailFrames; 171 172 float32x4_t k = vdupq_n_f32(*scale); 173 while (destP < endP) { 174 float32x4_t source = vld1q_f32(sourceP); 175 float32x4_t dest = vld1q_f32(destP); 176 177 dest = vmlaq_f32(dest, source, k); 178 vst1q_f32(destP, dest); 179 180 sourceP += 4; 181 destP += 4; 182 } 183 n = tailFrames; 184 } 185#endif 186 while (n) { 187 *destP += *sourceP * *scale; 188 sourceP += sourceStride; 189 destP += destStride; 190 n--; 191 } 192} 193 194void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 195{ 196 int n = framesToProcess; 197 198#ifdef __SSE2__ 199 if ((sourceStride == 1) && (destStride == 1)) { 200 float k = *scale; 201 202 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 203 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 204 *destP = k * *sourceP; 205 sourceP++; 206 destP++; 207 n--; 208 } 209 210 // Now the sourceP address is aligned and start to apply SSE. 211 int group = n / 4; 212 __m128 mScale = _mm_set_ps1(k); 213 __m128* pSource; 214 __m128* pDest; 215 __m128 dest; 216 217 218 if (reinterpret_cast<size_t>(destP) & 0x0F) { 219 while (group--) { 220 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 221 dest = _mm_mul_ps(*pSource, mScale); 222 _mm_storeu_ps(destP, dest); 223 224 sourceP += 4; 225 destP += 4; 226 } 227 } else { 228 while (group--) { 229 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 230 pDest = reinterpret_cast<__m128*>(destP); 231 *pDest = _mm_mul_ps(*pSource, mScale); 232 233 sourceP += 4; 234 destP += 4; 235 } 236 } 237 238 // Non-SSE handling for remaining frames which is less than 4. 239 n %= 4; 240 while (n) { 241 *destP = k * *sourceP; 242 sourceP++; 243 destP++; 244 n--; 245 } 246 } else { // If strides are not 1, rollback to normal algorithm. 247#elif HAVE(ARM_NEON_INTRINSICS) 248 if ((sourceStride == 1) && (destStride == 1)) { 249 float k = *scale; 250 int tailFrames = n % 4; 251 const float* endP = destP + n - tailFrames; 252 253 while (destP < endP) { 254 float32x4_t source = vld1q_f32(sourceP); 255 vst1q_f32(destP, vmulq_n_f32(source, k)); 256 257 sourceP += 4; 258 destP += 4; 259 } 260 n = tailFrames; 261 } 262#endif 263 float k = *scale; 264 while (n--) { 265 *destP = k * *sourceP; 266 sourceP += sourceStride; 267 destP += destStride; 268 } 269#ifdef __SSE2__ 270 } 271#endif 272} 273 274void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 275{ 276 int n = framesToProcess; 277 278#ifdef __SSE2__ 279 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 280 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 281 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 282 *destP = *source1P + *source2P; 283 source1P++; 284 source2P++; 285 destP++; 286 n--; 287 } 288 289 // Now the source1P address is aligned and start to apply SSE. 290 int group = n / 4; 291 __m128* pSource1; 292 __m128* pSource2; 293 __m128* pDest; 294 __m128 source2; 295 __m128 dest; 296 297 bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); 298 bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); 299 300 if (source2Aligned && destAligned) { // all aligned 301 while (group--) { 302 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 303 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 304 pDest = reinterpret_cast<__m128*>(destP); 305 *pDest = _mm_add_ps(*pSource1, *pSource2); 306 307 source1P += 4; 308 source2P += 4; 309 destP += 4; 310 } 311 312 } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 313 while (group--) { 314 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 315 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 316 dest = _mm_add_ps(*pSource1, *pSource2); 317 _mm_storeu_ps(destP, dest); 318 319 source1P += 4; 320 source2P += 4; 321 destP += 4; 322 } 323 324 } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 325 while (group--) { 326 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 327 source2 = _mm_loadu_ps(source2P); 328 pDest = reinterpret_cast<__m128*>(destP); 329 *pDest = _mm_add_ps(*pSource1, source2); 330 331 source1P += 4; 332 source2P += 4; 333 destP += 4; 334 } 335 } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 336 while (group--) { 337 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 338 source2 = _mm_loadu_ps(source2P); 339 dest = _mm_add_ps(*pSource1, source2); 340 _mm_storeu_ps(destP, dest); 341 342 source1P += 4; 343 source2P += 4; 344 destP += 4; 345 } 346 } 347 348 // Non-SSE handling for remaining frames which is less than 4. 349 n %= 4; 350 while (n) { 351 *destP = *source1P + *source2P; 352 source1P++; 353 source2P++; 354 destP++; 355 n--; 356 } 357 } else { // if strides are not 1, rollback to normal algorithm 358#elif HAVE(ARM_NEON_INTRINSICS) 359 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 360 int tailFrames = n % 4; 361 const float* endP = destP + n - tailFrames; 362 363 while (destP < endP) { 364 float32x4_t source1 = vld1q_f32(source1P); 365 float32x4_t source2 = vld1q_f32(source2P); 366 vst1q_f32(destP, vaddq_f32(source1, source2)); 367 368 source1P += 4; 369 source2P += 4; 370 destP += 4; 371 } 372 n = tailFrames; 373 } 374#endif 375 while (n--) { 376 *destP = *source1P + *source2P; 377 source1P += sourceStride1; 378 source2P += sourceStride2; 379 destP += destStride; 380 } 381#ifdef __SSE2__ 382 } 383#endif 384} 385 386void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 387{ 388 389 int n = framesToProcess; 390 391#ifdef __SSE2__ 392 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { 393 // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 394 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { 395 *destP = *source1P * *source2P; 396 source1P++; 397 source2P++; 398 destP++; 399 n--; 400 } 401 402 // Now the source1P address aligned and start to apply SSE. 403 int tailFrames = n % 4; 404 const float* endP = destP + n - tailFrames; 405 __m128 pSource1; 406 __m128 pSource2; 407 __m128 dest; 408 409 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); 410 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 411 412#define SSE2_MULT(loadInstr, storeInstr) \ 413 while (destP < endP) \ 414 { \ 415 pSource1 = _mm_load_ps(source1P); \ 416 pSource2 = _mm_##loadInstr##_ps(source2P); \ 417 dest = _mm_mul_ps(pSource1, pSource2); \ 418 _mm_##storeInstr##_ps(destP, dest); \ 419 source1P += 4; \ 420 source2P += 4; \ 421 destP += 4; \ 422 } 423 424 if (source2Aligned && destAligned) // Both aligned. 425 SSE2_MULT(load, store) 426 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 427 SSE2_MULT(load, storeu) 428 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 429 SSE2_MULT(loadu, store) 430 else // Neither aligned. 431 SSE2_MULT(loadu, storeu) 432 433 n = tailFrames; 434 } 435#elif HAVE(ARM_NEON_INTRINSICS) 436 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 437 int tailFrames = n % 4; 438 const float* endP = destP + n - tailFrames; 439 440 while (destP < endP) { 441 float32x4_t source1 = vld1q_f32(source1P); 442 float32x4_t source2 = vld1q_f32(source2P); 443 vst1q_f32(destP, vmulq_f32(source1, source2)); 444 445 source1P += 4; 446 source2P += 4; 447 destP += 4; 448 } 449 n = tailFrames; 450 } 451#endif 452 while (n) { 453 *destP = *source1P * *source2P; 454 source1P += sourceStride1; 455 source2P += sourceStride2; 456 destP += destStride; 457 n--; 458 } 459} 460 461void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 462{ 463 unsigned i = 0; 464#ifdef __SSE2__ 465 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 466 // Otherwise, fall through to the scalar code below. 467 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) 468 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) 469 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) 470 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) 471 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) 472 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { 473 474 unsigned endSize = framesToProcess - framesToProcess % 4; 475 while (i < endSize) { 476 __m128 real1 = _mm_load_ps(real1P + i); 477 __m128 real2 = _mm_load_ps(real2P + i); 478 __m128 imag1 = _mm_load_ps(imag1P + i); 479 __m128 imag2 = _mm_load_ps(imag2P + i); 480 __m128 real = _mm_mul_ps(real1, real2); 481 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); 482 __m128 imag = _mm_mul_ps(real1, imag2); 483 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); 484 _mm_store_ps(realDestP + i, real); 485 _mm_store_ps(imagDestP + i, imag); 486 i += 4; 487 } 488 } 489#elif HAVE(ARM_NEON_INTRINSICS) 490 unsigned endSize = framesToProcess - framesToProcess % 4; 491 while (i < endSize) { 492 float32x4_t real1 = vld1q_f32(real1P + i); 493 float32x4_t real2 = vld1q_f32(real2P + i); 494 float32x4_t imag1 = vld1q_f32(imag1P + i); 495 float32x4_t imag2 = vld1q_f32(imag2P + i); 496 497 float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); 498 float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); 499 500 vst1q_f32(realDestP + i, realResult); 501 vst1q_f32(imagDestP + i, imagResult); 502 503 i += 4; 504 } 505#endif 506 for (; i < framesToProcess; ++i) { 507 // Read and compute result before storing them, in case the 508 // destination is the same as one of the sources. 509 float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; 510 float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; 511 512 realDestP[i] = realResult; 513 imagDestP[i] = imagResult; 514 } 515} 516 517void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 518{ 519 int n = framesToProcess; 520 float sum = 0; 521 522#ifdef __SSE2__ 523 if (sourceStride == 1) { 524 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 525 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 526 float sample = *sourceP; 527 sum += sample * sample; 528 sourceP++; 529 n--; 530 } 531 532 // Now the sourceP is aligned, use SSE. 533 int tailFrames = n % 4; 534 const float* endP = sourceP + n - tailFrames; 535 __m128 source; 536 __m128 mSum = _mm_setzero_ps(); 537 538 while (sourceP < endP) { 539 source = _mm_load_ps(sourceP); 540 source = _mm_mul_ps(source, source); 541 mSum = _mm_add_ps(mSum, source); 542 sourceP += 4; 543 } 544 545 // Summarize the SSE results. 546 const float* groupSumP = reinterpret_cast<float*>(&mSum); 547 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 548 549 n = tailFrames; 550 } 551#elif HAVE(ARM_NEON_INTRINSICS) 552 if (sourceStride == 1) { 553 int tailFrames = n % 4; 554 const float* endP = sourceP + n - tailFrames; 555 556 float32x4_t fourSum = vdupq_n_f32(0); 557 while (sourceP < endP) { 558 float32x4_t source = vld1q_f32(sourceP); 559 fourSum = vmlaq_f32(fourSum, source, source); 560 sourceP += 4; 561 } 562 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); 563 564 float groupSum[2]; 565 vst1_f32(groupSum, twoSum); 566 sum += groupSum[0] + groupSum[1]; 567 568 n = tailFrames; 569 } 570#endif 571 572 while (n--) { 573 float sample = *sourceP; 574 sum += sample * sample; 575 sourceP += sourceStride; 576 } 577 578 ASSERT(sumP); 579 *sumP = sum; 580} 581 582void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 583{ 584 int n = framesToProcess; 585 float max = 0; 586 587#ifdef __SSE2__ 588 if (sourceStride == 1) { 589 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 590 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 591 max = std::max(max, fabsf(*sourceP)); 592 sourceP++; 593 n--; 594 } 595 596 // Now the sourceP is aligned, use SSE. 597 int tailFrames = n % 4; 598 const float* endP = sourceP + n - tailFrames; 599 __m128 source; 600 __m128 mMax = _mm_setzero_ps(); 601 int mask = 0x7FFFFFFF; 602 __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); 603 604 while (sourceP < endP) { 605 source = _mm_load_ps(sourceP); 606 // Calculate the absolute value by anding source with mask, the sign bit is set to 0. 607 source = _mm_and_ps(source, mMask); 608 mMax = _mm_max_ps(mMax, source); 609 sourceP += 4; 610 } 611 612 // Get max from the SSE results. 613 const float* groupMaxP = reinterpret_cast<float*>(&mMax); 614 max = std::max(max, groupMaxP[0]); 615 max = std::max(max, groupMaxP[1]); 616 max = std::max(max, groupMaxP[2]); 617 max = std::max(max, groupMaxP[3]); 618 619 n = tailFrames; 620 } 621#elif HAVE(ARM_NEON_INTRINSICS) 622 if (sourceStride == 1) { 623 int tailFrames = n % 4; 624 const float* endP = sourceP + n - tailFrames; 625 626 float32x4_t fourMax = vdupq_n_f32(0); 627 while (sourceP < endP) { 628 float32x4_t source = vld1q_f32(sourceP); 629 fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); 630 sourceP += 4; 631 } 632 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); 633 634 float groupMax[2]; 635 vst1_f32(groupMax, twoMax); 636 max = std::max(groupMax[0], groupMax[1]); 637 638 n = tailFrames; 639 } 640#endif 641 642 while (n--) { 643 max = std::max(max, fabsf(*sourceP)); 644 sourceP += sourceStride; 645 } 646 647 ASSERT(maxP); 648 *maxP = max; 649} 650 651void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 652{ 653 int n = framesToProcess; 654 float lowThreshold = *lowThresholdP; 655 float highThreshold = *highThresholdP; 656 657 // FIXME: Optimize for SSE2. 658#if HAVE(ARM_NEON_INTRINSICS) 659 if ((sourceStride == 1) && (destStride == 1)) { 660 int tailFrames = n % 4; 661 const float* endP = destP + n - tailFrames; 662 663 float32x4_t low = vdupq_n_f32(lowThreshold); 664 float32x4_t high = vdupq_n_f32(highThreshold); 665 while (destP < endP) { 666 float32x4_t source = vld1q_f32(sourceP); 667 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 668 sourceP += 4; 669 destP += 4; 670 } 671 n = tailFrames; 672 } 673#endif 674 while (n--) { 675 *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); 676 sourceP += sourceStride; 677 destP += destStride; 678 } 679} 680 681#endif // OS(DARWIN) 682 683} // namespace VectorMath 684 685} // namespace WebCore 686 687#endif // ENABLE(WEB_AUDIO) 688