1/*
2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1.  Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2.  Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25#include "config.h"
26
27#if ENABLE(WEB_AUDIO)
28
29#include "VectorMath.h"
30
31#if OS(DARWIN)
32#include <Accelerate/Accelerate.h>
33#endif
34
35#ifdef __SSE2__
36#include <emmintrin.h>
37#endif
38
39#if HAVE(ARM_NEON_INTRINSICS)
40#include <arm_neon.h>
41#endif
42
43#include <algorithm>
44#include <math.h>
45
46namespace WebCore {
47
48namespace VectorMath {
49
50#if OS(DARWIN)
51// On the Mac we use the highly optimized versions in Accelerate.framework
52// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
53// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
54
55void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
56{
57#if defined(__ppc__) || defined(__i386__)
58    ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
59#else
60    vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
61#endif
62}
63
64void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
65{
66#if defined(__ppc__) || defined(__i386__)
67    ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
68#else
69    vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
70#endif
71}
72
73void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
74{
75#if defined(__ppc__) || defined(__i386__)
76    ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
77#else
78    vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
79#endif
80}
81
82void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
83{
84    DSPSplitComplex sc1;
85    DSPSplitComplex sc2;
86    DSPSplitComplex dest;
87    sc1.realp = const_cast<float*>(real1P);
88    sc1.imagp = const_cast<float*>(imag1P);
89    sc2.realp = const_cast<float*>(real2P);
90    sc2.imagp = const_cast<float*>(imag2P);
91    dest.realp = realDestP;
92    dest.imagp = imagDestP;
93#if defined(__ppc__) || defined(__i386__)
94    ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
95#else
96    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
97#endif
98}
99
100void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
101{
102    vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
103}
104
105void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
106{
107    vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
108}
109
110void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
111{
112    vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
113}
114
115void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
116{
117    vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
118}
119#else
120
121void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
122{
123    int n = framesToProcess;
124
125#ifdef __SSE2__
126    if ((sourceStride == 1) && (destStride == 1)) {
127        float k = *scale;
128
129        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
130        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
131            *destP += k * *sourceP;
132            sourceP++;
133            destP++;
134            n--;
135        }
136
137        // Now the sourceP is aligned, use SSE.
138        int tailFrames = n % 4;
139        const float* endP = destP + n - tailFrames;
140
141        __m128 pSource;
142        __m128 dest;
143        __m128 temp;
144        __m128 mScale = _mm_set_ps1(k);
145
146        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
147
148#define SSE2_MULT_ADD(loadInstr, storeInstr)        \
149            while (destP < endP)                    \
150            {                                       \
151                pSource = _mm_load_ps(sourceP);     \
152                temp = _mm_mul_ps(pSource, mScale); \
153                dest = _mm_##loadInstr##_ps(destP); \
154                dest = _mm_add_ps(dest, temp);      \
155                _mm_##storeInstr##_ps(destP, dest); \
156                sourceP += 4;                       \
157                destP += 4;                         \
158            }
159
160        if (destAligned)
161            SSE2_MULT_ADD(load, store)
162        else
163            SSE2_MULT_ADD(loadu, storeu)
164
165        n = tailFrames;
166    }
167#elif HAVE(ARM_NEON_INTRINSICS)
168    if ((sourceStride == 1) && (destStride == 1)) {
169        int tailFrames = n % 4;
170        const float* endP = destP + n - tailFrames;
171
172        float32x4_t k = vdupq_n_f32(*scale);
173        while (destP < endP) {
174            float32x4_t source = vld1q_f32(sourceP);
175            float32x4_t dest = vld1q_f32(destP);
176
177            dest = vmlaq_f32(dest, source, k);
178            vst1q_f32(destP, dest);
179
180            sourceP += 4;
181            destP += 4;
182        }
183        n = tailFrames;
184    }
185#endif
186    while (n) {
187        *destP += *sourceP * *scale;
188        sourceP += sourceStride;
189        destP += destStride;
190        n--;
191    }
192}
193
194void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
195{
196    int n = framesToProcess;
197
198#ifdef __SSE2__
199    if ((sourceStride == 1) && (destStride == 1)) {
200        float k = *scale;
201
202        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
203        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
204            *destP = k * *sourceP;
205            sourceP++;
206            destP++;
207            n--;
208        }
209
210        // Now the sourceP address is aligned and start to apply SSE.
211        int group = n / 4;
212        __m128 mScale = _mm_set_ps1(k);
213        __m128* pSource;
214        __m128* pDest;
215        __m128 dest;
216
217
218        if (reinterpret_cast<size_t>(destP) & 0x0F) {
219            while (group--) {
220                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
221                dest = _mm_mul_ps(*pSource, mScale);
222                _mm_storeu_ps(destP, dest);
223
224                sourceP += 4;
225                destP += 4;
226            }
227        } else {
228            while (group--) {
229                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
230                pDest = reinterpret_cast<__m128*>(destP);
231                *pDest = _mm_mul_ps(*pSource, mScale);
232
233                sourceP += 4;
234                destP += 4;
235            }
236        }
237
238        // Non-SSE handling for remaining frames which is less than 4.
239        n %= 4;
240        while (n) {
241            *destP = k * *sourceP;
242            sourceP++;
243            destP++;
244            n--;
245        }
246    } else { // If strides are not 1, rollback to normal algorithm.
247#elif HAVE(ARM_NEON_INTRINSICS)
248    if ((sourceStride == 1) && (destStride == 1)) {
249        float k = *scale;
250        int tailFrames = n % 4;
251        const float* endP = destP + n - tailFrames;
252
253        while (destP < endP) {
254            float32x4_t source = vld1q_f32(sourceP);
255            vst1q_f32(destP, vmulq_n_f32(source, k));
256
257            sourceP += 4;
258            destP += 4;
259        }
260        n = tailFrames;
261    }
262#endif
263    float k = *scale;
264    while (n--) {
265        *destP = k * *sourceP;
266        sourceP += sourceStride;
267        destP += destStride;
268    }
269#ifdef __SSE2__
270    }
271#endif
272}
273
274void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
275{
276    int n = framesToProcess;
277
278#ifdef __SSE2__
279    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
280        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
281        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
282            *destP = *source1P + *source2P;
283            source1P++;
284            source2P++;
285            destP++;
286            n--;
287        }
288
289        // Now the source1P address is aligned and start to apply SSE.
290        int group = n / 4;
291        __m128* pSource1;
292        __m128* pSource2;
293        __m128* pDest;
294        __m128 source2;
295        __m128 dest;
296
297        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
298        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
299
300        if (source2Aligned && destAligned) { // all aligned
301            while (group--) {
302                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
303                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
304                pDest = reinterpret_cast<__m128*>(destP);
305                *pDest = _mm_add_ps(*pSource1, *pSource2);
306
307                source1P += 4;
308                source2P += 4;
309                destP += 4;
310            }
311
312        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
313            while (group--) {
314                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
315                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
316                dest = _mm_add_ps(*pSource1, *pSource2);
317                _mm_storeu_ps(destP, dest);
318
319                source1P += 4;
320                source2P += 4;
321                destP += 4;
322            }
323
324        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
325            while (group--) {
326                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
327                source2 = _mm_loadu_ps(source2P);
328                pDest = reinterpret_cast<__m128*>(destP);
329                *pDest = _mm_add_ps(*pSource1, source2);
330
331                source1P += 4;
332                source2P += 4;
333                destP += 4;
334            }
335        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
336            while (group--) {
337                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
338                source2 = _mm_loadu_ps(source2P);
339                dest = _mm_add_ps(*pSource1, source2);
340                _mm_storeu_ps(destP, dest);
341
342                source1P += 4;
343                source2P += 4;
344                destP += 4;
345            }
346        }
347
348        // Non-SSE handling for remaining frames which is less than 4.
349        n %= 4;
350        while (n) {
351            *destP = *source1P + *source2P;
352            source1P++;
353            source2P++;
354            destP++;
355            n--;
356        }
357    } else { // if strides are not 1, rollback to normal algorithm
358#elif HAVE(ARM_NEON_INTRINSICS)
359    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
360        int tailFrames = n % 4;
361        const float* endP = destP + n - tailFrames;
362
363        while (destP < endP) {
364            float32x4_t source1 = vld1q_f32(source1P);
365            float32x4_t source2 = vld1q_f32(source2P);
366            vst1q_f32(destP, vaddq_f32(source1, source2));
367
368            source1P += 4;
369            source2P += 4;
370            destP += 4;
371        }
372        n = tailFrames;
373    }
374#endif
375    while (n--) {
376        *destP = *source1P + *source2P;
377        source1P += sourceStride1;
378        source2P += sourceStride2;
379        destP += destStride;
380    }
381#ifdef __SSE2__
382    }
383#endif
384}
385
386void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
387{
388
389    int n = framesToProcess;
390
391#ifdef __SSE2__
392    if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
393        // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
394        while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
395            *destP = *source1P * *source2P;
396            source1P++;
397            source2P++;
398            destP++;
399            n--;
400        }
401
402        // Now the source1P address aligned and start to apply SSE.
403        int tailFrames = n % 4;
404        const float* endP = destP + n - tailFrames;
405        __m128 pSource1;
406        __m128 pSource2;
407        __m128 dest;
408
409        bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
410        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
411
412#define SSE2_MULT(loadInstr, storeInstr)                   \
413            while (destP < endP)                           \
414            {                                              \
415                pSource1 = _mm_load_ps(source1P);          \
416                pSource2 = _mm_##loadInstr##_ps(source2P); \
417                dest = _mm_mul_ps(pSource1, pSource2);     \
418                _mm_##storeInstr##_ps(destP, dest);        \
419                source1P += 4;                             \
420                source2P += 4;                             \
421                destP += 4;                                \
422            }
423
424        if (source2Aligned && destAligned) // Both aligned.
425            SSE2_MULT(load, store)
426        else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
427            SSE2_MULT(load, storeu)
428        else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
429            SSE2_MULT(loadu, store)
430        else // Neither aligned.
431            SSE2_MULT(loadu, storeu)
432
433        n = tailFrames;
434    }
435#elif HAVE(ARM_NEON_INTRINSICS)
436    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
437        int tailFrames = n % 4;
438        const float* endP = destP + n - tailFrames;
439
440        while (destP < endP) {
441            float32x4_t source1 = vld1q_f32(source1P);
442            float32x4_t source2 = vld1q_f32(source2P);
443            vst1q_f32(destP, vmulq_f32(source1, source2));
444
445            source1P += 4;
446            source2P += 4;
447            destP += 4;
448        }
449        n = tailFrames;
450    }
451#endif
452    while (n) {
453        *destP = *source1P * *source2P;
454        source1P += sourceStride1;
455        source2P += sourceStride2;
456        destP += destStride;
457        n--;
458    }
459}
460
461void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
462{
463    unsigned i = 0;
464#ifdef __SSE2__
465    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
466    // Otherwise, fall through to the scalar code below.
467    if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
468        && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
469        && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
470        && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
471        && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
472        && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
473
474        unsigned endSize = framesToProcess - framesToProcess % 4;
475        while (i < endSize) {
476            __m128 real1 = _mm_load_ps(real1P + i);
477            __m128 real2 = _mm_load_ps(real2P + i);
478            __m128 imag1 = _mm_load_ps(imag1P + i);
479            __m128 imag2 = _mm_load_ps(imag2P + i);
480            __m128 real = _mm_mul_ps(real1, real2);
481            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
482            __m128 imag = _mm_mul_ps(real1, imag2);
483            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
484            _mm_store_ps(realDestP + i, real);
485            _mm_store_ps(imagDestP + i, imag);
486            i += 4;
487        }
488    }
489#elif HAVE(ARM_NEON_INTRINSICS)
490        unsigned endSize = framesToProcess - framesToProcess % 4;
491        while (i < endSize) {
492            float32x4_t real1 = vld1q_f32(real1P + i);
493            float32x4_t real2 = vld1q_f32(real2P + i);
494            float32x4_t imag1 = vld1q_f32(imag1P + i);
495            float32x4_t imag2 = vld1q_f32(imag2P + i);
496
497            float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
498            float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
499
500            vst1q_f32(realDestP + i, realResult);
501            vst1q_f32(imagDestP + i, imagResult);
502
503            i += 4;
504        }
505#endif
506    for (; i < framesToProcess; ++i) {
507        // Read and compute result before storing them, in case the
508        // destination is the same as one of the sources.
509        float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
510        float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
511
512        realDestP[i] = realResult;
513        imagDestP[i] = imagResult;
514    }
515}
516
517void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
518{
519    int n = framesToProcess;
520    float sum = 0;
521
522#ifdef __SSE2__
523    if (sourceStride == 1) {
524        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
525        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
526            float sample = *sourceP;
527            sum += sample * sample;
528            sourceP++;
529            n--;
530        }
531
532        // Now the sourceP is aligned, use SSE.
533        int tailFrames = n % 4;
534        const float* endP = sourceP + n - tailFrames;
535        __m128 source;
536        __m128 mSum = _mm_setzero_ps();
537
538        while (sourceP < endP) {
539            source = _mm_load_ps(sourceP);
540            source = _mm_mul_ps(source, source);
541            mSum = _mm_add_ps(mSum, source);
542            sourceP += 4;
543        }
544
545        // Summarize the SSE results.
546        const float* groupSumP = reinterpret_cast<float*>(&mSum);
547        sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
548
549        n = tailFrames;
550    }
551#elif HAVE(ARM_NEON_INTRINSICS)
552    if (sourceStride == 1) {
553        int tailFrames = n % 4;
554        const float* endP = sourceP + n - tailFrames;
555
556        float32x4_t fourSum = vdupq_n_f32(0);
557        while (sourceP < endP) {
558            float32x4_t source = vld1q_f32(sourceP);
559            fourSum = vmlaq_f32(fourSum, source, source);
560            sourceP += 4;
561        }
562        float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
563
564        float groupSum[2];
565        vst1_f32(groupSum, twoSum);
566        sum += groupSum[0] + groupSum[1];
567
568        n = tailFrames;
569    }
570#endif
571
572    while (n--) {
573        float sample = *sourceP;
574        sum += sample * sample;
575        sourceP += sourceStride;
576    }
577
578    ASSERT(sumP);
579    *sumP = sum;
580}
581
582void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
583{
584    int n = framesToProcess;
585    float max = 0;
586
587#ifdef __SSE2__
588    if (sourceStride == 1) {
589        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
590        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
591            max = std::max(max, fabsf(*sourceP));
592            sourceP++;
593            n--;
594        }
595
596        // Now the sourceP is aligned, use SSE.
597        int tailFrames = n % 4;
598        const float* endP = sourceP + n - tailFrames;
599        __m128 source;
600        __m128 mMax = _mm_setzero_ps();
601        int mask = 0x7FFFFFFF;
602        __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
603
604        while (sourceP < endP) {
605            source = _mm_load_ps(sourceP);
606            // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
607            source = _mm_and_ps(source, mMask);
608            mMax = _mm_max_ps(mMax, source);
609            sourceP += 4;
610        }
611
612        // Get max from the SSE results.
613        const float* groupMaxP = reinterpret_cast<float*>(&mMax);
614        max = std::max(max, groupMaxP[0]);
615        max = std::max(max, groupMaxP[1]);
616        max = std::max(max, groupMaxP[2]);
617        max = std::max(max, groupMaxP[3]);
618
619        n = tailFrames;
620    }
621#elif HAVE(ARM_NEON_INTRINSICS)
622    if (sourceStride == 1) {
623        int tailFrames = n % 4;
624        const float* endP = sourceP + n - tailFrames;
625
626        float32x4_t fourMax = vdupq_n_f32(0);
627        while (sourceP < endP) {
628            float32x4_t source = vld1q_f32(sourceP);
629            fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
630            sourceP += 4;
631        }
632        float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
633
634        float groupMax[2];
635        vst1_f32(groupMax, twoMax);
636        max = std::max(groupMax[0], groupMax[1]);
637
638        n = tailFrames;
639    }
640#endif
641
642    while (n--) {
643        max = std::max(max, fabsf(*sourceP));
644        sourceP += sourceStride;
645    }
646
647    ASSERT(maxP);
648    *maxP = max;
649}
650
651void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
652{
653    int n = framesToProcess;
654    float lowThreshold = *lowThresholdP;
655    float highThreshold = *highThresholdP;
656
657    // FIXME: Optimize for SSE2.
658#if HAVE(ARM_NEON_INTRINSICS)
659    if ((sourceStride == 1) && (destStride == 1)) {
660        int tailFrames = n % 4;
661        const float* endP = destP + n - tailFrames;
662
663        float32x4_t low = vdupq_n_f32(lowThreshold);
664        float32x4_t high = vdupq_n_f32(highThreshold);
665        while (destP < endP) {
666            float32x4_t source = vld1q_f32(sourceP);
667            vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
668            sourceP += 4;
669            destP += 4;
670        }
671        n = tailFrames;
672    }
673#endif
674    while (n--) {
675        *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
676        sourceP += sourceStride;
677        destP += destStride;
678    }
679}
680
681#endif // OS(DARWIN)
682
683} // namespace VectorMath
684
685} // namespace WebCore
686
687#endif // ENABLE(WEB_AUDIO)
688