1/*
2 * Copyright (C) 2011 University of Szeged
3 * Copyright (C) 2011 Zoltan Herczeg
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "FELightingNEON.h"
29
30#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC)
31
32#include <wtf/Alignment.h>
33
34namespace WebCore {
35
36// These constants are copied to the following SIMD registers:
37//   ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D
38
39static WTF_ALIGNED(short, s_FELightingConstantsForNeon[], 16) = {
40    // Alpha coefficients.
41    -2, 1, 0, -1, 2, 1, 0, -1,
42    0, -1, -2, -1, 0, 1, 2, 1,
43    // Remapping indicies.
44    0x0f0e, 0x0302, 0x0504, 0x0706,
45    0x0b0a, 0x1312, 0x1514, 0x1716,
46};
47
48short* feLightingConstantsForNeon()
49{
50    return s_FELightingConstantsForNeon;
51}
52
53void FELighting::platformApplyNeonWorker(FELightingPaintingDataForNeon* parameters)
54{
55    neonDrawLighting(parameters);
56}
57
58#define ASSTRING(str) #str
59#define TOSTRING(value) ASSTRING(value)
60
61#define PIXELS_OFFSET TOSTRING(0)
62#define YSTART_OFFSET TOSTRING(4)
63#define WIDTH_OFFSET TOSTRING(8)
64#define HEIGHT_OFFSET TOSTRING(12)
65#define FLAGS_OFFSET TOSTRING(16)
66#define SPECULAR_EXPONENT_OFFSET TOSTRING(20)
67#define CONE_EXPONENT_OFFSET TOSTRING(24)
68#define FLOAT_ARGUMENTS_OFFSET TOSTRING(28)
69#define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
70#define NL "\n"
71
72// Register allocation
73#define PAINTING_DATA_R       "r11"
74#define RESET_WIDTH_R         PAINTING_DATA_R
75#define PIXELS_R              "r4"
76#define WIDTH_R               "r5"
77#define HEIGHT_R              "r6"
78#define FLAGS_R               "r7"
79#define SPECULAR_EXPONENT_R   "r8"
80#define CONE_EXPONENT_R       "r10"
81#define SCANLINE_R            "r12"
82
83#define TMP1_Q                "q0"
84#define TMP1_D0               "d0"
85#define TMP1_S0               "s0"
86#define TMP1_S1               "s1"
87#define TMP1_D1               "d1"
88#define TMP1_S2               "s2"
89#define TMP1_S3               "s3"
90#define TMP2_Q                "q1"
91#define TMP2_D0               "d2"
92#define TMP2_S0               "s4"
93#define TMP2_S1               "s5"
94#define TMP2_D1               "d3"
95#define TMP2_S2               "s6"
96#define TMP2_S3               "s7"
97#define TMP3_Q                "q2"
98#define TMP3_D0               "d4"
99#define TMP3_S0               "s8"
100#define TMP3_S1               "s9"
101#define TMP3_D1               "d5"
102#define TMP3_S2               "s10"
103#define TMP3_S3               "s11"
104
105#define COSINE_OF_ANGLE       "s12"
106#define POWF_INT_S            "s13"
107#define POWF_FRAC_S           "s14"
108#define SPOT_COLOR_Q          "q4"
109
110// Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
111// must be placed on the same side of the double vector
112
113// Current pixel position
114#define POSITION_Q            "q5"
115#define POSITION_X_S          "s20"
116#define POSITION_Y_S          "s21"
117#define POSITION_Z_S          "s22"
118#define CONST_ZERO_HI_D       "d11"
119#define CONST_ZERO_S          "s23"
120
121// -------------------------------
122//     Variable arguments
123// Misc arguments
124#define READ1_RANGE           "d12-d15"
125#define READ2_RANGE           "d16-d19"
126#define READ3_RANGE           "d20-d21"
127
128#define SCALE_S               "s24"
129#define SCALE_DIV4_S          "s25"
130#define DIFFUSE_CONST_S       "s26"
131
132// Light source position
133#define CONE_CUT_OFF_S        "s28"
134#define CONE_FULL_LIGHT_S     "s29"
135#define CONE_CUT_OFF_RANGE_S  "s30"
136#define CONST_ONE_HI_D        "d15"
137#define CONST_ONE_S           "s31"
138
139#define LIGHT_Q               "q8"
140#define DIRECTION_Q           "q9"
141#define COLOR_Q               "q10"
142// -------------------------------
143//    Constant coefficients
144#define READ4_RANGE           "d22-d25"
145#define READ5_RANGE           "d26-d27"
146
147#define ALPHAX_Q              "q11"
148#define ALPHAY_Q              "q12"
149#define REMAPX_D              "d26"
150#define REMAPY_D              "d27"
151// -------------------------------
152
153#define ALL_ROWS_D            "{d28,d29,d30}"
154#define TOP_ROW_D             "d28"
155#define MIDDLE_ROW_D          "d29"
156#define BOTTOM_ROW_D          "d30"
157
158#define GET_LENGTH(source, temp) \
159    "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
160    "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
161    "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
162    "vsqrt.f32 " source##_S3 ", " source##_S3 NL
163
164// destination##_S3 can contain the multiply of length.
165#define DOT_PRODUCT(destination, source1, source2) \
166    "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
167    "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
168    "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL
169
170#define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
171    "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
172    "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
173    "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
174    "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL
175
176#define POWF_SQR(value, exponent, current, remaining) \
177    "tst " exponent ", #" ASSTRING(current) NL \
178    "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
179    "tst " exponent ", #" ASSTRING(remaining) NL \
180    "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL
181
182#define POWF_SQRT(value, exponent, current, remaining) \
183    "tst " exponent ", #" ASSTRING(remaining) NL \
184    "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
185    "tst " exponent ", #" ASSTRING(current) NL \
186    "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL
187
188// This simplified powf function is sufficiently accurate.
189#define POWF(value, exponent) \
190    "tst " exponent ", #0xfc0" NL \
191    "vmovne.f32 " POWF_INT_S ", " value NL \
192    "tst " exponent ", #0x03f" NL \
193    "vmovne.f32 " POWF_FRAC_S ", " value NL \
194    "vmov.f32 " value ", " CONST_ONE_S NL \
195    \
196    POWF_SQR(value, exponent, 0x040, 0xf80) \
197    POWF_SQR(value, exponent, 0x080, 0xf00) \
198    POWF_SQR(value, exponent, 0x100, 0xe00) \
199    POWF_SQR(value, exponent, 0x200, 0xc00) \
200    POWF_SQR(value, exponent, 0x400, 0x800) \
201    "tst " exponent ", #0x800" NL \
202    "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
203    \
204    POWF_SQRT(value, exponent, 0x20, 0x3f) \
205    POWF_SQRT(value, exponent, 0x10, 0x1f) \
206    POWF_SQRT(value, exponent, 0x08, 0x0f) \
207    POWF_SQRT(value, exponent, 0x04, 0x07) \
208    POWF_SQRT(value, exponent, 0x02, 0x03) \
209    POWF_SQRT(value, exponent, 0x01, 0x01)
210
211// The following algorithm is an ARM-NEON optimized version of
212// the main loop found in FELighting.cpp. Since the whole code
213// is redesigned to be as effective as possible (ARM specific
214// thinking), it is four times faster than its C++ counterpart.
215
216asm ( // NOLINT
217".globl " TOSTRING(neonDrawLighting) NL
218TOSTRING(neonDrawLighting) ":" NL
219    // Because of the clever register allocation, nothing is stored on the stack
220    // except the saved registers.
221    // Stack must be aligned to 8 bytes.
222    "stmdb sp!, {r4-r8, r10, r11, lr}" NL
223    "vstmdb sp!, {d8-d15}" NL
224    "mov " PAINTING_DATA_R ", r0" NL
225
226    // The following two arguments are loaded to SIMD registers.
227    "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
228    "ldr r1, [" PAINTING_DATA_R ", #" PAINTING_CONSTANTS_OFFSET "]" NL
229    "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
230    "vldr.f32 " POSITION_Y_S ", [" PAINTING_DATA_R ", #" YSTART_OFFSET "]"  NL
231    "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
232    "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
233    "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
234    "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
235    "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL
236
237    // Load all data to the SIMD registers with the least number of instructions.
238    "vld1.f32 { " READ1_RANGE " }, [r0]!" NL
239    "vld1.f32 { " READ2_RANGE " }, [r0]!" NL
240    "vld1.f32 { " READ3_RANGE " }, [r0]!" NL
241    "vld1.s16 {" READ4_RANGE "}, [r1]!" NL
242    "vld1.s16 {" READ5_RANGE "}, [r1]!" NL
243
244    // Initializing local variables.
245    "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
246    "add " SCANLINE_R ", " SCANLINE_R ", #8" NL
247    "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
248    "add " PIXELS_R ", " PIXELS_R ", #3" NL
249    "mov r0, #0" NL
250    "vmov.f32 " CONST_ZERO_S ", r0" NL
251    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
252    "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
253    "mov " RESET_WIDTH_R ", " WIDTH_R NL
254
255".mainLoop:" NL
256    "mov r3, #3" NL
257    "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL
258
259".scanline:" NL
260    // The ROW registers are storing the alpha channel of the last three pixels.
261    // The alpha channel is stored as signed short (sint16) values. The fourth value
262    // is garbage. The following instructions are shifting out the unnecessary alpha
263    // values and load the next ones.
264    "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
265    "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
266    "ldrb r2, [" PIXELS_R "], #4" NL
267    "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
268    "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
269    "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
270    "vmov.s16 " TOP_ROW_D "[1], r0" NL
271    "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
272    "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL
273
274    // The two border pixels (rightmost and leftmost) are skipped when
275    // the next scanline is reached. It also jumps, when the algorithm
276    // is started, and the first free alpha values are loaded to each row.
277    "subs r3, r3, #1" NL
278    "bne .scanline" NL
279
280    // The light vector goes to TMP1_Q. It is constant in case of distant light.
281    // The fourth value contains the length of the light vector.
282    "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL
283    "beq .distantLight" NL
284
285    "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
286    "vmov.f32 " POSITION_Z_S ", r3" NL
287    "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
288    "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL
289
290    "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
291    GET_LENGTH(TMP1, TMP2)
292
293    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
294    "bne .cosineOfAngle" NL
295".visiblePixel:" NL
296
297    //     | -1  0  1 |      | -1 -2 -1 |
298    // X = | -2  0  2 |  Y = |  0  0  0 |
299    //     | -1  0  1 |      |  1  2  1 |
300
301    // Multiply the alpha values by the X and Y matrices.
302
303    // Moving the 8 alpha value to TMP3.
304    "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
305    "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL
306
307    "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
308    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
309    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
310    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
311    "vmov.s16 r0, " TMP2_D0 "[0]" NL
312
313    "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
314    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
315    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
316    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
317    "vmov.s16 r1, " TMP2_D0 "[0]" NL
318
319    // r0 and r1 contains the X and Y coordinates of the
320    // normal vector, respectively.
321
322    // Calculating the spot light strength.
323    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
324    "beq .endLight" NL
325
326    "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
327    "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
328    "beq .coneExpPowf" NL
329".coneExpPowfFinished:" NL
330
331    // Smoothing the cone edge if necessary.
332    "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
333    "fmstat" NL
334    "bhi .cutOff" NL
335".cutOffFinished:" NL
336
337    "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
338    "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL
339
340".endLight:" NL
341    // Summarize:
342    // r0 and r1 contains the normalVector.
343    // TMP1_Q contains the light vector and its length.
344    // COLOR_Q contains the color of the light vector.
345
346    // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
347    "orrs r2, r0, r1" NL
348    "bne .normalVectorIsNonZero" NL
349
350    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
351    "bne .specularLight1" NL
352
353    // Calculate diffuse light strength.
354    MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
355    "b .lightStrengthCalculated" NL
356
357".specularLight1:" NL
358    // Calculating specular light strength.
359    "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
360    GET_LENGTH(TMP1, TMP2)
361
362    // When the exponent is 1, we don't need to call an expensive powf function.
363    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
364    "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
365    "beq .specularExpPowf" NL
366
367    MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
368    "b .lightStrengthCalculated" NL
369
370".normalVectorIsNonZero:" NL
371    // Normal vector goes to TMP2, and its length is calculated as well.
372    "vmov.s32 " TMP2_S0 ", r0" NL
373    "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
374    "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
375    "vmov.s32 " TMP2_S1 ", r1" NL
376    "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
377    "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
378    "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
379    GET_LENGTH(TMP2, TMP3)
380
381    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
382    "bne .specularLight2" NL
383
384    // Calculating diffuse light strength.
385    DOT_PRODUCT(TMP3, TMP2, TMP1)
386    MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
387    "b .lightStrengthCalculated" NL
388
389".specularLight2:" NL
390    // Calculating specular light strength.
391    "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
392    GET_LENGTH(TMP1, TMP3)
393    DOT_PRODUCT(TMP3, TMP2, TMP1)
394
395    // When the exponent is 1, we don't need to call an expensive powf function.
396    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
397    "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
398    "beq .specularExpPowf" NL
399    MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
400
401".lightStrengthCalculated:" NL
402    // TMP2_S1 contains the light strength. Clamp it to [0, 1]
403    "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
404    "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
405    "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
406    "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
407    "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
408    // The color values are stored in-place.
409    "strb r2, [" PIXELS_R ", #-11]" NL
410    "strb r3, [" PIXELS_R ", #-10]" NL
411    "vmov.u32 r2, " TMP3_S2 NL
412    "strb r2, [" PIXELS_R ", #-9]" NL
413
414    // Continue to the next pixel.
415".blackPixel:" NL
416    "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
417    "mov r3, #1" NL
418    "subs " WIDTH_R ", " WIDTH_R ", #1" NL
419    "bne .scanline" NL
420
421    // If the end of the scanline is reached, we continue
422    // to the next scanline.
423    "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
424    "mov " WIDTH_R ", " RESET_WIDTH_R NL
425    "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
426    "bne .mainLoop" NL
427
428    // Return.
429    "vldmia sp!, {d8-d15}" NL
430    "ldmia sp!, {r4-r8, r10, r11, pc}" NL
431
432".distantLight:" NL
433    // In case of distant light, the light vector is constant,
434    // we simply copy it.
435    "vmov.f32 " TMP1_Q ", " LIGHT_Q NL
436    "b .visiblePixel" NL
437
438".cosineOfAngle:" NL
439    // If the pixel is outside of the cone angle, it is simply a black pixel.
440    DOT_PRODUCT(TMP3, TMP1, DIRECTION)
441    "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
442    "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
443    "fmstat" NL
444    "bls .visiblePixel" NL
445    "mov r0, #0" NL
446    "strh r0, [" PIXELS_R ", #-11]" NL
447    "strb r0, [" PIXELS_R ", #-9]" NL
448    "b .blackPixel" NL
449
450".cutOff:" NL
451    // Smoothing the light strength on the cone edge.
452    "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
453    "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
454    "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
455    "b .cutOffFinished" NL
456
457".coneExpPowf:" NL
458    POWF(TMP3_S1, CONE_EXPONENT_R)
459    "b .coneExpPowfFinished" NL
460
461".specularExpPowf:" NL
462    POWF(TMP2_S1, SPECULAR_EXPONENT_R)
463    "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
464    "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
465    "b .lightStrengthCalculated" NL
466); // NOLINT
467
468int FELighting::getPowerCoefficients(float exponent)
469{
470    // Calling a powf function from the assembly code would require to save
471    // and reload a lot of NEON registers. Since the base is in range [0..1]
472    // and only 8 bit precision is required, we use our own powf function.
473    // This is probably not the best, but it uses only a few registers and
474    // gives us enough precision (modifying the exponent field directly would
475    // also be possible).
476
477    // First, we limit the exponent to maximum of 64, which gives us enough
478    // precision. We split the exponent to an integer and fraction part,
479    // since a^x = (a^y)*(a^z) where x = y+z. The integer exponent of the
480    // power is estimated by square, and the fraction exponent of the power
481    // is estimated by square root assembly instructions.
482    int i, result;
483
484    if (exponent < 0)
485        exponent = 1 / (-exponent);
486
487    if (exponent > 63.99)
488        exponent = 63.99;
489
490    exponent /= 64;
491    result = 0;
492    for (i = 11; i >= 0; --i) {
493        exponent *= 2;
494        if (exponent >= 1) {
495            result |= 1 << i;
496            exponent -= 1;
497        }
498    }
499    return result;
500}
501
502} // namespace WebCore
503
504#endif // CPU(ARM_NEON) && COMPILER(GCC)
505