1/* 2 * Copyright (C) 2011 University of Szeged 3 * Copyright (C) 2011 Zoltan Herczeg 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "FELightingNEON.h" 29 30#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC) 31 32#include <wtf/Alignment.h> 33 34namespace WebCore { 35 36// These constants are copied to the following SIMD registers: 37// ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D 38 39static WTF_ALIGNED(short, s_FELightingConstantsForNeon[], 16) = { 40 // Alpha coefficients. 41 -2, 1, 0, -1, 2, 1, 0, -1, 42 0, -1, -2, -1, 0, 1, 2, 1, 43 // Remapping indicies. 44 0x0f0e, 0x0302, 0x0504, 0x0706, 45 0x0b0a, 0x1312, 0x1514, 0x1716, 46}; 47 48short* feLightingConstantsForNeon() 49{ 50 return s_FELightingConstantsForNeon; 51} 52 53void FELighting::platformApplyNeonWorker(FELightingPaintingDataForNeon* parameters) 54{ 55 neonDrawLighting(parameters); 56} 57 58#define ASSTRING(str) #str 59#define TOSTRING(value) ASSTRING(value) 60 61#define PIXELS_OFFSET TOSTRING(0) 62#define YSTART_OFFSET TOSTRING(4) 63#define WIDTH_OFFSET TOSTRING(8) 64#define HEIGHT_OFFSET TOSTRING(12) 65#define FLAGS_OFFSET TOSTRING(16) 66#define SPECULAR_EXPONENT_OFFSET TOSTRING(20) 67#define CONE_EXPONENT_OFFSET TOSTRING(24) 68#define FLOAT_ARGUMENTS_OFFSET TOSTRING(28) 69#define PAINTING_CONSTANTS_OFFSET TOSTRING(32) 70#define NL "\n" 71 72// Register allocation 73#define PAINTING_DATA_R "r11" 74#define RESET_WIDTH_R PAINTING_DATA_R 75#define PIXELS_R "r4" 76#define WIDTH_R "r5" 77#define HEIGHT_R "r6" 78#define FLAGS_R "r7" 79#define SPECULAR_EXPONENT_R "r8" 80#define CONE_EXPONENT_R "r10" 81#define SCANLINE_R "r12" 82 83#define TMP1_Q "q0" 84#define TMP1_D0 "d0" 85#define TMP1_S0 "s0" 86#define TMP1_S1 "s1" 87#define TMP1_D1 "d1" 88#define TMP1_S2 "s2" 89#define TMP1_S3 "s3" 90#define TMP2_Q "q1" 91#define TMP2_D0 "d2" 92#define TMP2_S0 "s4" 93#define TMP2_S1 "s5" 94#define TMP2_D1 "d3" 95#define TMP2_S2 "s6" 96#define TMP2_S3 "s7" 97#define TMP3_Q "q2" 98#define TMP3_D0 "d4" 99#define TMP3_S0 "s8" 100#define TMP3_S1 "s9" 101#define TMP3_D1 "d5" 102#define TMP3_S2 "s10" 103#define TMP3_S3 "s11" 104 105#define COSINE_OF_ANGLE "s12" 106#define POWF_INT_S "s13" 107#define POWF_FRAC_S "s14" 108#define SPOT_COLOR_Q "q4" 109 110// Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S 111// must be placed on the same side of the double vector 112 113// Current pixel position 114#define POSITION_Q "q5" 115#define POSITION_X_S "s20" 116#define POSITION_Y_S "s21" 117#define POSITION_Z_S "s22" 118#define CONST_ZERO_HI_D "d11" 119#define CONST_ZERO_S "s23" 120 121// ------------------------------- 122// Variable arguments 123// Misc arguments 124#define READ1_RANGE "d12-d15" 125#define READ2_RANGE "d16-d19" 126#define READ3_RANGE "d20-d21" 127 128#define SCALE_S "s24" 129#define SCALE_DIV4_S "s25" 130#define DIFFUSE_CONST_S "s26" 131 132// Light source position 133#define CONE_CUT_OFF_S "s28" 134#define CONE_FULL_LIGHT_S "s29" 135#define CONE_CUT_OFF_RANGE_S "s30" 136#define CONST_ONE_HI_D "d15" 137#define CONST_ONE_S "s31" 138 139#define LIGHT_Q "q8" 140#define DIRECTION_Q "q9" 141#define COLOR_Q "q10" 142// ------------------------------- 143// Constant coefficients 144#define READ4_RANGE "d22-d25" 145#define READ5_RANGE "d26-d27" 146 147#define ALPHAX_Q "q11" 148#define ALPHAY_Q "q12" 149#define REMAPX_D "d26" 150#define REMAPY_D "d27" 151// ------------------------------- 152 153#define ALL_ROWS_D "{d28,d29,d30}" 154#define TOP_ROW_D "d28" 155#define MIDDLE_ROW_D "d29" 156#define BOTTOM_ROW_D "d30" 157 158#define GET_LENGTH(source, temp) \ 159 "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \ 160 "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \ 161 "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \ 162 "vsqrt.f32 " source##_S3 ", " source##_S3 NL 163 164// destination##_S3 can contain the multiply of length. 165#define DOT_PRODUCT(destination, source1, source2) \ 166 "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \ 167 "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \ 168 "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL 169 170#define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \ 171 "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \ 172 "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \ 173 "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \ 174 "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL 175 176#define POWF_SQR(value, exponent, current, remaining) \ 177 "tst " exponent ", #" ASSTRING(current) NL \ 178 "vmulne.f32 " value ", " value ", " POWF_INT_S NL \ 179 "tst " exponent ", #" ASSTRING(remaining) NL \ 180 "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL 181 182#define POWF_SQRT(value, exponent, current, remaining) \ 183 "tst " exponent ", #" ASSTRING(remaining) NL \ 184 "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \ 185 "tst " exponent ", #" ASSTRING(current) NL \ 186 "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL 187 188// This simplified powf function is sufficiently accurate. 189#define POWF(value, exponent) \ 190 "tst " exponent ", #0xfc0" NL \ 191 "vmovne.f32 " POWF_INT_S ", " value NL \ 192 "tst " exponent ", #0x03f" NL \ 193 "vmovne.f32 " POWF_FRAC_S ", " value NL \ 194 "vmov.f32 " value ", " CONST_ONE_S NL \ 195 \ 196 POWF_SQR(value, exponent, 0x040, 0xf80) \ 197 POWF_SQR(value, exponent, 0x080, 0xf00) \ 198 POWF_SQR(value, exponent, 0x100, 0xe00) \ 199 POWF_SQR(value, exponent, 0x200, 0xc00) \ 200 POWF_SQR(value, exponent, 0x400, 0x800) \ 201 "tst " exponent ", #0x800" NL \ 202 "vmulne.f32 " value ", " value ", " POWF_INT_S NL \ 203 \ 204 POWF_SQRT(value, exponent, 0x20, 0x3f) \ 205 POWF_SQRT(value, exponent, 0x10, 0x1f) \ 206 POWF_SQRT(value, exponent, 0x08, 0x0f) \ 207 POWF_SQRT(value, exponent, 0x04, 0x07) \ 208 POWF_SQRT(value, exponent, 0x02, 0x03) \ 209 POWF_SQRT(value, exponent, 0x01, 0x01) 210 211// The following algorithm is an ARM-NEON optimized version of 212// the main loop found in FELighting.cpp. Since the whole code 213// is redesigned to be as effective as possible (ARM specific 214// thinking), it is four times faster than its C++ counterpart. 215 216asm ( // NOLINT 217".globl " TOSTRING(neonDrawLighting) NL 218TOSTRING(neonDrawLighting) ":" NL 219 // Because of the clever register allocation, nothing is stored on the stack 220 // except the saved registers. 221 // Stack must be aligned to 8 bytes. 222 "stmdb sp!, {r4-r8, r10, r11, lr}" NL 223 "vstmdb sp!, {d8-d15}" NL 224 "mov " PAINTING_DATA_R ", r0" NL 225 226 // The following two arguments are loaded to SIMD registers. 227 "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL 228 "ldr r1, [" PAINTING_DATA_R ", #" PAINTING_CONSTANTS_OFFSET "]" NL 229 "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL 230 "vldr.f32 " POSITION_Y_S ", [" PAINTING_DATA_R ", #" YSTART_OFFSET "]" NL 231 "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL 232 "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL 233 "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL 234 "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL 235 "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL 236 237 // Load all data to the SIMD registers with the least number of instructions. 238 "vld1.f32 { " READ1_RANGE " }, [r0]!" NL 239 "vld1.f32 { " READ2_RANGE " }, [r0]!" NL 240 "vld1.f32 { " READ3_RANGE " }, [r0]!" NL 241 "vld1.s16 {" READ4_RANGE "}, [r1]!" NL 242 "vld1.s16 {" READ5_RANGE "}, [r1]!" NL 243 244 // Initializing local variables. 245 "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL 246 "add " SCANLINE_R ", " SCANLINE_R ", #8" NL 247 "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL 248 "add " PIXELS_R ", " PIXELS_R ", #3" NL 249 "mov r0, #0" NL 250 "vmov.f32 " CONST_ZERO_S ", r0" NL 251 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL 252 "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL 253 "mov " RESET_WIDTH_R ", " WIDTH_R NL 254 255".mainLoop:" NL 256 "mov r3, #3" NL 257 "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL 258 259".scanline:" NL 260 // The ROW registers are storing the alpha channel of the last three pixels. 261 // The alpha channel is stored as signed short (sint16) values. The fourth value 262 // is garbage. The following instructions are shifting out the unnecessary alpha 263 // values and load the next ones. 264 "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL 265 "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL 266 "ldrb r2, [" PIXELS_R "], #4" NL 267 "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL 268 "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL 269 "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL 270 "vmov.s16 " TOP_ROW_D "[1], r0" NL 271 "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL 272 "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL 273 274 // The two border pixels (rightmost and leftmost) are skipped when 275 // the next scanline is reached. It also jumps, when the algorithm 276 // is started, and the first free alpha values are loaded to each row. 277 "subs r3, r3, #1" NL 278 "bne .scanline" NL 279 280 // The light vector goes to TMP1_Q. It is constant in case of distant light. 281 // The fourth value contains the length of the light vector. 282 "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL 283 "beq .distantLight" NL 284 285 "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL 286 "vmov.f32 " POSITION_Z_S ", r3" NL 287 "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL 288 "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL 289 290 "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL 291 GET_LENGTH(TMP1, TMP2) 292 293 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL 294 "bne .cosineOfAngle" NL 295".visiblePixel:" NL 296 297 // | -1 0 1 | | -1 -2 -1 | 298 // X = | -2 0 2 | Y = | 0 0 0 | 299 // | -1 0 1 | | 1 2 1 | 300 301 // Multiply the alpha values by the X and Y matrices. 302 303 // Moving the 8 alpha value to TMP3. 304 "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL 305 "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL 306 307 "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL 308 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL 309 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL 310 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL 311 "vmov.s16 r0, " TMP2_D0 "[0]" NL 312 313 "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL 314 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL 315 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL 316 "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL 317 "vmov.s16 r1, " TMP2_D0 "[0]" NL 318 319 // r0 and r1 contains the X and Y coordinates of the 320 // normal vector, respectively. 321 322 // Calculating the spot light strength. 323 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL 324 "beq .endLight" NL 325 326 "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL 327 "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL 328 "beq .coneExpPowf" NL 329".coneExpPowfFinished:" NL 330 331 // Smoothing the cone edge if necessary. 332 "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL 333 "fmstat" NL 334 "bhi .cutOff" NL 335".cutOffFinished:" NL 336 337 "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL 338 "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL 339 340".endLight:" NL 341 // Summarize: 342 // r0 and r1 contains the normalVector. 343 // TMP1_Q contains the light vector and its length. 344 // COLOR_Q contains the color of the light vector. 345 346 // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)). 347 "orrs r2, r0, r1" NL 348 "bne .normalVectorIsNonZero" NL 349 350 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL 351 "bne .specularLight1" NL 352 353 // Calculate diffuse light strength. 354 MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3) 355 "b .lightStrengthCalculated" NL 356 357".specularLight1:" NL 358 // Calculating specular light strength. 359 "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL 360 GET_LENGTH(TMP1, TMP2) 361 362 // When the exponent is 1, we don't need to call an expensive powf function. 363 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL 364 "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL 365 "beq .specularExpPowf" NL 366 367 MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3) 368 "b .lightStrengthCalculated" NL 369 370".normalVectorIsNonZero:" NL 371 // Normal vector goes to TMP2, and its length is calculated as well. 372 "vmov.s32 " TMP2_S0 ", r0" NL 373 "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL 374 "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL 375 "vmov.s32 " TMP2_S1 ", r1" NL 376 "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL 377 "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL 378 "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL 379 GET_LENGTH(TMP2, TMP3) 380 381 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL 382 "bne .specularLight2" NL 383 384 // Calculating diffuse light strength. 385 DOT_PRODUCT(TMP3, TMP2, TMP1) 386 MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3) 387 "b .lightStrengthCalculated" NL 388 389".specularLight2:" NL 390 // Calculating specular light strength. 391 "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL 392 GET_LENGTH(TMP1, TMP3) 393 DOT_PRODUCT(TMP3, TMP2, TMP1) 394 395 // When the exponent is 1, we don't need to call an expensive powf function. 396 "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL 397 "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL 398 "beq .specularExpPowf" NL 399 MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3) 400 401".lightStrengthCalculated:" NL 402 // TMP2_S1 contains the light strength. Clamp it to [0, 1] 403 "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL 404 "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL 405 "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL 406 "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL 407 "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL 408 // The color values are stored in-place. 409 "strb r2, [" PIXELS_R ", #-11]" NL 410 "strb r3, [" PIXELS_R ", #-10]" NL 411 "vmov.u32 r2, " TMP3_S2 NL 412 "strb r2, [" PIXELS_R ", #-9]" NL 413 414 // Continue to the next pixel. 415".blackPixel:" NL 416 "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL 417 "mov r3, #1" NL 418 "subs " WIDTH_R ", " WIDTH_R ", #1" NL 419 "bne .scanline" NL 420 421 // If the end of the scanline is reached, we continue 422 // to the next scanline. 423 "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL 424 "mov " WIDTH_R ", " RESET_WIDTH_R NL 425 "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL 426 "bne .mainLoop" NL 427 428 // Return. 429 "vldmia sp!, {d8-d15}" NL 430 "ldmia sp!, {r4-r8, r10, r11, pc}" NL 431 432".distantLight:" NL 433 // In case of distant light, the light vector is constant, 434 // we simply copy it. 435 "vmov.f32 " TMP1_Q ", " LIGHT_Q NL 436 "b .visiblePixel" NL 437 438".cosineOfAngle:" NL 439 // If the pixel is outside of the cone angle, it is simply a black pixel. 440 DOT_PRODUCT(TMP3, TMP1, DIRECTION) 441 "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL 442 "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL 443 "fmstat" NL 444 "bls .visiblePixel" NL 445 "mov r0, #0" NL 446 "strh r0, [" PIXELS_R ", #-11]" NL 447 "strb r0, [" PIXELS_R ", #-9]" NL 448 "b .blackPixel" NL 449 450".cutOff:" NL 451 // Smoothing the light strength on the cone edge. 452 "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL 453 "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL 454 "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL 455 "b .cutOffFinished" NL 456 457".coneExpPowf:" NL 458 POWF(TMP3_S1, CONE_EXPONENT_R) 459 "b .coneExpPowfFinished" NL 460 461".specularExpPowf:" NL 462 POWF(TMP2_S1, SPECULAR_EXPONENT_R) 463 "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL 464 "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL 465 "b .lightStrengthCalculated" NL 466); // NOLINT 467 468int FELighting::getPowerCoefficients(float exponent) 469{ 470 // Calling a powf function from the assembly code would require to save 471 // and reload a lot of NEON registers. Since the base is in range [0..1] 472 // and only 8 bit precision is required, we use our own powf function. 473 // This is probably not the best, but it uses only a few registers and 474 // gives us enough precision (modifying the exponent field directly would 475 // also be possible). 476 477 // First, we limit the exponent to maximum of 64, which gives us enough 478 // precision. We split the exponent to an integer and fraction part, 479 // since a^x = (a^y)*(a^z) where x = y+z. The integer exponent of the 480 // power is estimated by square, and the fraction exponent of the power 481 // is estimated by square root assembly instructions. 482 int i, result; 483 484 if (exponent < 0) 485 exponent = 1 / (-exponent); 486 487 if (exponent > 63.99) 488 exponent = 63.99; 489 490 exponent /= 64; 491 result = 0; 492 for (i = 11; i >= 0; --i) { 493 exponent *= 2; 494 if (exponent >= 1) { 495 result |= 1 << i; 496 exponent -= 1; 497 } 498 } 499 return result; 500} 501 502} // namespace WebCore 503 504#endif // CPU(ARM_NEON) && COMPILER(GCC) 505