1/* 2 * Copyright (C) 2002 Frederic 'dilb' Boulay 3 * 4 * Author: Frederic Boulay <dilb@handhelds.org> 5 * 6 * The function defined in this file is derived from the simple_idct function 7 * from the libavcodec library part of the Libav project. 8 * 9 * This file is part of Libav. 10 * 11 * Libav is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * Libav is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with Libav; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 */ 25 26#include "asm.S" 27 28/* useful constants for the algorithm, they are save in __constant_ptr__ at */ 29/* the end of the source code.*/ 30#define W1 22725 31#define W2 21407 32#define W3 19266 33#define W4 16383 34#define W5 12873 35#define W6 8867 36#define W7 4520 37#define MASK_MSHW 0xFFFF0000 38 39/* offsets of the constants in the vector */ 40#define offW1 0 41#define offW2 4 42#define offW3 8 43#define offW4 12 44#define offW5 16 45#define offW6 20 46#define offW7 24 47#define offMASK_MSHW 28 48 49#define ROW_SHIFT 11 50#define ROW_SHIFT2MSHW (16-11) 51#define COL_SHIFT 20 52#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ 53#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ 54 55 56function ff_simple_idct_arm, export=1 57 @@ void simple_idct_arm(int16_t *block) 58 @@ save stack for reg needed (take all of them), 59 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block 60 @@ so it must not be overwritten, if it is not saved!! 61 @@ R12 is another scratch register, so it should not be saved too 62 @@ save all registers 63 stmfd sp!, {r4-r11, r14} @ R14 is also called LR 64 @@ at this point, R0=block, other registers are free. 65 add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. 66 adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it 67 @@ add 2 temporary variables in the stack: R0 and R14 68 sub sp, sp, #8 @ allow 2 local variables 69 str r0, [sp, #0] @ save block in sp[0] 70 @@ stack status 71 @@ sp+4 free 72 @@ sp+0 R0 (block) 73 74 75 @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free 76 77 78__row_loop: 79 @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) 80 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) 81 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] 82 ldr r3, [r14, #8] @ R3=ROWr32[2] 83 ldr r4, [r14, #12] @ R4=ROWr32[3] 84 @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), 85 @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) 86 @@ else follow the complete algorithm. 87 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 88 @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free 89 orr r5, r4, r3 @ R5=R4 | R3 90 orr r5, r5, r2 @ R5=R4 | R3 | R2 91 orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) 92 beq __end_row_loop 93 mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) 94 ldrsh r6, [r14, #0] @ R6=ROWr16[0] 95 orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 96 beq __almost_empty_row 97 98__b_evaluation: 99 @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], 100 @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, 101 @@ R12=__const_ptr_, R14=&block[n] 102 @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 103 104 @@ MUL16(b0, W1, row[1]); 105 @@ MUL16(b1, W3, row[1]); 106 @@ MUL16(b2, W5, row[1]); 107 @@ MUL16(b3, W7, row[1]); 108 @@ MAC16(b0, W3, row[3]); 109 @@ MAC16(b1, -W7, row[3]); 110 @@ MAC16(b2, -W1, row[3]); 111 @@ MAC16(b3, -W5, row[3]); 112 ldr r8, [r12, #offW1] @ R8=W1 113 mov r2, r2, asr #16 @ R2=ROWr16[3] 114 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 115 ldr r9, [r12, #offW3] @ R9=W3 116 ldr r10, [r12, #offW5] @ R10=W5 117 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 118 ldr r11, [r12, #offW7] @ R11=W7 119 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 120 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 121 teq r2, #0 @ if null avoid muls 122 itttt ne 123 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 124 rsbne r2, r2, #0 @ R2=-ROWr16[3] 125 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 126 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 127 it ne 128 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 129 130 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 131 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 132 @@ R12=__const_ptr_, R14=&block[n] 133 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 134 @@ if (temp != 0) {} 135 orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] 136 beq __end_b_evaluation 137 138 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 139 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 140 @@ R12=__const_ptr_, R14=&block[n] 141 @@ MAC16(b0, W5, row[5]); 142 @@ MAC16(b2, W7, row[5]); 143 @@ MAC16(b3, W3, row[5]); 144 @@ MAC16(b1, -W1, row[5]); 145 @@ MAC16(b0, W7, row[7]); 146 @@ MAC16(b2, W3, row[7]); 147 @@ MAC16(b3, -W1, row[7]); 148 @@ MAC16(b1, -W5, row[7]); 149 mov r3, r3, asr #16 @ R3=ROWr16[5] 150 teq r3, #0 @ if null avoid muls 151 it ne 152 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 153 mov r4, r4, asr #16 @ R4=ROWr16[7] 154 itttt ne 155 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 156 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 157 rsbne r3, r3, #0 @ R3=-ROWr16[5] 158 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 159 @@ R3 is free now 160 teq r4, #0 @ if null avoid muls 161 itttt ne 162 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 163 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 164 rsbne r4, r4, #0 @ R4=-ROWr16[7] 165 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 166 it ne 167 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 168 @@ R4 is free now 169__end_b_evaluation: 170 @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), 171 @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 172 @@ R12=__const_ptr_, R14=&block[n] 173 174__a_evaluation: 175 @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); 176 @@ a1 = a0 + W6 * row[2]; 177 @@ a2 = a0 - W6 * row[2]; 178 @@ a3 = a0 - W2 * row[2]; 179 @@ a0 = a0 + W2 * row[2]; 180 ldr r9, [r12, #offW4] @ R9=W4 181 mul r6, r9, r6 @ R6=W4*ROWr16[0] 182 ldr r10, [r12, #offW6] @ R10=W6 183 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) 184 add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) 185 186 mul r11, r10, r4 @ R11=W6*ROWr16[2] 187 ldr r8, [r12, #offW2] @ R8=W2 188 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 189 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 190 @@ if (temp != 0) {} 191 teq r2, #0 192 beq __end_bef_a_evaluation 193 194 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 195 mul r11, r8, r4 @ R11=W2*ROWr16[2] 196 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 197 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 198 199 200 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 201 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 202 @@ R12=__const_ptr_, R14=&block[n] 203 204 205 @@ a0 += W4*row[4] 206 @@ a1 -= W4*row[4] 207 @@ a2 -= W4*row[4] 208 @@ a3 += W4*row[4] 209 ldrsh r11, [r14, #8] @ R11=ROWr16[4] 210 teq r11, #0 @ if null avoid muls 211 it ne 212 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 213 @@ R9 is free now 214 ldrsh r9, [r14, #12] @ R9=ROWr16[6] 215 itttt ne 216 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 217 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 218 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 219 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 220 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 221 teq r9, #0 @ if null avoid muls 222 itttt ne 223 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 224 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 225 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 226 @@ a0 += W6*row[6]; 227 @@ a3 -= W6*row[6]; 228 @@ a1 -= W2*row[6]; 229 @@ a2 += W2*row[6]; 230 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 231 itt ne 232 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 233 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 234 235__end_a_evaluation: 236 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 237 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 238 @@ R12=__const_ptr_, R14=&block[n] 239 @@ row[0] = (a0 + b0) >> ROW_SHIFT; 240 @@ row[1] = (a1 + b1) >> ROW_SHIFT; 241 @@ row[2] = (a2 + b2) >> ROW_SHIFT; 242 @@ row[3] = (a3 + b3) >> ROW_SHIFT; 243 @@ row[4] = (a3 - b3) >> ROW_SHIFT; 244 @@ row[5] = (a2 - b2) >> ROW_SHIFT; 245 @@ row[6] = (a1 - b1) >> ROW_SHIFT; 246 @@ row[7] = (a0 - b0) >> ROW_SHIFT; 247 add r8, r6, r0 @ R8=a0+b0 248 add r9, r2, r1 @ R9=a1+b1 249 @@ put 2 16 bits half-words in a 32bits word 250 @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) 251 ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000 252 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) 253 mvn r11, r10 @ R11= NOT R10= 0x0000FFFF 254 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) 255 orr r8, r8, r9 256 str r8, [r14, #0] 257 258 add r8, r3, r5 @ R8=a2+b2 259 add r9, r4, r7 @ R9=a3+b3 260 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) 261 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) 262 orr r8, r8, r9 263 str r8, [r14, #4] 264 265 sub r8, r4, r7 @ R8=a3-b3 266 sub r9, r3, r5 @ R9=a2-b2 267 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) 268 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) 269 orr r8, r8, r9 270 str r8, [r14, #8] 271 272 sub r8, r2, r1 @ R8=a1-b1 273 sub r9, r6, r0 @ R9=a0-b0 274 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) 275 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) 276 orr r8, r8, r9 277 str r8, [r14, #12] 278 279 bal __end_row_loop 280 281__almost_empty_row: 282 @@ the row was empty, except ROWr16[0], now, management of this special case 283 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 284 @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], 285 @@ R8=0xFFFF (temp), R9-R11 free 286 mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). 287 sub r8, r8, #1 @ R8 is now ready. 288 and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF 289 orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) 290 str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 291 str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 292 str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 293 str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 294 295__end_row_loop: 296 @@ at this point, R0-R11 (free) 297 @@ R12=__const_ptr_, R14=&block[n] 298 ldr r0, [sp, #0] @ R0=block 299 teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. 300 sub r14, r14, #16 301 bne __row_loop 302 303 304 305 @@ at this point, R0=block, R1-R11 (free) 306 @@ R12=__const_ptr_, R14=&block[n] 307 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. 308__col_loop: 309 310__b_evaluation2: 311 @@ at this point, R0=block (temp), R1-R11 (free) 312 @@ R12=__const_ptr_, R14=&block[n] 313 @@ proceed with b0-b3 first, followed by a0-a3 314 @@ MUL16(b0, W1, col[8x1]); 315 @@ MUL16(b1, W3, col[8x1]); 316 @@ MUL16(b2, W5, col[8x1]); 317 @@ MUL16(b3, W7, col[8x1]); 318 @@ MAC16(b0, W3, col[8x3]); 319 @@ MAC16(b1, -W7, col[8x3]); 320 @@ MAC16(b2, -W1, col[8x3]); 321 @@ MAC16(b3, -W5, col[8x3]); 322 ldr r8, [r12, #offW1] @ R8=W1 323 ldrsh r7, [r14, #16] 324 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 325 ldr r9, [r12, #offW3] @ R9=W3 326 ldr r10, [r12, #offW5] @ R10=W5 327 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 328 ldr r11, [r12, #offW7] @ R11=W7 329 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 330 ldrsh r2, [r14, #48] 331 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 332 teq r2, #0 @ if 0, then avoid muls 333 itttt ne 334 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 335 rsbne r2, r2, #0 @ R2=-ROWr16[3] 336 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 337 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 338 it ne 339 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 340 341 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 342 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 343 @@ R12=__const_ptr_, R14=&block[n] 344 @@ MAC16(b0, W5, col[5x8]); 345 @@ MAC16(b2, W7, col[5x8]); 346 @@ MAC16(b3, W3, col[5x8]); 347 @@ MAC16(b1, -W1, col[5x8]); 348 @@ MAC16(b0, W7, col[7x8]); 349 @@ MAC16(b2, W3, col[7x8]); 350 @@ MAC16(b3, -W1, col[7x8]); 351 @@ MAC16(b1, -W5, col[7x8]); 352 ldrsh r3, [r14, #80] @ R3=COLr16[5x8] 353 teq r3, #0 @ if 0 then avoid muls 354 itttt ne 355 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 356 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 357 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 358 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] 359 ldrsh r4, [r14, #112] @ R4=COLr16[7x8] 360 it ne 361 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 362 @@ R3 is free now 363 teq r4, #0 @ if 0 then avoid muls 364 itttt ne 365 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 366 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 367 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] 368 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 369 it ne 370 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 371 @@ R4 is free now 372__end_b_evaluation2: 373 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 374 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 375 @@ R12=__const_ptr_, R14=&block[n] 376 377__a_evaluation2: 378 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); 379 @@ a1 = a0 + W6 * row[2]; 380 @@ a2 = a0 - W6 * row[2]; 381 @@ a3 = a0 - W2 * row[2]; 382 @@ a0 = a0 + W2 * row[2]; 383 ldrsh r6, [r14, #0] 384 ldr r9, [r12, #offW4] @ R9=W4 385 mul r6, r9, r6 @ R6=W4*ROWr16[0] 386 ldr r10, [r12, #offW6] @ R10=W6 387 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) 388 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) 389 mul r11, r10, r4 @ R11=W6*ROWr16[2] 390 ldr r8, [r12, #offW2] @ R8=W2 391 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 392 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 393 mul r11, r8, r4 @ R11=W2*ROWr16[2] 394 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 395 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 396 397 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 398 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 399 @@ R12=__const_ptr_, R14=&block[n] 400 @@ a0 += W4*row[4] 401 @@ a1 -= W4*row[4] 402 @@ a2 -= W4*row[4] 403 @@ a3 += W4*row[4] 404 ldrsh r11, [r14, #64] @ R11=ROWr16[4] 405 teq r11, #0 @ if null avoid muls 406 itttt ne 407 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 408 @@ R9 is free now 409 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 410 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 411 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 412 ldrsh r9, [r14, #96] @ R9=ROWr16[6] 413 it ne 414 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 415 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 416 teq r9, #0 @ if null avoid muls 417 itttt ne 418 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 419 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 420 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 421 @@ a0 += W6*row[6]; 422 @@ a3 -= W6*row[6]; 423 @@ a1 -= W2*row[6]; 424 @@ a2 += W2*row[6]; 425 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 426 itt ne 427 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 428 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 429__end_a_evaluation2: 430 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 431 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 432 @@ R12=__const_ptr_, R14=&block[n] 433 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); 434 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); 435 @@ col[16] = ((a2 + b2) >> COL_SHIFT); 436 @@ col[24] = ((a3 + b3) >> COL_SHIFT); 437 @@ col[32] = ((a3 - b3) >> COL_SHIFT); 438 @@ col[40] = ((a2 - b2) >> COL_SHIFT); 439 @@ col[48] = ((a1 - b1) >> COL_SHIFT); 440 @@ col[56] = ((a0 - b0) >> COL_SHIFT); 441 @@@@@ no optimization here @@@@@ 442 add r8, r6, r0 @ R8=a0+b0 443 add r9, r2, r1 @ R9=a1+b1 444 mov r8, r8, asr #COL_SHIFT 445 mov r9, r9, asr #COL_SHIFT 446 strh r8, [r14, #0] 447 strh r9, [r14, #16] 448 add r8, r3, r5 @ R8=a2+b2 449 add r9, r4, r7 @ R9=a3+b3 450 mov r8, r8, asr #COL_SHIFT 451 mov r9, r9, asr #COL_SHIFT 452 strh r8, [r14, #32] 453 strh r9, [r14, #48] 454 sub r8, r4, r7 @ R8=a3-b3 455 sub r9, r3, r5 @ R9=a2-b2 456 mov r8, r8, asr #COL_SHIFT 457 mov r9, r9, asr #COL_SHIFT 458 strh r8, [r14, #64] 459 strh r9, [r14, #80] 460 sub r8, r2, r1 @ R8=a1-b1 461 sub r9, r6, r0 @ R9=a0-b0 462 mov r8, r8, asr #COL_SHIFT 463 mov r9, r9, asr #COL_SHIFT 464 strh r8, [r14, #96] 465 strh r9, [r14, #112] 466 467__end_col_loop: 468 @@ at this point, R0-R11 (free) 469 @@ R12=__const_ptr_, R14=&block[n] 470 ldr r0, [sp, #0] @ R0=block 471 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. 472 sub r14, r14, #2 473 bne __col_loop 474 475 476 477 478__end_simple_idct_arm: 479 @@ restore registers to previous status! 480 add sp, sp, #8 @@ the local variables! 481 ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. 482 483 484 485@@ kind of sub-function, here not to overload the common case. 486__end_bef_a_evaluation: 487 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 488 mul r11, r8, r4 @ R11=W2*ROWr16[2] 489 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 490 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 491 bal __end_a_evaluation 492 493 494 .align 495__constant_ptr__: @@ see #defines at the beginning of the source code for values. 496 .word W1 497 .word W2 498 .word W3 499 .word W4 500 .word W5 501 .word W6 502 .word W7 503 .word MASK_MSHW 504