1/*
2 * simple_idct_arm.S
3 * Copyright (C) 2002 Frederic 'dilb' Boulay
4 *
5 * Author: Frederic Boulay <dilb@handhelds.org>
6 *
7 * The function defined in this file is derived from the simple_idct function
8 * from the libavcodec library part of the FFmpeg project.
9 *
10 * This file is part of FFmpeg.
11 *
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
16 *
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20 * Lesser General Public License for more details.
21 *
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 */
26
27#include "asm.S"
28
29/* useful constants for the algorithm, they are save in __constant_ptr__ at */
30/* the end of the source code.*/
31#define W1  22725
32#define W2  21407
33#define W3  19266
34#define W4  16383
35#define W5  12873
36#define W6  8867
37#define W7  4520
38#define MASK_MSHW 0xFFFF0000
39
40/* offsets of the constants in the vector */
41#define offW1  0
42#define offW2  4
43#define offW3  8
44#define offW4  12
45#define offW5  16
46#define offW6  20
47#define offW7  24
48#define offMASK_MSHW 28
49
50#define ROW_SHIFT 11
51#define ROW_SHIFT2MSHW (16-11)
52#define COL_SHIFT 20
53#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
54#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
55
56
57        .text
58
59function simple_idct_ARM, export=1
60        @@ void simple_idct_ARM(int16_t *block)
61        @@ save stack for reg needed (take all of them),
62        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
63        @@ so it must not be overwritten, if it is not saved!!
64        @@ R12 is another scratch register, so it should not be saved too
65        @@ save all registers
66        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
67        @@ at this point, R0=block, other registers are free.
68        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
69        add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
70        @@ add 2 temporary variables in the stack: R0 and R14
71        sub sp, sp, #8          @ allow 2 local variables
72        str r0, [sp, #0]        @ save block in sp[0]
73        @@ stack status
74        @@ sp+4   free
75        @@ sp+0   R0  (block)
76
77
78        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
79
80
81__row_loop:
82        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
83        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
84        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
85        ldr r3, [r14, #8]        @ R3=ROWr32[2]
86        ldr r4, [r14, #12]       @ R4=ROWr32[3]
87        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
88        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
89        @@ else follow the complete algorithm.
90        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
91        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
92        orr r5, r4, r3           @ R5=R4 | R3
93        orr r5, r5, r2           @ R5=R4 | R3 | R2
94        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
95        beq __end_row_loop
96        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
97        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
98        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
99        beq __almost_empty_row
100
101__b_evaluation:
102        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
103        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
104        @@     R12=__const_ptr_, R14=&block[n]
105        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
106
107        @@ MUL16(b0, W1, row[1]);
108        @@ MUL16(b1, W3, row[1]);
109        @@ MUL16(b2, W5, row[1]);
110        @@ MUL16(b3, W7, row[1]);
111        @@ MAC16(b0, W3, row[3]);
112        @@ MAC16(b1, -W7, row[3]);
113        @@ MAC16(b2, -W1, row[3]);
114        @@ MAC16(b3, -W5, row[3]);
115        ldr r8, [r12, #offW1]    @ R8=W1
116        mov r2, r2, asr #16      @ R2=ROWr16[3]
117        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
118        ldr r9, [r12, #offW3]    @ R9=W3
119        ldr r10, [r12, #offW5]   @ R10=W5
120        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
121        ldr r11, [r12, #offW7]   @ R11=W7
122        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
123        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
124                teq r2, #0               @ if null avoid muls
125                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
126        rsbne r2, r2, #0         @ R2=-ROWr16[3]
127        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
128        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
130
131        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
132        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
133        @@     R12=__const_ptr_, R14=&block[n]
134        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
135        @@ if (temp != 0) {}
136        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
137        beq __end_b_evaluation
138
139        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
140        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
141        @@     R12=__const_ptr_, R14=&block[n]
142        @@ MAC16(b0, W5, row[5]);
143        @@ MAC16(b2, W7, row[5]);
144        @@ MAC16(b3, W3, row[5]);
145        @@ MAC16(b1, -W1, row[5]);
146        @@ MAC16(b0, W7, row[7]);
147        @@ MAC16(b2, W3, row[7]);
148        @@ MAC16(b3, -W1, row[7]);
149        @@ MAC16(b1, -W5, row[7]);
150        mov r3, r3, asr #16      @ R3=ROWr16[5]
151                teq r3, #0               @ if null avoid muls
152        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
153        mov r4, r4, asr #16      @ R4=ROWr16[7]
154        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
155        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
156        rsbne r3, r3, #0         @ R3=-ROWr16[5]
157        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
158        @@ R3 is free now
159                teq r4, #0               @ if null avoid muls
160        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
161        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
162        rsbne r4, r4, #0         @ R4=-ROWr16[7]
163        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
164        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
165        @@ R4 is free now
166__end_b_evaluation:
167        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
168        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
169        @@     R12=__const_ptr_, R14=&block[n]
170
171__a_evaluation:
172        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
173        @@ a1 = a0 + W6 * row[2];
174        @@ a2 = a0 - W6 * row[2];
175        @@ a3 = a0 - W2 * row[2];
176        @@ a0 = a0 + W2 * row[2];
177        ldr r9, [r12, #offW4]    @ R9=W4
178        mul r6, r9, r6           @ R6=W4*ROWr16[0]
179        ldr r10, [r12, #offW6]   @ R10=W6
180        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
181        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
182
183        mul r11, r10, r4         @ R11=W6*ROWr16[2]
184        ldr r8, [r12, #offW2]    @ R8=W2
185        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
186        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
187        @@ if (temp != 0) {}
188        teq r2, #0
189        beq __end_bef_a_evaluation
190
191        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
192        mul r11, r8, r4          @ R11=W2*ROWr16[2]
193        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
194        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
195
196
197        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
198        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
199        @@     R12=__const_ptr_, R14=&block[n]
200
201
202        @@ a0 += W4*row[4]
203        @@ a1 -= W4*row[4]
204        @@ a2 -= W4*row[4]
205        @@ a3 += W4*row[4]
206        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
207                teq r11, #0              @ if null avoid muls
208        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
209        @@ R9 is free now
210        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
211        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
212        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
213        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
214        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
215        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
216                teq r9, #0               @ if null avoid muls
217        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
218        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
219        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
220        @@ a0 += W6*row[6];
221        @@ a3 -= W6*row[6];
222        @@ a1 -= W2*row[6];
223        @@ a2 += W2*row[6];
224        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
225        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
226        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
227
228__end_a_evaluation:
229        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
230        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
231        @@     R12=__const_ptr_, R14=&block[n]
232        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
233        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
234        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
235        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
236        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
237        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
238        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
239        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
240        add r8, r6, r0           @ R8=a0+b0
241        add r9, r2, r1           @ R9=a1+b1
242        @@ put 2 16 bits half-words in a 32bits word
243        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
244        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
245        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
246        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
247        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
248        orr r8, r8, r9
249        str r8, [r14, #0]
250
251        add r8, r3, r5           @ R8=a2+b2
252        add r9, r4, r7           @ R9=a3+b3
253        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
254        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
255        orr r8, r8, r9
256        str r8, [r14, #4]
257
258        sub r8, r4, r7           @ R8=a3-b3
259        sub r9, r3, r5           @ R9=a2-b2
260        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
261        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
262        orr r8, r8, r9
263        str r8, [r14, #8]
264
265        sub r8, r2, r1           @ R8=a1-b1
266        sub r9, r6, r0           @ R9=a0-b0
267        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
268        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
269        orr r8, r8, r9
270        str r8, [r14, #12]
271
272        bal __end_row_loop
273
274__almost_empty_row:
275        @@ the row was empty, except ROWr16[0], now, management of this special case
276        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
277        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
278        @@                R8=0xFFFF (temp), R9-R11 free
279        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
280        sub r8, r8, #1           @ R8 is now ready.
281        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
282        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
283        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
284        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
285        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
286        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
287
288__end_row_loop:
289        @@ at this point, R0-R11 (free)
290        @@     R12=__const_ptr_, R14=&block[n]
291        ldr r0, [sp, #0]         @ R0=block
292        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
293        sub r14, r14, #16
294        bne __row_loop
295
296
297
298        @@ at this point, R0=block, R1-R11 (free)
299        @@     R12=__const_ptr_, R14=&block[n]
300        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
301__col_loop:
302
303__b_evaluation2:
304        @@ at this point, R0=block (temp),  R1-R11 (free)
305        @@     R12=__const_ptr_, R14=&block[n]
306        @@ proceed with b0-b3 first, followed by a0-a3
307        @@ MUL16(b0, W1, col[8x1]);
308        @@ MUL16(b1, W3, col[8x1]);
309        @@ MUL16(b2, W5, col[8x1]);
310        @@ MUL16(b3, W7, col[8x1]);
311        @@ MAC16(b0, W3, col[8x3]);
312        @@ MAC16(b1, -W7, col[8x3]);
313        @@ MAC16(b2, -W1, col[8x3]);
314        @@ MAC16(b3, -W5, col[8x3]);
315        ldr r8, [r12, #offW1]    @ R8=W1
316        ldrsh r7, [r14, #16]
317        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
318        ldr r9, [r12, #offW3]    @ R9=W3
319        ldr r10, [r12, #offW5]   @ R10=W5
320        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
321        ldr r11, [r12, #offW7]   @ R11=W7
322        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
323        ldrsh r2, [r14, #48]
324        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
325        teq r2, #0               @ if 0, then avoid muls
326        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
327        rsbne r2, r2, #0         @ R2=-ROWr16[3]
328        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
331
332        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
333        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
334        @@     R12=__const_ptr_, R14=&block[n]
335        @@ MAC16(b0, W5, col[5x8]);
336        @@ MAC16(b2, W7, col[5x8]);
337        @@ MAC16(b3, W3, col[5x8]);
338        @@ MAC16(b1, -W1, col[5x8]);
339        @@ MAC16(b0, W7, col[7x8]);
340        @@ MAC16(b2, W3, col[7x8]);
341        @@ MAC16(b3, -W1, col[7x8]);
342        @@ MAC16(b1, -W5, col[7x8]);
343        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
344        teq r3, #0               @ if 0 then avoid muls
345        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
346        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
347        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
348        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
349        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
350        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
351        @@ R3 is free now
352        teq r4, #0               @ if 0 then avoid muls
353        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
354        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
355        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
356        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
357        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
358        @@ R4 is free now
359__end_b_evaluation2:
360        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
361        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
362        @@     R12=__const_ptr_, R14=&block[n]
363
364__a_evaluation2:
365        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
366        @@ a1 = a0 + W6 * row[2];
367        @@ a2 = a0 - W6 * row[2];
368        @@ a3 = a0 - W2 * row[2];
369        @@ a0 = a0 + W2 * row[2];
370        ldrsh r6, [r14, #0]
371        ldr r9, [r12, #offW4]    @ R9=W4
372        mul r6, r9, r6           @ R6=W4*ROWr16[0]
373        ldr r10, [r12, #offW6]   @ R10=W6
374        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
375        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
376        mul r11, r10, r4         @ R11=W6*ROWr16[2]
377        ldr r8, [r12, #offW2]    @ R8=W2
378        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
379        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
380        mul r11, r8, r4          @ R11=W2*ROWr16[2]
381        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
382        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
383
384        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
385        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
386        @@     R12=__const_ptr_, R14=&block[n]
387        @@ a0 += W4*row[4]
388        @@ a1 -= W4*row[4]
389        @@ a2 -= W4*row[4]
390        @@ a3 += W4*row[4]
391        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
392        teq r11, #0              @ if null avoid muls
393        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
394        @@ R9 is free now
395        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
396        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
397        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
398        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
399        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
400        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
401        teq r9, #0               @ if null avoid muls
402        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
403        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
404        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
405        @@ a0 += W6*row[6];
406        @@ a3 -= W6*row[6];
407        @@ a1 -= W2*row[6];
408        @@ a2 += W2*row[6];
409        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
410        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
411        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
412__end_a_evaluation2:
413        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
414        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
415        @@     R12=__const_ptr_, R14=&block[n]
416        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
417        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
418        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
419        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
420        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
421        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
422        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
423        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
424        @@@@@ no optimization here @@@@@
425        add r8, r6, r0           @ R8=a0+b0
426        add r9, r2, r1           @ R9=a1+b1
427        mov r8, r8, asr #COL_SHIFT
428        mov r9, r9, asr #COL_SHIFT
429        strh r8, [r14, #0]
430        strh r9, [r14, #16]
431        add r8, r3, r5           @ R8=a2+b2
432        add r9, r4, r7           @ R9=a3+b3
433        mov r8, r8, asr #COL_SHIFT
434        mov r9, r9, asr #COL_SHIFT
435        strh r8, [r14, #32]
436        strh r9, [r14, #48]
437        sub r8, r4, r7           @ R8=a3-b3
438        sub r9, r3, r5           @ R9=a2-b2
439        mov r8, r8, asr #COL_SHIFT
440        mov r9, r9, asr #COL_SHIFT
441        strh r8, [r14, #64]
442        strh r9, [r14, #80]
443        sub r8, r2, r1           @ R8=a1-b1
444        sub r9, r6, r0           @ R9=a0-b0
445        mov r8, r8, asr #COL_SHIFT
446        mov r9, r9, asr #COL_SHIFT
447        strh r8, [r14, #96]
448        strh r9, [r14, #112]
449
450__end_col_loop:
451        @@ at this point, R0-R11 (free)
452        @@     R12=__const_ptr_, R14=&block[n]
453        ldr r0, [sp, #0]         @ R0=block
454        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
455        sub r14, r14, #2
456        bne __col_loop
457
458
459
460
461__end_simple_idct_ARM:
462        @@ restore registers to previous status!
463        add sp, sp, #8 @@ the local variables!
464        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
465
466
467
468@@ kind of sub-function, here not to overload the common case.
469__end_bef_a_evaluation:
470        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
471        mul r11, r8, r4          @ R11=W2*ROWr16[2]
472        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
473        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
474        bal __end_a_evaluation
475
476
477__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
478        .align
479        .word   W1
480        .word   W2
481        .word   W3
482        .word   W4
483        .word   W5
484        .word   W6
485        .word   W7
486        .word   MASK_MSHW
487