1/*
2 * Copyright (C) 2002 Frederic 'dilb' Boulay
3 *
4 * Author: Frederic Boulay <dilb@handhelds.org>
5 *
6 * The function defined in this file is derived from the simple_idct function
7 * from the libavcodec library part of the Libav project.
8 *
9 * This file is part of Libav.
10 *
11 * Libav is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * Libav is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with Libav; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26#include "asm.S"
27
28/* useful constants for the algorithm, they are save in __constant_ptr__ at */
29/* the end of the source code.*/
30#define W1  22725
31#define W2  21407
32#define W3  19266
33#define W4  16383
34#define W5  12873
35#define W6  8867
36#define W7  4520
37#define MASK_MSHW 0xFFFF0000
38
39/* offsets of the constants in the vector */
40#define offW1  0
41#define offW2  4
42#define offW3  8
43#define offW4  12
44#define offW5  16
45#define offW6  20
46#define offW7  24
47#define offMASK_MSHW 28
48
49#define ROW_SHIFT 11
50#define ROW_SHIFT2MSHW (16-11)
51#define COL_SHIFT 20
52#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
53#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
54
55
56function ff_simple_idct_arm, export=1
57        @@ void simple_idct_arm(int16_t *block)
58        @@ save stack for reg needed (take all of them),
59        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
60        @@ so it must not be overwritten, if it is not saved!!
61        @@ R12 is another scratch register, so it should not be saved too
62        @@ save all registers
63        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
64        @@ at this point, R0=block, other registers are free.
65        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
66        adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
67        @@ add 2 temporary variables in the stack: R0 and R14
68        sub sp, sp, #8          @ allow 2 local variables
69        str r0, [sp, #0]        @ save block in sp[0]
70        @@ stack status
71        @@ sp+4   free
72        @@ sp+0   R0  (block)
73
74
75        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
76
77
78__row_loop:
79        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
80        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
81        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
82        ldr r3, [r14, #8]        @ R3=ROWr32[2]
83        ldr r4, [r14, #12]       @ R4=ROWr32[3]
84        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
85        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
86        @@ else follow the complete algorithm.
87        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
88        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
89        orr r5, r4, r3           @ R5=R4 | R3
90        orr r5, r5, r2           @ R5=R4 | R3 | R2
91        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
92        beq __end_row_loop
93        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
94        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
95        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
96        beq __almost_empty_row
97
98__b_evaluation:
99        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
100        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
101        @@     R12=__const_ptr_, R14=&block[n]
102        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
103
104        @@ MUL16(b0, W1, row[1]);
105        @@ MUL16(b1, W3, row[1]);
106        @@ MUL16(b2, W5, row[1]);
107        @@ MUL16(b3, W7, row[1]);
108        @@ MAC16(b0, W3, row[3]);
109        @@ MAC16(b1, -W7, row[3]);
110        @@ MAC16(b2, -W1, row[3]);
111        @@ MAC16(b3, -W5, row[3]);
112        ldr r8, [r12, #offW1]    @ R8=W1
113        mov r2, r2, asr #16      @ R2=ROWr16[3]
114        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
115        ldr r9, [r12, #offW3]    @ R9=W3
116        ldr r10, [r12, #offW5]   @ R10=W5
117        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
118        ldr r11, [r12, #offW7]   @ R11=W7
119        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
120        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
121        teq r2, #0               @ if null avoid muls
122        itttt ne
123        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
124        rsbne r2, r2, #0         @ R2=-ROWr16[3]
125        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
126        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
127        it    ne
128        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
129
130        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
131        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
132        @@     R12=__const_ptr_, R14=&block[n]
133        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
134        @@ if (temp != 0) {}
135        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
136        beq __end_b_evaluation
137
138        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
139        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
140        @@     R12=__const_ptr_, R14=&block[n]
141        @@ MAC16(b0, W5, row[5]);
142        @@ MAC16(b2, W7, row[5]);
143        @@ MAC16(b3, W3, row[5]);
144        @@ MAC16(b1, -W1, row[5]);
145        @@ MAC16(b0, W7, row[7]);
146        @@ MAC16(b2, W3, row[7]);
147        @@ MAC16(b3, -W1, row[7]);
148        @@ MAC16(b1, -W5, row[7]);
149        mov r3, r3, asr #16      @ R3=ROWr16[5]
150        teq r3, #0               @ if null avoid muls
151        it    ne
152        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
153        mov r4, r4, asr #16      @ R4=ROWr16[7]
154        itttt ne
155        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
156        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
157        rsbne r3, r3, #0         @ R3=-ROWr16[5]
158        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
159        @@ R3 is free now
160        teq r4, #0               @ if null avoid muls
161        itttt ne
162        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
163        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
164        rsbne r4, r4, #0         @ R4=-ROWr16[7]
165        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
166        it    ne
167        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
168        @@ R4 is free now
169__end_b_evaluation:
170        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
171        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
172        @@     R12=__const_ptr_, R14=&block[n]
173
174__a_evaluation:
175        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
176        @@ a1 = a0 + W6 * row[2];
177        @@ a2 = a0 - W6 * row[2];
178        @@ a3 = a0 - W2 * row[2];
179        @@ a0 = a0 + W2 * row[2];
180        ldr r9, [r12, #offW4]    @ R9=W4
181        mul r6, r9, r6           @ R6=W4*ROWr16[0]
182        ldr r10, [r12, #offW6]   @ R10=W6
183        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
184        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
185
186        mul r11, r10, r4         @ R11=W6*ROWr16[2]
187        ldr r8, [r12, #offW2]    @ R8=W2
188        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
189        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
190        @@ if (temp != 0) {}
191        teq r2, #0
192        beq __end_bef_a_evaluation
193
194        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
195        mul r11, r8, r4          @ R11=W2*ROWr16[2]
196        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
197        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
198
199
200        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
201        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
202        @@     R12=__const_ptr_, R14=&block[n]
203
204
205        @@ a0 += W4*row[4]
206        @@ a1 -= W4*row[4]
207        @@ a2 -= W4*row[4]
208        @@ a3 += W4*row[4]
209        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
210        teq r11, #0              @ if null avoid muls
211        it    ne
212        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
213        @@ R9 is free now
214        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
215        itttt ne
216        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
217        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
218        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
219        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
220        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
221        teq r9, #0               @ if null avoid muls
222        itttt ne
223        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
224        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
225        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
226        @@ a0 += W6*row[6];
227        @@ a3 -= W6*row[6];
228        @@ a1 -= W2*row[6];
229        @@ a2 += W2*row[6];
230        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
231        itt   ne
232        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
233        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
234
235__end_a_evaluation:
236        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
237        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
238        @@     R12=__const_ptr_, R14=&block[n]
239        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
240        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
241        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
242        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
243        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
244        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
245        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
246        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
247        add r8, r6, r0           @ R8=a0+b0
248        add r9, r2, r1           @ R9=a1+b1
249        @@ put 2 16 bits half-words in a 32bits word
250        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
251        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
252        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
253        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
254        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
255        orr r8, r8, r9
256        str r8, [r14, #0]
257
258        add r8, r3, r5           @ R8=a2+b2
259        add r9, r4, r7           @ R9=a3+b3
260        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
261        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
262        orr r8, r8, r9
263        str r8, [r14, #4]
264
265        sub r8, r4, r7           @ R8=a3-b3
266        sub r9, r3, r5           @ R9=a2-b2
267        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
268        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
269        orr r8, r8, r9
270        str r8, [r14, #8]
271
272        sub r8, r2, r1           @ R8=a1-b1
273        sub r9, r6, r0           @ R9=a0-b0
274        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
275        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
276        orr r8, r8, r9
277        str r8, [r14, #12]
278
279        bal __end_row_loop
280
281__almost_empty_row:
282        @@ the row was empty, except ROWr16[0], now, management of this special case
283        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
284        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
285        @@                R8=0xFFFF (temp), R9-R11 free
286        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
287        sub r8, r8, #1           @ R8 is now ready.
288        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
289        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
290        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
291        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
292        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
293        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
294
295__end_row_loop:
296        @@ at this point, R0-R11 (free)
297        @@     R12=__const_ptr_, R14=&block[n]
298        ldr r0, [sp, #0]         @ R0=block
299        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
300        sub r14, r14, #16
301        bne __row_loop
302
303
304
305        @@ at this point, R0=block, R1-R11 (free)
306        @@     R12=__const_ptr_, R14=&block[n]
307        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
308__col_loop:
309
310__b_evaluation2:
311        @@ at this point, R0=block (temp),  R1-R11 (free)
312        @@     R12=__const_ptr_, R14=&block[n]
313        @@ proceed with b0-b3 first, followed by a0-a3
314        @@ MUL16(b0, W1, col[8x1]);
315        @@ MUL16(b1, W3, col[8x1]);
316        @@ MUL16(b2, W5, col[8x1]);
317        @@ MUL16(b3, W7, col[8x1]);
318        @@ MAC16(b0, W3, col[8x3]);
319        @@ MAC16(b1, -W7, col[8x3]);
320        @@ MAC16(b2, -W1, col[8x3]);
321        @@ MAC16(b3, -W5, col[8x3]);
322        ldr r8, [r12, #offW1]    @ R8=W1
323        ldrsh r7, [r14, #16]
324        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
325        ldr r9, [r12, #offW3]    @ R9=W3
326        ldr r10, [r12, #offW5]   @ R10=W5
327        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
328        ldr r11, [r12, #offW7]   @ R11=W7
329        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
330        ldrsh r2, [r14, #48]
331        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
332        teq r2, #0               @ if 0, then avoid muls
333        itttt ne
334        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
335        rsbne r2, r2, #0         @ R2=-ROWr16[3]
336        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
337        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
338        it    ne
339        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
340
341        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
342        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
343        @@     R12=__const_ptr_, R14=&block[n]
344        @@ MAC16(b0, W5, col[5x8]);
345        @@ MAC16(b2, W7, col[5x8]);
346        @@ MAC16(b3, W3, col[5x8]);
347        @@ MAC16(b1, -W1, col[5x8]);
348        @@ MAC16(b0, W7, col[7x8]);
349        @@ MAC16(b2, W3, col[7x8]);
350        @@ MAC16(b3, -W1, col[7x8]);
351        @@ MAC16(b1, -W5, col[7x8]);
352        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
353        teq r3, #0               @ if 0 then avoid muls
354        itttt ne
355        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
356        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
357        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
358        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
359        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
360        it    ne
361        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
362        @@ R3 is free now
363        teq r4, #0               @ if 0 then avoid muls
364        itttt ne
365        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
366        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
367        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
368        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
369        it    ne
370        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
371        @@ R4 is free now
372__end_b_evaluation2:
373        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
374        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
375        @@     R12=__const_ptr_, R14=&block[n]
376
377__a_evaluation2:
378        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
379        @@ a1 = a0 + W6 * row[2];
380        @@ a2 = a0 - W6 * row[2];
381        @@ a3 = a0 - W2 * row[2];
382        @@ a0 = a0 + W2 * row[2];
383        ldrsh r6, [r14, #0]
384        ldr r9, [r12, #offW4]    @ R9=W4
385        mul r6, r9, r6           @ R6=W4*ROWr16[0]
386        ldr r10, [r12, #offW6]   @ R10=W6
387        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
388        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
389        mul r11, r10, r4         @ R11=W6*ROWr16[2]
390        ldr r8, [r12, #offW2]    @ R8=W2
391        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
392        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
393        mul r11, r8, r4          @ R11=W2*ROWr16[2]
394        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
395        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
396
397        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
398        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
399        @@     R12=__const_ptr_, R14=&block[n]
400        @@ a0 += W4*row[4]
401        @@ a1 -= W4*row[4]
402        @@ a2 -= W4*row[4]
403        @@ a3 += W4*row[4]
404        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
405        teq r11, #0              @ if null avoid muls
406        itttt ne
407        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
408        @@ R9 is free now
409        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
410        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
411        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
412        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
413        it    ne
414        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
415        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
416        teq r9, #0               @ if null avoid muls
417        itttt ne
418        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
419        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
420        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
421        @@ a0 += W6*row[6];
422        @@ a3 -= W6*row[6];
423        @@ a1 -= W2*row[6];
424        @@ a2 += W2*row[6];
425        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
426        itt   ne
427        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
428        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
429__end_a_evaluation2:
430        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
431        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
432        @@     R12=__const_ptr_, R14=&block[n]
433        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
434        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
435        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
436        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
437        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
438        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
439        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
440        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
441        @@@@@ no optimization here @@@@@
442        add r8, r6, r0           @ R8=a0+b0
443        add r9, r2, r1           @ R9=a1+b1
444        mov r8, r8, asr #COL_SHIFT
445        mov r9, r9, asr #COL_SHIFT
446        strh r8, [r14, #0]
447        strh r9, [r14, #16]
448        add r8, r3, r5           @ R8=a2+b2
449        add r9, r4, r7           @ R9=a3+b3
450        mov r8, r8, asr #COL_SHIFT
451        mov r9, r9, asr #COL_SHIFT
452        strh r8, [r14, #32]
453        strh r9, [r14, #48]
454        sub r8, r4, r7           @ R8=a3-b3
455        sub r9, r3, r5           @ R9=a2-b2
456        mov r8, r8, asr #COL_SHIFT
457        mov r9, r9, asr #COL_SHIFT
458        strh r8, [r14, #64]
459        strh r9, [r14, #80]
460        sub r8, r2, r1           @ R8=a1-b1
461        sub r9, r6, r0           @ R9=a0-b0
462        mov r8, r8, asr #COL_SHIFT
463        mov r9, r9, asr #COL_SHIFT
464        strh r8, [r14, #96]
465        strh r9, [r14, #112]
466
467__end_col_loop:
468        @@ at this point, R0-R11 (free)
469        @@     R12=__const_ptr_, R14=&block[n]
470        ldr r0, [sp, #0]         @ R0=block
471        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
472        sub r14, r14, #2
473        bne __col_loop
474
475
476
477
478__end_simple_idct_arm:
479        @@ restore registers to previous status!
480        add sp, sp, #8 @@ the local variables!
481        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
482
483
484
485@@ kind of sub-function, here not to overload the common case.
486__end_bef_a_evaluation:
487        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
488        mul r11, r8, r4          @ R11=W2*ROWr16[2]
489        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
490        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
491        bal __end_a_evaluation
492
493
494        .align
495__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
496        .word   W1
497        .word   W2
498        .word   W3
499        .word   W4
500        .word   W5
501        .word   W6
502        .word   W7
503        .word   MASK_MSHW
504