1/*
2   C-like prototype :
3        void j_rev_dct_ARM(DCTBLOCK data)
4
5   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
6
7   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
8
9   Permission is hereby granted, free of charge, to any person obtaining a copy
10   of this software and associated documentation files (the "Software"), to deal
11   in the Software without restriction, including without limitation the rights
12   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13   copies of the Software, and to permit persons to whom the Software is
14   furnished to do so, subject to the following conditions:
15
16   The above copyright notice and this permission notice shall be included in
17   all copies or substantial portions of the Software.
18
19   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
22   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
23   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
24   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26*/
27
28#include "asm.S"
29
30#define FIX_0_298631336 2446
31#define FIX_0_541196100 4433
32#define FIX_0_765366865 6270
33#define FIX_1_175875602 9633
34#define FIX_1_501321110 12299
35#define FIX_2_053119869 16819
36#define FIX_3_072711026 25172
37#define FIX_M_0_390180644 -3196
38#define FIX_M_0_899976223 -7373
39#define FIX_M_1_847759065 -15137
40#define FIX_M_1_961570560 -16069
41#define FIX_M_2_562915447 -20995
42#define FIX_0xFFFF 0xFFFF
43
44#define FIX_0_298631336_ID      0
45#define FIX_0_541196100_ID      4
46#define FIX_0_765366865_ID      8
47#define FIX_1_175875602_ID     12
48#define FIX_1_501321110_ID     16
49#define FIX_2_053119869_ID     20
50#define FIX_3_072711026_ID     24
51#define FIX_M_0_390180644_ID   28
52#define FIX_M_0_899976223_ID   32
53#define FIX_M_1_847759065_ID   36
54#define FIX_M_1_961570560_ID   40
55#define FIX_M_2_562915447_ID   44
56#define FIX_0xFFFF_ID          48
57        .text
58        .align
59
60function j_rev_dct_ARM, export=1
61        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
62
63        sub sp, sp, #4                  @ reserve some space on the stack
64        str r0, [ sp ]                  @ save the DCT pointer to the stack
65
66        mov lr, r0                      @ lr = pointer to the current row
67        mov r12, #8                     @ r12 = row-counter
68        add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
69row_loop:
70        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
71        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
72
73        @ Optimization for row that have all items except the first set to 0
74        @ (this works as the DCTELEMS are always 4-byte aligned)
75        ldr r5, [lr, # 0]
76        ldr r6, [lr, # 4]
77        ldr r3, [lr, # 8]
78        ldr r4, [lr, #12]
79        orr r3, r3, r4
80        orr r3, r3, r6
81        orrs r5, r3, r5
82        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
83        orrs r3, r3, r2
84        beq empty_row
85
86        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
87        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
88        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
89
90        ldr r3, [r11, #FIX_0_541196100_ID]
91        add r7, r2, r6
92        ldr r5, [r11, #FIX_M_1_847759065_ID]
93        mul r7, r3, r7                      @ r7 = z1
94        ldr r3, [r11, #FIX_0_765366865_ID]
95        mla r6, r5, r6, r7                  @ r6 = tmp2
96        add r5, r0, r4                      @ r5 = tmp0
97        mla r2, r3, r2, r7                  @ r2 = tmp3
98        sub r3, r0, r4                      @ r3 = tmp1
99
100        add r0, r2, r5, lsl #13             @ r0 = tmp10
101        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
102        add r4, r6, r3, lsl #13             @ r4 = tmp11
103        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
104
105        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
106
107        ldrsh r3, [lr, #10]             @ r3 = 'd3'
108        ldrsh r5, [lr, #12]             @ r5 = 'd5'
109        ldrsh r7, [lr, #14]             @ r7 = 'd7'
110
111        add r0, r3, r5                        @ r0 = 'z2'
112        add r2, r1, r7                  @ r2 = 'z1'
113        add r4, r3, r7                  @ r4 = 'z3'
114        add r6, r1, r5                  @ r6 = 'z4'
115        ldr r9, [r11, #FIX_1_175875602_ID]
116        add r8, r4, r6                  @ r8 = z3 + z4
117        ldr r10, [r11, #FIX_M_0_899976223_ID]
118        mul r8, r9, r8                  @ r8 = 'z5'
119        ldr r9, [r11, #FIX_M_2_562915447_ID]
120        mul r2, r10, r2                 @ r2 = 'z1'
121        ldr r10, [r11, #FIX_M_1_961570560_ID]
122        mul r0, r9, r0                  @ r0 = 'z2'
123        ldr r9, [r11, #FIX_M_0_390180644_ID]
124        mla r4, r10, r4, r8             @ r4 = 'z3'
125        ldr r10, [r11, #FIX_0_298631336_ID]
126        mla r6, r9, r6, r8              @ r6 = 'z4'
127        ldr r9, [r11, #FIX_2_053119869_ID]
128        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
129        ldr r10, [r11, #FIX_3_072711026_ID]
130        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
131        ldr r9, [r11, #FIX_1_501321110_ID]
132        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
133        add r7, r7, r4                  @ r7 = tmp0
134        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
135        add r5,        r5, r6                  @ r5 = tmp1
136        add r3, r3, r4                  @ r3 = tmp2
137        add r1, r1, r6                  @ r1 = tmp3
138
139        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
140                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
141
142        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
143        add r8, r0, r1
144        add r8, r8, #(1<<10)
145        mov r8, r8, asr #11
146        strh r8, [lr, # 0]
147
148        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
149        sub r8, r0, r1
150        add r8, r8, #(1<<10)
151        mov r8, r8, asr #11
152        strh r8, [lr, #14]
153
154        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
155        add r8, r6, r3
156        add r8, r8, #(1<<10)
157        mov r8, r8, asr #11
158        strh r8, [lr, # 2]
159
160        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
161        sub r8, r6, r3
162        add r8, r8, #(1<<10)
163        mov r8, r8, asr #11
164        strh r8, [lr, #12]
165
166        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
167        add r8, r4, r5
168        add r8, r8, #(1<<10)
169        mov r8, r8, asr #11
170        strh r8, [lr, # 4]
171
172        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
173        sub r8, r4, r5
174        add r8, r8, #(1<<10)
175        mov r8, r8, asr #11
176        strh r8, [lr, #10]
177
178        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
179        add r8, r2, r7
180        add r8, r8, #(1<<10)
181        mov r8, r8, asr #11
182        strh r8, [lr, # 6]
183
184        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
185        sub r8, r2, r7
186        add r8, r8, #(1<<10)
187        mov r8, r8, asr #11
188        strh r8, [lr, # 8]
189
190        @ End of row loop
191        add lr, lr, #16
192        subs r12, r12, #1
193        bne row_loop
194        beq start_column_loop
195
196empty_row:
197        ldr r1, [r11, #FIX_0xFFFF_ID]
198        mov r0, r0, lsl #2
199        and r0, r0, r1
200        add r0, r0, r0, lsl #16
201        str r0, [lr, # 0]
202        str r0, [lr, # 4]
203        str r0, [lr, # 8]
204        str r0, [lr, #12]
205
206end_of_row_loop:
207        @ End of loop
208        add lr, lr, #16
209        subs r12, r12, #1
210        bne row_loop
211
212start_column_loop:
213        @ Start of column loop
214        ldr lr, [ sp ]
215        mov r12, #8
216column_loop:
217        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
218        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
219        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
220        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
221
222        ldr r3, [r11, #FIX_0_541196100_ID]
223        add r1, r2, r6
224        ldr r5, [r11, #FIX_M_1_847759065_ID]
225        mul r1, r3, r1                      @ r1 = z1
226        ldr r3, [r11, #FIX_0_765366865_ID]
227        mla r6, r5, r6, r1                  @ r6 = tmp2
228        add r5, r0, r4                      @ r5 = tmp0
229        mla r2, r3, r2, r1                  @ r2 = tmp3
230        sub r3, r0, r4                      @ r3 = tmp1
231
232        add r0, r2, r5, lsl #13             @ r0 = tmp10
233        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
234        add r4, r6, r3, lsl #13             @ r4 = tmp11
235        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
236
237        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
238        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
239        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
240        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
241
242        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
243        orr r9, r1, r3
244        orr r10, r5, r7
245        orrs r10, r9, r10
246        beq empty_odd_column
247
248        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
249
250        add r0, r3, r5                  @ r0 = 'z2'
251        add r2, r1, r7                  @ r2 = 'z1'
252        add r4, r3, r7                  @ r4 = 'z3'
253        add r6, r1, r5                  @ r6 = 'z4'
254        ldr r9, [r11, #FIX_1_175875602_ID]
255        add r8, r4, r6
256        ldr r10, [r11, #FIX_M_0_899976223_ID]
257        mul r8, r9, r8                  @ r8 = 'z5'
258        ldr r9, [r11, #FIX_M_2_562915447_ID]
259        mul r2, r10, r2                 @ r2 = 'z1'
260        ldr r10, [r11, #FIX_M_1_961570560_ID]
261        mul r0, r9, r0                  @ r0 = 'z2'
262        ldr r9, [r11, #FIX_M_0_390180644_ID]
263        mla r4, r10, r4, r8             @ r4 = 'z3'
264        ldr r10, [r11, #FIX_0_298631336_ID]
265        mla r6, r9, r6, r8              @ r6 = 'z4'
266        ldr r9, [r11, #FIX_2_053119869_ID]
267        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
268        ldr r10, [r11, #FIX_3_072711026_ID]
269        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
270        ldr r9, [r11, #FIX_1_501321110_ID]
271        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
272        add r7, r7, r4                  @ r7 = tmp0
273        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
274        add r5,        r5, r6                  @ r5 = tmp1
275        add r3, r3, r4                  @ r3 = tmp2
276        add r1, r1, r6                  @ r1 = tmp3
277
278        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
279                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
280
281        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
282        add r8, r0, r1
283        add r8, r8, #(1<<17)
284        mov r8, r8, asr #18
285        strh r8, [lr, #( 0*8)]
286
287        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
288        sub r8, r0, r1
289        add r8, r8, #(1<<17)
290        mov r8, r8, asr #18
291        strh r8, [lr, #(14*8)]
292
293        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
294        add r8, r4, r3
295        add r8, r8, #(1<<17)
296        mov r8, r8, asr #18
297        strh r8, [lr, #( 2*8)]
298
299        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
300        sub r8, r4, r3
301        add r8, r8, #(1<<17)
302        mov r8, r8, asr #18
303        strh r8, [lr, #(12*8)]
304
305        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
306        add r8, r6, r5
307        add r8, r8, #(1<<17)
308        mov r8, r8, asr #18
309        strh r8, [lr, #( 4*8)]
310
311        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
312        sub r8, r6, r5
313        add r8, r8, #(1<<17)
314        mov r8, r8, asr #18
315        strh r8, [lr, #(10*8)]
316
317        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
318        add r8, r2, r7
319        add r8, r8, #(1<<17)
320        mov r8, r8, asr #18
321        strh r8, [lr, #( 6*8)]
322
323        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
324        sub r8, r2, r7
325        add r8, r8, #(1<<17)
326        mov r8, r8, asr #18
327        strh r8, [lr, #( 8*8)]
328
329        @ End of row loop
330        add lr, lr, #2
331        subs r12, r12, #1
332        bne column_loop
333        beq the_end
334
335empty_odd_column:
336        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
337        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
338        add r0, r0, #(1<<17)
339        mov r0, r0, asr #18
340        strh r0, [lr, #( 0*8)]
341        strh r0, [lr, #(14*8)]
342
343        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
344        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
345        add r4, r4, #(1<<17)
346        mov r4, r4, asr #18
347        strh r4, [lr, #( 2*8)]
348        strh r4, [lr, #(12*8)]
349
350        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
351        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
352        add r6, r6, #(1<<17)
353        mov r6, r6, asr #18
354        strh r6, [lr, #( 4*8)]
355        strh r6, [lr, #(10*8)]
356
357        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
358        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
359        add r2, r2, #(1<<17)
360        mov r2, r2, asr #18
361        strh r2, [lr, #( 6*8)]
362        strh r2, [lr, #( 8*8)]
363
364        @ End of row loop
365        add lr, lr, #2
366        subs r12, r12, #1
367        bne column_loop
368
369the_end:
370        @ The end....
371        add sp, sp, #4
372        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
373
374const_array:
375        .align
376        .word FIX_0_298631336
377        .word FIX_0_541196100
378        .word FIX_0_765366865
379        .word FIX_1_175875602
380        .word FIX_1_501321110
381        .word FIX_2_053119869
382        .word FIX_3_072711026
383        .word FIX_M_0_390180644
384        .word FIX_M_0_899976223
385        .word FIX_M_1_847759065
386        .word FIX_M_1_961570560
387        .word FIX_M_2_562915447
388        .word FIX_0xFFFF
389