1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23        preserve8
24        .fpu neon
25
26        .text
27
28function ff_h264_idct_add_neon, export=1
29        mov             r3,  #(1<<5)
30        vmov.i16        d16, #0
31        vmov.16         d16[0],   r3
32        vld1.64         {d0-d3},  [r1,:128]
33        vadd.i16        d0,  d0,  d16
34
35        vswp            d1,  d2
36        vadd.i16        d4,  d0,  d1
37        vshr.s16        q8,  q1,  #1
38        vsub.i16        d5,  d0,  d1
39        vadd.i16        d6,  d2,  d17
40        vsub.i16        d7,  d16, d3
41        vadd.i16        q0,  q2,  q3
42        vsub.i16        q1,  q2,  q3
43
44        vtrn.16         d0,  d1
45        vtrn.16         d3,  d2
46        vtrn.32         d0,  d3
47        vtrn.32         d1,  d2
48
49        vadd.i16        d4,  d0,  d3
50        vld1.32         {d18[0]}, [r0,:32], r2
51        vswp            d1,  d3
52        vshr.s16        q8,  q1,  #1
53        vld1.32         {d19[1]}, [r0,:32], r2
54        vsub.i16        d5,  d0,  d1
55        vld1.32         {d18[1]}, [r0,:32], r2
56        vadd.i16        d6,  d16, d3
57        vld1.32         {d19[0]}, [r0,:32], r2
58        vsub.i16        d7,  d2,  d17
59        sub             r0,  r0,  r2, lsl #2
60        vadd.i16        q0,  q2,  q3
61        vsub.i16        q1,  q2,  q3
62
63        vshr.s16        q0,  q0,  #6
64        vshr.s16        q1,  q1,  #6
65
66        vaddw.u8        q0,  q0,  d18
67        vaddw.u8        q1,  q1,  d19
68
69        vqmovun.s16     d0,  q0
70        vqmovun.s16     d1,  q1
71
72        vst1.32         {d0[0]},  [r0,:32], r2
73        vst1.32         {d1[1]},  [r0,:32], r2
74        vst1.32         {d0[1]},  [r0,:32], r2
75        vst1.32         {d1[0]},  [r0,:32], r2
76
77        bx              lr
78        .endfunc
79
80function ff_h264_idct_dc_add_neon, export=1
81        vld1.16         {d2[],d3[]}, [r1,:16]
82        vrshr.s16       q1,  q1,  #6
83        vld1.32         {d0[0]},  [r0,:32], r2
84        vld1.32         {d0[1]},  [r0,:32], r2
85        vaddw.u8        q2,  q1,  d0
86        vld1.32         {d1[0]},  [r0,:32], r2
87        vld1.32         {d1[1]},  [r0,:32], r2
88        vaddw.u8        q1,  q1,  d1
89        vqmovun.s16     d0,  q2
90        vqmovun.s16     d1,  q1
91        sub             r0,  r0,  r2, lsl #2
92        vst1.32         {d0[0]},  [r0,:32], r2
93        vst1.32         {d0[1]},  [r0,:32], r2
94        vst1.32         {d1[0]},  [r0,:32], r2
95        vst1.32         {d1[1]},  [r0,:32], r2
96        bx              lr
97        .endfunc
98
99function ff_h264_idct_add16_neon, export=1
100        push            {r4-r8,lr}
101        mov             r4,  r0
102        mov             r5,  r1
103        mov             r1,  r2
104        mov             r2,  r3
105        ldr             r6,  [sp, #24]
106        movrel          r7,  scan8
107        mov             ip,  #16
1081:      ldrb            r8,  [r7], #1
109        ldr             r0,  [r5], #4
110        ldrb            r8,  [r6, r8]
111        subs            r8,  r8,  #1
112        blt             2f
113        ldrsh           lr,  [r1]
114        add             r0,  r0,  r4
115        movne           lr,  #0
116        cmp             lr,  #0
117        adrne           lr,  ff_h264_idct_dc_add_neon
118        adreq           lr,  ff_h264_idct_add_neon
119        blx             lr
1202:      subs            ip,  ip,  #1
121        add             r1,  r1,  #32
122        bne             1b
123        pop             {r4-r8,pc}
124        .endfunc
125
126function ff_h264_idct_add16intra_neon, export=1
127        push            {r4-r8,lr}
128        mov             r4,  r0
129        mov             r5,  r1
130        mov             r1,  r2
131        mov             r2,  r3
132        ldr             r6,  [sp, #24]
133        movrel          r7,  scan8
134        mov             ip,  #16
1351:      ldrb            r8,  [r7], #1
136        ldr             r0,  [r5], #4
137        ldrb            r8,  [r6, r8]
138        add             r0,  r0,  r4
139        cmp             r8,  #0
140        ldrsh           r8,  [r1]
141        adrne           lr,  ff_h264_idct_add_neon
142        adreq           lr,  ff_h264_idct_dc_add_neon
143        cmpeq           r8,  #0
144        blxne           lr
145        subs            ip,  ip,  #1
146        add             r1,  r1,  #32
147        bne             1b
148        pop             {r4-r8,pc}
149        .endfunc
150
151function ff_h264_idct_add8_neon, export=1
152        push            {r4-r10,lr}
153        ldm             r0,  {r4,r9}
154        add             r5,  r1,  #16*4
155        add             r1,  r2,  #16*32
156        mov             r2,  r3
157        ldr             r6,  [sp, #32]
158        movrel          r7,  scan8+16
159        mov             ip,  #8
1601:      ldrb            r8,  [r7], #1
161        ldr             r0,  [r5], #4
162        ldrb            r8,  [r6, r8]
163        tst             ip,  #4
164        addeq           r0,  r0,  r4
165        addne           r0,  r0,  r9
166        cmp             r8,  #0
167        ldrsh           r8,  [r1]
168        adrne           lr,  ff_h264_idct_add_neon
169        adreq           lr,  ff_h264_idct_dc_add_neon
170        cmpeq           r8,  #0
171        blxne           lr
172        subs            ip,  ip,  #1
173        add             r1,  r1,  #32
174        bne             1b
175        pop             {r4-r10,pc}
176        .endfunc
177
178        .section .rodata
179scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
180        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
181        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
182        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
183        .byte           1+1*8, 2+1*8
184        .byte           1+2*8, 2+2*8
185        .byte           1+4*8, 2+4*8
186        .byte           1+5*8, 2+5*8
187