1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "asm.S"
22
23        preserve8
24        .text
25
26function ff_h264_idct_add_neon, export=1
27        vld1.64         {d0-d3},  [r1,:128]
28
29        vswp            d1,  d2
30        vadd.i16        d4,  d0,  d1
31        vshr.s16        q8,  q1,  #1
32        vsub.i16        d5,  d0,  d1
33        vadd.i16        d6,  d2,  d17
34        vsub.i16        d7,  d16, d3
35        vadd.i16        q0,  q2,  q3
36        vsub.i16        q1,  q2,  q3
37
38        vtrn.16         d0,  d1
39        vtrn.16         d3,  d2
40        vtrn.32         d0,  d3
41        vtrn.32         d1,  d2
42
43        vadd.i16        d4,  d0,  d3
44        vld1.32         {d18[0]}, [r0,:32], r2
45        vswp            d1,  d3
46        vshr.s16        q8,  q1,  #1
47        vld1.32         {d19[1]}, [r0,:32], r2
48        vsub.i16        d5,  d0,  d1
49        vld1.32         {d18[1]}, [r0,:32], r2
50        vadd.i16        d6,  d16, d3
51        vld1.32         {d19[0]}, [r0,:32], r2
52        vsub.i16        d7,  d2,  d17
53        sub             r0,  r0,  r2, lsl #2
54        vadd.i16        q0,  q2,  q3
55        vsub.i16        q1,  q2,  q3
56
57        vrshr.s16       q0,  q0,  #6
58        vrshr.s16       q1,  q1,  #6
59
60        vaddw.u8        q0,  q0,  d18
61        vaddw.u8        q1,  q1,  d19
62
63        vqmovun.s16     d0,  q0
64        vqmovun.s16     d1,  q1
65
66        vst1.32         {d0[0]},  [r0,:32], r2
67        vst1.32         {d1[1]},  [r0,:32], r2
68        vst1.32         {d0[1]},  [r0,:32], r2
69        vst1.32         {d1[0]},  [r0,:32], r2
70
71        bx              lr
72endfunc
73
74function ff_h264_idct_dc_add_neon, export=1
75        vld1.16         {d2[],d3[]}, [r1,:16]
76        vrshr.s16       q1,  q1,  #6
77        vld1.32         {d0[0]},  [r0,:32], r2
78        vld1.32         {d0[1]},  [r0,:32], r2
79        vaddw.u8        q2,  q1,  d0
80        vld1.32         {d1[0]},  [r0,:32], r2
81        vld1.32         {d1[1]},  [r0,:32], r2
82        vaddw.u8        q1,  q1,  d1
83        vqmovun.s16     d0,  q2
84        vqmovun.s16     d1,  q1
85        sub             r0,  r0,  r2, lsl #2
86        vst1.32         {d0[0]},  [r0,:32], r2
87        vst1.32         {d0[1]},  [r0,:32], r2
88        vst1.32         {d1[0]},  [r0,:32], r2
89        vst1.32         {d1[1]},  [r0,:32], r2
90        bx              lr
91endfunc
92
93function ff_h264_idct_add16_neon, export=1
94        push            {r4-r8,lr}
95        mov             r4,  r0
96        mov             r5,  r1
97        mov             r1,  r2
98        mov             r2,  r3
99        ldr             r6,  [sp, #24]
100        movrel          r7,  scan8
101        mov             ip,  #16
1021:      ldrb            r8,  [r7], #1
103        ldr             r0,  [r5], #4
104        ldrb            r8,  [r6, r8]
105        subs            r8,  r8,  #1
106        blt             2f
107        ldrsh           lr,  [r1]
108        add             r0,  r0,  r4
109        movne           lr,  #0
110        cmp             lr,  #0
111        adrne           lr,  ff_h264_idct_dc_add_neon
112        adreq           lr,  ff_h264_idct_add_neon
113        blx             lr
1142:      subs            ip,  ip,  #1
115        add             r1,  r1,  #32
116        bne             1b
117        pop             {r4-r8,pc}
118endfunc
119
120function ff_h264_idct_add16intra_neon, export=1
121        push            {r4-r8,lr}
122        mov             r4,  r0
123        mov             r5,  r1
124        mov             r1,  r2
125        mov             r2,  r3
126        ldr             r6,  [sp, #24]
127        movrel          r7,  scan8
128        mov             ip,  #16
1291:      ldrb            r8,  [r7], #1
130        ldr             r0,  [r5], #4
131        ldrb            r8,  [r6, r8]
132        add             r0,  r0,  r4
133        cmp             r8,  #0
134        ldrsh           r8,  [r1]
135        adrne           lr,  ff_h264_idct_add_neon
136        adreq           lr,  ff_h264_idct_dc_add_neon
137        cmpeq           r8,  #0
138        blxne           lr
139        subs            ip,  ip,  #1
140        add             r1,  r1,  #32
141        bne             1b
142        pop             {r4-r8,pc}
143endfunc
144
145function ff_h264_idct_add8_neon, export=1
146        push            {r4-r10,lr}
147        ldm             r0,  {r4,r9}
148        add             r5,  r1,  #16*4
149        add             r1,  r2,  #16*32
150        mov             r2,  r3
151        ldr             r6,  [sp, #32]
152        movrel          r7,  scan8+16
153        mov             ip,  #8
1541:      ldrb            r8,  [r7], #1
155        ldr             r0,  [r5], #4
156        ldrb            r8,  [r6, r8]
157        tst             ip,  #4
158        addeq           r0,  r0,  r4
159        addne           r0,  r0,  r9
160        cmp             r8,  #0
161        ldrsh           r8,  [r1]
162        adrne           lr,  ff_h264_idct_add_neon
163        adreq           lr,  ff_h264_idct_dc_add_neon
164        cmpeq           r8,  #0
165        blxne           lr
166        subs            ip,  ip,  #1
167        add             r1,  r1,  #32
168        bne             1b
169        pop             {r4-r10,pc}
170endfunc
171
172        .section .rodata
173scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
174        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
175        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
176        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
177        .byte           1+1*8, 2+1*8
178        .byte           1+2*8, 2+2*8
179        .byte           1+4*8, 2+4*8
180        .byte           1+5*8, 2+5*8
181