1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25function ff_h264_idct_add_neon, export=1
26        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
27        sxtw            x2,     w2
28        movi            v30.8H, #0
29
30        add             v4.4H,  v0.4H,  v2.4H
31        sshr            v16.4H, v1.4H,  #1
32        st1             {v30.8H},    [x1], #16
33        sshr            v17.4H, v3.4H,  #1
34        st1             {v30.8H},    [x1], #16
35        sub             v5.4H,  v0.4H,  v2.4H
36        add             v6.4H,  v1.4H,  v17.4H
37        sub             v7.4H,  v16.4H, v3.4H
38        add             v0.4H,  v4.4H,  v6.4H
39        add             v1.4H,  v5.4H,  v7.4H
40        sub             v2.4H,  v4.4H,  v6.4H
41        sub             v3.4H,  v5.4H,  v7.4H
42
43        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
44
45        add             v4.4H,  v0.4H,  v3.4H
46        ld1             {v18.S}[0], [x0], x2
47        sshr            v16.4H,  v2.4H,  #1
48        sshr            v17.4H,  v1.4H,  #1
49        ld1             {v19.S}[1], [x0], x2
50        sub             v5.4H,  v0.4H,  v3.4H
51        ld1             {v18.S}[1], [x0], x2
52        add             v6.4H,  v16.4H, v1.4H
53        ins             v4.D[1],  v5.D[0]
54        sub             v7.4H,  v2.4H,  v17.4H
55        ld1             {v19.S}[0], [x0], x2
56        ins             v6.D[1],  v7.D[0]
57        sub             x0,  x0,  x2, lsl #2
58        add             v0.8H,  v4.8H,  v6.8H
59        sub             v1.8H,  v4.8H,  v6.8H
60
61        srshr           v0.8H,  v0.8H,  #6
62        srshr           v1.8H,  v1.8H,  #6
63
64        uaddw           v0.8H,  v0.8H,  v18.8B
65        uaddw           v1.8H,  v1.8H,  v19.8B
66
67        sqxtun          v0.8B, v0.8H
68        sqxtun          v1.8B, v1.8H
69
70        st1             {v0.S}[0],  [x0], x2
71        st1             {v1.S}[1],  [x0], x2
72        st1             {v0.S}[1],  [x0], x2
73        st1             {v1.S}[0],  [x0], x2
74
75        sub             x1,  x1,  #32
76        ret
77endfunc
78
79function ff_h264_idct_dc_add_neon, export=1
80        sxtw            x2,  w2
81        mov             w3,       #0
82        ld1r            {v2.8H},  [x1]
83        strh            w3,       [x1]
84        srshr           v2.8H,  v2.8H,  #6
85        ld1             {v0.S}[0],  [x0], x2
86        ld1             {v0.S}[1],  [x0], x2
87        uaddw           v3.8H,  v2.8H,  v0.8B
88        ld1             {v1.S}[0],  [x0], x2
89        ld1             {v1.S}[1],  [x0], x2
90        uaddw           v4.8H,  v2.8H,  v1.8B
91        sqxtun          v0.8B,  v3.8H
92        sqxtun          v1.8B,  v4.8H
93        sub             x0,  x0,  x2, lsl #2
94        st1             {v0.S}[0],  [x0], x2
95        st1             {v0.S}[1],  [x0], x2
96        st1             {v1.S}[0],  [x0], x2
97        st1             {v1.S}[1],  [x0], x2
98        ret
99endfunc
100
101function ff_h264_idct_add16_neon, export=1
102        mov             x12, x30
103        mov             x6,  x0         // dest
104        mov             x5,  x1         // block_offset
105        mov             x1,  x2         // block
106        mov             w9,  w3         // stride
107        movrel          x7,  scan8
108        mov             x10, #16
109        movrel          x13, X(ff_h264_idct_dc_add_neon)
110        movrel          x14, X(ff_h264_idct_add_neon)
1111:      mov             w2,  w9
112        ldrb            w3,  [x7], #1
113        ldrsw           x0,  [x5], #4
114        ldrb            w3,  [x4,  w3,  uxtw]
115        subs            w3,  w3,  #1
116        b.lt            2f
117        ldrsh           w3,  [x1]
118        add             x0,  x0,  x6
119        ccmp            w3,  #0,  #4,  eq
120        csel            x15, x13, x14, ne
121        blr             x15
1222:      subs            x10, x10, #1
123        add             x1,  x1,  #32
124        b.ne            1b
125        ret             x12
126endfunc
127
128function ff_h264_idct_add16intra_neon, export=1
129        mov             x12, x30
130        mov             x6,  x0         // dest
131        mov             x5,  x1         // block_offset
132        mov             x1,  x2         // block
133        mov             w9,  w3         // stride
134        movrel          x7,  scan8
135        mov             x10, #16
136        movrel          x13, X(ff_h264_idct_dc_add_neon)
137        movrel          x14, X(ff_h264_idct_add_neon)
1381:      mov             w2,  w9
139        ldrb            w3,  [x7], #1
140        ldrsw           x0,  [x5], #4
141        ldrb            w3,  [x4,  w3,  uxtw]
142        add             x0,  x0,  x6
143        cmp             w3,  #0
144        ldrsh           w3,  [x1]
145        csel            x15, x13, x14, eq
146        ccmp            w3,  #0,  #0,  eq
147        b.eq            2f
148        blr             x15
1492:      subs            x10, x10, #1
150        add             x1,  x1,  #32
151        b.ne            1b
152        ret             x12
153endfunc
154
155function ff_h264_idct_add8_neon, export=1
156        sub             sp,  sp, #0x40
157        stp             x19, x20, [sp]
158        mov             x12, x30
159        ldp             x6,  x15, [x0]          // dest[0], dest[1]
160        add             x5,  x1,  #16*4         // block_offset
161        add             x9,  x2,  #16*32        // block
162        mov             w19, w3                 // stride
163        movrel          x13, X(ff_h264_idct_dc_add_neon)
164        movrel          x14, X(ff_h264_idct_add_neon)
165        movrel          x7,  scan8+16
166        mov             x10, #0
167        mov             x11, #16
1681:      mov             w2,  w19
169        ldrb            w3,  [x7, x10]          // scan8[i]
170        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
171        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
172        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
173        add             x1,  x9,  x10, lsl #5   // block + i * 16
174        cmp             w3,  #0
175        ldrsh           w3,  [x1]               // block[i*16]
176        csel            x20, x13, x14, eq
177        ccmp            w3,  #0,  #0,  eq
178        b.eq            2f
179        blr             x20
1802:      add             x10, x10, #1
181        cmp             x10, #4
182        csel            x10, x11, x10, eq     // mov x10, #16
183        csel            x6,  x15, x6,  eq
184        cmp             x10, #20
185        b.lt            1b
186        ldp             x19, x20, [sp]
187        add             sp,  sp,  #0x40
188        ret             x12
189endfunc
190
191.macro  idct8x8_cols    pass
192  .if \pass == 0
193        va      .req    v18
194        vb      .req    v30
195        sshr            v18.8H, v26.8H, #1
196        add             v16.8H, v24.8H, v28.8H
197        ld1             {v30.8H, v31.8H}, [x1]
198        st1             {v19.8H}, [x1],  #16
199        st1             {v19.8H}, [x1],  #16
200        sub             v17.8H,  v24.8H, v28.8H
201        sshr            v19.8H,  v30.8H, #1
202        sub             v18.8H,  v18.8H,  v30.8H
203        add             v19.8H,  v19.8H,  v26.8H
204  .else
205        va      .req    v30
206        vb      .req    v18
207        sshr            v30.8H, v26.8H, #1
208        sshr            v19.8H, v18.8H, #1
209        add             v16.8H, v24.8H, v28.8H
210        sub             v17.8H, v24.8H, v28.8H
211        sub             v30.8H, v30.8H, v18.8H
212        add             v19.8H, v19.8H, v26.8H
213  .endif
214        add             v26.8H, v17.8H, va.8H
215        sub             v28.8H, v17.8H, va.8H
216        add             v24.8H, v16.8H, v19.8H
217        sub             vb.8H,  v16.8H, v19.8H
218        sub             v16.8H, v29.8H, v27.8H
219        add             v17.8H, v31.8H, v25.8H
220        sub             va.8H,  v31.8H, v25.8H
221        add             v19.8H, v29.8H, v27.8H
222        sub             v16.8H, v16.8H, v31.8H
223        sub             v17.8H, v17.8H, v27.8H
224        add             va.8H,  va.8H,  v29.8H
225        add             v19.8H, v19.8H, v25.8H
226        sshr            v25.8H, v25.8H, #1
227        sshr            v27.8H, v27.8H, #1
228        sshr            v29.8H, v29.8H, #1
229        sshr            v31.8H, v31.8H, #1
230        sub             v16.8H, v16.8H, v31.8H
231        sub             v17.8H, v17.8H, v27.8H
232        add             va.8H,  va.8H,  v29.8H
233        add             v19.8H, v19.8H, v25.8H
234        sshr            v25.8H, v16.8H, #2
235        sshr            v27.8H, v17.8H, #2
236        sshr            v29.8H, va.8H,  #2
237        sshr            v31.8H, v19.8H, #2
238        sub             v19.8H, v19.8H, v25.8H
239        sub             va.8H,  v27.8H, va.8H
240        add             v17.8H, v17.8H, v29.8H
241        add             v16.8H, v16.8H, v31.8H
242  .if \pass == 0
243        sub             v31.8H, v24.8H, v19.8H
244        add             v24.8H, v24.8H, v19.8H
245        add             v25.8H, v26.8H, v18.8H
246        sub             v18.8H, v26.8H, v18.8H
247        add             v26.8H, v28.8H, v17.8H
248        add             v27.8H, v30.8H, v16.8H
249        sub             v29.8H, v28.8H, v17.8H
250        sub             v28.8H, v30.8H, v16.8H
251  .else
252        sub             v31.8H, v24.8H, v19.8H
253        add             v24.8H, v24.8H, v19.8H
254        add             v25.8H, v26.8H, v30.8H
255        sub             v30.8H, v26.8H, v30.8H
256        add             v26.8H, v28.8H, v17.8H
257        sub             v29.8H, v28.8H, v17.8H
258        add             v27.8H, v18.8H, v16.8H
259        sub             v28.8H, v18.8H, v16.8H
260  .endif
261        .unreq          va
262        .unreq          vb
263.endm
264
265function ff_h264_idct8_add_neon, export=1
266        movi            v19.8H,   #0
267        ld1             {v24.8H, v25.8H}, [x1]
268        st1             {v19.8H},  [x1],   #16
269        st1             {v19.8H},  [x1],   #16
270        ld1             {v26.8H, v27.8H}, [x1]
271        st1             {v19.8H},  [x1],   #16
272        st1             {v19.8H},  [x1],   #16
273        ld1             {v28.8H, v29.8H}, [x1]
274        st1             {v19.8H},  [x1],   #16
275        st1             {v19.8H},  [x1],   #16
276
277        idct8x8_cols    0
278        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
279        idct8x8_cols    1
280
281        mov             x3,  x0
282        srshr           v24.8H, v24.8H, #6
283        ld1             {v0.8B},     [x0], x2
284        srshr           v25.8H, v25.8H, #6
285        ld1             {v1.8B},     [x0], x2
286        srshr           v26.8H, v26.8H, #6
287        ld1             {v2.8B},     [x0], x2
288        srshr           v27.8H, v27.8H, #6
289        ld1             {v3.8B},     [x0], x2
290        srshr           v28.8H, v28.8H, #6
291        ld1             {v4.8B},     [x0], x2
292        srshr           v29.8H, v29.8H, #6
293        ld1             {v5.8B},     [x0], x2
294        srshr           v30.8H, v30.8H, #6
295        ld1             {v6.8B},     [x0], x2
296        srshr           v31.8H, v31.8H, #6
297        ld1             {v7.8B},     [x0], x2
298        uaddw           v24.8H, v24.8H, v0.8B
299        uaddw           v25.8H, v25.8H, v1.8B
300        uaddw           v26.8H, v26.8H, v2.8B
301        sqxtun          v0.8B,  v24.8H
302        uaddw           v27.8H, v27.8H, v3.8B
303        sqxtun          v1.8B,  v25.8H
304        uaddw           v28.8H, v28.8H, v4.8B
305        sqxtun          v2.8B,  v26.8H
306        st1             {v0.8B},     [x3], x2
307        uaddw           v29.8H, v29.8H, v5.8B
308        sqxtun          v3.8B,  v27.8H
309        st1             {v1.8B},     [x3], x2
310        uaddw           v30.8H, v30.8H, v6.8B
311        sqxtun          v4.8B,  v28.8H
312        st1             {v2.8B},     [x3], x2
313        uaddw           v31.8H, v31.8H, v7.8B
314        sqxtun          v5.8B,  v29.8H
315        st1             {v3.8B},     [x3], x2
316        sqxtun          v6.8B,  v30.8H
317        sqxtun          v7.8B,  v31.8H
318        st1             {v4.8B},     [x3], x2
319        st1             {v5.8B},     [x3], x2
320        st1             {v6.8B},     [x3], x2
321        st1             {v7.8B},     [x3], x2
322
323        sub             x1,  x1,  #128
324        ret
325endfunc
326
327function ff_h264_idct8_dc_add_neon, export=1
328        mov             w3,       #0
329        sxtw            x2,       w2
330        ld1r            {v31.8H}, [x1]
331        strh            w3,       [x1]
332        ld1             {v0.8B},  [x0], x2
333        srshr           v31.8H, v31.8H, #6
334        ld1             {v1.8B},     [x0], x2
335        ld1             {v2.8B},     [x0], x2
336        uaddw           v24.8H, v31.8H, v0.8B
337        ld1             {v3.8B},     [x0], x2
338        uaddw           v25.8H, v31.8H, v1.8B
339        ld1             {v4.8B},     [x0], x2
340        uaddw           v26.8H, v31.8H, v2.8B
341        ld1             {v5.8B},     [x0], x2
342        uaddw           v27.8H, v31.8H, v3.8B
343        ld1             {v6.8B},     [x0], x2
344        uaddw           v28.8H, v31.8H, v4.8B
345        ld1             {v7.8B},     [x0], x2
346        uaddw           v29.8H, v31.8H, v5.8B
347        uaddw           v30.8H, v31.8H, v6.8B
348        uaddw           v31.8H, v31.8H, v7.8B
349        sqxtun          v0.8B,  v24.8H
350        sqxtun          v1.8B,  v25.8H
351        sqxtun          v2.8B,  v26.8H
352        sqxtun          v3.8B,  v27.8H
353        sub             x0,  x0,  x2, lsl #3
354        st1             {v0.8B},     [x0], x2
355        sqxtun          v4.8B,  v28.8H
356        st1             {v1.8B},     [x0], x2
357        sqxtun          v5.8B,  v29.8H
358        st1             {v2.8B},     [x0], x2
359        sqxtun          v6.8B,  v30.8H
360        st1             {v3.8B},     [x0], x2
361        sqxtun          v7.8B,  v31.8H
362        st1             {v4.8B},     [x0], x2
363        st1             {v5.8B},     [x0], x2
364        st1             {v6.8B},     [x0], x2
365        st1             {v7.8B},     [x0], x2
366        ret
367endfunc
368
369function ff_h264_idct8_add4_neon, export=1
370        mov             x12, x30
371        mov             x6,  x0
372        mov             x5,  x1
373        mov             x1,  x2
374        mov             w2,  w3
375        movrel          x7,  scan8
376        mov             w10, #16
377        movrel          x13, X(ff_h264_idct8_dc_add_neon)
378        movrel          x14, X(ff_h264_idct8_add_neon)
3791:      ldrb            w9,  [x7], #4
380        ldrsw           x0,  [x5], #16
381        ldrb            w9,  [x4, w9, UXTW]
382        subs            w9,  w9,  #1
383        b.lt            2f
384        ldrsh           w11,  [x1]
385        add             x0,  x6,  x0
386        ccmp            w11, #0,  #4,  eq
387        csel            x15, x13, x14, ne
388        blr             x15
3892:      subs            w10, w10, #4
390        add             x1,  x1,  #128
391        b.ne            1b
392        ret             x12
393endfunc
394
395const   scan8
396        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
397        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
398        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
399        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
400        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
401        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
402        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
403        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
404        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
405        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
406        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
407        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
408endconst
409