1/*
2 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22
23#define FRAC_BITS   23   // fractional bits for sb_samples and dct
24#define WFRAC_BITS  16   // fractional bits for window
25#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
26
27const   tbl_rev128.s align=4
28        .byte           12, 13, 14, 15
29        .byte            8,  9, 10, 11
30        .byte            4,  5,  6,  7
31        .byte            0,  1,  2,  3
32endconst
33
34.macro   apply_window   type, st
35function ff_mpadsp_apply_window_\type\()_neon, export=1
36        mov             x7,  x0
37        sxtw            x4,  w4  // incr
38        add             x8,  x0,  #512<<2
39        ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
40        ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
41        st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
42        st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
43        movrel          x15, tbl_rev128.s
44        ld1             {v27.4s}, [x15]
45.ifc \type, fixed
46        lsl             x4,  x4,  #1
47.else
48        lsl             x4,  x4,  #2
49.endif
50        add             x10, x0,  #45<<2
51        add             x0,  x0,  #16<<2
52        add             x1,  x1,  #16<<2
53        add             x5,  x3,  x4,  lsl #5
54        sub             x5,  x5,  x4            // samples2
55        neg             x13, x4                 // -incr
56        mov             x9,  #64<<2
57.ifc \type, fixed
58        ld1r            {v16.2s}, [x2]          // dither_state
59        sxtl            v16.2d, v16.2s
60        movi            v29.2d, #0
61        movi            v30.2d, #(1<<OUT_SHIFT)-1
62        trn1            v31.2d, v29.2d, v30.2d
63        trn2            v30.2d, v30.2d, v29.2d
64        trn1            v16.2d, v16.2d, v29.2d
65.else
66        movi            v16.4s, #0
67        movi            v28.4s, #0
68.endif
69        mov             x14, #4
701:
71        mov             x8,  x0
72        sub             x7,  x1,  #3<<2
73        sub             x6,  x1,  x14, lsl #4
74        add             x7,  x7,  x14, lsl #4
75        add             x11, x6, #(32)<<2      // w  + 32
76        add             x12, x7, #(32)<<2      // w2 + 32
77        mov             x15, #8
78        movi            v17.2d, #0
79        movi            v18.2d, #0
80        movi            v19.2d, #0
812:
82        subs            x15, x15, #1
83        ld1             {v0.4s},  [x8],  x9
84        ld1             {v1.4s},  [x10], x9
85        ld1             {v2.4s},  [x6],  x9
86        ld1             {v3.4s},  [x7],  x9
87        tbl             v6.16b, {v0.16b}, v27.16b
88        tbl             v7.16b, {v1.16b}, v27.16b
89        ld1             {v4.4s},  [x11], x9
90        ld1             {v5.4s},  [x12], x9
91        MLA             v16, v2, v0
92        MLA2            v17, v2, v0
93        MLS             v18, v3, v6
94        MLS2            v19, v3, v6
95        MLS             v16, v4, v7
96        MLS2            v17, v4, v7
97        MLS             v18, v5, v1
98        MLS2            v19, v5, v1
99        b.gt            2b
100
101        cmp             x14, #4
102        sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)
103
104.ifc \type, fixed
105        and             v28.16b, v16.16b, v30.16b
106        ext             v28.16b, v29.16b, v28.16b, #8
107
108        b.eq            4f
109        round_sample    v19, 1, 1
1104:
111        round_sample    v16, 1, 0
112        shrn            v16.2s, v16.2d,  #OUT_SHIFT
113        round_sample    v19, 0, 0
114        shrn            v19.2s, v19.2d,  #OUT_SHIFT
115        round_sample    v17, 0, 1
116        round_sample    v18, 1, 1
117        round_sample    v17, 1, 0
118        shrn2           v16.4s, v17.2d,  #OUT_SHIFT
119        round_sample    v18, 0, 0
120        shrn2           v19.4s, v18.2d,  #OUT_SHIFT
121        sqxtn           v16.4h, v16.4s
122        sqxtn           v18.4h, v19.4s
123.else
124        ext             v18.16b, v18.16b, v18.16b, #8
125.endif
126
127        st1             {v16.\st\()}[0], [x3], x4
128        b.eq            4f
129        st1             {v18.\st\()}[1], [x5], x13
1304:
131        st1             {v16.\st\()}[1], [x3], x4
132        st1             {v18.\st\()}[0], [x5], x13
133        st1             {v16.\st\()}[2], [x3], x4
134        st1             {v18.\st\()}[3], [x5], x13
135        st1             {v16.\st\()}[3], [x3], x4
136        st1             {v18.\st\()}[2], [x5], x13
137
138        mov             v16.16b, v28.16b
139
140        subs            x14, x14, #1
141        add             x0,  x0,  #4<<2
142        sub             x10, x10, #4<<2
143        b.gt            1b
144
145// comuting samples[16]
146        add             x6,  x1,  #32<<2
147        ld1             {v0.2s},  [x6],  x9
148        ld1             {v1.2s},  [x0],  x9
149.rept   3
150        ld1             {v2.2s},  [x6],  x9
151        ld1             {v3.2s},  [x0],  x9
152        MLS             v16, v0,  v1
153        ld1             {v0.2s},  [x6],  x9
154        ld1             {v1.2s},  [x0],  x9
155        MLS             v16, v2,  v3
156.endr
157        ld1             {v2.2s},  [x6],  x9
158        ld1             {v3.2s},  [x0],  x9
159        MLS             v16, v0,  v1
160        MLS             v16, v2,  v3
161
162.ifc \type, fixed
163        and             v28.16b, v16.16b, v30.16b
164        shrn            v20.2s,  v16.2d,  #OUT_SHIFT
165        xtn             v28.2s,  v28.2d
166        sqxtn           v20.4h,  v20.4s
167        st1             {v28.s}[0], [x2]        // save dither_state
168        st1             {v20.h}[0], [x3]
169.else
170        st1             {v16.s}[0], [x3]
171.endif
172
173        ret
174endfunc
175.purgem round_sample
176.purgem MLA
177.purgem MLA2
178.purgem MLS
179.purgem MLS2
180.endm
181
182
183.macro  round_sample    r, idx, next
184        add             \r\().2d, \r\().2d, v28.2d
185.if \idx == 0
186        and             v28.16b,  \r\().16b,  v30.16b
187.else // \idx == 1
188        and             v28.16b,  \r\().16b,  v31.16b
189.endif
190.if \idx != \next
191  .if \next == 0
192        ext             v28.16b, v28.16b, v29.16b, #8
193  .else
194        ext             v28.16b, v29.16b, v28.16b, #8
195  .endif
196.endif
197.endm
198.macro  MLA             d, s1, s2
199        smlal           \d\().2d, \s1\().2s, \s2\().2s
200.endm
201.macro  MLA2            d, s1, s2
202        smlal2          \d\().2d, \s1\().4s, \s2\().4s
203.endm
204.macro  MLS             d, s1, s2
205        smlsl           \d\().2d, \s1\().2s, \s2\().2s
206.endm
207.macro  MLS2            d, s1, s2
208        smlsl2          \d\().2d, \s1\().4s, \s2\().4s
209.endm
210apply_window fixed, h
211
212
213// nothing to do for round_sample and ML{A,S}2
214.macro  round_sample    r, idx, next
215.endm
216.macro  MLA2            d, s1, s2
217.endm
218.macro  MLS2            d, s1, s2
219.endm
220.macro  MLA             d, s1, s2
221        fmla            \d\().4s, \s1\().4s, \s2\().4s
222.endm
223.macro  MLS             d, s1, s2
224        fmls            \d\().4s, \s1\().4s, \s2\().4s
225.endm
226apply_window float, s
227