1/*
2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/aarch64/asm.S"
23#include "neon.S"
24
25.macro  h264_loop_filter_start
26        cmp             w2,  #0
27        ldr             w6,  [x4]
28        ccmp            w3,  #0, #0, ne
29        mov             v24.S[0], w6
30        and             w6,  w6,  w6,  lsl #16
31        b.eq            1f
32        ands            w6,  w6,  w6,  lsl #8
33        b.ge            2f
341:
35        ret
362:
37.endm
38
39.macro  h264_loop_filter_luma
40        dup             v22.16B, w2                     // alpha
41        uxtl            v24.8H,  v24.8B
42        uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
43        uxtl            v24.4S,  v24.4H
44        uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
45        sli             v24.8H,  v24.8H,  #8
46        uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
47        sli             v24.4S,  v24.4S,  #16
48        cmhi            v21.16B, v22.16B, v21.16B       // < alpha
49        dup             v22.16B, w3                     // beta
50        cmlt            v23.16B, v24.16B, #0
51        cmhi            v28.16B, v22.16B, v28.16B       // < beta
52        cmhi            v30.16B, v22.16B, v30.16B       // < beta
53        bic             v21.16B, v21.16B, v23.16B
54        uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
55        and             v21.16B, v21.16B, v28.16B
56        uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
57        cmhi            v17.16B, v22.16B, v17.16B       // < beta
58        and             v21.16B, v21.16B, v30.16B
59        cmhi            v19.16B, v22.16B, v19.16B       // < beta
60        and             v17.16B, v17.16B, v21.16B
61        and             v19.16B, v19.16B, v21.16B
62        and             v24.16B, v24.16B, v21.16B
63        urhadd          v28.16B, v16.16B,  v0.16B
64        sub             v21.16B, v24.16B, v17.16B
65        uqadd           v23.16B, v18.16B, v24.16B
66        uhadd           v20.16B, v20.16B, v28.16B
67        sub             v21.16B, v21.16B, v19.16B
68        uhadd           v28.16B,  v4.16B, v28.16B
69        umin            v23.16B, v23.16B, v20.16B
70        uqsub           v22.16B, v18.16B, v24.16B
71        uqadd           v4.16B,   v2.16B, v24.16B
72        umax            v23.16B, v23.16B, v22.16B
73        uqsub           v22.16B,  v2.16B, v24.16B
74        umin            v28.16B,  v4.16B, v28.16B
75        uxtl            v4.8H,    v0.8B
76        umax            v28.16B, v28.16B, v22.16B
77        uxtl2           v20.8H,   v0.16B
78        usubw           v4.8H,    v4.8H,  v16.8B
79        usubw2          v20.8H,  v20.8H,  v16.16B
80        shl             v4.8H,    v4.8H,  #2
81        shl             v20.8H,  v20.8H,  #2
82        uaddw           v4.8H,    v4.8H,  v18.8B
83        uaddw2          v20.8H,  v20.8H,  v18.16B
84        usubw           v4.8H,    v4.8H,   v2.8B
85        usubw2          v20.8H,  v20.8H,   v2.16B
86        rshrn           v4.8B,    v4.8H,  #3
87        rshrn2          v4.16B,  v20.8H,  #3
88        bsl             v17.16B, v23.16B, v18.16B
89        bsl             v19.16B, v28.16B,  v2.16B
90        neg             v23.16B, v21.16B
91        uxtl            v28.8H,  v16.8B
92        smin            v4.16B,   v4.16B, v21.16B
93        uxtl2           v21.8H,  v16.16B
94        smax            v4.16B,   v4.16B, v23.16B
95        uxtl            v22.8H,   v0.8B
96        uxtl2           v24.8H,   v0.16B
97        saddw           v28.8H,  v28.8H,  v4.8B
98        saddw2          v21.8H,  v21.8H,  v4.16B
99        ssubw           v22.8H,  v22.8H,  v4.8B
100        ssubw2          v24.8H,  v24.8H,  v4.16B
101        sqxtun          v16.8B,  v28.8H
102        sqxtun2         v16.16B, v21.8H
103        sqxtun          v0.8B,   v22.8H
104        sqxtun2         v0.16B,  v24.8H
105.endm
106
107function ff_h264_v_loop_filter_luma_neon, export=1
108        h264_loop_filter_start
109        sxtw            x1,  w1
110
111        ld1             {v0.16B},  [x0], x1
112        ld1             {v2.16B},  [x0], x1
113        ld1             {v4.16B},  [x0], x1
114        sub             x0,  x0,  x1, lsl #2
115        sub             x0,  x0,  x1, lsl #1
116        ld1             {v20.16B},  [x0], x1
117        ld1             {v18.16B},  [x0], x1
118        ld1             {v16.16B},  [x0], x1
119
120        h264_loop_filter_luma
121
122        sub             x0,  x0,  x1, lsl #1
123        st1             {v17.16B},  [x0], x1
124        st1             {v16.16B}, [x0], x1
125        st1             {v0.16B},  [x0], x1
126        st1             {v19.16B}, [x0]
127
128        ret
129endfunc
130
131function ff_h264_h_loop_filter_luma_neon, export=1
132        h264_loop_filter_start
133
134        sub             x0,  x0,  #4
135        ld1             {v6.8B},  [x0], x1
136        ld1             {v20.8B}, [x0], x1
137        ld1             {v18.8B}, [x0], x1
138        ld1             {v16.8B}, [x0], x1
139        ld1             {v0.8B},  [x0], x1
140        ld1             {v2.8B},  [x0], x1
141        ld1             {v4.8B},  [x0], x1
142        ld1             {v26.8B}, [x0], x1
143        ld1             {v6.D}[1],  [x0], x1
144        ld1             {v20.D}[1], [x0], x1
145        ld1             {v18.D}[1], [x0], x1
146        ld1             {v16.D}[1], [x0], x1
147        ld1             {v0.D}[1],  [x0], x1
148        ld1             {v2.D}[1],  [x0], x1
149        ld1             {v4.D}[1],  [x0], x1
150        ld1             {v26.D}[1], [x0], x1
151
152        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
153
154        h264_loop_filter_luma
155
156        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
157
158        sub             x0,  x0,  x1, lsl #4
159        add             x0,  x0,  #2
160        st1             {v17.S}[0],  [x0], x1
161        st1             {v16.S}[0], [x0], x1
162        st1             {v0.S}[0],  [x0], x1
163        st1             {v19.S}[0], [x0], x1
164        st1             {v17.S}[1],  [x0], x1
165        st1             {v16.S}[1], [x0], x1
166        st1             {v0.S}[1],  [x0], x1
167        st1             {v19.S}[1], [x0], x1
168        st1             {v17.S}[2],  [x0], x1
169        st1             {v16.S}[2], [x0], x1
170        st1             {v0.S}[2],  [x0], x1
171        st1             {v19.S}[2], [x0], x1
172        st1             {v17.S}[3],  [x0], x1
173        st1             {v16.S}[3], [x0], x1
174        st1             {v0.S}[3],  [x0], x1
175        st1             {v19.S}[3], [x0], x1
176
177        ret
178endfunc
179
180.macro  h264_loop_filter_chroma
181        dup             v22.8B, w2              // alpha
182        uxtl            v24.8H, v24.8B
183        uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
184        uxtl            v4.8H,  v0.8B
185        uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
186        usubw           v4.8H,  v4.8H,  v16.8B
187        sli             v24.8H, v24.8H, #8
188        shl             v4.8H,  v4.8H,  #2
189        uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
190        uaddw           v4.8H,  v4.8H,  v18.8B
191        cmhi            v26.8B, v22.8B, v26.8B  // < alpha
192        usubw           v4.8H,  v4.8H,  v2.8B
193        dup             v22.8B, w3              // beta
194        rshrn           v4.8B,  v4.8H,  #3
195        cmhi            v28.8B, v22.8B, v28.8B  // < beta
196        cmhi            v30.8B, v22.8B, v30.8B  // < beta
197        smin            v4.8B,  v4.8B,  v24.8B
198        neg             v25.8B, v24.8B
199        and             v26.8B, v26.8B, v28.8B
200        smax            v4.8B,  v4.8B,  v25.8B
201        and             v26.8B, v26.8B, v30.8B
202        uxtl            v22.8H, v0.8B
203        and             v4.8B,  v4.8B,  v26.8B
204        uxtl            v28.8H, v16.8B
205        saddw           v28.8H, v28.8H, v4.8B
206        ssubw           v22.8H, v22.8H, v4.8B
207        sqxtun          v16.8B, v28.8H
208        sqxtun          v0.8B,  v22.8H
209.endm
210
211function ff_h264_v_loop_filter_chroma_neon, export=1
212        h264_loop_filter_start
213
214        sub             x0,  x0,  x1, lsl #1
215        ld1             {v18.8B}, [x0], x1
216        ld1             {v16.8B}, [x0], x1
217        ld1             {v0.8B},  [x0], x1
218        ld1             {v2.8B},  [x0]
219
220        h264_loop_filter_chroma
221
222        sub             x0,  x0,  x1, lsl #1
223        st1             {v16.8B}, [x0], x1
224        st1             {v0.8B},  [x0], x1
225
226        ret
227endfunc
228
229function ff_h264_h_loop_filter_chroma_neon, export=1
230        h264_loop_filter_start
231
232        sub             x0,  x0,  #2
233        ld1             {v18.S}[0], [x0], x1
234        ld1             {v16.S}[0], [x0], x1
235        ld1             {v0.S}[0],  [x0], x1
236        ld1             {v2.S}[0],  [x0], x1
237        ld1             {v18.S}[1], [x0], x1
238        ld1             {v16.S}[1], [x0], x1
239        ld1             {v0.S}[1],  [x0], x1
240        ld1             {v2.S}[1],  [x0], x1
241
242        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
243
244        h264_loop_filter_chroma
245
246        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
247
248        sub             x0,  x0,  x1, lsl #3
249        st1             {v18.S}[0], [x0], x1
250        st1             {v16.S}[0], [x0], x1
251        st1             {v0.S}[0],  [x0], x1
252        st1             {v2.S}[0],  [x0], x1
253        st1             {v18.S}[1], [x0], x1
254        st1             {v16.S}[1], [x0], x1
255        st1             {v0.S}[1],  [x0], x1
256        st1             {v2.S}[1],  [x0], x1
257
258        ret
259endfunc
260
261.macro  biweight_16     macs, macd
262        dup             v0.16B,  w5
263        dup             v1.16B,  w6
264        mov             v4.16B,  v16.16B
265        mov             v6.16B,  v16.16B
2661:      subs            w3,  w3,  #2
267        ld1             {v20.16B}, [x0], x2
268        \macd           v4.8H,   v0.8B,  v20.8B
269        \macd\()2       v6.8H,   v0.16B, v20.16B
270        ld1             {v22.16B}, [x1], x2
271        \macs           v4.8H,   v1.8B,  v22.8B
272        \macs\()2       v6.8H,   v1.16B, v22.16B
273        mov             v24.16B, v16.16B
274        ld1             {v28.16B}, [x0], x2
275        mov             v26.16B, v16.16B
276        \macd           v24.8H,  v0.8B,  v28.8B
277        \macd\()2       v26.8H,  v0.16B, v28.16B
278        ld1             {v30.16B}, [x1], x2
279        \macs           v24.8H,  v1.8B,  v30.8B
280        \macs\()2       v26.8H,  v1.16B, v30.16B
281        sshl            v4.8H,   v4.8H,  v18.8H
282        sshl            v6.8H,   v6.8H,  v18.8H
283        sqxtun          v4.8B,   v4.8H
284        sqxtun2         v4.16B,  v6.8H
285        sshl            v24.8H,  v24.8H, v18.8H
286        sshl            v26.8H,  v26.8H, v18.8H
287        sqxtun          v24.8B,  v24.8H
288        sqxtun2         v24.16B, v26.8H
289        mov             v6.16B,  v16.16B
290        st1             {v4.16B},  [x7], x2
291        mov             v4.16B,  v16.16B
292        st1             {v24.16B}, [x7], x2
293        b.ne            1b
294        ret
295.endm
296
297.macro  biweight_8      macs, macd
298        dup             v0.8B,  w5
299        dup             v1.8B,  w6
300        mov             v2.16B,  v16.16B
301        mov             v20.16B, v16.16B
3021:      subs            w3,  w3,  #2
303        ld1             {v4.8B}, [x0], x2
304        \macd           v2.8H,  v0.8B,  v4.8B
305        ld1             {v5.8B}, [x1], x2
306        \macs           v2.8H,  v1.8B,  v5.8B
307        ld1             {v6.8B}, [x0], x2
308        \macd           v20.8H, v0.8B,  v6.8B
309        ld1             {v7.8B}, [x1], x2
310        \macs           v20.8H, v1.8B,  v7.8B
311        sshl            v2.8H,  v2.8H,  v18.8H
312        sqxtun          v2.8B,  v2.8H
313        sshl            v20.8H, v20.8H, v18.8H
314        sqxtun          v4.8B,  v20.8H
315        mov             v20.16B, v16.16B
316        st1             {v2.8B}, [x7], x2
317        mov             v2.16B,  v16.16B
318        st1             {v4.8B}, [x7], x2
319        b.ne            1b
320        ret
321.endm
322
323.macro  biweight_4      macs, macd
324        dup             v0.8B,  w5
325        dup             v1.8B,  w6
326        mov             v2.16B, v16.16B
327        mov             v20.16B,v16.16B
3281:      subs            w3,  w3,  #4
329        ld1             {v4.S}[0], [x0], x2
330        ld1             {v4.S}[1], [x0], x2
331        \macd           v2.8H,  v0.8B,  v4.8B
332        ld1             {v5.S}[0], [x1], x2
333        ld1             {v5.S}[1], [x1], x2
334        \macs           v2.8H,  v1.8B,  v5.8B
335        b.lt            2f
336        ld1             {v6.S}[0], [x0], x2
337        ld1             {v6.S}[1], [x0], x2
338        \macd           v20.8H, v0.8B,  v6.8B
339        ld1             {v7.S}[0], [x1], x2
340        ld1             {v7.S}[1], [x1], x2
341        \macs           v20.8H, v1.8B,  v7.8B
342        sshl            v2.8H,  v2.8H,  v18.8H
343        sqxtun          v2.8B,  v2.8H
344        sshl            v20.8H, v20.8H, v18.8H
345        sqxtun          v4.8B,  v20.8H
346        mov             v20.16B, v16.16B
347        st1             {v2.S}[0], [x7], x2
348        st1             {v2.S}[1], [x7], x2
349        mov             v2.16B,  v16.16B
350        st1             {v4.S}[0], [x7], x2
351        st1             {v4.S}[1], [x7], x2
352        b.ne            1b
353        ret
3542:      sshl            v2.8H,  v2.8H,  v18.8H
355        sqxtun          v2.8B,  v2.8H
356        st1             {v2.S}[0], [x7], x2
357        st1             {v2.S}[1], [x7], x2
358        ret
359.endm
360
361.macro  biweight_func   w
362function ff_biweight_h264_pixels_\w\()_neon, export=1
363        sxtw            x2,  w2
364        lsr             w8,  w5,  #31
365        add             w7,  w7,  #1
366        eor             w8,  w8,  w6,  lsr #30
367        orr             w7,  w7,  #1
368        dup             v18.8H,   w4
369        lsl             w7,  w7,  w4
370        not             v18.16B,  v18.16B
371        dup             v16.8H,   w7
372        mov             x7,  x0
373        cbz             w8,  10f
374        subs            w8,  w8,  #1
375        b.eq            20f
376        subs            w8,  w8,  #1
377        b.eq            30f
378        b               40f
37910:     biweight_\w     umlal, umlal
38020:     neg             w5, w5
381        biweight_\w     umlal, umlsl
38230:     neg             w5, w5
383        neg             w6, w6
384        biweight_\w     umlsl, umlsl
38540:     neg             w6, w6
386        biweight_\w     umlsl, umlal
387endfunc
388.endm
389
390        biweight_func   16
391        biweight_func   8
392        biweight_func   4
393
394.macro  weight_16       add
395        dup             v0.16B,  w4
3961:      subs            w2,  w2,  #2
397        ld1             {v20.16B}, [x0], x1
398        umull           v4.8H,   v0.8B,  v20.8B
399        umull2          v6.8H,   v0.16B, v20.16B
400        ld1             {v28.16B}, [x0], x1
401        umull           v24.8H,  v0.8B,  v28.8B
402        umull2          v26.8H,  v0.16B, v28.16B
403        \add            v4.8H,   v16.8H, v4.8H
404        srshl           v4.8H,   v4.8H,  v18.8H
405        \add            v6.8H,   v16.8H, v6.8H
406        srshl           v6.8H,   v6.8H,  v18.8H
407        sqxtun          v4.8B,   v4.8H
408        sqxtun2         v4.16B,  v6.8H
409        \add            v24.8H,  v16.8H, v24.8H
410        srshl           v24.8H,  v24.8H, v18.8H
411        \add            v26.8H,  v16.8H, v26.8H
412        srshl           v26.8H,  v26.8H, v18.8H
413        sqxtun          v24.8B,  v24.8H
414        sqxtun2         v24.16B, v26.8H
415        st1             {v4.16B},  [x5], x1
416        st1             {v24.16B}, [x5], x1
417        b.ne            1b
418        ret
419.endm
420
421.macro  weight_8        add
422        dup             v0.8B,  w4
4231:      subs            w2,  w2,  #2
424        ld1             {v4.8B}, [x0], x1
425        umull           v2.8H,  v0.8B,  v4.8B
426        ld1             {v6.8B}, [x0], x1
427        umull           v20.8H, v0.8B,  v6.8B
428        \add            v2.8H,  v16.8H,  v2.8H
429        srshl           v2.8H,  v2.8H,  v18.8H
430        sqxtun          v2.8B,  v2.8H
431        \add            v20.8H, v16.8H,  v20.8H
432        srshl           v20.8H, v20.8H, v18.8H
433        sqxtun          v4.8B,  v20.8H
434        st1             {v2.8B}, [x5], x1
435        st1             {v4.8B}, [x5], x1
436        b.ne            1b
437        ret
438.endm
439
440.macro  weight_4        add
441        dup             v0.8B,  w4
4421:      subs            w2,  w2,  #4
443        ld1             {v4.S}[0], [x0], x1
444        ld1             {v4.S}[1], [x0], x1
445        umull           v2.8H,  v0.8B,  v4.8B
446        b.lt            2f
447        ld1             {v6.S}[0], [x0], x1
448        ld1             {v6.S}[1], [x0], x1
449        umull           v20.8H, v0.8B,  v6.8B
450        \add            v2.8H,  v16.8H,  v2.8H
451        srshl           v2.8H,  v2.8H,  v18.8H
452        sqxtun          v2.8B,  v2.8H
453        \add            v20.8H, v16.8H,  v20.8H
454        srshl           v20.8H, v20.8h, v18.8H
455        sqxtun          v4.8B,  v20.8H
456        st1             {v2.S}[0], [x5], x1
457        st1             {v2.S}[1], [x5], x1
458        st1             {v4.S}[0], [x5], x1
459        st1             {v4.S}[1], [x5], x1
460        b.ne            1b
461        ret
4622:      \add            v2.8H,  v16.8H,  v2.8H
463        srshl           v2.8H,  v2.8H,  v18.8H
464        sqxtun          v2.8B,  v2.8H
465        st1             {v2.S}[0], [x5], x1
466        st1             {v2.S}[1], [x5], x1
467        ret
468.endm
469
470.macro  weight_func     w
471function ff_weight_h264_pixels_\w\()_neon, export=1
472        sxtw            x1,  w1
473        cmp             w3,  #1
474        mov             w6,  #1
475        lsl             w5,  w5,  w3
476        dup             v16.8H,  w5
477        mov             x5,  x0
478        b.le            20f
479        sub             w6,  w6,  w3
480        dup             v18.8H,  w6
481        cmp             w4, #0
482        b.lt            10f
483        weight_\w       shadd
48410:     neg             w4,  w4
485        weight_\w       shsub
48620:     neg             w6,  w3
487        dup             v18.8H,  w6
488        cmp             w4,  #0
489        b.lt            10f
490        weight_\w       add
49110:     neg             w4,  w4
492        weight_\w       sub
493endfunc
494.endm
495
496        weight_func     16
497        weight_func     8
498        weight_func     4
499