1;******************************************************************************
2;* Copyright (c) 2008 Loren Merritt
3;*
4;* This file is part of FFmpeg.
5;*
6;* FFmpeg is free software; you can redistribute it and/or
7;* modify it under the terms of the GNU Lesser General Public
8;* License as published by the Free Software Foundation; either
9;* version 2.1 of the License, or (at your option) any later version.
10;*
11;* FFmpeg is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14;* Lesser General Public License for more details.
15;*
16;* You should have received a copy of the GNU Lesser General Public
17;* License along with FFmpeg; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19;******************************************************************************
20
21%include "libavutil/x86/x86util.asm"
22
23SECTION_TEXT
24
25%macro SCALARPRODUCT 0
26; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
27;                                     int order, int mul)
28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
29    shl orderq, 1
30    movd    m7, mulm
31%if mmsize == 16
32    pshuflw m7, m7, 0
33    punpcklqdq m7, m7
34%else
35    pshufw  m7, m7, 0
36%endif
37    pxor    m6, m6
38    add v1q, orderq
39    add v2q, orderq
40    add v3q, orderq
41    neg orderq
42.loop:
43    movu    m0, [v2q + orderq]
44    movu    m1, [v2q + orderq + mmsize]
45    mova    m4, [v1q + orderq]
46    mova    m5, [v1q + orderq + mmsize]
47    movu    m2, [v3q + orderq]
48    movu    m3, [v3q + orderq + mmsize]
49    pmaddwd m0, m4
50    pmaddwd m1, m5
51    pmullw  m2, m7
52    pmullw  m3, m7
53    paddd   m6, m0
54    paddd   m6, m1
55    paddw   m2, m4
56    paddw   m3, m5
57    mova    [v1q + orderq], m2
58    mova    [v1q + orderq + mmsize], m3
59    add     orderq, mmsize*2
60    jl .loop
61    HADDD   m6, m0
62    movd   eax, m6
63    RET
64%endmacro
65
66INIT_MMX mmxext
67SCALARPRODUCT
68INIT_XMM sse2
69SCALARPRODUCT
70
71%macro SCALARPRODUCT_LOOP 1
72align 16
73.loop%1:
74    sub     orderq, mmsize*2
75%if %1
76    mova    m1, m4
77    mova    m4, [v2q + orderq]
78    mova    m0, [v2q + orderq + mmsize]
79    palignr m1, m0, %1
80    palignr m0, m4, %1
81    mova    m3, m5
82    mova    m5, [v3q + orderq]
83    mova    m2, [v3q + orderq + mmsize]
84    palignr m3, m2, %1
85    palignr m2, m5, %1
86%else
87    mova    m0, [v2q + orderq]
88    mova    m1, [v2q + orderq + mmsize]
89    mova    m2, [v3q + orderq]
90    mova    m3, [v3q + orderq + mmsize]
91%endif
92    %define t0  [v1q + orderq]
93    %define t1  [v1q + orderq + mmsize]
94%if ARCH_X86_64
95    mova    m8, t0
96    mova    m9, t1
97    %define t0  m8
98    %define t1  m9
99%endif
100    pmaddwd m0, t0
101    pmaddwd m1, t1
102    pmullw  m2, m7
103    pmullw  m3, m7
104    paddw   m2, t0
105    paddw   m3, t1
106    paddd   m6, m0
107    paddd   m6, m1
108    mova    [v1q + orderq], m2
109    mova    [v1q + orderq + mmsize], m3
110    jg .loop%1
111%if %1
112    jmp .end
113%endif
114%endmacro
115
116; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
117;                                     int order, int mul)
118INIT_XMM ssse3
119cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
120    shl orderq, 1
121    movd    m7, mulm
122    pshuflw m7, m7, 0
123    punpcklqdq m7, m7
124    pxor    m6, m6
125    mov    r4d, v2d
126    and    r4d, 15
127    and    v2q, ~15
128    and    v3q, ~15
129    mova    m4, [v2q + orderq]
130    mova    m5, [v3q + orderq]
131    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
132    cmp    r4d, 0
133    je .loop0
134    cmp    r4d, 2
135    je .loop2
136    cmp    r4d, 4
137    je .loop4
138    cmp    r4d, 6
139    je .loop6
140    cmp    r4d, 8
141    je .loop8
142    cmp    r4d, 10
143    je .loop10
144    cmp    r4d, 12
145    je .loop12
146SCALARPRODUCT_LOOP 14
147SCALARPRODUCT_LOOP 12
148SCALARPRODUCT_LOOP 10
149SCALARPRODUCT_LOOP 8
150SCALARPRODUCT_LOOP 6
151SCALARPRODUCT_LOOP 4
152SCALARPRODUCT_LOOP 2
153SCALARPRODUCT_LOOP 0
154.end:
155    HADDD   m6, m0
156    movd   eax, m6
157    RET
158