1;*****************************************************************************
2;* SIMD-optimized MPEG encoding functions
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27
28cextern pw_1
29
30SECTION .text
31; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
32; %1 = number of xmm registers used
33; %2 = number of loops
34; %3 = number of GPRs used
35%macro PIX_SUM16 4
36cglobal pix_sum16, 2, %3, %1
37    movsxdifnidn r1, r1d
38    mov          r2, %2
39%if cpuflag(xop)
40    lea          r3, [r1*3]
41%else
42    pxor         m5, m5
43%endif
44    pxor         m4, m4
45.loop:
46%if cpuflag(xop)
47    vphaddubq    m0, [r0]
48    vphaddubq    m1, [r0+r1]
49    vphaddubq    m2, [r0+r1*2]
50    vphaddubq    m3, [r0+r3]
51%else
52    mova         m0, [r0]
53%if mmsize == 8
54    mova         m1, [r0+8]
55%else
56    mova         m1, [r0+r1]
57%endif
58    punpckhbw    m2, m0, m5
59    punpcklbw    m0, m5
60    punpckhbw    m3, m1, m5
61    punpcklbw    m1, m5
62%endif ; cpuflag(xop)
63    paddw        m1, m0
64    paddw        m3, m2
65    paddw        m3, m1
66    paddw        m4, m3
67%if mmsize == 8
68    add          r0, r1
69%else
70    lea          r0, [r0+r1*%4]
71%endif
72    dec r2
73    jne .loop
74%if cpuflag(xop)
75    pshufd       m0, m4, q0032
76    paddd        m4, m0
77%else
78    HADDW        m4, m5
79%endif
80    movd        eax, m4
81    RET
82%endmacro
83
84INIT_MMX mmx
85PIX_SUM16 0, 16, 3, 0
86INIT_XMM sse2
87PIX_SUM16 6, 8,  3, 2
88%if HAVE_XOP_EXTERNAL
89INIT_XMM xop
90PIX_SUM16 5, 4,  4, 4
91%endif
92
93; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
94; %1 = number of xmm registers used
95; %2 = number of loops
96%macro PIX_NORM1 2
97cglobal pix_norm1, 2, 3, %1
98    movsxdifnidn r1, r1d
99    mov          r2, %2
100    pxor         m0, m0
101    pxor         m5, m5
102.loop:
103    mova         m2, [r0+0]
104%if mmsize == 8
105    mova         m3, [r0+8]
106%else
107    mova         m3, [r0+r1]
108%endif
109    punpckhbw    m1, m2, m0
110    punpcklbw    m2, m0
111    punpckhbw    m4, m3, m0
112    punpcklbw    m3, m0
113    pmaddwd      m1, m1
114    pmaddwd      m2, m2
115    pmaddwd      m3, m3
116    pmaddwd      m4, m4
117    paddd        m2, m1
118    paddd        m4, m3
119    paddd        m5, m2
120    paddd        m5, m4
121%if mmsize == 8
122    add          r0, r1
123%else
124    lea          r0, [r0+r1*2]
125%endif
126    dec r2
127    jne .loop
128    HADDD        m5, m1
129    movd        eax, m5
130    RET
131%endmacro
132
133INIT_MMX mmx
134PIX_NORM1 0, 16
135INIT_XMM sse2
136PIX_NORM1 6, 8
137
138