1;*****************************************************************************
2;* SIMD-optimized pixel operations
3;*****************************************************************************
4;* Copyright (c) 2000, 2001 Fabrice Bellard
5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;*****************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION .text
27
28INIT_MMX mmx
29; void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
30cglobal get_pixels, 3,4
31    movsxdifnidn r2, r2d
32    add          r0, 128
33    mov          r3, -128
34    pxor         m7, m7
35.loop:
36    mova         m0, [r1]
37    mova         m2, [r1+r2]
38    mova         m1, m0
39    mova         m3, m2
40    punpcklbw    m0, m7
41    punpckhbw    m1, m7
42    punpcklbw    m2, m7
43    punpckhbw    m3, m7
44    mova [r0+r3+ 0], m0
45    mova [r0+r3+ 8], m1
46    mova [r0+r3+16], m2
47    mova [r0+r3+24], m3
48    lea          r1, [r1+r2*2]
49    add          r3, 32
50    js .loop
51    REP_RET
52
53INIT_XMM sse2
54cglobal get_pixels, 3, 4, 5
55    movsxdifnidn r2, r2d
56    lea          r3, [r2*3]
57    pxor         m4, m4
58    movh         m0, [r1]
59    movh         m1, [r1+r2]
60    movh         m2, [r1+r2*2]
61    movh         m3, [r1+r3]
62    lea          r1, [r1+r2*4]
63    punpcklbw    m0, m4
64    punpcklbw    m1, m4
65    punpcklbw    m2, m4
66    punpcklbw    m3, m4
67    mova       [r0], m0
68    mova  [r0+0x10], m1
69    mova  [r0+0x20], m2
70    mova  [r0+0x30], m3
71    movh         m0, [r1]
72    movh         m1, [r1+r2*1]
73    movh         m2, [r1+r2*2]
74    movh         m3, [r1+r3]
75    punpcklbw    m0, m4
76    punpcklbw    m1, m4
77    punpcklbw    m2, m4
78    punpcklbw    m3, m4
79    mova  [r0+0x40], m0
80    mova  [r0+0x50], m1
81    mova  [r0+0x60], m2
82    mova  [r0+0x70], m3
83    RET
84
85INIT_MMX mmx
86; void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
87;                         int stride);
88cglobal diff_pixels, 4,5
89    movsxdifnidn r3, r3d
90    pxor         m7, m7
91    add          r0,  128
92    mov          r4, -128
93.loop:
94    mova         m0, [r1]
95    mova         m2, [r2]
96    mova         m1, m0
97    mova         m3, m2
98    punpcklbw    m0, m7
99    punpckhbw    m1, m7
100    punpcklbw    m2, m7
101    punpckhbw    m3, m7
102    psubw        m0, m2
103    psubw        m1, m3
104    mova  [r0+r4+0], m0
105    mova  [r0+r4+8], m1
106    add          r1, r3
107    add          r2, r3
108    add          r4, 16
109    jne .loop
110    REP_RET
111
112INIT_XMM sse2
113cglobal diff_pixels, 4, 5, 5
114    movsxdifnidn r3, r3d
115    pxor         m4, m4
116    add          r0,  128
117    mov          r4, -128
118.loop:
119    movh         m0, [r1]
120    movh         m2, [r2]
121    movh         m1, [r1+r3]
122    movh         m3, [r2+r3]
123    punpcklbw    m0, m4
124    punpcklbw    m1, m4
125    punpcklbw    m2, m4
126    punpcklbw    m3, m4
127    psubw        m0, m2
128    psubw        m1, m3
129    mova [r0+r4+0 ], m0
130    mova [r0+r4+16], m1
131    lea          r1, [r1+r3*2]
132    lea          r2, [r2+r3*2]
133    add          r4, 32
134    jne .loop
135    RET
136