1;*****************************************************************************
2;* SSE2-optimized H.264 iDCT
3;*****************************************************************************
4;* Copyright (C) 2003-2008 x264 project
5;*
6;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
7;*          Loren Merritt <lorenm@u.washington.edu>
8;*          Holger Lubitz <hal@duncan.ol.sub.de>
9;*          Min Chen <chenm001.163.com>
10;*
11;* This program is free software; you can redistribute it and/or modify
12;* it under the terms of the GNU General Public License as published by
13;* the Free Software Foundation; either version 2 of the License, or
14;* (at your option) any later version.
15;*
16;* This program is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19;* GNU General Public License for more details.
20;*
21;* You should have received a copy of the GNU General Public License
22;* along with this program; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24;*****************************************************************************
25
26%include "x86inc.asm"
27%include "x86util.asm"
28
29SECTION_RODATA
30pw_32: times 8 dw 32
31
32SECTION .text
33
34INIT_XMM
35cglobal x264_add8x4_idct_sse2, 3,3,8
36    movq   m0, [r1+ 0]
37    movq   m1, [r1+ 8]
38    movq   m2, [r1+16]
39    movq   m3, [r1+24]
40    movhps m0, [r1+32]
41    movhps m1, [r1+40]
42    movhps m2, [r1+48]
43    movhps m3, [r1+56]
44    IDCT4_1D 0,1,2,3,4,5
45    TRANSPOSE2x4x4W 0,1,2,3,4
46    paddw m0, [pw_32 GLOBAL]
47    IDCT4_1D 0,1,2,3,4,5
48    pxor  m7, m7
49    STORE_DIFF  m0, m4, m7, [r0]
50    STORE_DIFF  m1, m4, m7, [r0+r2]
51    lea   r0, [r0+r2*2]
52    STORE_DIFF  m2, m4, m7, [r0]
53    STORE_DIFF  m3, m4, m7, [r0+r2]
54    RET
55