1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders 3;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 4;* 5;* This file is part of Libav. 6;* 7;* Libav is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* Libav is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with Libav; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "x86inc.asm" 23%include "x86util.asm" 24 25SECTION .text 26 27%macro IDCT_DC_NOROUND 1 28 imul %1, 13*13*3 29 sar %1, 11 30%endmacro 31 32%macro IDCT_DC_ROUND 1 33 imul %1, 13*13 34 add %1, 0x200 35 sar %1, 10 36%endmacro 37 38%macro rv34_idct 1 39cglobal rv34_idct_%1_mmx2, 1, 2, 0 40 movsx r1, word [r0] 41 IDCT_DC r1 42 movd m0, r1 43 pshufw m0, m0, 0 44 movq [r0+ 0], m0 45 movq [r0+ 8], m0 46 movq [r0+16], m0 47 movq [r0+24], m0 48 REP_RET 49%endmacro 50 51INIT_MMX 52%define IDCT_DC IDCT_DC_ROUND 53rv34_idct dc 54%define IDCT_DC IDCT_DC_NOROUND 55rv34_idct dc_noround 56 57; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); 58cglobal rv34_idct_dc_add_mmx, 3, 3 59 ; calculate DC 60 IDCT_DC_ROUND r2 61 pxor m1, m1 62 movd m0, r2 63 psubw m1, m0 64 packuswb m0, m0 65 packuswb m1, m1 66 punpcklbw m0, m0 67 punpcklbw m1, m1 68 punpcklwd m0, m0 69 punpcklwd m1, m1 70 71 ; add DC 72 lea r2, [r0+r1*2] 73 movh m2, [r0] 74 movh m3, [r0+r1] 75 movh m4, [r2] 76 movh m5, [r2+r1] 77 paddusb m2, m0 78 paddusb m3, m0 79 paddusb m4, m0 80 paddusb m5, m0 81 psubusb m2, m1 82 psubusb m3, m1 83 psubusb m4, m1 84 psubusb m5, m1 85 movh [r0], m2 86 movh [r0+r1], m3 87 movh [r2], m4 88 movh [r2+r1], m5 89 RET 90 91; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); 92INIT_XMM 93cglobal rv34_idct_dc_add_sse4, 3, 3, 6 94 ; load data 95 IDCT_DC_ROUND r2 96 pxor m1, m1 97 98 ; calculate DC 99 movd m0, r2 100 lea r2, [r0+r1*2] 101 movd m2, [r0] 102 movd m3, [r0+r1] 103 pshuflw m0, m0, 0 104 movd m4, [r2] 105 movd m5, [r2+r1] 106 punpcklqdq m0, m0 107 punpckldq m2, m3 108 punpckldq m4, m5 109 punpcklbw m2, m1 110 punpcklbw m4, m1 111 paddw m2, m0 112 paddw m4, m0 113 packuswb m2, m4 114 movd [r0], m2 115 pextrd [r0+r1], m2, 1 116 pextrd [r2], m2, 2 117 pextrd [r2+r1], m2, 3 118 RET 119