1/* 2 * Copyright (c) 2008 Loren Merritt 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21/** 22 * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. 23 * H264_CHROMA_MC8_TMPL must be defined to the desired function name 24 * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function 25 * AVG_OP must be defined to empty for put and the identify for avg 26 */ 27static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) 28{ 29 if(y==0 && x==0) { 30 /* no filter needed */ 31 H264_CHROMA_MC8_MV0(dst, src, stride, h); 32 return; 33 } 34 35 assert(x<8 && y<8 && x>=0 && y>=0); 36 37 if(y==0 || x==0) 38 { 39 /* 1 dimensional filter only */ 40 __asm__ volatile( 41 "movd %0, %%xmm7 \n\t" 42 "movq %1, %%xmm6 \n\t" 43 "pshuflw $0, %%xmm7, %%xmm7 \n\t" 44 "movlhps %%xmm6, %%xmm6 \n\t" 45 "movlhps %%xmm7, %%xmm7 \n\t" 46 :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3)) 47 ); 48 49 if(x) { 50 __asm__ volatile( 51 "1: \n\t" 52 "movq (%1), %%xmm0 \n\t" 53 "movq 1(%1), %%xmm1 \n\t" 54 "movq (%1,%3), %%xmm2 \n\t" 55 "movq 1(%1,%3), %%xmm3 \n\t" 56 "punpcklbw %%xmm1, %%xmm0 \n\t" 57 "punpcklbw %%xmm3, %%xmm2 \n\t" 58 "pmaddubsw %%xmm7, %%xmm0 \n\t" 59 "pmaddubsw %%xmm7, %%xmm2 \n\t" 60 AVG_OP("movq (%0), %%xmm4 \n\t") 61 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") 62 "paddw %%xmm6, %%xmm0 \n\t" 63 "paddw %%xmm6, %%xmm2 \n\t" 64 "psrlw $3, %%xmm0 \n\t" 65 "psrlw $3, %%xmm2 \n\t" 66 "packuswb %%xmm2, %%xmm0 \n\t" 67 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") 68 "movq %%xmm0, (%0) \n\t" 69 "movhps %%xmm0, (%0,%3) \n\t" 70 "sub $2, %2 \n\t" 71 "lea (%1,%3,2), %1 \n\t" 72 "lea (%0,%3,2), %0 \n\t" 73 "jg 1b \n\t" 74 :"+r"(dst), "+r"(src), "+r"(h) 75 :"r"((x86_reg)stride) 76 ); 77 } else { 78 __asm__ volatile( 79 "1: \n\t" 80 "movq (%1), %%xmm0 \n\t" 81 "movq (%1,%3), %%xmm1 \n\t" 82 "movdqa %%xmm1, %%xmm2 \n\t" 83 "movq (%1,%3,2), %%xmm3 \n\t" 84 "punpcklbw %%xmm1, %%xmm0 \n\t" 85 "punpcklbw %%xmm3, %%xmm2 \n\t" 86 "pmaddubsw %%xmm7, %%xmm0 \n\t" 87 "pmaddubsw %%xmm7, %%xmm2 \n\t" 88 AVG_OP("movq (%0), %%xmm4 \n\t") 89 AVG_OP("movhps (%0,%3), %%xmm4 \n\t") 90 "paddw %%xmm6, %%xmm0 \n\t" 91 "paddw %%xmm6, %%xmm2 \n\t" 92 "psrlw $3, %%xmm0 \n\t" 93 "psrlw $3, %%xmm2 \n\t" 94 "packuswb %%xmm2, %%xmm0 \n\t" 95 AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") 96 "movq %%xmm0, (%0) \n\t" 97 "movhps %%xmm0, (%0,%3) \n\t" 98 "sub $2, %2 \n\t" 99 "lea (%1,%3,2), %1 \n\t" 100 "lea (%0,%3,2), %0 \n\t" 101 "jg 1b \n\t" 102 :"+r"(dst), "+r"(src), "+r"(h) 103 :"r"((x86_reg)stride) 104 ); 105 } 106 return; 107 } 108 109 /* general case, bilinear */ 110 __asm__ volatile( 111 "movd %0, %%xmm7 \n\t" 112 "movd %1, %%xmm6 \n\t" 113 "movdqa %2, %%xmm5 \n\t" 114 "pshuflw $0, %%xmm7, %%xmm7 \n\t" 115 "pshuflw $0, %%xmm6, %%xmm6 \n\t" 116 "movlhps %%xmm7, %%xmm7 \n\t" 117 "movlhps %%xmm6, %%xmm6 \n\t" 118 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) 119 ); 120 121 __asm__ volatile( 122 "movq (%1), %%xmm0 \n\t" 123 "movq 1(%1), %%xmm1 \n\t" 124 "punpcklbw %%xmm1, %%xmm0 \n\t" 125 "add %3, %1 \n\t" 126 "1: \n\t" 127 "movq (%1), %%xmm1 \n\t" 128 "movq 1(%1), %%xmm2 \n\t" 129 "movq (%1,%3), %%xmm3 \n\t" 130 "movq 1(%1,%3), %%xmm4 \n\t" 131 "lea (%1,%3,2), %1 \n\t" 132 "punpcklbw %%xmm2, %%xmm1 \n\t" 133 "punpcklbw %%xmm4, %%xmm3 \n\t" 134 "movdqa %%xmm1, %%xmm2 \n\t" 135 "movdqa %%xmm3, %%xmm4 \n\t" 136 "pmaddubsw %%xmm7, %%xmm0 \n\t" 137 "pmaddubsw %%xmm6, %%xmm1 \n\t" 138 "pmaddubsw %%xmm7, %%xmm2 \n\t" 139 "pmaddubsw %%xmm6, %%xmm3 \n\t" 140 "paddw %%xmm5, %%xmm0 \n\t" 141 "paddw %%xmm5, %%xmm2 \n\t" 142 "paddw %%xmm0, %%xmm1 \n\t" 143 "paddw %%xmm2, %%xmm3 \n\t" 144 "movdqa %%xmm4, %%xmm0 \n\t" 145 "psrlw $6, %%xmm1 \n\t" 146 "psrlw $6, %%xmm3 \n\t" 147 AVG_OP("movq (%0), %%xmm2 \n\t") 148 AVG_OP("movhps (%0,%3), %%xmm2 \n\t") 149 "packuswb %%xmm3, %%xmm1 \n\t" 150 AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") 151 "movq %%xmm1, (%0)\n\t" 152 "movhps %%xmm1, (%0,%3)\n\t" 153 "sub $2, %2 \n\t" 154 "lea (%0,%3,2), %0 \n\t" 155 "jg 1b \n\t" 156 :"+r"(dst), "+r"(src), "+r"(h) 157 :"r"((x86_reg)stride) 158 ); 159} 160 161static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) 162{ 163 __asm__ volatile( 164 "movd %0, %%mm7 \n\t" 165 "movd %1, %%mm6 \n\t" 166 "movq %2, %%mm5 \n\t" 167 "pshufw $0, %%mm7, %%mm7 \n\t" 168 "pshufw $0, %%mm6, %%mm6 \n\t" 169 :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) 170 ); 171 172 __asm__ volatile( 173 "movd (%1), %%mm0 \n\t" 174 "punpcklbw 1(%1), %%mm0 \n\t" 175 "add %3, %1 \n\t" 176 "1: \n\t" 177 "movd (%1), %%mm1 \n\t" 178 "movd (%1,%3), %%mm3 \n\t" 179 "punpcklbw 1(%1), %%mm1 \n\t" 180 "punpcklbw 1(%1,%3), %%mm3 \n\t" 181 "lea (%1,%3,2), %1 \n\t" 182 "movq %%mm1, %%mm2 \n\t" 183 "movq %%mm3, %%mm4 \n\t" 184 "pmaddubsw %%mm7, %%mm0 \n\t" 185 "pmaddubsw %%mm6, %%mm1 \n\t" 186 "pmaddubsw %%mm7, %%mm2 \n\t" 187 "pmaddubsw %%mm6, %%mm3 \n\t" 188 "paddw %%mm5, %%mm0 \n\t" 189 "paddw %%mm5, %%mm2 \n\t" 190 "paddw %%mm0, %%mm1 \n\t" 191 "paddw %%mm2, %%mm3 \n\t" 192 "movq %%mm4, %%mm0 \n\t" 193 "psrlw $6, %%mm1 \n\t" 194 "psrlw $6, %%mm3 \n\t" 195 "packuswb %%mm1, %%mm1 \n\t" 196 "packuswb %%mm3, %%mm3 \n\t" 197 AVG_OP("pavgb (%0), %%mm1 \n\t") 198 AVG_OP("pavgb (%0,%3), %%mm3 \n\t") 199 "movd %%mm1, (%0)\n\t" 200 "movd %%mm3, (%0,%3)\n\t" 201 "sub $2, %2 \n\t" 202 "lea (%0,%3,2), %0 \n\t" 203 "jg 1b \n\t" 204 :"+r"(dst), "+r"(src), "+r"(h) 205 :"r"((x86_reg)stride) 206 ); 207} 208 209