1/* 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 3 * Copyright (c) 2011 Daniel Kang 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/cpu.h" 24#include "libavutil/x86/asm.h" 25#include "libavutil/x86/cpu.h" 26#include "libavcodec/h264qpel.h" 27#include "libavcodec/mpegvideo.h" 28#include "libavcodec/pixels.h" 29#include "fpel.h" 30 31#if HAVE_YASM 32void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, 33 ptrdiff_t line_size, int h); 34void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, 35 ptrdiff_t line_size, int h); 36void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 37 int dstStride, int src1Stride, int h); 38void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 39 int dstStride, int src1Stride, int h); 40void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 41 int dstStride, int src1Stride, int h); 42void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 43 int dstStride, int src1Stride, int h); 44void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 45 int dstStride, int src1Stride, int h); 46void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, 47 int dstStride, int src1Stride, int h); 48#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext 49#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext 50#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext 51#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext 52#define ff_put_pixels16_mmxext ff_put_pixels16_mmx 53#define ff_put_pixels8_mmxext ff_put_pixels8_mmx 54#define ff_put_pixels4_mmxext ff_put_pixels4_mmx 55 56#define DEF_QPEL(OPNAME)\ 57void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ 58void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ 59void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ 60void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ 61void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ 62void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ 63void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ 64void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ 65void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ 66void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\ 67void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ 68void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\ 69void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\ 70void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ 71void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ 72void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\ 73void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h); 74 75DEF_QPEL(avg) 76DEF_QPEL(put) 77 78#define QPEL_H264(OPNAME, OP, MMX)\ 79static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 80 int w=3;\ 81 src -= 2*srcStride+2;\ 82 while(w--){\ 83 ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\ 84 tmp += 4;\ 85 src += 4;\ 86 }\ 87 tmp -= 3*4;\ 88 ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ 89}\ 90\ 91static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 92 src -= 2*srcStride;\ 93 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ 94 src += 4;\ 95 dst += 4;\ 96 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ 97}\ 98static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ 99 int w = (size+8)>>2;\ 100 src -= 2*srcStride+2;\ 101 while(w--){\ 102 ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\ 103 tmp += 4;\ 104 src += 4;\ 105 }\ 106}\ 107static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 108 int w = size>>4;\ 109 do{\ 110 ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ 111 tmp += 8;\ 112 dst += 8;\ 113 }while(w--);\ 114}\ 115\ 116static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 117 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 118}\ 119static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 120 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 121 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 122}\ 123\ 124static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 125 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 126 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 127 src += 8*srcStride;\ 128 dst += 8*dstStride;\ 129 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 130 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 131}\ 132\ 133static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 134 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 135 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 136 src += 8*dstStride;\ 137 dst += 8*dstStride;\ 138 src2 += 8*src2Stride;\ 139 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 140 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 141}\ 142\ 143static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 144 ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ 145 ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 146}\ 147static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 148 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ 149}\ 150\ 151static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 152 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ 153}\ 154\ 155static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 156{\ 157 ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ 158 ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ 159}\ 160 161 162#if ARCH_X86_64 163#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 164 165void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); 166void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); 167 168#else // ARCH_X86_64 169#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 170static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 171 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 172 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 173 src += 8*dstStride;\ 174 dst += 8*dstStride;\ 175 src2 += 8*src2Stride;\ 176 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ 177 ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ 178} 179#endif // ARCH_X86_64 180 181#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 182QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 183static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 184 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 185 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 186 src += 8*srcStride;\ 187 dst += 8*dstStride;\ 188 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ 189 ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ 190}\ 191 192#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 193static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 194 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ 195}\ 196static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 197 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ 198 ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ 199} 200 201static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, 202 uint8_t *src, 203 int tmpStride, 204 int srcStride, 205 int size) 206{ 207 int w = (size+8)>>3; 208 src -= 2*srcStride+2; 209 while(w--){ 210 ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size); 211 tmp += 8; 212 src += 8; 213 } 214} 215 216#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ 217static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ 218 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ 219 ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ 220}\ 221static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 222 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ 223}\ 224static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 225 ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ 226}\ 227 228#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext 229#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext 230#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext 231#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext 232 233#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 234#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 235#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 236#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2 237 238#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext 239#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext 240 241#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ 242H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ 243H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ 244H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ 245H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ 246 247static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, 248 ptrdiff_t stride) 249{ 250 ff_put_pixels16_sse2(dst, src, stride, 16); 251} 252static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, 253 ptrdiff_t stride) 254{ 255 ff_avg_pixels16_sse2(dst, src, stride, 16); 256} 257#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext 258#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext 259 260#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ 261static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 262{\ 263 ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ 264}\ 265 266#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ 267static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 268{\ 269 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ 270}\ 271\ 272static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 273{\ 274 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ 275}\ 276\ 277static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 278{\ 279 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ 280}\ 281 282#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ 283static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 284{\ 285 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 286 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 287 ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ 288}\ 289\ 290static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 291{\ 292 ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ 293}\ 294\ 295static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 296{\ 297 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 298 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 299 ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ 300}\ 301 302#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ 303static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 304{\ 305 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 306 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 307 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 308}\ 309\ 310static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 311{\ 312 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 313 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 314 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ 315}\ 316\ 317static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 318{\ 319 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 320 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ 321 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 322}\ 323\ 324static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 325{\ 326 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ 327 ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ 328 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ 329}\ 330\ 331static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 332{\ 333 DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ 334 ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ 335}\ 336\ 337static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 338{\ 339 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 340 uint8_t * const halfHV= temp;\ 341 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 342 av_assert2(((int)temp & 7) == 0);\ 343 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 344 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ 345}\ 346\ 347static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 348{\ 349 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 350 uint8_t * const halfHV= temp;\ 351 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 352 av_assert2(((int)temp & 7) == 0);\ 353 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 354 ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ 355}\ 356\ 357static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 358{\ 359 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 360 uint8_t * const halfHV= temp;\ 361 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 362 av_assert2(((int)temp & 7) == 0);\ 363 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 364 ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ 365}\ 366\ 367static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ 368{\ 369 DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ 370 uint8_t * const halfHV= temp;\ 371 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ 372 av_assert2(((int)temp & 7) == 0);\ 373 ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ 374 ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ 375}\ 376 377#define H264_MC_4816(MMX)\ 378H264_MC(put_, 4, MMX, 8)\ 379H264_MC(put_, 8, MMX, 8)\ 380H264_MC(put_, 16,MMX, 8)\ 381H264_MC(avg_, 4, MMX, 8)\ 382H264_MC(avg_, 8, MMX, 8)\ 383H264_MC(avg_, 16,MMX, 8)\ 384 385#define H264_MC_816(QPEL, XMM)\ 386QPEL(put_, 8, XMM, 16)\ 387QPEL(put_, 16,XMM, 16)\ 388QPEL(avg_, 8, XMM, 16)\ 389QPEL(avg_, 16,XMM, 16)\ 390 391QPEL_H264(put_, PUT_OP, mmxext) 392QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext) 393QPEL_H264_V_XMM(put_, PUT_OP, sse2) 394QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2) 395QPEL_H264_HV_XMM(put_, PUT_OP, sse2) 396QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2) 397QPEL_H264_H_XMM(put_, PUT_OP, ssse3) 398QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) 399QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) 400QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) 401 402H264_MC_4816(mmxext) 403H264_MC_816(H264_MC_V, sse2) 404H264_MC_816(H264_MC_HV, sse2) 405H264_MC_816(H264_MC_H, ssse3) 406H264_MC_816(H264_MC_HV, ssse3) 407 408 409//10bit 410#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ 411void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ 412 (uint8_t *dst, uint8_t *src, ptrdiff_t stride); 413 414#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ 415 LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ 416 LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ 417 LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ 418 LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ 419 LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ 420 LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) 421 422#define LUMA_MC_816(DEPTH, TYPE, OPT) \ 423 LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ 424 LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ 425 LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ 426 LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) 427 428LUMA_MC_ALL(10, mc00, mmxext) 429LUMA_MC_ALL(10, mc10, mmxext) 430LUMA_MC_ALL(10, mc20, mmxext) 431LUMA_MC_ALL(10, mc30, mmxext) 432LUMA_MC_ALL(10, mc01, mmxext) 433LUMA_MC_ALL(10, mc11, mmxext) 434LUMA_MC_ALL(10, mc21, mmxext) 435LUMA_MC_ALL(10, mc31, mmxext) 436LUMA_MC_ALL(10, mc02, mmxext) 437LUMA_MC_ALL(10, mc12, mmxext) 438LUMA_MC_ALL(10, mc22, mmxext) 439LUMA_MC_ALL(10, mc32, mmxext) 440LUMA_MC_ALL(10, mc03, mmxext) 441LUMA_MC_ALL(10, mc13, mmxext) 442LUMA_MC_ALL(10, mc23, mmxext) 443LUMA_MC_ALL(10, mc33, mmxext) 444 445LUMA_MC_816(10, mc00, sse2) 446LUMA_MC_816(10, mc10, sse2) 447LUMA_MC_816(10, mc10, sse2_cache64) 448LUMA_MC_816(10, mc10, ssse3_cache64) 449LUMA_MC_816(10, mc20, sse2) 450LUMA_MC_816(10, mc20, sse2_cache64) 451LUMA_MC_816(10, mc20, ssse3_cache64) 452LUMA_MC_816(10, mc30, sse2) 453LUMA_MC_816(10, mc30, sse2_cache64) 454LUMA_MC_816(10, mc30, ssse3_cache64) 455LUMA_MC_816(10, mc01, sse2) 456LUMA_MC_816(10, mc11, sse2) 457LUMA_MC_816(10, mc21, sse2) 458LUMA_MC_816(10, mc31, sse2) 459LUMA_MC_816(10, mc02, sse2) 460LUMA_MC_816(10, mc12, sse2) 461LUMA_MC_816(10, mc22, sse2) 462LUMA_MC_816(10, mc32, sse2) 463LUMA_MC_816(10, mc03, sse2) 464LUMA_MC_816(10, mc13, sse2) 465LUMA_MC_816(10, mc23, sse2) 466LUMA_MC_816(10, mc33, sse2) 467 468#define QPEL16_OPMC(OP, MC, MMX)\ 469void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\ 470 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ 471 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ 472 src += 8*stride;\ 473 dst += 8*stride;\ 474 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ 475 ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ 476} 477 478#define QPEL16_OP(MC, MMX)\ 479QPEL16_OPMC(put, MC, MMX)\ 480QPEL16_OPMC(avg, MC, MMX) 481 482#define QPEL16(MMX)\ 483QPEL16_OP(mc00, MMX)\ 484QPEL16_OP(mc01, MMX)\ 485QPEL16_OP(mc02, MMX)\ 486QPEL16_OP(mc03, MMX)\ 487QPEL16_OP(mc10, MMX)\ 488QPEL16_OP(mc11, MMX)\ 489QPEL16_OP(mc12, MMX)\ 490QPEL16_OP(mc13, MMX)\ 491QPEL16_OP(mc20, MMX)\ 492QPEL16_OP(mc21, MMX)\ 493QPEL16_OP(mc22, MMX)\ 494QPEL16_OP(mc23, MMX)\ 495QPEL16_OP(mc30, MMX)\ 496QPEL16_OP(mc31, MMX)\ 497QPEL16_OP(mc32, MMX)\ 498QPEL16_OP(mc33, MMX) 499 500#if ARCH_X86_32 // ARCH_X86_64 implies SSE2+ 501QPEL16(mmxext) 502#endif 503 504#endif /* HAVE_YASM */ 505 506#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ 507 do { \ 508 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ 509 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ 510 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ 511 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ 512 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ 513 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ 514 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ 515 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ 516 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ 517 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ 518 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ 519 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ 520 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ 521 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ 522 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ 523 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ 524 } while (0) 525 526#define H264_QPEL_FUNCS(x, y, CPU) \ 527 do { \ 528 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ 529 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ 530 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ 531 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ 532 } while (0) 533 534#define H264_QPEL_FUNCS_10(x, y, CPU) \ 535 do { \ 536 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ 537 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ 538 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ 539 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ 540 } while (0) 541 542av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) 543{ 544#if HAVE_YASM 545 int high_bit_depth = bit_depth > 8; 546 int cpu_flags = av_get_cpu_flags(); 547 548 if (EXTERNAL_MMXEXT(cpu_flags)) { 549 if (!high_bit_depth) { 550 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, ); 551 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, ); 552 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, ); 553 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, ); 554 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, ); 555 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); 556 } else if (bit_depth == 10) { 557#if ARCH_X86_32 558 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); 559 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); 560 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); 561 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); 562#endif 563 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); 564 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); 565 } 566 } 567 568 if (EXTERNAL_SSE2(cpu_flags)) { 569 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) { 570 // these functions are slower than mmx on AMD, but faster on Intel 571 H264_QPEL_FUNCS(0, 0, sse2); 572 } 573 574 if (!high_bit_depth) { 575 H264_QPEL_FUNCS(0, 1, sse2); 576 H264_QPEL_FUNCS(0, 2, sse2); 577 H264_QPEL_FUNCS(0, 3, sse2); 578 H264_QPEL_FUNCS(1, 1, sse2); 579 H264_QPEL_FUNCS(1, 2, sse2); 580 H264_QPEL_FUNCS(1, 3, sse2); 581 H264_QPEL_FUNCS(2, 1, sse2); 582 H264_QPEL_FUNCS(2, 2, sse2); 583 H264_QPEL_FUNCS(2, 3, sse2); 584 H264_QPEL_FUNCS(3, 1, sse2); 585 H264_QPEL_FUNCS(3, 2, sse2); 586 H264_QPEL_FUNCS(3, 3, sse2); 587 } 588 589 if (bit_depth == 10) { 590 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); 591 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); 592 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); 593 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); 594 H264_QPEL_FUNCS_10(1, 0, sse2_cache64); 595 H264_QPEL_FUNCS_10(2, 0, sse2_cache64); 596 H264_QPEL_FUNCS_10(3, 0, sse2_cache64); 597 } 598 } 599 600 if (EXTERNAL_SSSE3(cpu_flags)) { 601 if (!high_bit_depth) { 602 H264_QPEL_FUNCS(1, 0, ssse3); 603 H264_QPEL_FUNCS(1, 1, ssse3); 604 H264_QPEL_FUNCS(1, 2, ssse3); 605 H264_QPEL_FUNCS(1, 3, ssse3); 606 H264_QPEL_FUNCS(2, 0, ssse3); 607 H264_QPEL_FUNCS(2, 1, ssse3); 608 H264_QPEL_FUNCS(2, 2, ssse3); 609 H264_QPEL_FUNCS(2, 3, ssse3); 610 H264_QPEL_FUNCS(3, 0, ssse3); 611 H264_QPEL_FUNCS(3, 1, ssse3); 612 H264_QPEL_FUNCS(3, 2, ssse3); 613 H264_QPEL_FUNCS(3, 3, ssse3); 614 } 615 616 if (bit_depth == 10) { 617 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); 618 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); 619 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); 620 } 621 } 622 623 if (EXTERNAL_AVX(cpu_flags)) { 624 /* AVX implies 64 byte cache lines without the need to avoid unaligned 625 * memory accesses that cross the boundary between two cache lines. 626 * TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid 627 * having to treat SSE2 functions with such properties as AVX. */ 628 if (bit_depth == 10) { 629 H264_QPEL_FUNCS_10(1, 0, sse2); 630 H264_QPEL_FUNCS_10(2, 0, sse2); 631 H264_QPEL_FUNCS_10(3, 0, sse2); 632 } 633 } 634#endif 635} 636