1/* 2 * The simplest mpeg encoder (well, it was the simplest!) 3 * Copyright (c) 2000,2001 Fabrice Bellard 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/attributes.h" 23#include "libavutil/cpu.h" 24#include "libavutil/x86/asm.h" 25#include "libavutil/x86/cpu.h" 26#include "libavcodec/avcodec.h" 27#include "libavcodec/dct.h" 28#include "libavcodec/mpegvideo.h" 29 30/* not permutated inverse zigzag_direct + 1 for MMX quantizer */ 31DECLARE_ALIGNED(16, static uint16_t, inv_zigzag_direct16)[64]; 32 33#if HAVE_6REGS 34 35#if HAVE_MMX_INLINE 36#define COMPILE_TEMPLATE_MMXEXT 0 37#define COMPILE_TEMPLATE_SSE2 0 38#define COMPILE_TEMPLATE_SSSE3 0 39#define RENAME(a) a ## _mmx 40#define RENAME_FDCT(a) a ## _mmx 41#include "mpegvideoenc_template.c" 42#endif /* HAVE_MMX_INLINE */ 43 44#if HAVE_MMXEXT_INLINE 45#undef COMPILE_TEMPLATE_SSSE3 46#undef COMPILE_TEMPLATE_SSE2 47#undef COMPILE_TEMPLATE_MMXEXT 48#define COMPILE_TEMPLATE_MMXEXT 1 49#define COMPILE_TEMPLATE_SSE2 0 50#define COMPILE_TEMPLATE_SSSE3 0 51#undef RENAME 52#undef RENAME_FDCT 53#define RENAME(a) a ## _mmxext 54#define RENAME_FDCT(a) a ## _mmxext 55#include "mpegvideoenc_template.c" 56#endif /* HAVE_MMXEXT_INLINE */ 57 58#if HAVE_SSE2_INLINE 59#undef COMPILE_TEMPLATE_MMXEXT 60#undef COMPILE_TEMPLATE_SSE2 61#undef COMPILE_TEMPLATE_SSSE3 62#define COMPILE_TEMPLATE_MMXEXT 0 63#define COMPILE_TEMPLATE_SSE2 1 64#define COMPILE_TEMPLATE_SSSE3 0 65#undef RENAME 66#undef RENAME_FDCT 67#define RENAME(a) a ## _sse2 68#define RENAME_FDCT(a) a ## _sse2 69#include "mpegvideoenc_template.c" 70#endif /* HAVE_SSE2_INLINE */ 71 72#if HAVE_SSSE3_INLINE 73#undef COMPILE_TEMPLATE_MMXEXT 74#undef COMPILE_TEMPLATE_SSE2 75#undef COMPILE_TEMPLATE_SSSE3 76#define COMPILE_TEMPLATE_MMXEXT 0 77#define COMPILE_TEMPLATE_SSE2 1 78#define COMPILE_TEMPLATE_SSSE3 1 79#undef RENAME 80#undef RENAME_FDCT 81#define RENAME(a) a ## _ssse3 82#define RENAME_FDCT(a) a ## _sse2 83#include "mpegvideoenc_template.c" 84#endif /* HAVE_SSSE3_INLINE */ 85 86#endif /* HAVE_6REGS */ 87 88#if HAVE_INLINE_ASM 89static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ 90 const int intra= s->mb_intra; 91 int *sum= s->dct_error_sum[intra]; 92 uint16_t *offset= s->dct_offset[intra]; 93 94 s->dct_count[intra]++; 95 96 __asm__ volatile( 97 "pxor %%mm7, %%mm7 \n\t" 98 "1: \n\t" 99 "pxor %%mm0, %%mm0 \n\t" 100 "pxor %%mm1, %%mm1 \n\t" 101 "movq (%0), %%mm2 \n\t" 102 "movq 8(%0), %%mm3 \n\t" 103 "pcmpgtw %%mm2, %%mm0 \n\t" 104 "pcmpgtw %%mm3, %%mm1 \n\t" 105 "pxor %%mm0, %%mm2 \n\t" 106 "pxor %%mm1, %%mm3 \n\t" 107 "psubw %%mm0, %%mm2 \n\t" 108 "psubw %%mm1, %%mm3 \n\t" 109 "movq %%mm2, %%mm4 \n\t" 110 "movq %%mm3, %%mm5 \n\t" 111 "psubusw (%2), %%mm2 \n\t" 112 "psubusw 8(%2), %%mm3 \n\t" 113 "pxor %%mm0, %%mm2 \n\t" 114 "pxor %%mm1, %%mm3 \n\t" 115 "psubw %%mm0, %%mm2 \n\t" 116 "psubw %%mm1, %%mm3 \n\t" 117 "movq %%mm2, (%0) \n\t" 118 "movq %%mm3, 8(%0) \n\t" 119 "movq %%mm4, %%mm2 \n\t" 120 "movq %%mm5, %%mm3 \n\t" 121 "punpcklwd %%mm7, %%mm4 \n\t" 122 "punpckhwd %%mm7, %%mm2 \n\t" 123 "punpcklwd %%mm7, %%mm5 \n\t" 124 "punpckhwd %%mm7, %%mm3 \n\t" 125 "paddd (%1), %%mm4 \n\t" 126 "paddd 8(%1), %%mm2 \n\t" 127 "paddd 16(%1), %%mm5 \n\t" 128 "paddd 24(%1), %%mm3 \n\t" 129 "movq %%mm4, (%1) \n\t" 130 "movq %%mm2, 8(%1) \n\t" 131 "movq %%mm5, 16(%1) \n\t" 132 "movq %%mm3, 24(%1) \n\t" 133 "add $16, %0 \n\t" 134 "add $32, %1 \n\t" 135 "add $16, %2 \n\t" 136 "cmp %3, %0 \n\t" 137 " jb 1b \n\t" 138 : "+r" (block), "+r" (sum), "+r" (offset) 139 : "r"(block+64) 140 ); 141} 142 143static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ 144 const int intra= s->mb_intra; 145 int *sum= s->dct_error_sum[intra]; 146 uint16_t *offset= s->dct_offset[intra]; 147 148 s->dct_count[intra]++; 149 150 __asm__ volatile( 151 "pxor %%xmm7, %%xmm7 \n\t" 152 "1: \n\t" 153 "pxor %%xmm0, %%xmm0 \n\t" 154 "pxor %%xmm1, %%xmm1 \n\t" 155 "movdqa (%0), %%xmm2 \n\t" 156 "movdqa 16(%0), %%xmm3 \n\t" 157 "pcmpgtw %%xmm2, %%xmm0 \n\t" 158 "pcmpgtw %%xmm3, %%xmm1 \n\t" 159 "pxor %%xmm0, %%xmm2 \n\t" 160 "pxor %%xmm1, %%xmm3 \n\t" 161 "psubw %%xmm0, %%xmm2 \n\t" 162 "psubw %%xmm1, %%xmm3 \n\t" 163 "movdqa %%xmm2, %%xmm4 \n\t" 164 "movdqa %%xmm3, %%xmm5 \n\t" 165 "psubusw (%2), %%xmm2 \n\t" 166 "psubusw 16(%2), %%xmm3 \n\t" 167 "pxor %%xmm0, %%xmm2 \n\t" 168 "pxor %%xmm1, %%xmm3 \n\t" 169 "psubw %%xmm0, %%xmm2 \n\t" 170 "psubw %%xmm1, %%xmm3 \n\t" 171 "movdqa %%xmm2, (%0) \n\t" 172 "movdqa %%xmm3, 16(%0) \n\t" 173 "movdqa %%xmm4, %%xmm6 \n\t" 174 "movdqa %%xmm5, %%xmm0 \n\t" 175 "punpcklwd %%xmm7, %%xmm4 \n\t" 176 "punpckhwd %%xmm7, %%xmm6 \n\t" 177 "punpcklwd %%xmm7, %%xmm5 \n\t" 178 "punpckhwd %%xmm7, %%xmm0 \n\t" 179 "paddd (%1), %%xmm4 \n\t" 180 "paddd 16(%1), %%xmm6 \n\t" 181 "paddd 32(%1), %%xmm5 \n\t" 182 "paddd 48(%1), %%xmm0 \n\t" 183 "movdqa %%xmm4, (%1) \n\t" 184 "movdqa %%xmm6, 16(%1) \n\t" 185 "movdqa %%xmm5, 32(%1) \n\t" 186 "movdqa %%xmm0, 48(%1) \n\t" 187 "add $32, %0 \n\t" 188 "add $64, %1 \n\t" 189 "add $32, %2 \n\t" 190 "cmp %3, %0 \n\t" 191 " jb 1b \n\t" 192 : "+r" (block), "+r" (sum), "+r" (offset) 193 : "r"(block+64) 194 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", 195 "%xmm4", "%xmm5", "%xmm6", "%xmm7") 196 ); 197} 198#endif /* HAVE_INLINE_ASM */ 199 200av_cold void ff_dct_encode_init_x86(MpegEncContext *s) 201{ 202 const int dct_algo = s->avctx->dct_algo; 203 int i; 204 205 for (i = 0; i < 64; i++) 206 inv_zigzag_direct16[ff_zigzag_direct[i]] = i + 1; 207 208 if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { 209#if HAVE_MMX_INLINE 210 int cpu_flags = av_get_cpu_flags(); 211 if (INLINE_MMX(cpu_flags)) { 212#if HAVE_6REGS 213 s->dct_quantize = dct_quantize_mmx; 214#endif 215 s->denoise_dct = denoise_dct_mmx; 216 } 217#endif 218#if HAVE_6REGS && HAVE_MMXEXT_INLINE 219 if (INLINE_MMXEXT(cpu_flags)) 220 s->dct_quantize = dct_quantize_mmxext; 221#endif 222#if HAVE_SSE2_INLINE 223 if (INLINE_SSE2(cpu_flags)) { 224#if HAVE_6REGS 225 s->dct_quantize = dct_quantize_sse2; 226#endif 227 s->denoise_dct = denoise_dct_sse2; 228 } 229#endif 230#if HAVE_6REGS && HAVE_SSSE3_INLINE 231 if (INLINE_SSSE3(cpu_flags)) 232 s->dct_quantize = dct_quantize_ssse3; 233#endif 234 } 235} 236