1/* 2 * MMX optimized DSP utils 3 * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#ifndef AVCODEC_X86_DSPUTIL_MMX_H 23#define AVCODEC_X86_DSPUTIL_MMX_H 24 25#include <stdint.h> 26#include "libavcodec/dsputil.h" 27 28typedef struct { uint64_t a, b; } xmm_reg; 29 30extern const uint64_t ff_bone; 31extern const uint64_t ff_wtwo; 32 33extern const uint64_t ff_pdw_80000000[2]; 34 35extern const uint64_t ff_pw_3; 36extern const uint64_t ff_pw_4; 37extern const xmm_reg ff_pw_5; 38extern const xmm_reg ff_pw_8; 39extern const uint64_t ff_pw_15; 40extern const xmm_reg ff_pw_16; 41extern const uint64_t ff_pw_20; 42extern const xmm_reg ff_pw_28; 43extern const xmm_reg ff_pw_32; 44extern const uint64_t ff_pw_42; 45extern const xmm_reg ff_pw_64; 46extern const uint64_t ff_pw_96; 47extern const uint64_t ff_pw_128; 48extern const uint64_t ff_pw_255; 49 50extern const uint64_t ff_pb_1; 51extern const uint64_t ff_pb_3; 52extern const uint64_t ff_pb_7; 53extern const uint64_t ff_pb_1F; 54extern const uint64_t ff_pb_3F; 55extern const uint64_t ff_pb_81; 56extern const uint64_t ff_pb_A1; 57extern const uint64_t ff_pb_FC; 58 59extern const double ff_pd_1[2]; 60extern const double ff_pd_2[2]; 61 62#define LOAD4(stride,in,a,b,c,d)\ 63 "movq 0*"#stride"+"#in", "#a"\n\t"\ 64 "movq 1*"#stride"+"#in", "#b"\n\t"\ 65 "movq 2*"#stride"+"#in", "#c"\n\t"\ 66 "movq 3*"#stride"+"#in", "#d"\n\t" 67 68#define STORE4(stride,out,a,b,c,d)\ 69 "movq "#a", 0*"#stride"+"#out"\n\t"\ 70 "movq "#b", 1*"#stride"+"#out"\n\t"\ 71 "movq "#c", 2*"#stride"+"#out"\n\t"\ 72 "movq "#d", 3*"#stride"+"#out"\n\t" 73 74/* in/out: mma=mma+mmb, mmb=mmb-mma */ 75#define SUMSUB_BA( a, b ) \ 76 "paddw "#b", "#a" \n\t"\ 77 "paddw "#b", "#b" \n\t"\ 78 "psubw "#a", "#b" \n\t" 79 80#define SBUTTERFLY(a,b,t,n,m)\ 81 "mov" #m " " #a ", " #t " \n\t" /* abcd */\ 82 "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ 83 "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ 84 85#define TRANSPOSE4(a,b,c,d,t)\ 86 SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ 87 SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ 88 SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ 89 SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ 90 91// e,f,g,h can be memory 92// out: a,d,t,c 93#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ 94 "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\ 95 "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\ 96 "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\ 97 "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\ 98 SBUTTERFLY(a, b, t, bw, q) /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\ 99 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\ 100 SBUTTERFLY(c, d, b, bw, q) /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\ 101 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\ 102 SBUTTERFLY(a, c, d, wd, q) /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\ 103 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\ 104 SBUTTERFLY(t, b, c, wd, q) /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\ 105 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */ 106 107#if ARCH_X86_64 108// permutes 01234567 -> 05736421 109#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ 110 SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ 111 SBUTTERFLY(c,d,b,wd,dqa)\ 112 SBUTTERFLY(e,f,d,wd,dqa)\ 113 SBUTTERFLY(g,h,f,wd,dqa)\ 114 SBUTTERFLY(a,c,h,dq,dqa)\ 115 SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ 116 SBUTTERFLY(e,g,b,dq,dqa)\ 117 SBUTTERFLY(d,f,g,dq,dqa)\ 118 SBUTTERFLY(a,e,f,qdq,dqa)\ 119 SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ 120 SBUTTERFLY(h,b,d,qdq,dqa)\ 121 SBUTTERFLY(c,g,b,qdq,dqa)\ 122 "movdqa %%xmm8, "#g" \n\t" 123#else 124#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ 125 "movdqa "#h", "#t" \n\t"\ 126 SBUTTERFLY(a,b,h,wd,dqa)\ 127 "movdqa "#h", 16"#t" \n\t"\ 128 "movdqa "#t", "#h" \n\t"\ 129 SBUTTERFLY(c,d,b,wd,dqa)\ 130 SBUTTERFLY(e,f,d,wd,dqa)\ 131 SBUTTERFLY(g,h,f,wd,dqa)\ 132 SBUTTERFLY(a,c,h,dq,dqa)\ 133 "movdqa "#h", "#t" \n\t"\ 134 "movdqa 16"#t", "#h" \n\t"\ 135 SBUTTERFLY(h,b,c,dq,dqa)\ 136 SBUTTERFLY(e,g,b,dq,dqa)\ 137 SBUTTERFLY(d,f,g,dq,dqa)\ 138 SBUTTERFLY(a,e,f,qdq,dqa)\ 139 SBUTTERFLY(h,d,e,qdq,dqa)\ 140 "movdqa "#h", 16"#t" \n\t"\ 141 "movdqa "#t", "#h" \n\t"\ 142 SBUTTERFLY(h,b,d,qdq,dqa)\ 143 SBUTTERFLY(c,g,b,qdq,dqa)\ 144 "movdqa 16"#t", "#g" \n\t" 145#endif 146 147#define MOVQ_WONE(regd) \ 148 __asm__ volatile ( \ 149 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 150 "psrlw $15, %%" #regd ::) 151 152void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); 153void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); 154 155void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 156void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 157void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); 158 159void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx); 160void ff_cavsdsp_init_3dnow(DSPContext* c, AVCodecContext *avctx); 161void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 162void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 163void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 164void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); 165 166void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); 167void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd); 168void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd); 169 170void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag, 171 double *autoc); 172 173void ff_mmx_idct(DCTELEM *block); 174void ff_mmxext_idct(DCTELEM *block); 175 176#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ 177