1/*
2 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
3 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22#include "libavutil/attributes.h"
23#include "libavutil/cpu.h"
24#include "libavutil/x86/asm.h"
25#include "libavutil/x86/cpu.h"
26#include "libavcodec/avcodec.h"
27#include "libavcodec/mpegvideo.h"
28
29#if HAVE_MMX_INLINE
30
31static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
32                                  int16_t *block, int n, int qscale)
33{
34    x86_reg level, qmul, qadd, nCoeffs;
35
36    qmul = qscale << 1;
37
38    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
39
40    if (!s->h263_aic) {
41        if (n < 4)
42            level = block[0] * s->y_dc_scale;
43        else
44            level = block[0] * s->c_dc_scale;
45        qadd = (qscale - 1) | 1;
46    }else{
47        qadd = 0;
48        level= block[0];
49    }
50    if(s->ac_pred)
51        nCoeffs=63;
52    else
53        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
54
55__asm__ volatile(
56                "movd %1, %%mm6                 \n\t" //qmul
57                "packssdw %%mm6, %%mm6          \n\t"
58                "packssdw %%mm6, %%mm6          \n\t"
59                "movd %2, %%mm5                 \n\t" //qadd
60                "pxor %%mm7, %%mm7              \n\t"
61                "packssdw %%mm5, %%mm5          \n\t"
62                "packssdw %%mm5, %%mm5          \n\t"
63                "psubw %%mm5, %%mm7             \n\t"
64                "pxor %%mm4, %%mm4              \n\t"
65                ".p2align 4                     \n\t"
66                "1:                             \n\t"
67                "movq (%0, %3), %%mm0           \n\t"
68                "movq 8(%0, %3), %%mm1          \n\t"
69
70                "pmullw %%mm6, %%mm0            \n\t"
71                "pmullw %%mm6, %%mm1            \n\t"
72
73                "movq (%0, %3), %%mm2           \n\t"
74                "movq 8(%0, %3), %%mm3          \n\t"
75
76                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
77                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
78
79                "pxor %%mm2, %%mm0              \n\t"
80                "pxor %%mm3, %%mm1              \n\t"
81
82                "paddw %%mm7, %%mm0             \n\t"
83                "paddw %%mm7, %%mm1             \n\t"
84
85                "pxor %%mm0, %%mm2              \n\t"
86                "pxor %%mm1, %%mm3              \n\t"
87
88                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
89                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
90
91                "pandn %%mm2, %%mm0             \n\t"
92                "pandn %%mm3, %%mm1             \n\t"
93
94                "movq %%mm0, (%0, %3)           \n\t"
95                "movq %%mm1, 8(%0, %3)          \n\t"
96
97                "add $16, %3                    \n\t"
98                "jng 1b                         \n\t"
99                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
100                : "memory"
101        );
102        block[0]= level;
103}
104
105
106static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
107                                  int16_t *block, int n, int qscale)
108{
109    x86_reg qmul, qadd, nCoeffs;
110
111    qmul = qscale << 1;
112    qadd = (qscale - 1) | 1;
113
114    av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
115
116    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
117
118__asm__ volatile(
119                "movd %1, %%mm6                 \n\t" //qmul
120                "packssdw %%mm6, %%mm6          \n\t"
121                "packssdw %%mm6, %%mm6          \n\t"
122                "movd %2, %%mm5                 \n\t" //qadd
123                "pxor %%mm7, %%mm7              \n\t"
124                "packssdw %%mm5, %%mm5          \n\t"
125                "packssdw %%mm5, %%mm5          \n\t"
126                "psubw %%mm5, %%mm7             \n\t"
127                "pxor %%mm4, %%mm4              \n\t"
128                ".p2align 4                     \n\t"
129                "1:                             \n\t"
130                "movq (%0, %3), %%mm0           \n\t"
131                "movq 8(%0, %3), %%mm1          \n\t"
132
133                "pmullw %%mm6, %%mm0            \n\t"
134                "pmullw %%mm6, %%mm1            \n\t"
135
136                "movq (%0, %3), %%mm2           \n\t"
137                "movq 8(%0, %3), %%mm3          \n\t"
138
139                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
140                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
141
142                "pxor %%mm2, %%mm0              \n\t"
143                "pxor %%mm3, %%mm1              \n\t"
144
145                "paddw %%mm7, %%mm0             \n\t"
146                "paddw %%mm7, %%mm1             \n\t"
147
148                "pxor %%mm0, %%mm2              \n\t"
149                "pxor %%mm1, %%mm3              \n\t"
150
151                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
152                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
153
154                "pandn %%mm2, %%mm0             \n\t"
155                "pandn %%mm3, %%mm1             \n\t"
156
157                "movq %%mm0, (%0, %3)           \n\t"
158                "movq %%mm1, 8(%0, %3)          \n\t"
159
160                "add $16, %3                    \n\t"
161                "jng 1b                         \n\t"
162                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
163                : "memory"
164        );
165}
166
167static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
168                                     int16_t *block, int n, int qscale)
169{
170    x86_reg nCoeffs;
171    const uint16_t *quant_matrix;
172    int block0;
173
174    av_assert2(s->block_last_index[n]>=0);
175
176    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
177
178    if (n < 4)
179        block0 = block[0] * s->y_dc_scale;
180    else
181        block0 = block[0] * s->c_dc_scale;
182    /* XXX: only mpeg1 */
183    quant_matrix = s->intra_matrix;
184__asm__ volatile(
185                "pcmpeqw %%mm7, %%mm7           \n\t"
186                "psrlw $15, %%mm7               \n\t"
187                "movd %2, %%mm6                 \n\t"
188                "packssdw %%mm6, %%mm6          \n\t"
189                "packssdw %%mm6, %%mm6          \n\t"
190                "mov %3, %%"REG_a"              \n\t"
191                ".p2align 4                     \n\t"
192                "1:                             \n\t"
193                "movq (%0, %%"REG_a"), %%mm0    \n\t"
194                "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
195                "movq (%1, %%"REG_a"), %%mm4    \n\t"
196                "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
197                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
198                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
199                "pxor %%mm2, %%mm2              \n\t"
200                "pxor %%mm3, %%mm3              \n\t"
201                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
202                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
203                "pxor %%mm2, %%mm0              \n\t"
204                "pxor %%mm3, %%mm1              \n\t"
205                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
206                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
207                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
208                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
209                "pxor %%mm4, %%mm4              \n\t"
210                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
211                "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
212                "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
213                "psraw $3, %%mm0                \n\t"
214                "psraw $3, %%mm1                \n\t"
215                "psubw %%mm7, %%mm0             \n\t"
216                "psubw %%mm7, %%mm1             \n\t"
217                "por %%mm7, %%mm0               \n\t"
218                "por %%mm7, %%mm1               \n\t"
219                "pxor %%mm2, %%mm0              \n\t"
220                "pxor %%mm3, %%mm1              \n\t"
221                "psubw %%mm2, %%mm0             \n\t"
222                "psubw %%mm3, %%mm1             \n\t"
223                "pandn %%mm0, %%mm4             \n\t"
224                "pandn %%mm1, %%mm5             \n\t"
225                "movq %%mm4, (%0, %%"REG_a")    \n\t"
226                "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
227
228                "add $16, %%"REG_a"             \n\t"
229                "js 1b                          \n\t"
230                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
231                : "%"REG_a, "memory"
232        );
233    block[0]= block0;
234}
235
236static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
237                                     int16_t *block, int n, int qscale)
238{
239    x86_reg nCoeffs;
240    const uint16_t *quant_matrix;
241
242    av_assert2(s->block_last_index[n]>=0);
243
244    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
245
246        quant_matrix = s->inter_matrix;
247__asm__ volatile(
248                "pcmpeqw %%mm7, %%mm7           \n\t"
249                "psrlw $15, %%mm7               \n\t"
250                "movd %2, %%mm6                 \n\t"
251                "packssdw %%mm6, %%mm6          \n\t"
252                "packssdw %%mm6, %%mm6          \n\t"
253                "mov %3, %%"REG_a"              \n\t"
254                ".p2align 4                     \n\t"
255                "1:                             \n\t"
256                "movq (%0, %%"REG_a"), %%mm0    \n\t"
257                "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
258                "movq (%1, %%"REG_a"), %%mm4    \n\t"
259                "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
260                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
261                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
262                "pxor %%mm2, %%mm2              \n\t"
263                "pxor %%mm3, %%mm3              \n\t"
264                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
265                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
266                "pxor %%mm2, %%mm0              \n\t"
267                "pxor %%mm3, %%mm1              \n\t"
268                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
269                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
270                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
271                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
272                "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
273                "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
274                "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 1)*q
275                "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
276                "pxor %%mm4, %%mm4              \n\t"
277                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
278                "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
279                "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
280                "psraw $4, %%mm0                \n\t"
281                "psraw $4, %%mm1                \n\t"
282                "psubw %%mm7, %%mm0             \n\t"
283                "psubw %%mm7, %%mm1             \n\t"
284                "por %%mm7, %%mm0               \n\t"
285                "por %%mm7, %%mm1               \n\t"
286                "pxor %%mm2, %%mm0              \n\t"
287                "pxor %%mm3, %%mm1              \n\t"
288                "psubw %%mm2, %%mm0             \n\t"
289                "psubw %%mm3, %%mm1             \n\t"
290                "pandn %%mm0, %%mm4             \n\t"
291                "pandn %%mm1, %%mm5             \n\t"
292                "movq %%mm4, (%0, %%"REG_a")    \n\t"
293                "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
294
295                "add $16, %%"REG_a"             \n\t"
296                "js 1b                          \n\t"
297                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
298                : "%"REG_a, "memory"
299        );
300}
301
302static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
303                                     int16_t *block, int n, int qscale)
304{
305    x86_reg nCoeffs;
306    const uint16_t *quant_matrix;
307    int block0;
308
309    av_assert2(s->block_last_index[n]>=0);
310
311    if(s->alternate_scan) nCoeffs= 63; //FIXME
312    else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
313
314    if (n < 4)
315        block0 = block[0] * s->y_dc_scale;
316    else
317        block0 = block[0] * s->c_dc_scale;
318    quant_matrix = s->intra_matrix;
319__asm__ volatile(
320                "pcmpeqw %%mm7, %%mm7           \n\t"
321                "psrlw $15, %%mm7               \n\t"
322                "movd %2, %%mm6                 \n\t"
323                "packssdw %%mm6, %%mm6          \n\t"
324                "packssdw %%mm6, %%mm6          \n\t"
325                "mov %3, %%"REG_a"              \n\t"
326                ".p2align 4                     \n\t"
327                "1:                             \n\t"
328                "movq (%0, %%"REG_a"), %%mm0    \n\t"
329                "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
330                "movq (%1, %%"REG_a"), %%mm4    \n\t"
331                "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
332                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
333                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
334                "pxor %%mm2, %%mm2              \n\t"
335                "pxor %%mm3, %%mm3              \n\t"
336                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
337                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
338                "pxor %%mm2, %%mm0              \n\t"
339                "pxor %%mm3, %%mm1              \n\t"
340                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
341                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
342                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
343                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
344                "pxor %%mm4, %%mm4              \n\t"
345                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
346                "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
347                "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
348                "psraw $3, %%mm0                \n\t"
349                "psraw $3, %%mm1                \n\t"
350                "pxor %%mm2, %%mm0              \n\t"
351                "pxor %%mm3, %%mm1              \n\t"
352                "psubw %%mm2, %%mm0             \n\t"
353                "psubw %%mm3, %%mm1             \n\t"
354                "pandn %%mm0, %%mm4             \n\t"
355                "pandn %%mm1, %%mm5             \n\t"
356                "movq %%mm4, (%0, %%"REG_a")    \n\t"
357                "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
358
359                "add $16, %%"REG_a"             \n\t"
360                "jng 1b                         \n\t"
361                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
362                : "%"REG_a, "memory"
363        );
364    block[0]= block0;
365        //Note, we do not do mismatch control for intra as errors cannot accumulate
366}
367
368static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
369                                     int16_t *block, int n, int qscale)
370{
371    x86_reg nCoeffs;
372    const uint16_t *quant_matrix;
373
374    av_assert2(s->block_last_index[n]>=0);
375
376    if(s->alternate_scan) nCoeffs= 63; //FIXME
377    else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
378
379        quant_matrix = s->inter_matrix;
380__asm__ volatile(
381                "pcmpeqw %%mm7, %%mm7           \n\t"
382                "psrlq $48, %%mm7               \n\t"
383                "movd %2, %%mm6                 \n\t"
384                "packssdw %%mm6, %%mm6          \n\t"
385                "packssdw %%mm6, %%mm6          \n\t"
386                "mov %3, %%"REG_a"              \n\t"
387                ".p2align 4                     \n\t"
388                "1:                             \n\t"
389                "movq (%0, %%"REG_a"), %%mm0    \n\t"
390                "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
391                "movq (%1, %%"REG_a"), %%mm4    \n\t"
392                "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
393                "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
394                "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
395                "pxor %%mm2, %%mm2              \n\t"
396                "pxor %%mm3, %%mm3              \n\t"
397                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
398                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
399                "pxor %%mm2, %%mm0              \n\t"
400                "pxor %%mm3, %%mm1              \n\t"
401                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
402                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
403                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
404                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
405                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
406                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
407                "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 1)*q
408                "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
409                "pxor %%mm4, %%mm4              \n\t"
410                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
411                "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
412                "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
413                "psrlw $4, %%mm0                \n\t"
414                "psrlw $4, %%mm1                \n\t"
415                "pxor %%mm2, %%mm0              \n\t"
416                "pxor %%mm3, %%mm1              \n\t"
417                "psubw %%mm2, %%mm0             \n\t"
418                "psubw %%mm3, %%mm1             \n\t"
419                "pandn %%mm0, %%mm4             \n\t"
420                "pandn %%mm1, %%mm5             \n\t"
421                "pxor %%mm4, %%mm7              \n\t"
422                "pxor %%mm5, %%mm7              \n\t"
423                "movq %%mm4, (%0, %%"REG_a")    \n\t"
424                "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
425
426                "add $16, %%"REG_a"             \n\t"
427                "jng 1b                         \n\t"
428                "movd 124(%0, %3), %%mm0        \n\t"
429                "movq %%mm7, %%mm6              \n\t"
430                "psrlq $32, %%mm7               \n\t"
431                "pxor %%mm6, %%mm7              \n\t"
432                "movq %%mm7, %%mm6              \n\t"
433                "psrlq $16, %%mm7               \n\t"
434                "pxor %%mm6, %%mm7              \n\t"
435                "pslld $31, %%mm7               \n\t"
436                "psrlq $15, %%mm7               \n\t"
437                "pxor %%mm7, %%mm0              \n\t"
438                "movd %%mm0, 124(%0, %3)        \n\t"
439
440                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
441                : "%"REG_a, "memory"
442        );
443}
444
445#endif /* HAVE_MMX_INLINE */
446
447av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
448{
449#if HAVE_MMX_INLINE
450    int cpu_flags = av_get_cpu_flags();
451
452    if (INLINE_MMX(cpu_flags)) {
453        s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
454        s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
455        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
456        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
457        if(!(s->flags & CODEC_FLAG_BITEXACT))
458            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
459        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
460    }
461#endif /* HAVE_MMX_INLINE */
462}
463