1;******************************************************************************
2;*
3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7;* Copyright (c) 2013 Daniel Kang
8;*
9;* SIMD-optimized halfpel functions
10;*
11;* This file is part of FFmpeg.
12;*
13;* FFmpeg is free software; you can redistribute it and/or
14;* modify it under the terms of the GNU Lesser General Public
15;* License as published by the Free Software Foundation; either
16;* version 2.1 of the License, or (at your option) any later version.
17;*
18;* FFmpeg is distributed in the hope that it will be useful,
19;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21;* Lesser General Public License for more details.
22;*
23;* You should have received a copy of the GNU Lesser General Public
24;* License along with FFmpeg; if not, write to the Free Software
25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26;******************************************************************************
27
28%include "libavutil/x86/x86util.asm"
29
30SECTION_RODATA
31cextern pb_1
32cextern pw_2
33pw_8192: times 8 dw (1<<13)
34pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
35pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
36
37SECTION_TEXT
38
39; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
40%macro PUT_PIXELS8_X2 0
41%if cpuflag(sse2)
42cglobal put_pixels16_x2, 4,5,4
43%else
44cglobal put_pixels8_x2, 4,5
45%endif
46    lea          r4, [r2*2]
47.loop:
48    movu         m0, [r1+1]
49    movu         m1, [r1+r2+1]
50%if cpuflag(sse2)
51    movu         m2, [r1]
52    movu         m3, [r1+r2]
53    pavgb        m0, m2
54    pavgb        m1, m3
55%else
56    PAVGB        m0, [r1]
57    PAVGB        m1, [r1+r2]
58%endif
59    mova       [r0], m0
60    mova    [r0+r2], m1
61    add          r1, r4
62    add          r0, r4
63    movu         m0, [r1+1]
64    movu         m1, [r1+r2+1]
65%if cpuflag(sse2)
66    movu         m2, [r1]
67    movu         m3, [r1+r2]
68    pavgb        m0, m2
69    pavgb        m1, m3
70%else
71    PAVGB        m0, [r1]
72    PAVGB        m1, [r1+r2]
73%endif
74    add          r1, r4
75    mova       [r0], m0
76    mova    [r0+r2], m1
77    add          r0, r4
78    sub         r3d, 4
79    jne .loop
80    REP_RET
81%endmacro
82
83INIT_MMX mmxext
84PUT_PIXELS8_X2
85INIT_MMX 3dnow
86PUT_PIXELS8_X2
87
88
89; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
90%macro PUT_PIXELS_16 0
91cglobal put_pixels16_x2, 4,5
92    lea          r4, [r2*2]
93.loop:
94    mova         m0, [r1]
95    mova         m1, [r1+r2]
96    mova         m2, [r1+8]
97    mova         m3, [r1+r2+8]
98    PAVGB        m0, [r1+1]
99    PAVGB        m1, [r1+r2+1]
100    PAVGB        m2, [r1+9]
101    PAVGB        m3, [r1+r2+9]
102    mova       [r0], m0
103    mova    [r0+r2], m1
104    mova     [r0+8], m2
105    mova  [r0+r2+8], m3
106    add          r1, r4
107    add          r0, r4
108    mova         m0, [r1]
109    mova         m1, [r1+r2]
110    mova         m2, [r1+8]
111    mova         m3, [r1+r2+8]
112    PAVGB        m0, [r1+1]
113    PAVGB        m1, [r1+r2+1]
114    PAVGB        m2, [r1+9]
115    PAVGB        m3, [r1+r2+9]
116    add          r1, r4
117    mova       [r0], m0
118    mova    [r0+r2], m1
119    mova     [r0+8], m2
120    mova  [r0+r2+8], m3
121    add          r0, r4
122    sub         r3d, 4
123    jne .loop
124    REP_RET
125%endmacro
126
127INIT_MMX mmxext
128PUT_PIXELS_16
129INIT_MMX 3dnow
130PUT_PIXELS_16
131; The 8_X2 macro can easily be used here
132INIT_XMM sse2
133PUT_PIXELS8_X2
134
135
136; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
137%macro PUT_NO_RND_PIXELS8_X2 0
138cglobal put_no_rnd_pixels8_x2, 4,5
139    mova         m6, [pb_1]
140    lea          r4, [r2*2]
141.loop:
142    mova         m0, [r1]
143    mova         m2, [r1+r2]
144    mova         m1, [r1+1]
145    mova         m3, [r1+r2+1]
146    add          r1, r4
147    psubusb      m0, m6
148    psubusb      m2, m6
149    PAVGB        m0, m1
150    PAVGB        m2, m3
151    mova       [r0], m0
152    mova    [r0+r2], m2
153    mova         m0, [r1]
154    mova         m1, [r1+1]
155    mova         m2, [r1+r2]
156    mova         m3, [r1+r2+1]
157    add          r0, r4
158    add          r1, r4
159    psubusb      m0, m6
160    psubusb      m2, m6
161    PAVGB        m0, m1
162    PAVGB        m2, m3
163    mova       [r0], m0
164    mova    [r0+r2], m2
165    add          r0, r4
166    sub         r3d, 4
167    jne .loop
168    REP_RET
169%endmacro
170
171INIT_MMX mmxext
172PUT_NO_RND_PIXELS8_X2
173INIT_MMX 3dnow
174PUT_NO_RND_PIXELS8_X2
175
176
177; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
178%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
179cglobal put_no_rnd_pixels8_x2_exact, 4,5
180    lea          r4, [r2*3]
181    pcmpeqb      m6, m6
182.loop:
183    mova         m0, [r1]
184    mova         m2, [r1+r2]
185    mova         m1, [r1+1]
186    mova         m3, [r1+r2+1]
187    pxor         m0, m6
188    pxor         m2, m6
189    pxor         m1, m6
190    pxor         m3, m6
191    PAVGB        m0, m1
192    PAVGB        m2, m3
193    pxor         m0, m6
194    pxor         m2, m6
195    mova       [r0], m0
196    mova    [r0+r2], m2
197    mova         m0, [r1+r2*2]
198    mova         m1, [r1+r2*2+1]
199    mova         m2, [r1+r4]
200    mova         m3, [r1+r4+1]
201    pxor         m0, m6
202    pxor         m1, m6
203    pxor         m2, m6
204    pxor         m3, m6
205    PAVGB        m0, m1
206    PAVGB        m2, m3
207    pxor         m0, m6
208    pxor         m2, m6
209    mova  [r0+r2*2], m0
210    mova    [r0+r4], m2
211    lea          r1, [r1+r2*4]
212    lea          r0, [r0+r2*4]
213    sub         r3d, 4
214    jg .loop
215    REP_RET
216%endmacro
217
218INIT_MMX mmxext
219PUT_NO_RND_PIXELS8_X2_EXACT
220INIT_MMX 3dnow
221PUT_NO_RND_PIXELS8_X2_EXACT
222
223
224; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
225%macro PUT_PIXELS8_Y2 0
226%if cpuflag(sse2)
227cglobal put_pixels16_y2, 4,5,3
228%else
229cglobal put_pixels8_y2, 4,5
230%endif
231    lea          r4, [r2*2]
232    movu         m0, [r1]
233    sub          r0, r2
234.loop:
235    movu         m1, [r1+r2]
236    movu         m2, [r1+r4]
237    add          r1, r4
238    PAVGB        m0, m1
239    PAVGB        m1, m2
240    mova    [r0+r2], m0
241    mova    [r0+r4], m1
242    movu         m1, [r1+r2]
243    movu         m0, [r1+r4]
244    add          r0, r4
245    add          r1, r4
246    PAVGB        m2, m1
247    PAVGB        m1, m0
248    mova    [r0+r2], m2
249    mova    [r0+r4], m1
250    add          r0, r4
251    sub         r3d, 4
252    jne .loop
253    REP_RET
254%endmacro
255
256INIT_MMX mmxext
257PUT_PIXELS8_Y2
258INIT_MMX 3dnow
259PUT_PIXELS8_Y2
260; actually, put_pixels16_y2_sse2
261INIT_XMM sse2
262PUT_PIXELS8_Y2
263
264
265; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
266%macro PUT_NO_RND_PIXELS8_Y2 0
267cglobal put_no_rnd_pixels8_y2, 4,5
268    mova         m6, [pb_1]
269    lea          r4, [r2+r2]
270    mova         m0, [r1]
271    sub          r0, r2
272.loop:
273    mova         m1, [r1+r2]
274    mova         m2, [r1+r4]
275    add          r1, r4
276    psubusb      m1, m6
277    PAVGB        m0, m1
278    PAVGB        m1, m2
279    mova    [r0+r2], m0
280    mova    [r0+r4], m1
281    mova         m1, [r1+r2]
282    mova         m0, [r1+r4]
283    add          r0, r4
284    add          r1, r4
285    psubusb      m1, m6
286    PAVGB        m2, m1
287    PAVGB        m1, m0
288    mova    [r0+r2], m2
289    mova    [r0+r4], m1
290    add          r0, r4
291    sub         r3d, 4
292    jne .loop
293    REP_RET
294%endmacro
295
296INIT_MMX mmxext
297PUT_NO_RND_PIXELS8_Y2
298INIT_MMX 3dnow
299PUT_NO_RND_PIXELS8_Y2
300
301
302; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
303%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
304cglobal put_no_rnd_pixels8_y2_exact, 4,5
305    lea          r4, [r2*3]
306    mova         m0, [r1]
307    pcmpeqb      m6, m6
308    add          r1, r2
309    pxor         m0, m6
310.loop:
311    mova         m1, [r1]
312    mova         m2, [r1+r2]
313    pxor         m1, m6
314    pxor         m2, m6
315    PAVGB        m0, m1
316    PAVGB        m1, m2
317    pxor         m0, m6
318    pxor         m1, m6
319    mova       [r0], m0
320    mova    [r0+r2], m1
321    mova         m1, [r1+r2*2]
322    mova         m0, [r1+r4]
323    pxor         m1, m6
324    pxor         m0, m6
325    PAVGB        m2, m1
326    PAVGB        m1, m0
327    pxor         m2, m6
328    pxor         m1, m6
329    mova  [r0+r2*2], m2
330    mova    [r0+r4], m1
331    lea          r1, [r1+r2*4]
332    lea          r0, [r0+r2*4]
333    sub         r3d, 4
334    jg .loop
335    REP_RET
336%endmacro
337
338INIT_MMX mmxext
339PUT_NO_RND_PIXELS8_Y2_EXACT
340INIT_MMX 3dnow
341PUT_NO_RND_PIXELS8_Y2_EXACT
342
343
344; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
345%macro AVG_PIXELS8 0
346cglobal avg_pixels8, 4,5
347    lea          r4, [r2*2]
348.loop:
349    mova         m0, [r0]
350    mova         m1, [r0+r2]
351    PAVGB        m0, [r1]
352    PAVGB        m1, [r1+r2]
353    mova       [r0], m0
354    mova    [r0+r2], m1
355    add          r1, r4
356    add          r0, r4
357    mova         m0, [r0]
358    mova         m1, [r0+r2]
359    PAVGB        m0, [r1]
360    PAVGB        m1, [r1+r2]
361    add          r1, r4
362    mova       [r0], m0
363    mova    [r0+r2], m1
364    add          r0, r4
365    sub         r3d, 4
366    jne .loop
367    REP_RET
368%endmacro
369
370INIT_MMX 3dnow
371AVG_PIXELS8
372
373
374; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
375%macro AVG_PIXELS8_X2 0
376%if cpuflag(sse2)
377cglobal avg_pixels16_x2, 4,5,4
378%else
379cglobal avg_pixels8_x2, 4,5
380%endif
381    lea          r4, [r2*2]
382%if notcpuflag(mmxext)
383    pcmpeqd      m5, m5
384    paddb        m5, m5
385%endif
386.loop:
387    movu         m0, [r1]
388    movu         m2, [r1+r2]
389%if cpuflag(sse2)
390    movu         m1, [r1+1]
391    movu         m3, [r1+r2+1]
392    pavgb        m0, m1
393    pavgb        m2, m3
394%else
395    PAVGB        m0, [r1+1], m3, m5
396    PAVGB        m2, [r1+r2+1], m4, m5
397%endif
398    PAVGB        m0, [r0], m3, m5
399    PAVGB        m2, [r0+r2], m4, m5
400    add          r1, r4
401    mova       [r0], m0
402    mova    [r0+r2], m2
403    movu         m0, [r1]
404    movu         m2, [r1+r2]
405%if cpuflag(sse2)
406    movu         m1, [r1+1]
407    movu         m3, [r1+r2+1]
408    pavgb        m0, m1
409    pavgb        m2, m3
410%else
411    PAVGB        m0, [r1+1], m3, m5
412    PAVGB        m2, [r1+r2+1], m4, m5
413%endif
414    add          r0, r4
415    add          r1, r4
416    PAVGB        m0, [r0], m3, m5
417    PAVGB        m2, [r0+r2], m4, m5
418    mova       [r0], m0
419    mova    [r0+r2], m2
420    add          r0, r4
421    sub         r3d, 4
422    jne .loop
423    REP_RET
424%endmacro
425
426INIT_MMX mmx
427AVG_PIXELS8_X2
428INIT_MMX mmxext
429AVG_PIXELS8_X2
430INIT_MMX 3dnow
431AVG_PIXELS8_X2
432; actually avg_pixels16_x2
433INIT_XMM sse2
434AVG_PIXELS8_X2
435
436
437; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
438%macro AVG_PIXELS8_Y2 0
439%if cpuflag(sse2)
440cglobal avg_pixels16_y2, 4,5,3
441%else
442cglobal avg_pixels8_y2, 4,5
443%endif
444    lea          r4, [r2*2]
445    movu         m0, [r1]
446    sub          r0, r2
447.loop:
448    movu         m1, [r1+r2]
449    movu         m2, [r1+r4]
450    add          r1, r4
451    PAVGB        m0, m1
452    PAVGB        m1, m2
453    PAVGB        m0, [r0+r2]
454    PAVGB        m1, [r0+r4]
455    mova    [r0+r2], m0
456    mova    [r0+r4], m1
457    movu         m1, [r1+r2]
458    movu         m0, [r1+r4]
459    PAVGB        m2, m1
460    PAVGB        m1, m0
461    add          r0, r4
462    add          r1, r4
463    PAVGB        m2, [r0+r2]
464    PAVGB        m1, [r0+r4]
465    mova    [r0+r2], m2
466    mova    [r0+r4], m1
467    add          r0, r4
468    sub         r3d, 4
469    jne .loop
470    REP_RET
471%endmacro
472
473INIT_MMX mmxext
474AVG_PIXELS8_Y2
475INIT_MMX 3dnow
476AVG_PIXELS8_Y2
477; actually avg_pixels16_y2
478INIT_XMM sse2
479AVG_PIXELS8_Y2
480
481
482; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
483; Note this is not correctly rounded, and is therefore used for
484; not-bitexact output
485%macro AVG_APPROX_PIXELS8_XY2 0
486cglobal avg_approx_pixels8_xy2, 4,5
487    mova         m6, [pb_1]
488    lea          r4, [r2*2]
489    mova         m0, [r1]
490    PAVGB        m0, [r1+1]
491.loop:
492    mova         m2, [r1+r4]
493    mova         m1, [r1+r2]
494    psubusb      m2, m6
495    PAVGB        m1, [r1+r2+1]
496    PAVGB        m2, [r1+r4+1]
497    add          r1, r4
498    PAVGB        m0, m1
499    PAVGB        m1, m2
500    PAVGB        m0, [r0]
501    PAVGB        m1, [r0+r2]
502    mova       [r0], m0
503    mova    [r0+r2], m1
504    mova         m1, [r1+r2]
505    mova         m0, [r1+r4]
506    PAVGB        m1, [r1+r2+1]
507    PAVGB        m0, [r1+r4+1]
508    add          r0, r4
509    add          r1, r4
510    PAVGB        m2, m1
511    PAVGB        m1, m0
512    PAVGB        m2, [r0]
513    PAVGB        m1, [r0+r2]
514    mova       [r0], m2
515    mova    [r0+r2], m1
516    add          r0, r4
517    sub         r3d, 4
518    jne .loop
519    REP_RET
520%endmacro
521
522INIT_MMX mmxext
523AVG_APPROX_PIXELS8_XY2
524INIT_MMX 3dnow
525AVG_APPROX_PIXELS8_XY2
526
527
528; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
529%macro SET_PIXELS_XY2 1
530%if cpuflag(sse2)
531cglobal %1_pixels16_xy2, 4,5,8
532%else
533cglobal %1_pixels8_xy2, 4,5
534%endif
535    pxor        m7, m7
536    mova        m6, [pw_2]
537    movu        m0, [r1]
538    movu        m4, [r1+1]
539    mova        m1, m0
540    mova        m5, m4
541    punpcklbw   m0, m7
542    punpcklbw   m4, m7
543    punpckhbw   m1, m7
544    punpckhbw   m5, m7
545    paddusw     m4, m0
546    paddusw     m5, m1
547    xor         r4, r4
548    add         r1, r2
549.loop:
550    movu        m0, [r1+r4]
551    movu        m2, [r1+r4+1]
552    mova        m1, m0
553    mova        m3, m2
554    punpcklbw   m0, m7
555    punpcklbw   m2, m7
556    punpckhbw   m1, m7
557    punpckhbw   m3, m7
558    paddusw     m0, m2
559    paddusw     m1, m3
560    paddusw     m4, m6
561    paddusw     m5, m6
562    paddusw     m4, m0
563    paddusw     m5, m1
564    psrlw       m4, 2
565    psrlw       m5, 2
566%ifidn %1, avg
567    mova        m3, [r0+r4]
568    packuswb    m4, m5
569    PAVGB       m4, m3
570%else
571    packuswb    m4, m5
572%endif
573    mova   [r0+r4], m4
574    add         r4, r2
575
576    movu        m2, [r1+r4]
577    movu        m4, [r1+r4+1]
578    mova        m3, m2
579    mova        m5, m4
580    punpcklbw   m2, m7
581    punpcklbw   m4, m7
582    punpckhbw   m3, m7
583    punpckhbw   m5, m7
584    paddusw     m4, m2
585    paddusw     m5, m3
586    paddusw     m0, m6
587    paddusw     m1, m6
588    paddusw     m0, m4
589    paddusw     m1, m5
590    psrlw       m0, 2
591    psrlw       m1, 2
592%ifidn %1, avg
593    mova        m3, [r0+r4]
594    packuswb    m0, m1
595    PAVGB       m0, m3
596%else
597    packuswb    m0, m1
598%endif
599    mova   [r0+r4], m0
600    add         r4, r2
601    sub        r3d, 2
602    jnz .loop
603    REP_RET
604%endmacro
605
606INIT_MMX mmxext
607SET_PIXELS_XY2 avg
608INIT_MMX 3dnow
609SET_PIXELS_XY2 avg
610INIT_XMM sse2
611SET_PIXELS_XY2 put
612SET_PIXELS_XY2 avg
613
614%macro SSSE3_PIXELS_XY2 1-2
615%if %0 == 2 ; sse2
616cglobal %1_pixels16_xy2, 4,5,%2
617    mova        m4, [pb_interleave16]
618%else
619cglobal %1_pixels8_xy2, 4,5
620    mova        m4, [pb_interleave8]
621%endif
622    mova        m5, [pb_1]
623    movu        m0, [r1]
624    movu        m1, [r1+1]
625    pmaddubsw   m0, m5
626    pmaddubsw   m1, m5
627    xor         r4, r4
628    add         r1, r2
629.loop:
630    movu        m2, [r1+r4]
631    movu        m3, [r1+r4+1]
632    pmaddubsw   m2, m5
633    pmaddubsw   m3, m5
634    paddusw     m0, m2
635    paddusw     m1, m3
636    pmulhrsw    m0, [pw_8192]
637    pmulhrsw    m1, [pw_8192]
638%ifidn %1, avg
639    mova        m6, [r0+r4]
640    packuswb    m0, m1
641    pshufb      m0, m4
642    pavgb       m0, m6
643%else
644    packuswb    m0, m1
645    pshufb      m0, m4
646%endif
647    mova   [r0+r4], m0
648    add         r4, r2
649
650    movu        m0, [r1+r4]
651    movu        m1, [r1+r4+1]
652    pmaddubsw   m0, m5
653    pmaddubsw   m1, m5
654    paddusw     m2, m0
655    paddusw     m3, m1
656    pmulhrsw    m2, [pw_8192]
657    pmulhrsw    m3, [pw_8192]
658%ifidn %1, avg
659    mova        m6, [r0+r4]
660    packuswb    m2, m3
661    pshufb      m2, m4
662    pavgb       m2, m6
663%else
664    packuswb    m2, m3
665    pshufb      m2, m4
666%endif
667    mova   [r0+r4], m2
668    add         r4, r2
669    sub        r3d, 2
670    jnz .loop
671    REP_RET
672%endmacro
673
674INIT_MMX ssse3
675SSSE3_PIXELS_XY2 put
676SSSE3_PIXELS_XY2 avg
677INIT_XMM ssse3
678SSSE3_PIXELS_XY2 put, 6
679SSSE3_PIXELS_XY2 avg, 7
680