1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3;*****************************************************************************
4;* Copyright (C) 2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of Libav.
9;*
10;* Libav is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* Libav is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with Libav; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA 32
29
30cextern pw_16
31cextern pw_1
32cextern pb_0
33
34pw_pixel_max: times 8 dw ((1 << 10)-1)
35
36pad10: times 8 dw 10*1023
37pad20: times 8 dw 20*1023
38pad30: times 8 dw 30*1023
39depad: times 4 dd 32*20*1023 + 512
40depad2: times 8 dw 20*1023 + 16*1022 + 16
41unpad: times 8 dw 16*1022/32 ; needs to be mod 16
42
43tap1: times 4 dw  1, -5
44tap2: times 4 dw 20, 20
45tap3: times 4 dw -5,  1
46pd_0f: times 4 dd 0xffff
47
48SECTION .text
49
50
51%macro AVG_MOV 2
52    pavgw %2, %1
53    mova  %1, %2
54%endmacro
55
56%macro ADDW 3
57%if mmsize == 8
58    paddw %1, %2
59%else
60    movu  %3, %2
61    paddw %1, %3
62%endif
63%endmacro
64
65%macro FILT_H 4
66    paddw  %1, %4
67    psubw  %1, %2  ; a-b
68    psraw  %1, 2   ; (a-b)/4
69    psubw  %1, %2  ; (a-b)/4-b
70    paddw  %1, %3  ; (a-b)/4-b+c
71    psraw  %1, 2   ; ((a-b)/4-b+c)/4
72    paddw  %1, %3  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
73%endmacro
74
75%macro PRELOAD_V 0
76    lea      r3, [r2*3]
77    sub      r1, r3
78    movu     m0, [r1+r2]
79    movu     m1, [r1+r2*2]
80    add      r1, r3
81    movu     m2, [r1]
82    movu     m3, [r1+r2]
83    movu     m4, [r1+r2*2]
84    add      r1, r3
85%endmacro
86
87%macro FILT_V 8
88    movu     %6, [r1]
89    paddw    %1, %6
90    mova     %7, %2
91    paddw    %7, %5
92    mova     %8, %3
93    paddw    %8, %4
94    FILT_H   %1, %7, %8, [pw_16]
95    psraw    %1, 1
96    CLIPW    %1, [pb_0], [pw_pixel_max]
97%endmacro
98
99%macro MC 1
100%define OP_MOV mova
101INIT_MMX
102%1 mmxext, put, 4
103INIT_XMM
104%1 sse2  , put, 8
105
106%define OP_MOV AVG_MOV
107INIT_MMX
108%1 mmxext, avg, 4
109INIT_XMM
110%1 sse2  , avg, 8
111%endmacro
112
113%macro MCAxA 8
114%ifdef ARCH_X86_64
115%ifnidn %1,mmxext
116MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
117%endif
118%else
119MCAxA_OP %1,%2,%3,%4,%5,%6,%7,%8
120%endif
121%endmacro
122
123%macro MCAxA_OP 8
124cglobal %2_h264_qpel%5_%3_10_%1, %6,%7,%8
125%ifdef ARCH_X86_32
126    call stub_%2_h264_qpel%4_%3_10_%1
127    mov  r0, r0m
128    mov  r1, r1m
129    add  r0, %4*2
130    add  r1, %4*2
131    call stub_%2_h264_qpel%4_%3_10_%1
132    mov  r0, r0m
133    mov  r1, r1m
134    lea  r0, [r0+r2*%4]
135    lea  r1, [r1+r2*%4]
136    call stub_%2_h264_qpel%4_%3_10_%1
137    mov  r0, r0m
138    mov  r1, r1m
139    lea  r0, [r0+r2*%4+%4*2]
140    lea  r1, [r1+r2*%4+%4*2]
141    call stub_%2_h264_qpel%4_%3_10_%1
142    RET
143%else ; ARCH_X86_64
144    mov r10, r0
145    mov r11, r1
146    call stub_%2_h264_qpel%4_%3_10_%1
147    lea  r0, [r10+%4*2]
148    lea  r1, [r11+%4*2]
149    call stub_%2_h264_qpel%4_%3_10_%1
150    lea  r0, [r10+r2*%4]
151    lea  r1, [r11+r2*%4]
152    call stub_%2_h264_qpel%4_%3_10_%1
153    lea  r0, [r10+r2*%4+%4*2]
154    lea  r1, [r11+r2*%4+%4*2]
155%ifndef UNIX64 ; fall through to function
156    call stub_%2_h264_qpel%4_%3_10_%1
157    RET
158%endif
159%endif
160%endmacro
161
162;cpu, put/avg, mc, 4/8, ...
163%macro cglobal_mc 7
164%assign i %4*2
165MCAxA %1, %2, %3, %4, i, %5,%6,%7
166
167cglobal %2_h264_qpel%4_%3_10_%1, %5,%6,%7
168%ifndef UNIX64 ; no prologue or epilogue for UNIX64
169    call stub_%2_h264_qpel%4_%3_10_%1
170    RET
171%endif
172
173stub_%2_h264_qpel%4_%3_10_%1:
174%endmacro
175
176;-----------------------------------------------------------------------------
177; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
178;-----------------------------------------------------------------------------
179%macro COPY4 0
180    movu          m0, [r1     ]
181    OP_MOV [r0     ], m0
182    movu          m0, [r1+r2  ]
183    OP_MOV [r0+r2  ], m0
184    movu          m0, [r1+r2*2]
185    OP_MOV [r0+r2*2], m0
186    movu          m0, [r1+r3  ]
187    OP_MOV [r0+r3  ], m0
188%endmacro
189
190%macro MC00 1
191INIT_MMX
192cglobal_mc mmxext, %1, mc00, 4, 3,4,0
193    lea           r3, [r2*3]
194    COPY4
195    ret
196
197INIT_XMM
198cglobal %1_h264_qpel8_mc00_10_sse2, 3,4
199    lea  r3, [r2*3]
200    COPY4
201    lea  r0, [r0+r2*4]
202    lea  r1, [r1+r2*4]
203    COPY4
204    RET
205
206cglobal %1_h264_qpel16_mc00_10_sse2, 3,4
207    mov r3d, 8
208.loop:
209    movu           m0, [r1      ]
210    movu           m1, [r1   +16]
211    OP_MOV [r0      ], m0
212    OP_MOV [r0   +16], m1
213    movu           m0, [r1+r2   ]
214    movu           m1, [r1+r2+16]
215    OP_MOV [r0+r2   ], m0
216    OP_MOV [r0+r2+16], m1
217    lea            r0, [r0+r2*2]
218    lea            r1, [r1+r2*2]
219    dec r3d
220    jg .loop
221    REP_RET
222%endmacro
223
224%define OP_MOV mova
225MC00 put
226
227%define OP_MOV AVG_MOV
228MC00 avg
229
230;-----------------------------------------------------------------------------
231; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
232;-----------------------------------------------------------------------------
233%macro MC_CACHE 1
234%define OP_MOV mova
235%define PALIGNR PALIGNR_MMX
236INIT_MMX
237%1 mmxext       , put, 4
238INIT_XMM
239%1 sse2_cache64 , put, 8
240%define PALIGNR PALIGNR_SSSE3
241%1 ssse3_cache64, put, 8
242%1 sse2         , put, 8, 0
243
244%define OP_MOV AVG_MOV
245%define PALIGNR PALIGNR_MMX
246INIT_MMX
247%1 mmxext       , avg, 4
248INIT_XMM
249%1 sse2_cache64 , avg, 8
250%define PALIGNR PALIGNR_SSSE3
251%1 ssse3_cache64, avg, 8
252%1 sse2         , avg, 8, 0
253%endmacro
254
255%macro MC20 3-4
256cglobal_mc %1, %2, mc20, %3, 3,4,9
257    mov     r3d, %3
258    mova     m1, [pw_pixel_max]
259%if num_mmregs > 8
260    mova     m8, [pw_16]
261    %define p16 m8
262%else
263    %define p16 [pw_16]
264%endif
265.nextrow
266%if %0 == 4
267    movu     m2, [r1-4]
268    movu     m3, [r1-2]
269    movu     m4, [r1+0]
270    ADDW     m2, [r1+6], m5
271    ADDW     m3, [r1+4], m5
272    ADDW     m4, [r1+2], m5
273%else ; movu is slow on these processors
274%if mmsize==16
275    movu     m2, [r1-4]
276    movu     m0, [r1+6]
277    mova     m6, m0
278    psrldq   m0, 6
279
280    paddw    m6, m2
281    PALIGNR  m3, m0, m2, 2, m5
282    PALIGNR  m7, m0, m2, 8, m5
283    paddw    m3, m7
284    PALIGNR  m4, m0, m2, 4, m5
285    PALIGNR  m7, m0, m2, 6, m5
286    paddw    m4, m7
287    SWAP      2, 6
288%else
289    movu     m2, [r1-4]
290    movu     m6, [r1+4]
291    PALIGNR  m3, m6, m2, 2, m5
292    paddw    m3, m6
293    PALIGNR  m4, m6, m2, 4, m5
294    PALIGNR  m7, m6, m2, 6, m5
295    paddw    m4, m7
296    paddw    m2, [r1+6]
297%endif
298%endif
299
300    FILT_H   m2, m3, m4, p16
301    psraw    m2, 1
302    pxor     m0, m0
303    CLIPW    m2, m0, m1
304    OP_MOV [r0], m2
305    add      r0, r2
306    add      r1, r2
307    dec     r3d
308    jg .nextrow
309    rep ret
310%endmacro
311
312MC_CACHE MC20
313
314;-----------------------------------------------------------------------------
315; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
316;-----------------------------------------------------------------------------
317%macro MC30 3-4
318cglobal_mc %1, %2, mc30, %3, 3,5,9
319    lea r4, [r1+2]
320    jmp stub_%2_h264_qpel%3_mc10_10_%1.body
321%endmacro
322
323MC_CACHE MC30
324
325;-----------------------------------------------------------------------------
326; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
327;-----------------------------------------------------------------------------
328%macro MC10 3-4
329cglobal_mc %1, %2, mc10, %3, 3,5,9
330    mov      r4, r1
331.body
332    mov     r3d, %3
333    mova     m1, [pw_pixel_max]
334%if num_mmregs > 8
335    mova     m8, [pw_16]
336    %define p16 m8
337%else
338    %define p16 [pw_16]
339%endif
340.nextrow
341%if %0 == 4
342    movu     m2, [r1-4]
343    movu     m3, [r1-2]
344    movu     m4, [r1+0]
345    ADDW     m2, [r1+6], m5
346    ADDW     m3, [r1+4], m5
347    ADDW     m4, [r1+2], m5
348%else ; movu is slow on these processors
349%if mmsize==16
350    movu     m2, [r1-4]
351    movu     m0, [r1+6]
352    mova     m6, m0
353    psrldq   m0, 6
354
355    paddw    m6, m2
356    PALIGNR  m3, m0, m2, 2, m5
357    PALIGNR  m7, m0, m2, 8, m5
358    paddw    m3, m7
359    PALIGNR  m4, m0, m2, 4, m5
360    PALIGNR  m7, m0, m2, 6, m5
361    paddw    m4, m7
362    SWAP      2, 6
363%else
364    movu     m2, [r1-4]
365    movu     m6, [r1+4]
366    PALIGNR  m3, m6, m2, 2, m5
367    paddw    m3, m6
368    PALIGNR  m4, m6, m2, 4, m5
369    PALIGNR  m7, m6, m2, 6, m5
370    paddw    m4, m7
371    paddw    m2, [r1+6]
372%endif
373%endif
374
375    FILT_H   m2, m3, m4, p16
376    psraw    m2, 1
377    pxor     m0, m0
378    CLIPW    m2, m0, m1
379    movu     m3, [r4]
380    pavgw    m2, m3
381    OP_MOV [r0], m2
382    add      r0, r2
383    add      r1, r2
384    add      r4, r2
385    dec     r3d
386    jg .nextrow
387    rep ret
388%endmacro
389
390MC_CACHE MC10
391
392;-----------------------------------------------------------------------------
393; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
394;-----------------------------------------------------------------------------
395%macro V_FILT 11
396v_filt%9_%10_10_%11:
397    add    r4, r2
398.no_addr4:
399    FILT_V m0, m1, m2, m3, m4, m5, m6, m7
400    add    r1, r2
401    add    r0, r2
402    ret
403%endmacro
404
405INIT_MMX
406RESET_MM_PERMUTATION
407%assign i 0
408%rep 4
409V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i, mmxext
410SWAP 0,1,2,3,4,5
411%assign i i+1
412%endrep
413
414INIT_XMM
415RESET_MM_PERMUTATION
416%assign i 0
417%rep 6
418V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i, sse2
419SWAP 0,1,2,3,4,5
420%assign i i+1
421%endrep
422
423%macro MC02 3
424cglobal_mc %1, %2, mc02, %3, 3,4,8
425    PRELOAD_V
426
427    sub      r0, r2
428%assign j 0
429%rep %3
430    %assign i (j % 6)
431    call v_filt%3_ %+ i %+ _10_%1.no_addr4
432    OP_MOV [r0], m0
433    SWAP 0,1,2,3,4,5
434    %assign j j+1
435%endrep
436    ret
437%endmacro
438
439MC MC02
440
441;-----------------------------------------------------------------------------
442; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
443;-----------------------------------------------------------------------------
444%macro MC01 3
445cglobal_mc %1, %2, mc01, %3, 3,5,8
446    mov      r4, r1
447.body
448    PRELOAD_V
449
450    sub      r4, r2
451    sub      r0, r2
452%assign j 0
453%rep %3
454    %assign i (j % 6)
455    call v_filt%3_ %+ i %+ _10_%1
456    movu     m7, [r4]
457    pavgw    m0, m7
458    OP_MOV [r0], m0
459    SWAP 0,1,2,3,4,5
460    %assign j j+1
461%endrep
462    ret
463%endmacro
464
465MC MC01
466
467;-----------------------------------------------------------------------------
468; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
469;-----------------------------------------------------------------------------
470%macro MC03 3
471cglobal_mc %1, %2, mc03, %3, 3,5,8
472    lea r4, [r1+r2]
473    jmp stub_%2_h264_qpel%3_mc01_10_%1.body
474%endmacro
475
476MC MC03
477
478;-----------------------------------------------------------------------------
479; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
480;-----------------------------------------------------------------------------
481%macro H_FILT_AVG 3-4
482h_filt%2_%3_10_%1:
483;FILT_H with fewer registers and averaged with the FILT_V result
484;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
485;unfortunately I need three registers, so m5 will have to be re-read from memory
486    movu     m5, [r4-4]
487    ADDW     m5, [r4+6], m7
488    movu     m6, [r4-2]
489    ADDW     m6, [r4+4], m7
490    paddw    m5, [pw_16]
491    psubw    m5, m6  ; a-b
492    psraw    m5, 2   ; (a-b)/4
493    psubw    m5, m6  ; (a-b)/4-b
494    movu     m6, [r4+0]
495    ADDW     m6, [r4+2], m7
496    paddw    m5, m6  ; (a-b)/4-b+c
497    psraw    m5, 2   ; ((a-b)/4-b+c)/4
498    paddw    m5, m6  ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
499    psraw    m5, 1
500    CLIPW    m5, [pb_0], [pw_pixel_max]
501;avg FILT_V, FILT_H
502    pavgw    m0, m5
503%if %0!=4
504    movu     m5, [r1+r5]
505%endif
506    ret
507%endmacro
508
509INIT_MMX
510RESET_MM_PERMUTATION
511%assign i 0
512%rep 3
513H_FILT_AVG mmxext, 4, i
514SWAP 0,1,2,3,4,5
515%assign i i+1
516%endrep
517H_FILT_AVG mmxext, 4, i, 0
518
519INIT_XMM
520RESET_MM_PERMUTATION
521%assign i 0
522%rep 6
523%if i==1
524H_FILT_AVG sse2,   8, i, 0
525%else
526H_FILT_AVG sse2,   8, i
527%endif
528SWAP 0,1,2,3,4,5
529%assign i i+1
530%endrep
531
532%macro MC11 3
533; this REALLY needs x86_64
534cglobal_mc %1, %2, mc11, %3, 3,6,8
535    mov      r4, r1
536.body
537    PRELOAD_V
538
539    sub      r0, r2
540    sub      r4, r2
541    mov      r5, r2
542    neg      r5
543%assign j 0
544%rep %3
545    %assign i (j % 6)
546    call v_filt%3_ %+ i %+ _10_%1
547    call h_filt%3_ %+ i %+ _10_%1
548%if %3==8 && i==1
549    movu     m5, [r1+r5]
550%endif
551    OP_MOV [r0], m0
552    SWAP 0,1,2,3,4,5
553    %assign j j+1
554%endrep
555    ret
556%endmacro
557
558MC MC11
559
560;-----------------------------------------------------------------------------
561; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
562;-----------------------------------------------------------------------------
563%macro MC31 3
564cglobal_mc %1, %2, mc31, %3, 3,6,8
565    mov r4, r1
566    add r1, 2
567    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
568%endmacro
569
570MC MC31
571
572;-----------------------------------------------------------------------------
573; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
574;-----------------------------------------------------------------------------
575%macro MC13 3
576cglobal_mc %1, %2, mc13, %3, 3,7,12
577    lea r4, [r1+r2]
578    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
579%endmacro
580
581MC MC13
582
583;-----------------------------------------------------------------------------
584; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
585;-----------------------------------------------------------------------------
586%macro MC33 3
587cglobal_mc %1, %2, mc33, %3, 3,6,8
588    lea r4, [r1+r2]
589    add r1, 2
590    jmp stub_%2_h264_qpel%3_mc11_10_%1.body
591%endmacro
592
593MC MC33
594
595;-----------------------------------------------------------------------------
596; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
597;-----------------------------------------------------------------------------
598%macro FILT_H2 3
599    psubw  %1, %2  ; a-b
600    psubw  %2, %3  ; b-c
601    psllw  %2, 2
602    psubw  %1, %2  ; a-5*b+4*c
603    psllw  %3, 4
604    paddw  %1, %3  ; a-5*b+20*c
605%endmacro
606
607%macro FILT_VNRD 8
608    movu     %6, [r1]
609    paddw    %1, %6
610    mova     %7, %2
611    paddw    %7, %5
612    mova     %8, %3
613    paddw    %8, %4
614    FILT_H2  %1, %7, %8
615%endmacro
616
617%macro HV 2
618%ifidn %1,sse2
619%define PAD 12
620%define COUNT 2
621%else
622%define PAD 4
623%define COUNT 3
624%endif
625put_hv%2_10_%1:
626    neg      r2           ; This actually saves instructions
627    lea      r1, [r1+r2*2-mmsize+PAD]
628    lea      r4, [rsp+PAD+gprsize]
629    mov     r3d, COUNT
630.v_loop:
631    movu     m0, [r1]
632    sub      r1, r2
633    movu     m1, [r1]
634    sub      r1, r2
635    movu     m2, [r1]
636    sub      r1, r2
637    movu     m3, [r1]
638    sub      r1, r2
639    movu     m4, [r1]
640    sub      r1, r2
641%assign i 0
642%rep %2-1
643    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
644    psubw    m0, [pad20]
645    movu     [r4+i*mmsize*3], m0
646    sub      r1, r2
647    SWAP 0,1,2,3,4,5
648%assign i i+1
649%endrep
650    FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
651    psubw    m0, [pad20]
652    movu     [r4+i*mmsize*3], m0
653    add      r4, mmsize
654    lea      r1, [r1+r2*8+mmsize]
655%if %2==8
656    lea      r1, [r1+r2*4]
657%endif
658    dec      r3d
659    jg .v_loop
660    neg      r2
661    ret
662%endmacro
663
664INIT_MMX
665HV mmxext, 4
666INIT_XMM
667HV sse2  , 8
668
669%macro H_LOOP 2
670%if num_mmregs > 8
671    %define s1 m8
672    %define s2 m9
673    %define s3 m10
674    %define d1 m11
675%else
676    %define s1 [tap1]
677    %define s2 [tap2]
678    %define s3 [tap3]
679    %define d1 [depad]
680%endif
681h%2_loop_op_%1:
682    movu       m1, [r1+mmsize-4]
683    movu       m2, [r1+mmsize-2]
684    mova       m3, [r1+mmsize+0]
685    movu       m4, [r1+mmsize+2]
686    movu       m5, [r1+mmsize+4]
687    movu       m6, [r1+mmsize+6]
688%if num_mmregs > 8
689    pmaddwd    m1, s1
690    pmaddwd    m2, s1
691    pmaddwd    m3, s2
692    pmaddwd    m4, s2
693    pmaddwd    m5, s3
694    pmaddwd    m6, s3
695    paddd      m1, d1
696    paddd      m2, d1
697%else
698    mova       m0, s1
699    pmaddwd    m1, m0
700    pmaddwd    m2, m0
701    mova       m0, s2
702    pmaddwd    m3, m0
703    pmaddwd    m4, m0
704    mova       m0, s3
705    pmaddwd    m5, m0
706    pmaddwd    m6, m0
707    mova       m0, d1
708    paddd      m1, m0
709    paddd      m2, m0
710%endif
711    paddd      m3, m5
712    paddd      m4, m6
713    paddd      m1, m3
714    paddd      m2, m4
715    psrad      m1, 10
716    psrad      m2, 10
717    pslld      m2, 16
718    pand       m1, [pd_0f]
719    por        m1, m2
720%if num_mmregs <= 8
721    pxor       m0, m0
722%endif
723    CLIPW      m1, m0, m7
724    add        r1, mmsize*3
725    ret
726%endmacro
727
728INIT_MMX
729H_LOOP mmxext, 4
730INIT_XMM
731H_LOOP sse2  , 8
732
733%macro MC22 3
734cglobal_mc %1, %2, mc22, %3, 3,7,12
735%define PAD mmsize*8*4*2      ; SIZE*16*4*sizeof(pixel)
736    mov      r6, rsp          ; backup stack pointer
737    and     rsp, ~(mmsize-1)  ; align stack
738    sub     rsp, PAD
739
740    call put_hv%3_10_%1
741
742    mov       r3d, %3
743    mova       m7, [pw_pixel_max]
744%if num_mmregs > 8
745    pxor       m0, m0
746    mova       m8, [tap1]
747    mova       m9, [tap2]
748    mova      m10, [tap3]
749    mova      m11, [depad]
750%endif
751    mov        r1, rsp
752.h_loop:
753    call h%3_loop_op_%1
754
755    OP_MOV   [r0], m1
756    add        r0, r2
757    dec       r3d
758    jg .h_loop
759
760    mov     rsp, r6          ; restore stack pointer
761    ret
762%endmacro
763
764MC MC22
765
766;-----------------------------------------------------------------------------
767; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
768;-----------------------------------------------------------------------------
769%macro MC12 3
770cglobal_mc %1, %2, mc12, %3, 3,7,12
771%define PAD mmsize*8*4*2        ; SIZE*16*4*sizeof(pixel)
772    mov        r6, rsp          ; backup stack pointer
773    and       rsp, ~(mmsize-1)  ; align stack
774    sub       rsp, PAD
775
776    call put_hv%3_10_%1
777
778    xor       r4d, r4d
779.body
780    mov       r3d, %3
781    pxor       m0, m0
782    mova       m7, [pw_pixel_max]
783%if num_mmregs > 8
784    mova       m8, [tap1]
785    mova       m9, [tap2]
786    mova      m10, [tap3]
787    mova      m11, [depad]
788%endif
789    mov        r1, rsp
790.h_loop:
791    call h%3_loop_op_%1
792
793    movu       m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
794    paddw      m3, [depad2]
795    psrlw      m3, 5
796    psubw      m3, [unpad]
797    CLIPW      m3, m0, m7
798    pavgw      m1, m3
799
800    OP_MOV   [r0], m1
801    add        r0, r2
802    dec       r3d
803    jg .h_loop
804
805    mov     rsp, r6          ; restore stack pointer
806    ret
807%endmacro
808
809MC MC12
810
811;-----------------------------------------------------------------------------
812; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
813;-----------------------------------------------------------------------------
814%macro MC32 3
815cglobal_mc %1, %2, mc32, %3, 3,7,12
816%define PAD mmsize*8*3*2  ; SIZE*16*4*sizeof(pixel)
817    mov  r6, rsp          ; backup stack pointer
818    and rsp, ~(mmsize-1)  ; align stack
819    sub rsp, PAD
820
821    call put_hv%3_10_%1
822
823    mov r4d, 2            ; sizeof(pixel)
824    jmp stub_%2_h264_qpel%3_mc12_10_%1.body
825%endmacro
826
827MC MC32
828
829;-----------------------------------------------------------------------------
830; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
831;-----------------------------------------------------------------------------
832%macro H_NRD 2
833put_h%2_10_%1:
834    add       rsp, gprsize
835    mov       r3d, %2
836    xor       r4d, r4d
837    mova       m6, [pad20]
838.nextrow
839    movu       m2, [r5-4]
840    movu       m3, [r5-2]
841    movu       m4, [r5+0]
842    ADDW       m2, [r5+6], m5
843    ADDW       m3, [r5+4], m5
844    ADDW       m4, [r5+2], m5
845
846    FILT_H2    m2, m3, m4
847    psubw      m2, m6
848    mova [rsp+r4], m2
849    add       r4d, mmsize*3
850    add        r5, r2
851    dec       r3d
852    jg .nextrow
853    sub       rsp, gprsize
854    ret
855%endmacro
856
857INIT_MMX
858H_NRD mmxext, 4
859INIT_XMM
860H_NRD sse2  , 8
861
862%macro MC21 3
863cglobal_mc %1, %2, mc21, %3, 3,7,12
864    mov   r5, r1
865.body
866%define PAD mmsize*8*3*2   ; SIZE*16*4*sizeof(pixel)
867    mov   r6, rsp          ; backup stack pointer
868    and  rsp, ~(mmsize-1)  ; align stack
869
870    sub  rsp, PAD
871    call put_h%3_10_%1
872
873    sub  rsp, PAD
874    call put_hv%3_10_%1
875
876    mov r4d, PAD-mmsize    ; H buffer
877    jmp stub_%2_h264_qpel%3_mc12_10_%1.body
878%endmacro
879
880MC MC21
881
882;-----------------------------------------------------------------------------
883; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
884;-----------------------------------------------------------------------------
885%macro MC23 3
886cglobal_mc %1, %2, mc23, %3, 3,7,12
887    lea   r5, [r1+r2]
888    jmp stub_%2_h264_qpel%3_mc21_10_%1.body
889%endmacro
890
891MC MC23
892