1;******************************************************************************
2;* MMX optimized DSP utils
3;* Copyright (c) 2008 Loren Merritt
4;*
5;* This file is part of Libav.
6;*
7;* Libav is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* Libav is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with Libav; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "x86inc.asm"
23%include "x86util.asm"
24
25SECTION_RODATA
26pb_f: times 16 db 15
27pb_zzzzzzzz77777777: times 8 db -1
28pb_7: times 8 db 7
29pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
30pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
31pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
32pd_16384: times 4 dd 16384
33
34SECTION_TEXT
35
36%macro SCALARPRODUCT 1
37; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
38cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
39    shl orderq, 1
40    add v1q, orderq
41    add v2q, orderq
42    neg orderq
43    movd    m3, shiftm
44    pxor    m2, m2
45.loop:
46    movu    m0, [v1q + orderq]
47    movu    m1, [v1q + orderq + mmsize]
48    pmaddwd m0, [v2q + orderq]
49    pmaddwd m1, [v2q + orderq + mmsize]
50    paddd   m2, m0
51    paddd   m2, m1
52    add     orderq, mmsize*2
53    jl .loop
54%if mmsize == 16
55    movhlps m0, m2
56    paddd   m2, m0
57    psrad   m2, m3
58    pshuflw m0, m2, 0x4e
59%else
60    psrad   m2, m3
61    pshufw  m0, m2, 0x4e
62%endif
63    paddd   m2, m0
64    movd   eax, m2
65    RET
66
67; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
68cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
69    shl orderq, 1
70    movd    m7, mulm
71%if mmsize == 16
72    pshuflw m7, m7, 0
73    punpcklqdq m7, m7
74%else
75    pshufw  m7, m7, 0
76%endif
77    pxor    m6, m6
78    add v1q, orderq
79    add v2q, orderq
80    add v3q, orderq
81    neg orderq
82.loop:
83    movu    m0, [v2q + orderq]
84    movu    m1, [v2q + orderq + mmsize]
85    mova    m4, [v1q + orderq]
86    mova    m5, [v1q + orderq + mmsize]
87    movu    m2, [v3q + orderq]
88    movu    m3, [v3q + orderq + mmsize]
89    pmaddwd m0, m4
90    pmaddwd m1, m5
91    pmullw  m2, m7
92    pmullw  m3, m7
93    paddd   m6, m0
94    paddd   m6, m1
95    paddw   m2, m4
96    paddw   m3, m5
97    mova    [v1q + orderq], m2
98    mova    [v1q + orderq + mmsize], m3
99    add     orderq, mmsize*2
100    jl .loop
101%if mmsize == 16
102    movhlps m0, m6
103    paddd   m6, m0
104    pshuflw m0, m6, 0x4e
105%else
106    pshufw  m0, m6, 0x4e
107%endif
108    paddd   m6, m0
109    movd   eax, m6
110    RET
111%endmacro
112
113INIT_MMX
114SCALARPRODUCT mmx2
115INIT_XMM
116SCALARPRODUCT sse2
117
118%macro SCALARPRODUCT_LOOP 1
119align 16
120.loop%1:
121    sub     orderq, mmsize*2
122%if %1
123    mova    m1, m4
124    mova    m4, [v2q + orderq]
125    mova    m0, [v2q + orderq + mmsize]
126    palignr m1, m0, %1
127    palignr m0, m4, %1
128    mova    m3, m5
129    mova    m5, [v3q + orderq]
130    mova    m2, [v3q + orderq + mmsize]
131    palignr m3, m2, %1
132    palignr m2, m5, %1
133%else
134    mova    m0, [v2q + orderq]
135    mova    m1, [v2q + orderq + mmsize]
136    mova    m2, [v3q + orderq]
137    mova    m3, [v3q + orderq + mmsize]
138%endif
139    %define t0  [v1q + orderq]
140    %define t1  [v1q + orderq + mmsize]
141%ifdef ARCH_X86_64
142    mova    m8, t0
143    mova    m9, t1
144    %define t0  m8
145    %define t1  m9
146%endif
147    pmaddwd m0, t0
148    pmaddwd m1, t1
149    pmullw  m2, m7
150    pmullw  m3, m7
151    paddw   m2, t0
152    paddw   m3, t1
153    paddd   m6, m0
154    paddd   m6, m1
155    mova    [v1q + orderq], m2
156    mova    [v1q + orderq + mmsize], m3
157    jg .loop%1
158%if %1
159    jmp .end
160%endif
161%endmacro
162
163; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
164cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
165    shl orderq, 1
166    movd    m7, mulm
167    pshuflw m7, m7, 0
168    punpcklqdq m7, m7
169    pxor    m6, m6
170    mov    r4d, v2d
171    and    r4d, 15
172    and    v2q, ~15
173    and    v3q, ~15
174    mova    m4, [v2q + orderq]
175    mova    m5, [v3q + orderq]
176    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
177    cmp    r4d, 0
178    je .loop0
179    cmp    r4d, 2
180    je .loop2
181    cmp    r4d, 4
182    je .loop4
183    cmp    r4d, 6
184    je .loop6
185    cmp    r4d, 8
186    je .loop8
187    cmp    r4d, 10
188    je .loop10
189    cmp    r4d, 12
190    je .loop12
191SCALARPRODUCT_LOOP 14
192SCALARPRODUCT_LOOP 12
193SCALARPRODUCT_LOOP 10
194SCALARPRODUCT_LOOP 8
195SCALARPRODUCT_LOOP 6
196SCALARPRODUCT_LOOP 4
197SCALARPRODUCT_LOOP 2
198SCALARPRODUCT_LOOP 0
199.end:
200    movhlps m0, m6
201    paddd   m6, m0
202    pshuflw m0, m6, 0x4e
203    paddd   m6, m0
204    movd   eax, m6
205    RET
206
207
208;-----------------------------------------------------------------------------
209; void ff_apply_window_int16(int16_t *output, const int16_t *input,
210;                            const int16_t *window, unsigned int len)
211;-----------------------------------------------------------------------------
212
213%macro REVERSE_WORDS_MMXEXT 1-2
214    pshufw   %1, %1, 0x1B
215%endmacro
216
217%macro REVERSE_WORDS_SSE2 1-2
218    pshuflw  %1, %1, 0x1B
219    pshufhw  %1, %1, 0x1B
220    pshufd   %1, %1, 0x4E
221%endmacro
222
223%macro REVERSE_WORDS_SSSE3 2
224    pshufb  %1, %2
225%endmacro
226
227; dst = (dst * src) >> 15
228; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
229; in from the pmullw result.
230%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
231    mova    %3, %1
232    pmulhw  %1, %2
233    pmullw  %3, %2
234    psrlw   %3, 15
235    psllw   %1, 1
236    por     %1, %3
237%endmacro
238
239; dst = ((dst * src) + (1<<14)) >> 15
240%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
241    pmulhrsw   %1, %2
242%endmacro
243
244%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
245cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
246    lea     offset2q, [offsetq-mmsize]
247%if %2
248    mova          m5, [pd_16384]
249%elifidn %1, ssse3
250    mova          m5, [pb_revwords]
251    ALIGN 16
252%endif
253.loop:
254%if %2
255    ; This version expands 16-bit to 32-bit, multiplies by the window,
256    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
257    ; save to the output. The window is reversed for the second half.
258    mova          m3, [windowq+offset2q]
259    mova          m4, [ inputq+offset2q]
260    pxor          m0, m0
261    punpcklwd     m0, m3
262    punpcklwd     m1, m4
263    pmaddwd       m0, m1
264    paddd         m0, m5
265    psrad         m0, 15
266    pxor          m2, m2
267    punpckhwd     m2, m3
268    punpckhwd     m1, m4
269    pmaddwd       m2, m1
270    paddd         m2, m5
271    psrad         m2, 15
272    packssdw      m0, m2
273    mova  [outputq+offset2q], m0
274    REVERSE_WORDS m3
275    mova          m4, [ inputq+offsetq]
276    pxor          m0, m0
277    punpcklwd     m0, m3
278    punpcklwd     m1, m4
279    pmaddwd       m0, m1
280    paddd         m0, m5
281    psrad         m0, 15
282    pxor          m2, m2
283    punpckhwd     m2, m3
284    punpckhwd     m1, m4
285    pmaddwd       m2, m1
286    paddd         m2, m5
287    psrad         m2, 15
288    packssdw      m0, m2
289    mova  [outputq+offsetq], m0
290%elif %3
291    ; This version does the 16x16->16 multiplication in-place without expanding
292    ; to 32-bit. The ssse3 version is bit-identical.
293    mova          m0, [windowq+offset2q]
294    mova          m1, [ inputq+offset2q]
295    pmulhrsw      m1, m0
296    REVERSE_WORDS m0, m5
297    pmulhrsw      m0, [ inputq+offsetq ]
298    mova  [outputq+offset2q], m1
299    mova  [outputq+offsetq ], m0
300%else
301    ; This version does the 16x16->16 multiplication in-place without expanding
302    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
303    ; therefore are not bit-identical to the C version.
304    mova          m0, [windowq+offset2q]
305    mova          m1, [ inputq+offset2q]
306    mova          m2, [ inputq+offsetq ]
307    MUL16FIXED    m1, m0, m3
308    REVERSE_WORDS m0
309    MUL16FIXED    m2, m0, m3
310    mova  [outputq+offset2q], m1
311    mova  [outputq+offsetq ], m2
312%endif
313    add      offsetd, mmsize
314    sub     offset2d, mmsize
315    jae .loop
316    REP_RET
317%endmacro
318
319INIT_MMX
320%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
321%define MUL16FIXED MUL16FIXED_MMXEXT
322APPLY_WINDOW_INT16 mmxext,     0, 0
323APPLY_WINDOW_INT16 mmxext_ba,  1, 0
324INIT_XMM
325%define REVERSE_WORDS REVERSE_WORDS_SSE2
326APPLY_WINDOW_INT16 sse2,       0, 0
327APPLY_WINDOW_INT16 sse2_ba,    1, 0
328APPLY_WINDOW_INT16 ssse3_atom, 0, 1
329%define REVERSE_WORDS REVERSE_WORDS_SSSE3
330APPLY_WINDOW_INT16 ssse3,      0, 1
331
332
333; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
334cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
335    movq    mm0, [topq]
336    movq    mm2, mm0
337    movd    mm4, [left_topq]
338    psllq   mm2, 8
339    movq    mm1, mm0
340    por     mm4, mm2
341    movd    mm3, [leftq]
342    psubb   mm0, mm4 ; t-tl
343    add    dstq, wq
344    add    topq, wq
345    add   diffq, wq
346    neg      wq
347    jmp .skip
348.loop:
349    movq    mm4, [topq+wq]
350    movq    mm0, mm4
351    psllq   mm4, 8
352    por     mm4, mm1
353    movq    mm1, mm0 ; t
354    psubb   mm0, mm4 ; t-tl
355.skip:
356    movq    mm2, [diffq+wq]
357%assign i 0
358%rep 8
359    movq    mm4, mm0
360    paddb   mm4, mm3 ; t-tl+l
361    movq    mm5, mm3
362    pmaxub  mm3, mm1
363    pminub  mm5, mm1
364    pminub  mm3, mm4
365    pmaxub  mm3, mm5 ; median
366    paddb   mm3, mm2 ; +residual
367%if i==0
368    movq    mm7, mm3
369    psllq   mm7, 56
370%else
371    movq    mm6, mm3
372    psrlq   mm7, 8
373    psllq   mm6, 56
374    por     mm7, mm6
375%endif
376%if i<7
377    psrlq   mm0, 8
378    psrlq   mm1, 8
379    psrlq   mm2, 8
380%endif
381%assign i i+1
382%endrep
383    movq [dstq+wq], mm7
384    add      wq, 8
385    jl .loop
386    movzx   r2d, byte [dstq-1]
387    mov [leftq], r2d
388    movzx   r2d, byte [topq-1]
389    mov [left_topq], r2d
390    RET
391
392
393%macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
394    add     srcq, wq
395    add     dstq, wq
396    neg     wq
397%%.loop:
398    mova    m1, [srcq+wq]
399    mova    m2, m1
400    psllw   m1, 8
401    paddb   m1, m2
402    mova    m2, m1
403    pshufb  m1, m3
404    paddb   m1, m2
405    pshufb  m0, m5
406    mova    m2, m1
407    pshufb  m1, m4
408    paddb   m1, m2
409%if mmsize == 16
410    mova    m2, m1
411    pshufb  m1, m6
412    paddb   m1, m2
413%endif
414    paddb   m0, m1
415%if %1
416    mova    [dstq+wq], m0
417%else
418    movq    [dstq+wq], m0
419    movhps  [dstq+wq+8], m0
420%endif
421    add     wq, mmsize
422    jl %%.loop
423    mov     eax, mmsize-1
424    sub     eax, wd
425    movd    m1, eax
426    pshufb  m0, m1
427    movd    eax, m0
428    RET
429%endmacro
430
431; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
432INIT_MMX
433cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
434.skip_prologue:
435    mova    m5, [pb_7]
436    mova    m4, [pb_zzzz3333zzzzbbbb]
437    mova    m3, [pb_zz11zz55zz99zzdd]
438    movd    m0, leftm
439    psllq   m0, 56
440    ADD_HFYU_LEFT_LOOP 1
441
442INIT_XMM
443cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
444    mova    m5, [pb_f]
445    mova    m6, [pb_zzzzzzzz77777777]
446    mova    m4, [pb_zzzz3333zzzzbbbb]
447    mova    m3, [pb_zz11zz55zz99zzdd]
448    movd    m0, leftm
449    pslldq  m0, 15
450    test    srcq, 15
451    jnz add_hfyu_left_prediction_ssse3.skip_prologue
452    test    dstq, 15
453    jnz .unaligned
454    ADD_HFYU_LEFT_LOOP 1
455.unaligned:
456    ADD_HFYU_LEFT_LOOP 0
457
458
459; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
460cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
461    neg offsetq
462    shl offsetq, 2
463    sub v1q, offsetq
464    sub v2q, offsetq
465    xorps xmm0, xmm0
466    .loop:
467        movaps   xmm1, [v1q+offsetq]
468        mulps    xmm1, [v2q+offsetq]
469        addps    xmm0, xmm1
470        add      offsetq, 16
471        js       .loop
472    movhlps xmm1, xmm0
473    addps   xmm0, xmm1
474    movss   xmm1, xmm0
475    shufps  xmm0, xmm0, 1
476    addss   xmm0, xmm1
477%ifndef ARCH_X86_64
478    movd    r0m,  xmm0
479    fld     dword r0m
480%endif
481    RET
482
483; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
484;                              x86_reg start_y, x86_reg end_y, x86_reg block_h,
485;                              x86_reg start_x, x86_reg end_x, x86_reg block_w);
486;
487; The actual function itself is below. It basically wraps a very simple
488; w = end_x - start_x
489; if (w) {
490;   if (w > 22) {
491;     jump to the slow loop functions
492;   } else {
493;     jump to the fast loop functions
494;   }
495; }
496;
497; ... and then the same for left/right extend also. See below for loop
498; function implementations. Fast are fixed-width, slow is variable-width
499
500%macro EMU_EDGE_FUNC 0
501%ifdef ARCH_X86_64
502%define w_reg r10
503cglobal emu_edge_core, 6, 7, 1
504    mov        r11, r5          ; save block_h
505%else
506%define w_reg r6
507cglobal emu_edge_core, 2, 7, 0
508    mov         r4, r4m         ; end_y
509    mov         r5, r5m         ; block_h
510%endif
511
512    ; start with vertical extend (top/bottom) and body pixel copy
513    mov      w_reg, r7m
514    sub      w_reg, r6m         ; w = start_x - end_x
515    sub         r5, r4
516%ifdef ARCH_X86_64
517    sub         r4, r3
518%else
519    sub         r4, dword r3m
520%endif
521    cmp      w_reg, 22
522    jg .slow_v_extend_loop
523%ifdef ARCH_X86_32
524    mov         r2, r2m         ; linesize
525%endif
526    sal      w_reg, 7           ; w * 128
527%ifdef PIC
528    lea        rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
529    add      w_reg, rax
530%else
531    lea      w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
532%endif
533    call     w_reg              ; fast top extend, body copy and bottom extend
534.v_extend_end:
535
536    ; horizontal extend (left/right)
537    mov      w_reg, r6m         ; start_x
538    sub         r0, w_reg
539%ifdef ARCH_X86_64
540    mov         r3, r0          ; backup of buf+block_h*linesize
541    mov         r5, r11
542%else
543    mov        r0m, r0          ; backup of buf+block_h*linesize
544    mov         r5, r5m
545%endif
546    test     w_reg, w_reg
547    jz .right_extend
548    cmp      w_reg, 22
549    jg .slow_left_extend_loop
550    mov         r1, w_reg
551    dec      w_reg
552    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
553    sar      w_reg, 1
554    sal      w_reg, 6
555    ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
556    ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
557%ifdef PIC
558    lea        rax, [.emuedge_extend_left_2]
559    add      w_reg, rax
560%else
561    lea      w_reg, [.emuedge_extend_left_2+w_reg]
562%endif
563    call     w_reg
564
565    ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
566.right_extend:
567%ifdef ARCH_X86_32
568    mov         r0, r0m
569    mov         r5, r5m
570%endif
571    mov      w_reg, r7m         ; end_x
572    mov         r1, r8m         ; block_w
573    mov         r4, r1
574    sub         r1, w_reg
575    jz .h_extend_end            ; if (end_x == block_w) goto h_extend_end
576    cmp         r1, 22
577    jg .slow_right_extend_loop
578    dec         r1
579    ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
580    sar         r1, 1
581    sal         r1, 6
582%ifdef PIC
583    lea        rax, [.emuedge_extend_right_2]
584    add         r1, rax
585%else
586    lea         r1, [.emuedge_extend_right_2+r1]
587%endif
588    call        r1
589.h_extend_end:
590    RET
591
592%ifdef ARCH_X86_64
593%define vall  al
594%define valh  ah
595%define valw  ax
596%define valw2 r10w
597%define valw3 r3w
598%ifdef WIN64
599%define valw4 r4w
600%else ; unix64
601%define valw4 r3w
602%endif
603%define vald eax
604%else
605%define vall  bl
606%define valh  bh
607%define valw  bx
608%define valw2 r6w
609%define valw3 valw2
610%define valw4 valw3
611%define vald ebx
612%define stack_offset 0x14
613%endif
614
615%endmacro
616
617; macro to read/write a horizontal number of pixels (%2) to/from registers
618; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
619;            - if (%2 & 15 == 8) fills the last 8 bytes into rax
620;            - else if (%2 & 8)  fills 8 bytes into mm0
621;            - if (%2 & 7 == 4)  fills the last 4 bytes into rax
622;            - else if (%2 & 4)  fills 4 bytes into mm0-1
623;            - if (%2 & 3 == 3)  fills 2 bytes into r10/r3, and 1 into eax
624;              (note that we're using r3 for body/bottom because it's a shorter
625;               opcode, and then the loop fits in 128 bytes)
626;            - else              fills remaining bytes into rax
627; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
628;            - if (%2 & 7 == 4)  fills 4 bytes into ebx
629;            - else if (%2 & 4)  fills 4 bytes into mm0-7
630;            - if (%2 & 3 == 3)  fills 2 bytes into r6, and 1 into ebx
631;            - else              fills remaining bytes into ebx
632; writing data out is in the same way
633%macro READ_NUM_BYTES 2
634%assign %%src_off 0 ; offset in source buffer
635%assign %%smidx   0 ; mmx register idx
636%assign %%sxidx   0 ; xmm register idx
637
638%if cpuflag(sse)
639%rep %2/16
640    movups xmm %+ %%sxidx, [r1+%%src_off]
641%assign %%src_off %%src_off+16
642%assign %%sxidx   %%sxidx+1
643%endrep ; %2/16
644%endif
645
646%ifdef ARCH_X86_64
647%if (%2-%%src_off) == 8
648    mov           rax, [r1+%%src_off]
649%assign %%src_off %%src_off+8
650%endif ; (%2-%%src_off) == 8
651%endif ; x86-64
652
653%rep (%2-%%src_off)/8
654    movq    mm %+ %%smidx, [r1+%%src_off]
655%assign %%src_off %%src_off+8
656%assign %%smidx   %%smidx+1
657%endrep ; (%2-%%dst_off)/8
658
659%if (%2-%%src_off) == 4
660    mov          vald, [r1+%%src_off]
661%elif (%2-%%src_off) & 4
662    movd    mm %+ %%smidx, [r1+%%src_off]
663%assign %%src_off %%src_off+4
664%endif ; (%2-%%src_off) ==/& 4
665
666%if (%2-%%src_off) == 1
667    mov          vall, [r1+%%src_off]
668%elif (%2-%%src_off) == 2
669    mov          valw, [r1+%%src_off]
670%elif (%2-%%src_off) == 3
671%ifidn %1, top
672    mov         valw2, [r1+%%src_off]
673%elifidn %1, body
674    mov         valw3, [r1+%%src_off]
675%elifidn %1, bottom
676    mov         valw4, [r1+%%src_off]
677%endif ; %1 ==/!= top
678    mov          vall, [r1+%%src_off+2]
679%endif ; (%2-%%src_off) == 1/2/3
680%endmacro ; READ_NUM_BYTES
681
682%macro WRITE_NUM_BYTES 2
683%assign %%dst_off 0 ; offset in destination buffer
684%assign %%dmidx   0 ; mmx register idx
685%assign %%dxidx   0 ; xmm register idx
686
687%if cpuflag(sse)
688%rep %2/16
689    movups [r0+%%dst_off], xmm %+ %%dxidx
690%assign %%dst_off %%dst_off+16
691%assign %%dxidx   %%dxidx+1
692%endrep ; %2/16
693%endif
694
695%ifdef ARCH_X86_64
696%if (%2-%%dst_off) == 8
697    mov    [r0+%%dst_off], rax
698%assign %%dst_off %%dst_off+8
699%endif ; (%2-%%dst_off) == 8
700%endif ; x86-64
701
702%rep (%2-%%dst_off)/8
703    movq   [r0+%%dst_off], mm %+ %%dmidx
704%assign %%dst_off %%dst_off+8
705%assign %%dmidx   %%dmidx+1
706%endrep ; (%2-%%dst_off)/8
707
708%if (%2-%%dst_off) == 4
709    mov    [r0+%%dst_off], vald
710%elif (%2-%%dst_off) & 4
711    movd   [r0+%%dst_off], mm %+ %%dmidx
712%assign %%dst_off %%dst_off+4
713%endif ; (%2-%%dst_off) ==/& 4
714
715%if (%2-%%dst_off) == 1
716    mov    [r0+%%dst_off], vall
717%elif (%2-%%dst_off) == 2
718    mov    [r0+%%dst_off], valw
719%elif (%2-%%dst_off) == 3
720%ifidn %1, top
721    mov    [r0+%%dst_off], valw2
722%elifidn %1, body
723    mov    [r0+%%dst_off], valw3
724%elifidn %1, bottom
725    mov    [r0+%%dst_off], valw4
726%endif ; %1 ==/!= top
727    mov  [r0+%%dst_off+2], vall
728%endif ; (%2-%%dst_off) == 1/2/3
729%endmacro ; WRITE_NUM_BYTES
730
731; vertical top/bottom extend and body copy fast loops
732; these are function pointers to set-width line copy functions, i.e.
733; they read a fixed number of pixels into set registers, and write
734; those out into the destination buffer
735; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
736; r6(eax/64)/r3(ebx/32)=val_reg
737%macro VERTICAL_EXTEND 0
738%assign %%n 1
739%rep 22
740ALIGN 128
741.emuedge_v_extend_ %+ %%n:
742    ; extend pixels above body
743%ifdef ARCH_X86_64
744    test           r3 , r3                   ; if (!start_y)
745    jz .emuedge_copy_body_ %+ %%n %+ _loop   ;   goto body
746%else ; ARCH_X86_32
747    cmp      dword r3m, 0
748    je .emuedge_copy_body_ %+ %%n %+ _loop
749%endif ; ARCH_X86_64/32
750    READ_NUM_BYTES  top,    %%n              ; read bytes
751.emuedge_extend_top_ %+ %%n %+ _loop:        ; do {
752    WRITE_NUM_BYTES top,    %%n              ;   write bytes
753    add            r0 , r2                   ;   dst += linesize
754%ifdef ARCH_X86_64
755    dec            r3d
756%else ; ARCH_X86_32
757    dec      dword r3m
758%endif ; ARCH_X86_64/32
759    jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
760
761    ; copy body pixels
762.emuedge_copy_body_ %+ %%n %+ _loop:         ; do {
763    READ_NUM_BYTES  body,   %%n              ;   read bytes
764    WRITE_NUM_BYTES body,   %%n              ;   write bytes
765    add            r0 , r2                   ;   dst += linesize
766    add            r1 , r2                   ;   src += linesize
767    dec            r4d
768    jnz .emuedge_copy_body_ %+ %%n %+ _loop  ; } while (--end_y)
769
770    ; copy bottom pixels
771    test           r5 , r5                   ; if (!block_h)
772    jz .emuedge_v_extend_end_ %+ %%n         ;   goto end
773    sub            r1 , r2                   ; src -= linesize
774    READ_NUM_BYTES  bottom, %%n              ; read bytes
775.emuedge_extend_bottom_ %+ %%n %+ _loop:     ; do {
776    WRITE_NUM_BYTES bottom, %%n              ;   write bytes
777    add            r0 , r2                   ;   dst += linesize
778    dec            r5d
779    jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
780
781.emuedge_v_extend_end_ %+ %%n:
782%ifdef ARCH_X86_64
783    ret
784%else ; ARCH_X86_32
785    rep ret
786%endif ; ARCH_X86_64/32
787%assign %%n %%n+1
788%endrep
789%endmacro VERTICAL_EXTEND
790
791; left/right (horizontal) fast extend functions
792; these are essentially identical to the vertical extend ones above,
793; just left/right separated because number of pixels to extend is
794; obviously not the same on both sides.
795; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
796; lowest two bytes of the register (so val*0x0101), and are splatted
797; into each byte of mm0 as well if n_pixels >= 8
798
799%macro READ_V_PIXEL 2
800    mov        vall, %2
801    mov        valh, vall
802%if %1 >= 8
803    movd        mm0, vald
804%if cpuflag(mmx2)
805    pshufw      mm0, mm0, 0
806%else ; mmx
807    punpcklwd   mm0, mm0
808    punpckldq   mm0, mm0
809%endif ; sse
810%endif ; %1 >= 8
811%endmacro
812
813%macro WRITE_V_PIXEL 2
814%assign %%dst_off 0
815%rep %1/8
816    movq [%2+%%dst_off], mm0
817%assign %%dst_off %%dst_off+8
818%endrep
819%if %1 & 4
820%if %1 >= 8
821    movd [%2+%%dst_off], mm0
822%else ; %1 < 8
823    mov  [%2+%%dst_off]  , valw
824    mov  [%2+%%dst_off+2], valw
825%endif ; %1 >=/< 8
826%assign %%dst_off %%dst_off+4
827%endif ; %1 & 4
828%if %1&2
829    mov  [%2+%%dst_off], valw
830%endif ; %1 & 2
831%endmacro
832
833; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
834%macro LEFT_EXTEND 0
835%assign %%n 2
836%rep 11
837ALIGN 64
838.emuedge_extend_left_ %+ %%n:          ; do {
839    sub         r0, r2                 ;   dst -= linesize
840    READ_V_PIXEL  %%n, [r0+r1]         ;   read pixels
841    WRITE_V_PIXEL %%n, r0              ;   write pixels
842    dec         r5
843    jnz .emuedge_extend_left_ %+ %%n   ; } while (--block_h)
844%ifdef ARCH_X86_64
845    ret
846%else ; ARCH_X86_32
847    rep ret
848%endif ; ARCH_X86_64/32
849%assign %%n %%n+2
850%endrep
851%endmacro ; LEFT_EXTEND
852
853; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
854%macro RIGHT_EXTEND 0
855%assign %%n 2
856%rep 11
857ALIGN 64
858.emuedge_extend_right_ %+ %%n:          ; do {
859%ifdef ARCH_X86_64
860    sub        r3, r2                   ;   dst -= linesize
861    READ_V_PIXEL  %%n, [r3+w_reg-1]     ;   read pixels
862    WRITE_V_PIXEL %%n, r3+r4-%%n        ;   write pixels
863    dec       r11
864%else ; ARCH_X86_32
865    sub        r0, r2                   ;   dst -= linesize
866    READ_V_PIXEL  %%n, [r0+w_reg-1]     ;   read pixels
867    WRITE_V_PIXEL %%n, r0+r4-%%n        ;   write pixels
868    dec     r5
869%endif ; ARCH_X86_64/32
870    jnz .emuedge_extend_right_ %+ %%n   ; } while (--block_h)
871%ifdef ARCH_X86_64
872    ret
873%else ; ARCH_X86_32
874    rep ret
875%endif ; ARCH_X86_64/32
876%assign %%n %%n+2
877%endrep
878
879%ifdef ARCH_X86_32
880%define stack_offset 0x10
881%endif
882%endmacro ; RIGHT_EXTEND
883
884; below follow the "slow" copy/extend functions, these act on a non-fixed
885; width specified in a register, and run a loop to copy the full amount
886; of bytes. They are optimized for copying of large amounts of pixels per
887; line, so they unconditionally splat data into mm registers to copy 8
888; bytes per loop iteration. It could be considered to use xmm for x86-64
889; also, but I haven't optimized this as much (i.e. FIXME)
890%macro V_COPY_NPX 4-5
891%if %0 == 4
892    test     w_reg, %4
893    jz .%1_skip_%4_px
894%else ; %0 == 5
895.%1_%4_px_loop:
896%endif
897    %3          %2, [r1+cnt_reg]
898    %3 [r0+cnt_reg], %2
899    add    cnt_reg, %4
900%if %0 == 5
901    sub      w_reg, %4
902    test     w_reg, %5
903    jnz .%1_%4_px_loop
904%endif
905.%1_skip_%4_px:
906%endmacro
907
908%macro V_COPY_ROW 2
909%ifidn %1, bottom
910    sub         r1, linesize
911%endif
912.%1_copy_loop:
913    xor    cnt_reg, cnt_reg
914%if notcpuflag(sse)
915%define linesize r2m
916    V_COPY_NPX %1,  mm0, movq,    8, 0xFFFFFFF8
917%else ; sse
918    V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
919%ifdef ARCH_X86_64
920%define linesize r2
921    V_COPY_NPX %1, rax , mov,     8
922%else ; ARCH_X86_32
923%define linesize r2m
924    V_COPY_NPX %1,  mm0, movq,    8
925%endif ; ARCH_X86_64/32
926%endif ; sse
927    V_COPY_NPX %1, vald, mov,     4
928    V_COPY_NPX %1, valw, mov,     2
929    V_COPY_NPX %1, vall, mov,     1
930    mov      w_reg, cnt_reg
931%ifidn %1, body
932    add         r1, linesize
933%endif
934    add         r0, linesize
935    dec         %2
936    jnz .%1_copy_loop
937%endmacro
938
939%macro SLOW_V_EXTEND 0
940.slow_v_extend_loop:
941; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
942; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
943%ifdef ARCH_X86_64
944    push       r11              ; save old value of block_h
945    test        r3, r3
946%define cnt_reg r11
947    jz .do_body_copy            ; if (!start_y) goto do_body_copy
948    V_COPY_ROW top, r3
949%else
950    cmp  dword r3m, 0
951%define cnt_reg r2
952    je .do_body_copy            ; if (!start_y) goto do_body_copy
953    V_COPY_ROW top, dword r3m
954%endif
955
956.do_body_copy:
957    V_COPY_ROW body, r4
958
959%ifdef ARCH_X86_64
960    pop        r11              ; restore old value of block_h
961%define cnt_reg r3
962%endif
963    test        r5, r5
964%ifdef ARCH_X86_64
965    jz .v_extend_end
966%else
967    jz .skip_bottom_extend
968%endif
969    V_COPY_ROW bottom, r5
970%ifdef ARCH_X86_32
971.skip_bottom_extend:
972    mov         r2, r2m
973%endif
974    jmp .v_extend_end
975%endmacro
976
977%macro SLOW_LEFT_EXTEND 0
978.slow_left_extend_loop:
979; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
980    mov         r4, 8
981    sub         r0, linesize
982    READ_V_PIXEL 8, [r0+w_reg]
983.left_extend_8px_loop:
984    movq [r0+r4-8], mm0
985    add         r4, 8
986    cmp         r4, w_reg
987    jle .left_extend_8px_loop
988    sub         r4, 8
989    cmp         r4, w_reg
990    jge .left_extend_loop_end
991.left_extend_2px_loop:
992    mov    [r0+r4], valw
993    add         r4, 2
994    cmp         r4, w_reg
995    jl .left_extend_2px_loop
996.left_extend_loop_end:
997    dec         r5
998    jnz .slow_left_extend_loop
999%ifdef ARCH_X86_32
1000    mov         r2, r2m
1001%endif
1002    jmp .right_extend
1003%endmacro
1004
1005%macro SLOW_RIGHT_EXTEND 0
1006.slow_right_extend_loop:
1007; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
1008; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
1009%ifdef ARCH_X86_64
1010%define buf_reg r3
1011%define bh_reg r11
1012%else
1013%define buf_reg r0
1014%define bh_reg r5
1015%endif
1016    lea         r1, [r4-8]
1017    sub    buf_reg, linesize
1018    READ_V_PIXEL 8, [buf_reg+w_reg-1]
1019.right_extend_8px_loop:
1020    movq [buf_reg+r1], mm0
1021    sub         r1, 8
1022    cmp         r1, w_reg
1023    jge .right_extend_8px_loop
1024    add         r1, 8
1025    cmp         r1, w_reg
1026    je .right_extend_loop_end
1027.right_extend_2px_loop:
1028    sub         r1, 2
1029    mov [buf_reg+r1], valw
1030    cmp         r1, w_reg
1031    jg .right_extend_2px_loop
1032.right_extend_loop_end:
1033    dec         bh_reg
1034    jnz .slow_right_extend_loop
1035    jmp .h_extend_end
1036%endmacro
1037
1038%macro emu_edge 1
1039INIT_XMM %1
1040EMU_EDGE_FUNC
1041VERTICAL_EXTEND
1042LEFT_EXTEND
1043RIGHT_EXTEND
1044SLOW_V_EXTEND
1045SLOW_LEFT_EXTEND
1046SLOW_RIGHT_EXTEND
1047%endmacro
1048
1049emu_edge sse
1050%ifdef ARCH_X86_32
1051emu_edge mmx
1052%endif
1053
1054;-----------------------------------------------------------------------------
1055; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
1056;                           int32_t max, unsigned int len)
1057;-----------------------------------------------------------------------------
1058
1059; %1 = number of xmm registers used
1060; %2 = number of inline load/process/store loops per asm loop
1061; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
1062; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
1063; %5 = suffix
1064%macro VECTOR_CLIP_INT32 4-5
1065cglobal vector_clip_int32%5, 5,5,%2, dst, src, min, max, len
1066%if %4
1067    cvtsi2ss  m4, minm
1068    cvtsi2ss  m5, maxm
1069%else
1070    movd      m4, minm
1071    movd      m5, maxm
1072%endif
1073    SPLATD    m4
1074    SPLATD    m5
1075.loop:
1076%assign %%i 1
1077%rep %2
1078    mova      m0,  [srcq+mmsize*0*%%i]
1079    mova      m1,  [srcq+mmsize*1*%%i]
1080    mova      m2,  [srcq+mmsize*2*%%i]
1081    mova      m3,  [srcq+mmsize*3*%%i]
1082%if %3
1083    mova      m7,  [srcq+mmsize*4*%%i]
1084    mova      m8,  [srcq+mmsize*5*%%i]
1085    mova      m9,  [srcq+mmsize*6*%%i]
1086    mova      m10, [srcq+mmsize*7*%%i]
1087%endif
1088    CLIPD  m0,  m4, m5, m6
1089    CLIPD  m1,  m4, m5, m6
1090    CLIPD  m2,  m4, m5, m6
1091    CLIPD  m3,  m4, m5, m6
1092%if %3
1093    CLIPD  m7,  m4, m5, m6
1094    CLIPD  m8,  m4, m5, m6
1095    CLIPD  m9,  m4, m5, m6
1096    CLIPD  m10, m4, m5, m6
1097%endif
1098    mova  [dstq+mmsize*0*%%i], m0
1099    mova  [dstq+mmsize*1*%%i], m1
1100    mova  [dstq+mmsize*2*%%i], m2
1101    mova  [dstq+mmsize*3*%%i], m3
1102%if %3
1103    mova  [dstq+mmsize*4*%%i], m7
1104    mova  [dstq+mmsize*5*%%i], m8
1105    mova  [dstq+mmsize*6*%%i], m9
1106    mova  [dstq+mmsize*7*%%i], m10
1107%endif
1108%assign %%i %%i+1
1109%endrep
1110    add     srcq, mmsize*4*(%2+%3)
1111    add     dstq, mmsize*4*(%2+%3)
1112    sub     lend, mmsize*(%2+%3)
1113    jg .loop
1114    REP_RET
1115%endmacro
1116
1117INIT_MMX mmx
1118%define SPLATD SPLATD_MMX
1119%define CLIPD CLIPD_MMX
1120VECTOR_CLIP_INT32 0, 1, 0, 0
1121INIT_XMM sse2
1122%define SPLATD SPLATD_SSE2
1123VECTOR_CLIP_INT32 6, 1, 0, 0, _int
1124%define CLIPD CLIPD_SSE2
1125VECTOR_CLIP_INT32 6, 2, 0, 1
1126INIT_XMM sse4
1127%define CLIPD CLIPD_SSE41
1128%ifdef m8
1129VECTOR_CLIP_INT32 11, 1, 1, 0
1130%else
1131VECTOR_CLIP_INT32 6, 1, 0, 0
1132%endif
1133
1134;-----------------------------------------------------------------------------
1135; void ff_butterflies_float_interleave(float *dst, const float *src0,
1136;                                      const float *src1, int len);
1137;-----------------------------------------------------------------------------
1138
1139%macro BUTTERFLIES_FLOAT_INTERLEAVE 0
1140cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1141%ifdef ARCH_X86_64
1142    movsxd    lenq, lend
1143%endif
1144    test      lenq, lenq
1145    jz .end
1146    shl       lenq, 2
1147    lea      src0q, [src0q +   lenq]
1148    lea      src1q, [src1q +   lenq]
1149    lea       dstq, [ dstq + 2*lenq]
1150    neg       lenq
1151.loop:
1152    mova        m0, [src0q + lenq]
1153    mova        m1, [src1q + lenq]
1154    subps       m2, m0, m1
1155    addps       m0, m0, m1
1156    unpcklps    m1, m0, m2
1157    unpckhps    m0, m0, m2
1158%if cpuflag(avx)
1159    vextractf128 [dstq + 2*lenq     ], m1, 0
1160    vextractf128 [dstq + 2*lenq + 16], m0, 0
1161    vextractf128 [dstq + 2*lenq + 32], m1, 1
1162    vextractf128 [dstq + 2*lenq + 48], m0, 1
1163%else
1164    mova [dstq + 2*lenq         ], m1
1165    mova [dstq + 2*lenq + mmsize], m0
1166%endif
1167    add       lenq, mmsize
1168    jl .loop
1169%if mmsize == 32
1170    vzeroupper
1171    RET
1172%endif
1173.end:
1174    REP_RET
1175%endmacro
1176
1177INIT_XMM sse
1178BUTTERFLIES_FLOAT_INTERLEAVE
1179INIT_YMM avx
1180BUTTERFLIES_FLOAT_INTERLEAVE
1181